diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 805b79491e6ea..6e1301cc9de6f 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -4352,7 +4352,7 @@ defm ptrauth_init_fini_address_discrimination : OptInCC1FFlag<"ptrauth-init-fini
 def fenable_matrix : Flag<["-"], "fenable-matrix">, Group<f_Group>,
     Visibility<[ClangOption, CC1Option]>,
     HelpText<"Enable matrix data type and related builtin functions">,
-    MarshallingInfoFlag<LangOpts<"MatrixTypes">>;
+    MarshallingInfoFlag<LangOpts<"MatrixTypes">, hlsl.KeyPath>;
 
 defm raw_string_literals : BoolFOption<"raw-string-literals",
     LangOpts<"RawStringLiterals">, Default<std#".hasRawStringLiterals()">,
diff --git a/clang/include/clang/Sema/HLSLExternalSemaSource.h b/clang/include/clang/Sema/HLSLExternalSemaSource.h
index 3c7495e66055d..6f4b72045a946 100644
--- a/clang/include/clang/Sema/HLSLExternalSemaSource.h
+++ b/clang/include/clang/Sema/HLSLExternalSemaSource.h
@@ -28,6 +28,7 @@ class HLSLExternalSemaSource : public ExternalSemaSource {
   llvm::DenseMap<CXXRecordDecl *, CompletionFunction> Completions;
 
   void defineHLSLVectorAlias();
+  void defineHLSLMatrixAlias();
   void defineTrivialHLSLTypes();
   void defineHLSLTypesWithForwardDeclarations();
 
diff --git a/clang/lib/AST/TypePrinter.cpp b/clang/lib/AST/TypePrinter.cpp
index 6d8db5cf4ffd2..09e376156df7f 100644
--- a/clang/lib/AST/TypePrinter.cpp
+++ b/clang/lib/AST/TypePrinter.cpp
@@ -852,10 +852,18 @@ void TypePrinter::printExtVectorAfter(const ExtVectorType *T, raw_ostream &OS) {
 
 void TypePrinter::printConstantMatrixBefore(const ConstantMatrixType *T,
                                             raw_ostream &OS) {
+  if (Policy.UseHLSLTypes)
+    OS << "matrix<";
   printBefore(T->getElementType(), OS);
-  OS << " __attribute__((matrix_type(";
+  if (!Policy.UseHLSLTypes)
+    OS << " __attribute__((matrix_type(";
+  else
+    OS << ", ";
   OS << T->getNumRows() << ", " << T->getNumColumns();
-  OS << ")))";
+  if (!Policy.UseHLSLTypes)
+    OS << ")))";
+  else
+    OS << ">";
 }
 
 void TypePrinter::printConstantMatrixAfter(const ConstantMatrixType *T,
@@ -865,16 +873,25 @@ void TypePrinter::printConstantMatrixAfter(const ConstantMatrixType *T,
 
 void TypePrinter::printDependentSizedMatrixBefore(
     const DependentSizedMatrixType *T, raw_ostream &OS) {
+  if (Policy.UseHLSLTypes)
+    OS << "matrix<";
   printBefore(T->getElementType(), OS);
-  OS << " __attribute__((matrix_type(";
-  if (T->getRowExpr()) {
-    T->getRowExpr()->printPretty(OS, nullptr, Policy);
-  }
+  if (!Policy.UseHLSLTypes)
+    OS << " __attribute__((matrix_type(";
+  else
+    OS << ", ";
+
+  if (Expr *E = T->getRowExpr())
+    E->printPretty(OS, nullptr, Policy);
   OS << ", ";
-  if (T->getColumnExpr()) {
-    T->getColumnExpr()->printPretty(OS, nullptr, Policy);
-  }
-  OS << ")))";
+  if (Expr *E = T->getColumnExpr())
+    E->printPretty(OS, nullptr, Policy);
+
+  OS << ", ";
+  if (!Policy.UseHLSLTypes)
+    OS << ")))";
+  else
+    OS << ">";
 }
 
 void TypePrinter::printDependentSizedMatrixAfter(
diff --git a/clang/lib/Headers/hlsl/hlsl_basic_types.h b/clang/lib/Headers/hlsl/hlsl_basic_types.h
index eff94e0d7f950..b6eeffa2f5e36 100644
--- a/clang/lib/Headers/hlsl/hlsl_basic_types.h
+++ b/clang/lib/Headers/hlsl/hlsl_basic_types.h
@@ -115,6 +115,238 @@ typedef vector<float64_t, 2> float64_t2;
 typedef vector<float64_t, 3> float64_t3;
 typedef vector<float64_t, 4> float64_t4;
 
+#ifdef __HLSL_ENABLE_16_BIT
+typedef matrix<int16_t, 1, 1> int16_t1x1;
+typedef matrix<int16_t, 1, 2> int16_t1x2;
+typedef matrix<int16_t, 1, 3> int16_t1x3;
+typedef matrix<int16_t, 1, 4> int16_t1x4;
+typedef matrix<int16_t, 2, 1> int16_t2x1;
+typedef matrix<int16_t, 2, 2> int16_t2x2;
+typedef matrix<int16_t, 2, 3> int16_t2x3;
+typedef matrix<int16_t, 2, 4> int16_t2x4;
+typedef matrix<int16_t, 3, 1> int16_t3x1;
+typedef matrix<int16_t, 3, 2> int16_t3x2;
+typedef matrix<int16_t, 3, 3> int16_t3x3;
+typedef matrix<int16_t, 3, 4> int16_t3x4;
+typedef matrix<int16_t, 4, 1> int16_t4x1;
+typedef matrix<int16_t, 4, 2> int16_t4x2;
+typedef matrix<int16_t, 4, 3> int16_t4x3;
+typedef matrix<int16_t, 4, 4> int16_t4x4;
+typedef matrix<uint16_t, 1, 1> uint16_t1x1;
+typedef matrix<uint16_t, 1, 2> uint16_t1x2;
+typedef matrix<uint16_t, 1, 3> uint16_t1x3;
+typedef matrix<uint16_t, 1, 4> uint16_t1x4;
+typedef matrix<uint16_t, 2, 1> uint16_t2x1;
+typedef matrix<uint16_t, 2, 2> uint16_t2x2;
+typedef matrix<uint16_t, 2, 3> uint16_t2x3;
+typedef matrix<uint16_t, 2, 4> uint16_t2x4;
+typedef matrix<uint16_t, 3, 1> uint16_t3x1;
+typedef matrix<uint16_t, 3, 2> uint16_t3x2;
+typedef matrix<uint16_t, 3, 3> uint16_t3x3;
+typedef matrix<uint16_t, 3, 4> uint16_t3x4;
+typedef matrix<uint16_t, 4, 1> uint16_t4x1;
+typedef matrix<uint16_t, 4, 2> uint16_t4x2;
+typedef matrix<uint16_t, 4, 3> uint16_t4x3;
+typedef matrix<uint16_t, 4, 4> uint16_t4x4;
+#endif
+typedef matrix<int, 1, 1> int1x1;
+typedef matrix<int, 1, 2> int1x2;
+typedef matrix<int, 1, 3> int1x3;
+typedef matrix<int, 1, 4> int1x4;
+typedef matrix<int, 2, 1> int2x1;
+typedef matrix<int, 2, 2> int2x2;
+typedef matrix<int, 2, 3> int2x3;
+typedef matrix<int, 2, 4> int2x4;
+typedef matrix<int, 3, 1> int3x1;
+typedef matrix<int, 3, 2> int3x2;
+typedef matrix<int, 3, 3> int3x3;
+typedef matrix<int, 3, 4> int3x4;
+typedef matrix<int, 4, 1> int4x1;
+typedef matrix<int, 4, 2> int4x2;
+typedef matrix<int, 4, 3> int4x3;
+typedef matrix<int, 4, 4> int4x4;
+typedef matrix<uint, 1, 1> uint1x1;
+typedef matrix<uint, 1, 2> uint1x2;
+typedef matrix<uint, 1, 3> uint1x3;
+typedef matrix<uint, 1, 4> uint1x4;
+typedef matrix<uint, 2, 1> uint2x1;
+typedef matrix<uint, 2, 2> uint2x2;
+typedef matrix<uint, 2, 3> uint2x3;
+typedef matrix<uint, 2, 4> uint2x4;
+typedef matrix<uint, 3, 1> uint3x1;
+typedef matrix<uint, 3, 2> uint3x2;
+typedef matrix<uint, 3, 3> uint3x3;
+typedef matrix<uint, 3, 4> uint3x4;
+typedef matrix<uint, 4, 1> uint4x1;
+typedef matrix<uint, 4, 2> uint4x2;
+typedef matrix<uint, 4, 3> uint4x3;
+typedef matrix<uint, 4, 4> uint4x4;
+typedef matrix<int32_t, 1, 1> int32_t1x1;
+typedef matrix<int32_t, 1, 2> int32_t1x2;
+typedef matrix<int32_t, 1, 3> int32_t1x3;
+typedef matrix<int32_t, 1, 4> int32_t1x4;
+typedef matrix<int32_t, 2, 1> int32_t2x1;
+typedef matrix<int32_t, 2, 2> int32_t2x2;
+typedef matrix<int32_t, 2, 3> int32_t2x3;
+typedef matrix<int32_t, 2, 4> int32_t2x4;
+typedef matrix<int32_t, 3, 1> int32_t3x1;
+typedef matrix<int32_t, 3, 2> int32_t3x2;
+typedef matrix<int32_t, 3, 3> int32_t3x3;
+typedef matrix<int32_t, 3, 4> int32_t3x4;
+typedef matrix<int32_t, 4, 1> int32_t4x1;
+typedef matrix<int32_t, 4, 2> int32_t4x2;
+typedef matrix<int32_t, 4, 3> int32_t4x3;
+typedef matrix<int32_t, 4, 4> int32_t4x4;
+typedef matrix<uint32_t, 1, 1> uint32_t1x1;
+typedef matrix<uint32_t, 1, 2> uint32_t1x2;
+typedef matrix<uint32_t, 1, 3> uint32_t1x3;
+typedef matrix<uint32_t, 1, 4> uint32_t1x4;
+typedef matrix<uint32_t, 2, 1> uint32_t2x1;
+typedef matrix<uint32_t, 2, 2> uint32_t2x2;
+typedef matrix<uint32_t, 2, 3> uint32_t2x3;
+typedef matrix<uint32_t, 2, 4> uint32_t2x4;
+typedef matrix<uint32_t, 3, 1> uint32_t3x1;
+typedef matrix<uint32_t, 3, 2> uint32_t3x2;
+typedef matrix<uint32_t, 3, 3> uint32_t3x3;
+typedef matrix<uint32_t, 3, 4> uint32_t3x4;
+typedef matrix<uint32_t, 4, 1> uint32_t4x1;
+typedef matrix<uint32_t, 4, 2> uint32_t4x2;
+typedef matrix<uint32_t, 4, 3> uint32_t4x3;
+typedef matrix<uint32_t, 4, 4> uint32_t4x4;
+typedef matrix<int64_t, 1, 1> int64_t1x1;
+typedef matrix<int64_t, 1, 2> int64_t1x2;
+typedef matrix<int64_t, 1, 3> int64_t1x3;
+typedef matrix<int64_t, 1, 4> int64_t1x4;
+typedef matrix<int64_t, 2, 1> int64_t2x1;
+typedef matrix<int64_t, 2, 2> int64_t2x2;
+typedef matrix<int64_t, 2, 3> int64_t2x3;
+typedef matrix<int64_t, 2, 4> int64_t2x4;
+typedef matrix<int64_t, 3, 1> int64_t3x1;
+typedef matrix<int64_t, 3, 2> int64_t3x2;
+typedef matrix<int64_t, 3, 3> int64_t3x3;
+typedef matrix<int64_t, 3, 4> int64_t3x4;
+typedef matrix<int64_t, 4, 1> int64_t4x1;
+typedef matrix<int64_t, 4, 2> int64_t4x2;
+typedef matrix<int64_t, 4, 3> int64_t4x3;
+typedef matrix<int64_t, 4, 4> int64_t4x4;
+typedef matrix<uint64_t, 1, 1> uint64_t1x1;
+typedef matrix<uint64_t, 1, 2> uint64_t1x2;
+typedef matrix<uint64_t, 1, 3> uint64_t1x3;
+typedef matrix<uint64_t, 1, 4> uint64_t1x4;
+typedef matrix<uint64_t, 2, 1> uint64_t2x1;
+typedef matrix<uint64_t, 2, 2> uint64_t2x2;
+typedef matrix<uint64_t, 2, 3> uint64_t2x3;
+typedef matrix<uint64_t, 2, 4> uint64_t2x4;
+typedef matrix<uint64_t, 3, 1> uint64_t3x1;
+typedef matrix<uint64_t, 3, 2> uint64_t3x2;
+typedef matrix<uint64_t, 3, 3> uint64_t3x3;
+typedef matrix<uint64_t, 3, 4> uint64_t3x4;
+typedef matrix<uint64_t, 4, 1> uint64_t4x1;
+typedef matrix<uint64_t, 4, 2> uint64_t4x2;
+typedef matrix<uint64_t, 4, 3> uint64_t4x3;
+typedef matrix<uint64_t, 4, 4> uint64_t4x4;
+
+typedef matrix<half, 1, 1> half1x1;
+typedef matrix<half, 1, 2> half1x2;
+typedef matrix<half, 1, 3> half1x3;
+typedef matrix<half, 1, 4> half1x4;
+typedef matrix<half, 2, 1> half2x1;
+typedef matrix<half, 2, 2> half2x2;
+typedef matrix<half, 2, 3> half2x3;
+typedef matrix<half, 2, 4> half2x4;
+typedef matrix<half, 3, 1> half3x1;
+typedef matrix<half, 3, 2> half3x2;
+typedef matrix<half, 3, 3> half3x3;
+typedef matrix<half, 3, 4> half3x4;
+typedef matrix<half, 4, 1> half4x1;
+typedef matrix<half, 4, 2> half4x2;
+typedef matrix<half, 4, 3> half4x3;
+typedef matrix<half, 4, 4> half4x4;
+typedef matrix<float, 1, 1> float1x1;
+typedef matrix<float, 1, 2> float1x2;
+typedef matrix<float, 1, 3> float1x3;
+typedef matrix<float, 1, 4> float1x4;
+typedef matrix<float, 2, 1> float2x1;
+typedef matrix<float, 2, 2> float2x2;
+typedef matrix<float, 2, 3> float2x3;
+typedef matrix<float, 2, 4> float2x4;
+typedef matrix<float, 3, 1> float3x1;
+typedef matrix<float, 3, 2> float3x2;
+typedef matrix<float, 3, 3> float3x3;
+typedef matrix<float, 3, 4> float3x4;
+typedef matrix<float, 4, 1> float4x1;
+typedef matrix<float, 4, 2> float4x2;
+typedef matrix<float, 4, 3> float4x3;
+typedef matrix<float, 4, 4> float4x4;
+typedef matrix<double, 1, 1> double1x1;
+typedef matrix<double, 1, 2> double1x2;
+typedef matrix<double, 1, 3> double1x3;
+typedef matrix<double, 1, 4> double1x4;
+typedef matrix<double, 2, 1> double2x1;
+typedef matrix<double, 2, 2> double2x2;
+typedef matrix<double, 2, 3> double2x3;
+typedef matrix<double, 2, 4> double2x4;
+typedef matrix<double, 3, 1> double3x1;
+typedef matrix<double, 3, 2> double3x2;
+typedef matrix<double, 3, 3> double3x3;
+typedef matrix<double, 3, 4> double3x4;
+typedef matrix<double, 4, 1> double4x1;
+typedef matrix<double, 4, 2> double4x2;
+typedef matrix<double, 4, 3> double4x3;
+typedef matrix<double, 4, 4> double4x4;
+
+#ifdef __HLSL_ENABLE_16_BIT
+typedef matrix<float16_t, 1, 1> float16_t1x1;
+typedef matrix<float16_t, 1, 2> float16_t1x2;
+typedef matrix<float16_t, 1, 3> float16_t1x3;
+typedef matrix<float16_t, 1, 4> float16_t1x4;
+typedef matrix<float16_t, 2, 1> float16_t2x1;
+typedef matrix<float16_t, 2, 2> float16_t2x2;
+typedef matrix<float16_t, 2, 3> float16_t2x3;
+typedef matrix<float16_t, 2, 4> float16_t2x4;
+typedef matrix<float16_t, 3, 1> float16_t3x1;
+typedef matrix<float16_t, 3, 2> float16_t3x2;
+typedef matrix<float16_t, 3, 3> float16_t3x3;
+typedef matrix<float16_t, 3, 4> float16_t3x4;
+typedef matrix<float16_t, 4, 1> float16_t4x1;
+typedef matrix<float16_t, 4, 2> float16_t4x2;
+typedef matrix<float16_t, 4, 3> float16_t4x3;
+typedef matrix<float16_t, 4, 4> float16_t4x4;
+#endif
+
+typedef matrix<float32_t, 1, 1> float32_t1x1;
+typedef matrix<float32_t, 1, 2> float32_t1x2;
+typedef matrix<float32_t, 1, 3> float32_t1x3;
+typedef matrix<float32_t, 1, 4> float32_t1x4;
+typedef matrix<float32_t, 2, 1> float32_t2x1;
+typedef matrix<float32_t, 2, 2> float32_t2x2;
+typedef matrix<float32_t, 2, 3> float32_t2x3;
+typedef matrix<float32_t, 2, 4> float32_t2x4;
+typedef matrix<float32_t, 3, 1> float32_t3x1;
+typedef matrix<float32_t, 3, 2> float32_t3x2;
+typedef matrix<float32_t, 3, 3> float32_t3x3;
+typedef matrix<float32_t, 3, 4> float32_t3x4;
+typedef matrix<float32_t, 4, 1> float32_t4x1;
+typedef matrix<float32_t, 4, 2> float32_t4x2;
+typedef matrix<float32_t, 4, 3> float32_t4x3;
+typedef matrix<float32_t, 4, 4> float32_t4x4;
+typedef matrix<float64_t, 1, 1> float64_t1x1;
+typedef matrix<float64_t, 1, 2> float64_t1x2;
+typedef matrix<float64_t, 1, 3> float64_t1x3;
+typedef matrix<float64_t, 1, 4> float64_t1x4;
+typedef matrix<float64_t, 2, 1> float64_t2x1;
+typedef matrix<float64_t, 2, 2> float64_t2x2;
+typedef matrix<float64_t, 2, 3> float64_t2x3;
+typedef matrix<float64_t, 2, 4> float64_t2x4;
+typedef matrix<float64_t, 3, 1> float64_t3x1;
+typedef matrix<float64_t, 3, 2> float64_t3x2;
+typedef matrix<float64_t, 3, 3> float64_t3x3;
+typedef matrix<float64_t, 3, 4> float64_t3x4;
+typedef matrix<float64_t, 4, 1> float64_t4x1;
+typedef matrix<float64_t, 4, 2> float64_t4x2;
+typedef matrix<float64_t, 4, 3> float64_t4x3;
+typedef matrix<float64_t, 4, 4> float64_t4x4;
+
 } // namespace hlsl
 
 #endif //_HLSL_HLSL_BASIC_TYPES_H_
diff --git a/clang/lib/Sema/HLSLExternalSemaSource.cpp b/clang/lib/Sema/HLSLExternalSemaSource.cpp
index 6eb24c578f602..bd621661bb4bf 100644
--- a/clang/lib/Sema/HLSLExternalSemaSource.cpp
+++ b/clang/lib/Sema/HLSLExternalSemaSource.cpp
@@ -459,8 +459,81 @@ void HLSLExternalSemaSource::defineHLSLVectorAlias() {
   HLSLNamespace->addDecl(Template);
 }
 
+void HLSLExternalSemaSource::defineHLSLMatrixAlias() {
+  ASTContext &AST = SemaPtr->getASTContext();
+
+  llvm::SmallVector<NamedDecl *> TemplateParams;
+
+  auto *TypeParam = TemplateTypeParmDecl::Create(
+      AST, HLSLNamespace, SourceLocation(), SourceLocation(), 0, 0,
+      &AST.Idents.get("element", tok::TokenKind::identifier), false, false);
+  TypeParam->setDefaultArgument(
+      AST, SemaPtr->getTrivialTemplateArgumentLoc(
+               TemplateArgument(AST.FloatTy), QualType(), SourceLocation()));
+
+  TemplateParams.emplace_back(TypeParam);
+
+  // these should be 64 bit to be consistent with other clang matrices.
+  auto *RowsParam = NonTypeTemplateParmDecl::Create(
+      AST, HLSLNamespace, SourceLocation(), SourceLocation(), 0, 1,
+      &AST.Idents.get("rows_count", tok::TokenKind::identifier), AST.IntTy,
+      false, AST.getTrivialTypeSourceInfo(AST.IntTy));
+  llvm::APInt RVal(AST.getIntWidth(AST.IntTy), 4);
+  TemplateArgument RDefault(AST, llvm::APSInt(std::move(RVal)), AST.IntTy,
+                            /*IsDefaulted=*/true);
+  RowsParam->setDefaultArgument(
+      AST, SemaPtr->getTrivialTemplateArgumentLoc(RDefault, AST.IntTy,
+                                                  SourceLocation(), RowsParam));
+  TemplateParams.emplace_back(RowsParam);
+
+  auto *ColsParam = NonTypeTemplateParmDecl::Create(
+      AST, HLSLNamespace, SourceLocation(), SourceLocation(), 0, 2,
+      &AST.Idents.get("cols_count", tok::TokenKind::identifier), AST.IntTy,
+      false, AST.getTrivialTypeSourceInfo(AST.IntTy));
+  llvm::APInt CVal(AST.getIntWidth(AST.IntTy), 4);
+  TemplateArgument CDefault(AST, llvm::APSInt(std::move(CVal)), AST.IntTy,
+                            /*IsDefaulted=*/true);
+  ColsParam->setDefaultArgument(
+      AST, SemaPtr->getTrivialTemplateArgumentLoc(CDefault, AST.IntTy,
+                                                  SourceLocation(), ColsParam));
+  TemplateParams.emplace_back(RowsParam);
+
+  auto *ParamList =
+      TemplateParameterList::Create(AST, SourceLocation(), SourceLocation(),
+                                    TemplateParams, SourceLocation(), nullptr);
+
+  IdentifierInfo &II = AST.Idents.get("matrix", tok::TokenKind::identifier);
+
+  QualType AliasType = AST.getDependentSizedMatrixType(
+      AST.getTemplateTypeParmType(0, 0, false, TypeParam),
+      DeclRefExpr::Create(
+          AST, NestedNameSpecifierLoc(), SourceLocation(), RowsParam, false,
+          DeclarationNameInfo(RowsParam->getDeclName(), SourceLocation()),
+          AST.IntTy, VK_LValue),
+      DeclRefExpr::Create(
+          AST, NestedNameSpecifierLoc(), SourceLocation(), ColsParam, false,
+          DeclarationNameInfo(ColsParam->getDeclName(), SourceLocation()),
+          AST.IntTy, VK_LValue),
+      SourceLocation());
+
+  auto *Record = TypeAliasDecl::Create(AST, HLSLNamespace, SourceLocation(),
+                                       SourceLocation(), &II,
+                                       AST.getTrivialTypeSourceInfo(AliasType));
+  Record->setImplicit(true);
+
+  auto *Template =
+      TypeAliasTemplateDecl::Create(AST, HLSLNamespace, SourceLocation(),
+                                    Record->getIdentifier(), ParamList, Record);
+
+  Record->setDescribedAliasTemplate(Template);
+  Template->setImplicit(true);
+  Template->setLexicalDeclContext(Record->getDeclContext());
+  HLSLNamespace->addDecl(Template);
+}
+
 void HLSLExternalSemaSource::defineTrivialHLSLTypes() {
   defineHLSLVectorAlias();
+  defineHLSLMatrixAlias();
 }
 
 /// Set up common members and attributes for buffer types
diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
index e526a11973975..1471b402c1158 100644
--- a/clang/lib/Sema/SemaType.cpp
+++ b/clang/lib/Sema/SemaType.cpp
@@ -2447,7 +2447,7 @@ QualType Sema::BuildExtVectorType(QualType T, Expr *ArraySize,
 
 QualType Sema::BuildMatrixType(QualType ElementTy, Expr *NumRows, Expr *NumCols,
                                SourceLocation AttrLoc) {
-  assert(Context.getLangOpts().MatrixTypes &&
+  assert(getLangOpts().MatrixTypes &&
          "Should never build a matrix type when it is disabled");
 
   // Check element type, if it is not dependent.
diff --git a/clang/test/AST/HLSL/matrix-alias.hlsl b/clang/test/AST/HLSL/matrix-alias.hlsl
new file mode 100644
index 0000000000000..307b317998f85
--- /dev/null
+++ b/clang/test/AST/HLSL/matrix-alias.hlsl
@@ -0,0 +1,49 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -ast-dump -o - %s | FileCheck %s
+
+// Test that matrix aliases are set up properly for HLSL
+
+// CHECK: NamespaceDecl 0x{{[0-9a-fA-F]+}} <<invalid sloc>> <invalid sloc> implicit hlsl
+// CHECK-NEXT: TypeAliasTemplateDecl 0x{{[0-9a-fA-F]+}} <<invalid sloc>> <invalid sloc> implicit vector
+// CHECK-NEXT: TemplateTypeParmDecl 0x{{[0-9a-fA-F]+}} <<invalid sloc>> <invalid sloc> class depth 0 index 0 element
+// CHECK-NEXT: TemplateArgument type 'float'
+// CHECK-NEXT: BuiltinType 0x{{[0-9a-fA-F]+}} 'float'
+// CHECK-NEXT: NonTypeTemplateParmDecl 0x{{[0-9a-fA-F]+}} <<invalid sloc>> <invalid sloc> 'int' depth 0 index 1 element_count
+// CHECK-NEXT: TemplateArgument expr
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> 'int' 4
+// CHECK-NEXT: TypeAliasDecl 0x{{[0-9a-fA-F]+}} <<invalid sloc>> <invalid sloc> implicit vector 'vector<element, element_count>'
+// CHECK-NEXT: DependentSizedExtVectorType 0x{{[0-9a-fA-F]+}} 'vector<element, element_count>' dependent <invalid sloc>
+// CHECK-NEXT: TemplateTypeParmType 0x{{[0-9a-fA-F]+}} 'element' dependent depth 0 index 0
+// CHECK-NEXT: TemplateTypeParm 0x{{[0-9a-fA-F]+}} 'element'
+// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <<invalid sloc>> 'int' lvalue
+// CHECK-SAME: NonTypeTemplateParm 0x{{[0-9a-fA-F]+}} 'element_count' 'int'
+
+// Make sure we got a using directive at the end.
+// CHECK: UsingDirectiveDecl 0x{{[0-9a-fA-F]+}} <<invalid sloc>> <invalid sloc> Namespace 0x{{[0-9a-fA-F]+}} 'hlsl'
+
+[numthreads(1,1,1)]
+int entry() {
+  // Verify that the alias is generated inside the hlsl namespace.
+  hlsl::matrix<float, 2, 2> Mat2x2;
+
+  // CHECK: DeclStmt 0x{{[0-9a-fA-F]+}} <line:26:3, col:35>
+  // CHECK-NEXT: VarDecl 0x{{[0-9a-fA-F]+}} <col:3, col:29> col:29 Mat2x2 'hlsl::matrix<float, 2, 2>'
+
+  // Verify that you don't need to specify the namespace.
+  matrix<int, 2, 2> Vec2x2a;
+
+  // CHECK: DeclStmt 0x{{[0-9a-fA-F]+}} <line:32:3, col:28>
+  // CHECK-NEXT: VarDecl 0x{{[0-9a-fA-F]+}} <col:3, col:21> col:21 Vec2x2a 'matrix<int, 2, 2>'
+
+  // Build a bigger matrix.
+  matrix<double, 4, 4> Mat4x4;
+
+  // CHECK: DeclStmt 0x{{[0-9a-fA-F]+}} <line:38:3, col:30>
+  // CHECK-NEXT: VarDecl 0x{{[0-9a-fA-F]+}} <col:3, col:24> col:24 Mat4x4 'matrix<double, 4, 4>'
+
+  // Verify that the implicit arguments generate the correct type.
+  matrix<> ImpMat4x4;
+
+  // CHECK: DeclStmt 0x{{[0-9a-fA-F]+}} <line:44:3, col:21>
+  // CHECK-NEXT: VarDecl 0x{{[0-9a-fA-F]+}} <col:3, col:12> col:12 ImpMat4x4 'matrix<>':'matrix<float, 4, 4>'
+  return 1;
+}
diff --git a/clang/test/AST/HLSL/vector-alias.hlsl b/clang/test/AST/HLSL/vector-alias.hlsl
index 3d112ee1b2230..e7c72d51a6338 100644
--- a/clang/test/AST/HLSL/vector-alias.hlsl
+++ b/clang/test/AST/HLSL/vector-alias.hlsl
@@ -13,7 +13,7 @@
 // CHECK-NEXT: TemplateTypeParmType 0x{{[0-9a-fA-F]+}} 'element' dependent depth 0 index 0
 // CHECK-NEXT: TemplateTypeParm 0x{{[0-9a-fA-F]+}} 'element'
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <<invalid sloc>> 'int' lvalue
-// NonTypeTemplateParm 0x{{[0-9a-fA-F]+}} 'element_count' 'int'
+// CHECK-SAME: NonTypeTemplateParm 0x{{[0-9a-fA-F]+}} 'element_count' 'int'
 
 // Make sure we got a using directive at the end.
 // CHECK: UsingDirectiveDecl 0x{{[0-9a-fA-F]+}} <<invalid sloc>> <invalid sloc> Namespace 0x{{[0-9a-fA-F]+}} 'hlsl'
diff --git a/clang/test/CodeGenHLSL/Types/BuiltinMatrix/matrix-cast-template.hlsl b/clang/test/CodeGenHLSL/Types/BuiltinMatrix/matrix-cast-template.hlsl
new file mode 100644
index 0000000000000..a8b56e86cfa50
--- /dev/null
+++ b/clang/test/CodeGenHLSL/Types/BuiltinMatrix/matrix-cast-template.hlsl
@@ -0,0 +1,349 @@
+// RUN: %clang_cc1 -triple spirv-unknown-vulkan-compute -finclude-default-header -fnative-half-type -emit-llvm -disable-llvm-passes  %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-compute -finclude-default-header -fnative-half-type -emit-llvm -disable-llvm-passes  %s -o - | FileCheck %s
+
+
+template <typename X>
+using matrix_3_3 = matrix<X, 3, 3>;
+
+template <typename Y>
+using matrix_4_4 = matrix<Y, 4, 4>;
+
+// CHECK-LABEL: define {{.*}}CastCharMatrixToIntCStyle
+void CastCharMatrixToIntCStyle() {
+  // CHECK: [[C:%.*]] = load <16 x i16>, ptr {{.*}}, align 2
+  // CHECK-NEXT: [[CONV:%.*]] = sext <16 x i16> [[C]] to <16 x i32>
+  // CHECK-NEXT: store <16 x i32> [[CONV]], ptr {{.*}}, align 4
+
+  matrix_4_4<int16_t> c;
+  matrix_4_4<int> i;
+  i = (matrix_4_4<int>)c;
+}
+
+// CHECK-LABEL: define {{.*}}CastCharMatrixToIntStaticCast
+void CastCharMatrixToIntStaticCast() {
+  // CHECK: [[C:%.*]] = load <16 x i16>, ptr {{.*}}, align 2
+  // CHECK-NEXT: [[CONV:%.*]] = sext <16 x i16> [[C]] to <16 x i32>
+  // CHECK-NEXT: store <16 x i32> [[CONV]], ptr {{.*}}, align 4
+
+  matrix_4_4<int16_t> c;
+  matrix_4_4<int> i;
+  i = static_cast<matrix_4_4<int>>(c);
+}
+
+// CHECK-LABEL: define {{.*}}CastCharMatrixToUnsignedIntCStyle
+void CastCharMatrixToUnsignedIntCStyle() {
+  // CHECK:       [[C:%.*]] = load <16 x i16>, ptr {{.*}}, align 2
+  // CHECK-NEXT:  [[CONV:%.*]] = sext <16 x i16> [[C]] to <16 x i32>
+  // CHECK-NEXT:  store <16 x i32> [[CONV]], ptr {{.*}}, align 4
+  // CHECK-NEXT:  ret void
+
+  matrix_4_4<int16_t> c;
+  matrix_4_4<uint> u;
+  u = (matrix_4_4<uint>)c;
+}
+
+// CHECK-LABEL: define {{.*}}CastCharMatrixToUnsignedIntStaticCast
+void CastCharMatrixToUnsignedIntStaticCast() {
+  // CHECK:       [[C:%.*]] = load <16 x i16>, ptr {{.*}}, align 2
+  // CHECK-NEXT:  [[CONV:%.*]] = sext <16 x i16> [[C]] to <16 x i32>
+  // CHECK-NEXT:  store <16 x i32> [[CONV]], ptr {{.*}}, align 4
+  // CHECK-NEXT:  ret void
+
+  matrix_4_4<int16_t> c;
+  matrix_4_4<uint> u;
+  u = static_cast<matrix_4_4<uint>>(c);
+}
+
+// CHECK-LABEL: define {{.*}}CastUnsignedLongIntMatrixToShortCStyle
+void CastUnsignedLongIntMatrixToShortCStyle() {
+  // CHECK:      [[U:%.*]] = load <16 x i64>, ptr {{.*}}, align 8
+  // CHECK-NEXT: [[CONV:%.*]] = trunc <16 x i64> {{.*}} to <16 x i16>
+  // CHECK-NEXT: store <16 x i16> [[CONV]], ptr {{.*}}, align 2
+  // CHECK-NEXT: ret void
+
+  matrix_4_4<uint64_t> u;
+  matrix_4_4<int16_t> s;
+  s = (matrix_4_4<int16_t>)u;
+}
+
+// CHECK-LABEL: define {{.*}}CastUnsignedLongIntMatrixToShortStaticCast
+void CastUnsignedLongIntMatrixToShortStaticCast() {
+  // CHECK:      [[U:%.*]] = load <16 x i64>, ptr {{.*}}, align 8
+  // CHECK-NEXT: [[CONV:%.*]] = trunc <16 x i64> {{.*}} to <16 x i16>
+  // CHECK-NEXT: store <16 x i16> [[CONV]], ptr {{.*}}, align 2
+  // CHECK-NEXT: ret void
+
+  matrix_4_4<uint64_t> u;
+  matrix_4_4<int16_t> s;
+  s = static_cast<matrix_4_4<int16_t>>(u);
+}
+
+// CHECK-LABEL: define {{.*}}CastIntMatrixToShortCStyle
+void CastIntMatrixToShortCStyle() {
+  // CHECK:       [[I:%.*]] = load <16 x i32>, ptr {{.*}}, align 4
+  // CHECK-NEXT:  [[CONV:%.*]] = trunc <16 x i32> [[I]] to <16 x i16>
+  // CHECK-NEXT:  store <16 x i16> [[CONV]], ptr {{.*}}, align 2
+  // CHECK-NEXT:  ret void
+
+  matrix_4_4<int> i;
+  matrix_4_4<int16_t> s;
+  s = (matrix_4_4<int16_t>)i;
+}
+
+// CHECK-LABEL: define {{.*}}CastIntMatrixToShortStaticCast
+void CastIntMatrixToShortStaticCast() {
+  // CHECK:       [[I:%.*]] = load <16 x i32>, ptr {{.*}}, align 4
+  // CHECK-NEXT:  [[CONV:%.*]] = trunc <16 x i32> [[I]] to <16 x i16>
+  // CHECK-NEXT:  store <16 x i16> [[CONV]], ptr {{.*}}, align 2
+  // CHECK-NEXT:  ret void
+
+  matrix_4_4<int> i;
+  matrix_4_4<int16_t> s;
+  s = static_cast<matrix_4_4<int16_t>>(i);
+}
+
+// CHECK-LABEL: define {{.*}}CastIntMatrixToFloatCStyle
+void CastIntMatrixToFloatCStyle() {
+  // CHECK:       [[I:%.*]] = load <16 x i32>, ptr {{.*}}, align 4
+  // CHECK-NEXT:  [[CONV]] = sitofp <16 x i32> {{.*}} to <16 x float>
+  // CHECK-NEXT:  store <16 x float> [[CONV]], ptr {{.*}}, align 4
+  // CHECK-NEXT:  ret void
+
+  matrix_4_4<int> i;
+  matrix_4_4<float> f;
+  f = (matrix_4_4<float>)i;
+}
+
+// CHECK-LABEL: define {{.*}}CastIntMatrixToFloatStaticCast
+void CastIntMatrixToFloatStaticCast() {
+  // CHECK:       [[I:%.*]] = load <16 x i32>, ptr {{.*}}, align 4
+  // CHECK-NEXT:  [[CONV]] = sitofp <16 x i32> {{.*}} to <16 x float>
+  // CHECK-NEXT:  store <16 x float> [[CONV]], ptr {{.*}}, align 4
+  // CHECK-NEXT:  ret void
+
+  matrix_4_4<int> i;
+  matrix_4_4<float> f;
+  f = static_cast<matrix_4_4<float>>(i);
+}
+
+// CHECK-LABEL: define {{.*}}CastUnsignedIntMatrixToFloatCStyle
+void CastUnsignedIntMatrixToFloatCStyle() {
+  // CHECK:       [[U:%.*]] = load <16 x i16>, ptr {{.*}}, align 2
+  // CHECK-NEXT:  [[CONV:%.*]] = uitofp <16 x i16> [[U]] to <16 x float>
+  // CHECK-NEXT:  store <16 x float> [[CONV]], ptr {{.*}}, align 4
+  // CHECK-NEXT:  ret void
+
+  matrix_4_4<uint16_t> u;
+  matrix_4_4<float> f;
+  f = (matrix_4_4<float>)u;
+}
+
+// CHECK-LABEL: define {{.*}}CastUnsignedIntMatrixToFloatStaticCast
+void CastUnsignedIntMatrixToFloatStaticCast() {
+  // CHECK:       [[U:%.*]] = load <16 x i16>, ptr {{.*}}, align 2
+  // CHECK-NEXT:  [[CONV:%.*]] = uitofp <16 x i16> [[U]] to <16 x float>
+  // CHECK-NEXT:  store <16 x float> [[CONV]], ptr {{.*}}, align 4
+  // CHECK-NEXT:  ret void
+
+  matrix_4_4<uint16_t> u;
+  matrix_4_4<float> f;
+  f = static_cast<matrix_4_4<float>>(u);
+}
+
+// CHECK-LABEL: define {{.*}}CastDoubleMatrixToIntCStyle
+void CastDoubleMatrixToIntCStyle() {
+  // CHECK:       [[D:%.*]] = load <16 x double>, ptr {{.*}}, align 8
+  // CHECK-NEXT:  [[CONV:%.*]] = fptosi <16 x double> [[D]] to <16 x i32>
+  // CHECK-NEXT:  store <16 x i32> [[CONV]], ptr {{.*}}, align 4
+  // CHECK-NEXT:  ret void
+
+  matrix_4_4<double> d;
+  matrix_4_4<int> i;
+  i = (matrix_4_4<int>)d;
+}
+
+// CHECK-LABEL: define {{.*}}CastDoubleMatrixToIntStaticCast
+void CastDoubleMatrixToIntStaticCast() {
+  // CHECK:       [[D:%.*]] = load <16 x double>, ptr {{.*}}, align 8
+  // CHECK-NEXT:  [[CONV:%.*]] = fptosi <16 x double> [[D]] to <16 x i32>
+  // CHECK-NEXT:  store <16 x i32> [[CONV]], ptr {{.*}}, align 4
+  // CHECK-NEXT:  ret void
+
+  matrix_4_4<double> d;
+  matrix_4_4<int> i;
+  i = static_cast<matrix_4_4<int>>(d);
+}
+
+// CHECK-LABEL: define {{.*}}CastFloatMatrixToUnsignedShortIntCStyle
+void CastFloatMatrixToUnsignedShortIntCStyle() {
+  // CHECK:       [[F:%.*]] = load <16 x float>, ptr {{.*}}, align 4
+  // CHECK-NEXT:  [[CONV:%.*]] = fptoui <16 x float> [[F]] to <16 x i16>
+  // CHECK-NEXT:  store <16 x i16> [[CONV]], ptr {{.*}}, align 2
+  // CHECK-NEXT:  ret void
+
+  matrix_4_4<float> f;
+  matrix_4_4<uint16_t> i;
+  i = (matrix_4_4<uint16_t>)f;
+}
+
+// CHECK-LABEL: define {{.*}}CastFloatMatrixToUnsignedShortIntStaticCast
+void CastFloatMatrixToUnsignedShortIntStaticCast() {
+  // CHECK:       [[F:%.*]] = load <16 x float>, ptr {{.*}}, align 4
+  // CHECK-NEXT:  [[CONV:%.*]] = fptoui <16 x float> [[F]] to <16 x i16>
+  // CHECK-NEXT:  store <16 x i16> [[CONV]], ptr {{.*}}, align 2
+  // CHECK-NEXT:  ret void
+
+  matrix_4_4<float> f;
+  matrix_4_4<uint16_t> i;
+  i = static_cast<matrix_4_4<uint16_t>>(f);
+}
+
+// CHECK-LABEL: define {{.*}}CastDoubleMatrixToFloatCStyle
+void CastDoubleMatrixToFloatCStyle() {
+  // CHECK:       [[D:%.*]] = load <16 x double>, ptr {{.*}}, align 8
+  // CHECK-NEXT:  [[CONV:%.*]] = fptrunc <16 x double> [[D]] to <16 x float>
+  // CHECK-NEXT:  store <16 x float> [[CONV]], ptr {{.*}}, align 4
+  // CHECK-NEXT:  ret void
+
+  matrix_4_4<double> d;
+  matrix_4_4<float> f;
+  f = (matrix_4_4<float>)d;
+}
+
+// CHECK-LABEL: define {{.*}}CastDoubleMatrixToFloatStaticCast
+void CastDoubleMatrixToFloatStaticCast() {
+  // CHECK:       [[D:%.*]] = load <16 x double>, ptr {{.*}}, align 8
+  // CHECK-NEXT:  [[CONV:%.*]] = fptrunc <16 x double> [[D]] to <16 x float>
+  // CHECK-NEXT:  store <16 x float> [[CONV]], ptr {{.*}}, align 4
+  // CHECK-NEXT:  ret void
+
+  matrix_4_4<double> d;
+  matrix_4_4<float> f;
+  f = static_cast<matrix_4_4<float>>(d);
+}
+
+// CHECK-LABEL: define {{.*}}CastUnsignedShortIntToUnsignedIntCStyle
+void CastUnsignedShortIntToUnsignedIntCStyle() {
+  // CHECK:       [[S:%.*]] = load <16 x i16>, ptr {{.*}}, align 2
+  // CHECK-NEXT:  [[CONV:%.*]] = zext <16 x i16> [[S]] to <16 x i32>
+  // CHECK-NEXT:  store <16 x i32> [[CONV]], ptr {{.*}}, align 4
+  // CHECK-NEXT:  ret void
+
+  matrix_4_4<uint16_t> s;
+  matrix_4_4<uint> i;
+  i = (matrix_4_4<uint>)s;
+}
+
+// CHECK-LABEL: define {{.*}}CastUnsignedShortIntToUnsignedIntStaticCast
+void CastUnsignedShortIntToUnsignedIntStaticCast() {
+  // CHECK:       [[S:%.*]] = load <16 x i16>, ptr {{.*}}, align 2
+  // CHECK-NEXT:  [[CONV:%.*]] = zext <16 x i16> [[S]] to <16 x i32>
+  // CHECK-NEXT:  store <16 x i32> [[CONV]], ptr {{.*}}, align 4
+  // CHECK-NEXT:  ret void
+
+  matrix_4_4<uint16_t> s;
+  matrix_4_4<uint> i;
+  i = static_cast<matrix_4_4<uint>>(s);
+}
+
+// CHECK-LABEL: define {{.*}}CastUnsignedLongIntToUnsignedShortIntCStyle
+void CastUnsignedLongIntToUnsignedShortIntCStyle() {
+  // CHECK:       [[L:%.*]] = load <16 x i64>, ptr %l, align 8
+  // CHECK-NEXT:  [[CONV:%.*]] = trunc <16 x i64> [[L]] to <16 x i16>
+  // CHECK-NEXT:  store <16 x i16> [[CONV]], ptr {{.*}}, align 2
+  // CHECK-NEXT:  ret void
+
+  matrix_4_4<uint64_t> l;
+  matrix_4_4<uint16_t> s;
+  s = (matrix_4_4<uint16_t>)l;
+}
+
+// CHECK-LABEL: define {{.*}}CastUnsignedLongIntToUnsignedShortIntStaticCast
+void CastUnsignedLongIntToUnsignedShortIntStaticCast() {
+  // CHECK:       [[L:%.*]] = load <16 x i64>, ptr %l, align 8
+  // CHECK-NEXT:  [[CONV:%.*]] = trunc <16 x i64> [[L]] to <16 x i16>
+  // CHECK-NEXT:  store <16 x i16> [[CONV]], ptr {{.*}}, align 2
+  // CHECK-NEXT:  ret void
+
+  matrix_4_4<uint64_t> l;
+  matrix_4_4<uint16_t> s;
+  s = static_cast<matrix_4_4<uint16_t>>(l);
+}
+
+// CHECK-LABEL: define {{.*}}CastUnsignedShortIntToIntCStyle
+void CastUnsignedShortIntToIntCStyle() {
+  // CHECK:       [[U:%.*]] = load <16 x i16>, ptr %u, align 2
+  // CHECK-NEXT:  [[CONV:%.*]] = zext <16 x i16> [[U]] to <16 x i32>
+  // CHECK-NEXT:  store <16 x i32> [[CONV]], ptr {{.*}}, align 4
+  // CHECK-NEXT:  ret void
+
+  matrix_4_4<uint16_t> u;
+  matrix_4_4<int> i;
+  i = (matrix_4_4<int>)u;
+}
+
+// CHECK-LABEL: define {{.*}}CastUnsignedShortIntToIntStaticCast
+void CastUnsignedShortIntToIntStaticCast() {
+  // CHECK:       [[U:%.*]] = load <16 x i16>, ptr %u, align 2
+  // CHECK-NEXT:  [[CONV:%.*]] = zext <16 x i16> [[U]] to <16 x i32>
+  // CHECK-NEXT:  store <16 x i32> [[CONV]], ptr {{.*}}, align 4
+  // CHECK-NEXT:  ret void
+
+  matrix_4_4<uint16_t> u;
+  matrix_4_4<int> i;
+  i = static_cast<matrix_4_4<int>>(u);
+}
+
+// CHECK-LABEL: define {{.*}}CastIntToUnsignedLongIntCStyle
+void CastIntToUnsignedLongIntCStyle() {
+  // CHECK:       [[I:%.*]] = load <16 x i32>, ptr %i, align 4
+  // CHECK-NEXT:  [[CONV:%.*]] = sext <16 x i32> [[I]] to <16 x i64>
+  // CHECK-NEXT:  store <16 x i64> [[CONV]], ptr {{.*}}, align 8
+  // CHECK-NEXT:  ret void
+
+  matrix_4_4<int> i;
+  matrix_4_4<uint64_t> u;
+  u = (matrix_4_4<uint64_t>)i;
+}
+
+// CHECK-LABEL: define {{.*}}CastIntToUnsignedLongIntStaticCast
+void CastIntToUnsignedLongIntStaticCast() {
+  // CHECK:       [[I:%.*]] = load <16 x i32>, ptr %i, align 4
+  // CHECK-NEXT:  [[CONV:%.*]] = sext <16 x i32> [[I]] to <16 x i64>
+  // CHECK-NEXT:  store <16 x i64> [[CONV]], ptr {{.*}}, align 8
+  // CHECK-NEXT:  ret void
+
+  matrix_4_4<int> i;
+  matrix_4_4<uint64_t> u;
+  u = static_cast<matrix_4_4<uint64_t>>(i);
+}
+
+class Foo {
+  int x[10];
+
+  Foo(matrix_4_4<int> x);
+};
+
+// These require mangling. DXIL uses MicrosoftMangle which doesn't support mangling matrices yet.
+// CHECK-LABEL: define {{.*}}class_constructor_matrix_ty
+Foo class_constructor_matrix_ty(matrix_4_4<int> m) {
+  // CHECK:         [[M:%.*]]  = load <16 x i32>, ptr {{.*}}, align 4
+  // CHECK-NEXT:    call{{.*}} void @_ZN3FooC1Eu11matrix_typeIL{{[mj]}}4EL{{[mj]}}4EiE(ptr noundef nonnull align 4 dereferenceable(40) %agg.result, <16 x i32> noundef [[M]])
+  // CHECK-NEXT:    ret void
+
+  return Foo(m);
+}
+
+struct Bar {
+  float x[10];
+  Bar(matrix_3_3<float> x);
+};
+
+// CHECK-LABEL: define {{.*}}struct_constructor_matrix_ty
+Bar struct_constructor_matrix_ty(matrix_3_3<float> m) {
+  // CHECK:         [[M:%.*]] = load <9 x float>, ptr {{.*}}, align 4
+  // CHECK-NEXT:    call{{.*}} void @_ZN3BarC1Eu11matrix_typeIL{{[mj]}}3EL{{[mj]}}3EfE(ptr noundef nonnull align 4 dereferenceable(40) %agg.result, <9 x float> noundef [[M]])
+  // CHECK-NEXT:    ret void
+
+  return Bar(m);
+}
diff --git a/clang/test/CodeGenHLSL/Types/BuiltinMatrix/matrix-cast.hlsl b/clang/test/CodeGenHLSL/Types/BuiltinMatrix/matrix-cast.hlsl
new file mode 100644
index 0000000000000..a902b6892e2ba
--- /dev/null
+++ b/clang/test/CodeGenHLSL/Types/BuiltinMatrix/matrix-cast.hlsl
@@ -0,0 +1,135 @@
+// RUN: %clang_cc1 -triple spirv-unknown-vulkan-compute -finclude-default-header -fnative-half-type -emit-llvm -disable-llvm-passes  %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-compute -finclude-default-header -fnative-half-type -emit-llvm -disable-llvm-passes  %s -o - | FileCheck %s
+
+// Test explicit matrix casts.
+// This is adapted to HLSL from CodeGen/matrix-cast.c.
+
+// CHECK-LABEL: define {{.*}}cast_int16_matrix_to_int
+void cast_int16_matrix_to_int(  int16_t4x4 c,   int4x4 i) {
+  // CHECK:       [[C:%.*]] = load <16 x i16>, ptr {{.*}}, align 2
+  // CHECK-NEXT:  [[CONV:%.*]] = sext <16 x i16> [[C]] to <16 x i32>
+  // CHECK-NEXT:  store <16 x i32> [[CONV]], ptr {{.*}}, align 4
+  // CHECK-NEXT:  ret void
+
+  i = (int4x4)c;
+}
+
+// CHECK-LABEL: define {{.*}}cast_int16_matrix_to_uint
+void cast_int16_matrix_to_uint(  int16_t4x4 c,   uint4x4 u) {
+  // CHECK:       [[C:%.*]] = load <16 x i16>, ptr {{.*}}, align 2
+  // CHECK-NEXT:  [[CONV:%.*]] = sext <16 x i16> [[C]] to <16 x i32>
+  // CHECK-NEXT:  store <16 x i32> [[CONV]], ptr {{.*}}, align 4
+  // CHECK-NEXT:  ret void
+
+  u = (uint4x4)c;
+}
+
+// CHECK-LABEL: define {{.*}}cast_uint64_matrix_to_int16
+void cast_uint64_matrix_to_int16(  uint64_t4x4 u,   int16_t4x4 s) {
+  // CHECK:       [[U:%.*]] = load <16 x i64>, ptr {{.*}}, align 8
+  // CHECK-NEXT:  [[CONV:%.*]] = trunc <16 x i64> [[U]] to <16 x i16>
+  // CHECK-NEXT:  store <16 x i16> [[CONV]], ptr {{.*}}, align 2
+  // CHECK-NEXT:  ret void
+
+  s = (int16_t4x4)u;
+}
+
+// CHECK-LABEL: define {{.*}}cast_int_matrix_to_int16
+void cast_int_matrix_to_int16(  int4x4 i,   int16_t4x4 s) {
+  // CHECK:       [[I:%.*]] = load <16 x i32>, ptr {{.*}}, align 4
+  // CHECK-NEXT:  [[CONV:%.*]] = trunc <16 x i32> [[I]] to <16 x i16>
+  // CHECK-NEXT:  store <16 x i16> [[CONV]], ptr {{.*}}, align 2
+  // CHECK-NEXT:  ret void
+
+  s = (int16_t4x4)i;
+}
+
+// CHECK-LABEL: define {{.*}}cast_int_matrix_to_float
+void cast_int_matrix_to_float(  int4x4 i,   float4x4 f) {
+  // CHECK:       [[I:%.*]] = load <16 x i32>, ptr {{.*}}, align 4
+  // CHECK-NEXT:  [[CONV:%.*]] = sitofp <16 x i32> [[I]] to <16 x float>
+  // CHECK-NEXT:  store <16 x float> [[CONV]], ptr {{.*}}, align 4
+  // CHECK-NEXT:  ret void
+
+  f = (float4x4)i;
+}
+
+// CHECK-LABEL: define {{.*}}cast_uint_matrix_to_float
+void cast_uint_matrix_to_float(  uint16_t4x4 u,   float4x4 f) {
+  // CHECK:       [[U:%.*]] = load <16 x i16>, ptr {{.*}}, align 2
+  // CHECK-NEXT:  [[CONV:%.*]] = uitofp <16 x i16> [[U]] to <16 x float>
+  // CHECK-NEXT:  store <16 x float> [[CONV]], ptr {{.*}}, align 4
+  // CHECK-NEXT:  ret void
+
+  f = (float4x4)u;
+}
+
+// CHECK-LABEL: define {{.*}}cast_double_matrix_to_int
+void cast_double_matrix_to_int(  double4x4 d,   int4x4 i) {
+  // CHECK:       [[D:%.*]] = load <16 x double>, ptr {{.*}}, align 8
+  // CHECK-NEXT:  [[CONV:%.*]] = fptosi <16 x double> [[D]] to <16 x i32>
+  // CHECK-NEXT:  store <16 x i32> [[CONV]], ptr {{.*}}, align 4
+  // CHECK-NEXT:  ret void
+
+  i = (int4x4)d;
+}
+
+// CHECK-LABEL: define {{.*}}cast_float_matrix_to_uint16
+void cast_float_matrix_to_uint16(  float4x4 f,   uint16_t4x4 i) {
+  // CHECK:       [[F:%.*]] = load <16 x float>, ptr {{.*}}, align 4
+  // CHECK-NEXT:  [[CONV:%.*]] = fptoui <16 x float> [[F]] to <16 x i16>
+  // CHECK-NEXT:  store <16 x i16> [[CONV]], ptr {{.*}}, align 2
+  // CHECK-NEXT:  ret void
+
+  i = (uint16_t4x4)f;
+}
+
+// CHECK-LABEL: define {{.*}}cast_double_matrix_to_float
+void cast_double_matrix_to_float(  double4x4 d,   float4x4 f) {
+  // CHECK:       [[D:%.*]] = load <16 x double>, ptr {{.*}}, align 8
+  // CHECK-NEXT:  [[CONV:%.*]] = fptrunc <16 x double> [[D]] to <16 x float>
+  // CHECK-NEXT:  store <16 x float> [[CONV]], ptr {{.*}}, align 4
+  // CHECK-NEXT:  ret void
+
+  f = (float4x4)d;
+}
+
+// CHECK-LABEL: define {{.*}}cast_uint16_to_uint
+void cast_uint16_to_uint(  uint16_t4x4 s,   uint4x4 i) {
+  // CHECK:       [[S:%.*]] = load <16 x i16>, ptr {{.*}}, align 2
+  // CHECK-NEXT:  [[CONV:%.*]] = zext <16 x i16> [[S]] to <16 x i32>
+  // CHECK-NEXT:  store <16 x i32> [[CONV]], ptr {{.*}}, align 4
+  // CHECK-NEXT:  ret void
+
+  i = (uint4x4)s;
+}
+
+// CHECK-LABEL: define {{.*}}cast_uint64_to_uint16
+void cast_uint64_to_uint16(  uint64_t4x4 l,   uint16_t4x4 s) {
+  // CHECK:       [[L:%.*]] = load <16 x i64>, ptr {{.*}}, align 8
+  // CHECK-NEXT:  [[CONV:%.*]] = trunc <16 x i64> [[L]] to <16 x i16>
+  // CHECK-NEXT:  store <16 x i16> [[CONV]], ptr {{.*}}, align 2
+  // CHECK-NEXT:  ret void
+
+  s = (uint16_t4x4)l;
+}
+
+// CHECK-LABEL: define {{.*}}cast_uint16_to_int
+void cast_uint16_to_int(  uint16_t4x4 u,   int4x4 i) {
+  // CHECK:       [[U:%.*]] = load <16 x i16>, ptr {{.*}}, align 2
+  // CHECK-NEXT:  [[CONV:%.*]] = zext <16 x i16> [[U]] to <16 x i32>
+  // CHECK-NEXT:  store <16 x i32> [[CONV]], ptr {{.*}}, align 4
+  // CHECK-NEXT:  ret void
+
+  i = (int4x4)u;
+}
+
+// CHECK-LABEL: define {{.*}}cast_int_to_uint64
+void cast_int_to_uint64(  int4x4 i,   uint64_t4x4 u) {
+  // CHECK:       [[I:%.*]] = load <16 x i32>, ptr {{.*}}, align 4
+  // CHECK-NEXT:  [[CONV:%.*]] = sext <16 x i32> [[I]] to <16 x i64>
+  // CHECK-NEXT:  store <16 x i64> [[CONV]], ptr {{.*}}, align 8
+  // CHECK-NEXT:  ret void
+
+  u = (uint64_t4x4)i;
+}
diff --git a/clang/test/CodeGenHLSL/Types/BuiltinMatrix/matrix-transpose-template.hlsl b/clang/test/CodeGenHLSL/Types/BuiltinMatrix/matrix-transpose-template.hlsl
new file mode 100644
index 0000000000000..dd77fecbbe0d5
--- /dev/null
+++ b/clang/test/CodeGenHLSL/Types/BuiltinMatrix/matrix-transpose-template.hlsl
@@ -0,0 +1,80 @@
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple spirv-unknown-vulkan-compute -finclude-default-header %s -emit-llvm -disable-llvm-passes -o - | FileCheck %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple dxil-pc-shadermodel6.3-compute -finclude-default-header %s -emit-llvm -disable-llvm-passes -o - | FileCheck %s
+
+// Test the matrix type transpose builtin.
+
+template <typename EltTy, unsigned Rows, unsigned Columns>
+using matrix_t = matrix<EltTy, Rows, Columns>;
+
+template <typename EltTy, unsigned Rows, unsigned Columns>
+struct MyMatrix {
+  matrix_t<EltTy, Rows, Columns> value;
+};
+
+// Can't test utility function with matrix param without mangling.
+template <typename T, unsigned R, unsigned C>
+MyMatrix<T, C, R> transpose(const MyMatrix<T, R, C> M) {
+  MyMatrix<T, C, R> Res;
+  Res.value = __builtin_matrix_transpose(M.value);
+  return Res;
+}
+
+// CHECK-LABEL: define{{.*}} void @_Z24test_transpose_template1v()
+void test_transpose_template1() {
+  // CHECK:         call{{.*}} void @_Z9transposeIiLj3ELj4EE8MyMatrixIT_XT1_EXT0_EES0_IS1_XT0_EXT1_EE(ptr dead_on_unwind writable sret(%struct.MyMatrix.0) align 4 %M1_t, ptr byval(%struct.MyMatrix) align 4 %agg.tmp)
+  // CHECK-LABEL: define{{.*}} void @_Z9transposeIiLj3ELj4EE8MyMatrixIT_XT1_EXT0_EES0_IS1_XT0_EXT1_EE(
+  // CHECK:         [[M:%.*]] = load <12 x i32>, ptr {{.*}}, align 4
+  // CHECK-NEXT:    [[M_T:%.*]] = call <12 x i32> @llvm.matrix.transpose.v12i32(<12 x i32> [[M]], i32 3, i32 4)
+
+  MyMatrix<int, 3, 4> M1;
+  MyMatrix<int, 4, 3> M1_t = transpose(M1);
+}
+
+// CHECK-LABEL: define{{.*}} void @_Z24test_transpose_template2
+void test_transpose_template2(inout MyMatrix<double, 3, 2> M) {
+  // CHECK:         call{{.*}} void @_Z9transposeIdLj3ELj2EE8MyMatrixIT_XT1_EXT0_EES0_IS1_XT0_EXT1_EE(ptr dead_on_unwind writable sret(%struct.MyMatrix.1) align 8 %agg.tmp1, ptr byval(%struct.MyMatrix.2) align 8 %agg.tmp2)
+  // CHECK-NEXT:    call{{.*}} void @_Z9transposeIdLj2ELj3EE8MyMatrixIT_XT1_EXT0_EES0_IS1_XT0_EXT1_EE(ptr dead_on_unwind writable sret(%struct.MyMatrix.2) align 8 %agg.tmp, ptr byval(%struct.MyMatrix.1) align 8 %agg.tmp1)
+  // CHECK-NEXT:    call{{.*}} void @_Z9transposeIdLj3ELj2EE8MyMatrixIT_XT1_EXT0_EES0_IS1_XT0_EXT1_EE(ptr dead_on_unwind writable sret(%struct.MyMatrix.1) align 8 %M2_t, ptr byval(%struct.MyMatrix.2) align 8 %agg.tmp)
+
+  // CHECK-LABEL: define{{.*}} void @_Z9transposeIdLj3ELj2EE8MyMatrixIT_XT1_EXT0_EES0_IS1_XT0_EXT1_EE(
+  // CHECK:         [[M:%.*]] = load <6 x double>, ptr {{.*}}, align 8
+  // CHECK-NEXT:    [[M_T:%.*]] = call <6 x double> @llvm.matrix.transpose.v6f64(<6 x double> [[M]], i32 3, i32 2)
+  // CHECK-NEXT:    [[RES_ADDR:%.*]] = getelementptr inbounds nuw %struct.MyMatrix.1, ptr %agg.result, i32 0, i32 0
+  // CHECK-NEXT:    store <6 x double> [[M_T]], ptr [[RES_ADDR]], align 8
+
+  // CHECK-LABEL: define{{.*}} void @_Z9transposeIdLj2ELj3EE8MyMatrixIT_XT1_EXT0_EES0_IS1_XT0_EXT1_EE(
+  // CHECK:         [[M:%.*]] = load <6 x double>, ptr {{.*}}, align 8
+  // CHECK-NEXT:    [[M_T:%.*]] = call <6 x double> @llvm.matrix.transpose.v6f64(<6 x double> [[M]], i32 2, i32 3)
+  // CHECK-NEXT:    [[RES_ADDR:%.*]] = getelementptr inbounds nuw %struct.MyMatrix.2, ptr %agg.result, i32 0, i32 0
+  // CHECK-NEXT:    store <6 x double> [[M_T]], ptr [[RES_ADDR]], align 8
+
+  MyMatrix<double, 2, 3> M2_t = transpose(transpose(transpose(M)));
+}
+
+matrix_t<float, 3, 3> get_matrix();
+
+// CHECK-LABEL: define{{.*}} void @_Z21test_transpose_rvaluev()
+void test_transpose_rvalue() {
+  // CHECK:         [[M_T_ADDR:%.*]] = alloca [9 x float], align 4
+  // CHECK-NEXT:    [[CALL_RES:%.*]] = call{{.*}} <9 x float> @_Z10get_matrixv()
+  // CHECK-NEXT:    [[ADD:%.*]] = fadd <9 x float> [[CALL_RES]], splat (float 2.000000e+00)
+  // CHECK-NEXT:    [[M_T:%.*]] = call <9 x float> @llvm.matrix.transpose.v9f32(<9 x float> [[ADD]], i32 3, i32 3)
+  // CHECK-NEXT:    store <9 x float> [[M_T]], ptr [[M_T_ADDR]], align 4
+  matrix_t<float, 3, 3> m_t = __builtin_matrix_transpose(get_matrix() + 2.0);
+}
+
+// CHECK-LABEL:  define{{.*}} void @_Z20test_transpose_const
+void test_transpose_const(const matrix_t<float, 3, 3> m) {
+  // CHECK:         [[MATRIX:%.*]] = load <9 x float>, ptr {{.*}}, align 4
+  // CHECK-NEXT:    [[M_T:%.*]] = call <9 x float> @llvm.matrix.transpose.v9f32(<9 x float> [[MATRIX]], i32 3, i32 3)
+  // CHECK-NEXT:    store <9 x float> [[M_T]], ptr %m_t, align 4
+  matrix_t<float, 3, 3> m_t = __builtin_matrix_transpose(m);
+}
+
+// TODO: Enable once initialization support is defined and implemented for
+//       matrix types.
+// void test_lvalue_conversion() {
+//  constexpr double4x4 m = {};
+//  [] { return __builtin_matrix_transpose(m); }
+//}
+
diff --git a/clang/test/CodeGenHLSL/Types/BuiltinMatrix/matrix-transpose.hlsl b/clang/test/CodeGenHLSL/Types/BuiltinMatrix/matrix-transpose.hlsl
new file mode 100644
index 0000000000000..1431a5daf8a01
--- /dev/null
+++ b/clang/test/CodeGenHLSL/Types/BuiltinMatrix/matrix-transpose.hlsl
@@ -0,0 +1,83 @@
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple spirv-unknown-vulkan-compute -finclude-default-header %s -emit-llvm -disable-llvm-passes -o - | FileCheck %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple dxil-pc-shadermodel6.3-compute -finclude-default-header %s -emit-llvm -disable-llvm-passes -o - | FileCheck %s
+
+// Tests the matrix type transformation builtin.
+
+// CHECK-LABEL: define {{.*}}transpose_double_4x4
+void transpose_double_4x4(double4x4 a) {
+  // CHECK:       [[A:%.*]] = load <16 x double>, ptr {{.*}}, align 8
+  // CHECK-NEXT:   [[TRANS:%.*]] = call <16 x double> @llvm.matrix.transpose.v16f64(<16 x double> [[A]], i32 4, i32 4)
+  // CHECK-NEXT:  store <16 x double> [[TRANS]], ptr %a_t, align 8
+
+  double4x4 a_t = __builtin_matrix_transpose(a);
+}
+
+// CHECK-LABEL: define {{.*}}transpose_float_3x2
+void transpose_float_3x2(float3x2 a) {
+  // CHECK:        [[A:%.*]] = load <6 x float>, ptr {{.*}}, align 4
+  // CHECK-NEXT:   [[TRANS:%.*]] = call <6 x float> @llvm.matrix.transpose.v6f32(<6 x float> [[A]], i32 3, i32 2)
+  // CHECK-NEXT:   store <6 x float> [[TRANS]], ptr %a_t, align 4
+
+  float2x3 a_t = __builtin_matrix_transpose(a);
+}
+
+// CHECK-LABEL: define {{.*}}transpose_int_4x3
+void transpose_int_4x3(int4x3 a) {
+  // CHECK:         [[A:%.*]] = load <12 x i32>, ptr {{.*}}, align 4
+  // CHECK-NEXT:    [[TRANS:%.*]] = call <12 x i32> @llvm.matrix.transpose.v12i32(<12 x i32> [[A]], i32 4, i32 3)
+  // CHECK-NEXT:    store <12 x i32> [[TRANS]], ptr %a_t, align 4
+
+  int3x4 a_t = __builtin_matrix_transpose(a);
+}
+
+struct Foo {
+  uint1x4 In;
+  uint4x1 Out;
+};
+
+// CHECK-LABEL: define {{.*}}transpose_struct_member
+void transpose_struct_member(struct Foo F) {
+  // CHECK:         [[IN_PTR:%.*]] = getelementptr inbounds nuw %struct.Foo, ptr %F, i32 0, i32 0
+  // CHECK-NEXT:    [[M:%.*]] = load <4 x i32>, ptr [[IN_PTR]], align 4
+  // CHECK-NEXT:    [[M_T:%.*]] = call <4 x i32> @llvm.matrix.transpose.v4i32(<4 x i32> [[M]], i32 1, i32 4)
+  // CHECK-NEXT:    [[OUT_PTR:%.*]] = getelementptr inbounds nuw %struct.Foo, ptr %F, i32 0, i32 1
+  // CHECK-NEXT:    store <4 x i32> [[M_T]], ptr [[OUT_PTR]], align 4
+
+  F.Out = __builtin_matrix_transpose(F.In);
+}
+
+// CHECK-LABEL: define {{.*}}transpose_transpose_struct_member
+void transpose_transpose_struct_member(struct Foo F) {
+  // CHECK:         [[IN_PTR:%.*]] = getelementptr inbounds nuw %struct.Foo, ptr %F, i32 0, i32 0
+  // CHECK-NEXT:    [[M:%.*]] = load <4 x i32>, ptr [[IN_PTR]], align 4
+  // CHECK-NEXT:    [[M_T:%.*]] = call <4 x i32> @llvm.matrix.transpose.v4i32(<4 x i32> [[M]], i32 1, i32 4)
+  // CHECK-NEXT:    [[M_T2:%.*]] = call <4 x i32> @llvm.matrix.transpose.v4i32(<4 x i32> [[M_T]], i32 4, i32 1)
+  // CHECK:         [[OUT_PTR:%.*]] = getelementptr inbounds nuw %struct.Foo, ptr %F, i32 0, i32 0
+  // CHECK-NEXT:    store <4 x i32> [[M_T2]], ptr [[OUT_PTR]], align 4
+
+  F.In = __builtin_matrix_transpose(__builtin_matrix_transpose(F.In));
+}
+
+double4x4 get_matrix(void);
+
+// CHECK-LABEL: define {{.*}}transpose_rvalue
+void transpose_rvalue(void) {
+  // CHECK:         [[M_T_ADDR:%.*]] = alloca [16 x double], align 8
+  // CHECK-NEXT:    [[CALL:%.*]] = call{{.*}} <16 x double> @_Z10get_matrixv()
+  // CHECK-NEXT:    [[M_T:%.*]] = call <16 x double> @llvm.matrix.transpose.v16f64(<16 x double> [[CALL]], i32 4, i32 4)
+  // CHECK-NEXT:   store <16 x double> [[M_T]], ptr [[M_T_ADDR]], align 8
+
+  double4x4 m_t = __builtin_matrix_transpose(get_matrix());
+}
+
+double4x4 global_matrix;
+
+// CHECK-LABEL: define {{.*}}transpose_global
+void transpose_global(void) {
+  // CHECK:         [[M_T_ADDR:%.*]] = alloca [16 x double], align 8
+  // CHECK-NEXT:    [[GLOBAL_MATRIX:%.*]] = load <16 x double>, ptr @global_matrix, align 8
+  // CHECK-NEXT:    [[M_T:%.*]] = call <16 x double> @llvm.matrix.transpose.v16f64(<16 x double> [[GLOBAL_MATRIX]], i32 4, i32 4)
+  // CHECK-NEXT:    store <16 x double> [[M_T]], ptr [[M_T_ADDR]], align 8
+
+  double4x4 m_t = __builtin_matrix_transpose(global_matrix);
+}
diff --git a/clang/test/CodeGenHLSL/Types/BuiltinMatrix/matrix-type-operators-template.hlsl b/clang/test/CodeGenHLSL/Types/BuiltinMatrix/matrix-type-operators-template.hlsl
new file mode 100644
index 0000000000000..7d45039709074
--- /dev/null
+++ b/clang/test/CodeGenHLSL/Types/BuiltinMatrix/matrix-type-operators-template.hlsl
@@ -0,0 +1,449 @@
+// RUN: %clang_cc1 -O0 -triple spirv-unknown-vulkan-compute -std=hlsl202y -finclude-default-header -fnative-half-type -emit-llvm -disable-llvm-passes  %s -o - | FileCheck %s --check-prefixes=CHECK,SPIRV,NOOPT -DIPTR_T=i64 -DALIGN=8
+// RUN: %clang_cc1 -O1 -triple spirv-unknown-vulkan-compute -std=hlsl202y -finclude-default-header -fnative-half-type -emit-llvm -disable-llvm-passes  %s -o - | FileCheck %s --check-prefixes=CHECK,SPIRV,OPT -DIPTR_T=i64 -DALIGN=8
+// RUN: %clang_cc1 -O0 -triple dxil-pc-shadermodel6.3-compute -std=hlsl202y -finclude-default-header -fnative-half-type -emit-llvm -disable-llvm-passes  %s -o - | FileCheck %s --check-prefixes=CHECK,NOOPT -DIPTR_T=i32 -DALIGN=4
+// RUN: %clang_cc1 -O1 -triple dxil-pc-shadermodel6.3-compute -std=hlsl202y -finclude-default-header -fnative-half-type -emit-llvm -disable-llvm-passes  %s -o - | FileCheck %s --check-prefixes=CHECK,OPT -DIPTR_T=i32 -DALIGN=4
+
+template <typename EltTy, unsigned Rows, unsigned Columns>
+struct MyMatrix {
+  using matrix_t = matrix<EltTy, Rows, Columns>;
+
+  matrix_t value;
+};
+
+template <typename EltTy0, unsigned R0, unsigned C0>
+typename MyMatrix<EltTy0, R0, C0>::matrix_t add(inout MyMatrix<EltTy0, R0, C0> A, inout MyMatrix<EltTy0, R0, C0> B) {
+  return A.value + B.value;
+}
+
+// CHECK-LABEL: define {{.*}}test_add_template
+void test_add_template() {
+  // CHECK:       call{{.*}} <8 x float> @_Z3addIfLj2ELj4EEN8MyMatrixIT_XT0_EXT1_EE8matrix_tES2_S2_(ptr noalias noundef nonnull align 4 dereferenceable(32) %{{.*}}, ptr noalias noundef nonnull align 4 dereferenceable(32) %{{.*}})
+
+  // CHECK-LABEL: define{{.*}} <8 x float> @_Z3addIfLj2ELj4EEN8MyMatrixIT_XT0_EXT1_EE8matrix_tES2_S2_(
+  // NOOPT:       [[MAT1:%.*]] = load <8 x float>, ptr {{.*}}, align 4{{$}}
+  // NOOPT:       [[MAT2:%.*]] = load <8 x float>, ptr {{.*}}, align 4{{$}}
+  // OPT:         [[MAT1:%.*]] = load <8 x float>, ptr {{.*}}, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // OPT:         [[MAT2:%.*]] = load <8 x float>, ptr {{.*}}, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[RES:%.*]] = fadd <8 x float> [[MAT1]], [[MAT2]]
+  // CHECK-NEXT:  ret <8 x float> [[RES]]
+
+  MyMatrix<float, 2, 4> Mat1;
+  MyMatrix<float, 2, 4> Mat2;
+  Mat1.value = add(Mat1, Mat2);
+}
+
+template <typename EltTy0, unsigned R0, unsigned C0>
+typename MyMatrix<EltTy0, R0, C0>::matrix_t subtract(inout MyMatrix<EltTy0, R0, C0> A, inout MyMatrix<EltTy0, R0, C0> B) {
+  return A.value - B.value;
+}
+
+// CHECK-LABEL: define {{.*}}test_subtract_template
+void test_subtract_template() {
+  // CHECK:       call{{.*}} <8 x float> @_Z8subtractIfLj2ELj4EEN8MyMatrixIT_XT0_EXT1_EE8matrix_tES2_S2_(ptr noalias noundef nonnull align 4 dereferenceable(32) %{{.*}}, ptr noalias noundef nonnull align 4 dereferenceable(32) %{{.*}})
+
+  // CHECK-LABEL: define{{.*}} <8 x float> @_Z8subtractIfLj2ELj4EEN8MyMatrixIT_XT0_EXT1_EE8matrix_tES2_S2_(
+  // NOOPT:       [[MAT1:%.*]] = load <8 x float>, ptr {{.*}}, align 4{{$}}
+  // NOOPT:       [[MAT2:%.*]] = load <8 x float>, ptr {{.*}}, align 4{{$}}
+  // OPT:         [[MAT1:%.*]] = load <8 x float>, ptr {{.*}}, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // OPT:         [[MAT2:%.*]] = load <8 x float>, ptr {{.*}}, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[RES:%.*]] = fsub <8 x float> [[MAT1]], [[MAT2]]
+  // CHECK-NEXT:  ret <8 x float> [[RES]]
+
+  MyMatrix<float, 2, 4> Mat1;
+  MyMatrix<float, 2, 4> Mat2;
+  Mat1.value = subtract(Mat1, Mat2);
+}
+
+struct DoubleWrapper1 {
+  int x;
+  operator double() {
+    return x;
+  }
+};
+
+// CHECK-LABEL: define {{.*}}test_DoubleWrapper1_Sub1
+void test_DoubleWrapper1_Sub1(inout MyMatrix<double, 4, 3> m) {
+  // NOOPT:       [[MATRIX:%.*]] = load <12 x double>, ptr {{.*}}, align 8{{$}}
+  // OPT:         [[MATRIX:%.*]] = load <12 x double>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[SCALAR:%.*]] = call{{.*}} double @_ZN14DoubleWrapper1cvdEv(ptr {{[^,]*}} %w1)
+  // CHECK-NEXT:  [[SCALAR_EMBED:%.*]] = insertelement <12 x double> poison, double [[SCALAR]], i64 0
+  // CHECK-NEXT:  [[SCALAR_EMBED1:%.*]] = shufflevector <12 x double> [[SCALAR_EMBED]], <12 x double> poison, <12 x i32> zeroinitializer
+  // CHECK-NEXT:  [[RES:%.*]] = fsub <12 x double> [[MATRIX]], [[SCALAR_EMBED1]]
+  // CHECK:       store <12 x double> [[RES]], ptr {{.*}}, align 8
+
+  DoubleWrapper1 w1;
+  w1.x = 10;
+  m.value = m.value - w1;
+}
+
+// CHECK-LABEL: define {{.*}}test_DoubleWrapper1_Sub2
+void test_DoubleWrapper1_Sub2(inout MyMatrix<double, 4, 3> m) {
+  // CHECK:       [[SCALAR:%.*]] = call{{.*}} double @_ZN14DoubleWrapper1cvdEv(ptr {{[^,]*}} %w1)
+  // NOOPT:       [[MATRIX:%.*]] = load <12 x double>, ptr {{.*}}, align 8{{$}}
+  // OPT:         [[MATRIX:%.*]] = load <12 x double>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[SCALAR_EMBED:%.*]] = insertelement <12 x double> poison, double [[SCALAR]], i64 0
+  // CHECK-NEXT:  [[SCALAR_EMBED1:%.*]] = shufflevector <12 x double> [[SCALAR_EMBED]], <12 x double> poison, <12 x i32> zeroinitializer
+  // CHECK-NEXT:  [[RES:%.*]] = fsub <12 x double> [[SCALAR_EMBED1]], [[MATRIX]]
+  // CHECK:       store <12 x double> [[RES]], ptr {{.*}}, align 8
+
+  DoubleWrapper1 w1;
+  w1.x = 10;
+  m.value = w1 - m.value;
+}
+
+struct DoubleWrapper2 {
+  int x;
+  operator double() {
+    return x;
+  }
+};
+
+// CHECK-LABEL: define {{.*}}test_DoubleWrapper2_Add1
+void test_DoubleWrapper2_Add1(inout MyMatrix<double, 4, 3> m) {
+  // NOOPT:       [[MATRIX:%.*]] = load <12 x double>, ptr {{.+}}, align 8{{$}}
+  // OPT:         [[MATRIX:%.*]] = load <12 x double>, ptr {{.+}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK:       [[SCALAR:%.*]] = call{{.*}} double @_ZN14DoubleWrapper2cvdEv(ptr {{[^,]*}} %w2)
+  // CHECK-NEXT:  [[SCALAR_EMBED:%.*]] = insertelement <12 x double> poison, double [[SCALAR]], i64 0
+  // CHECK-NEXT:  [[SCALAR_EMBED1:%.*]] = shufflevector <12 x double> [[SCALAR_EMBED]], <12 x double> poison, <12 x i32> zeroinitializer
+  // CHECK-NEXT:  [[RES:%.*]] = fadd <12 x double> [[MATRIX]], [[SCALAR_EMBED1]]
+  // CHECK:       store <12 x double> [[RES]], ptr {{.*}}, align 8
+
+  DoubleWrapper2 w2;
+  w2.x = 20;
+  m.value = m.value + w2;
+}
+
+// CHECK-LABEL: define {{.*}}test_DoubleWrapper2_Add2
+void test_DoubleWrapper2_Add2(inout MyMatrix<double, 4, 3> m) {
+  // CHECK:       [[SCALAR:%.*]] = call{{.*}} double @_ZN14DoubleWrapper2cvdEv(ptr {{[^,]*}} %w2)
+  // NOOPT:       [[MATRIX:%.*]] = load <12 x double>, ptr {{.*}}, align 8{{$}}
+  // OPT:         [[MATRIX:%.*]] = load <12 x double>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[SCALAR_EMBED:%.*]] = insertelement <12 x double> poison, double [[SCALAR]], i64 0
+  // CHECK-NEXT:  [[SCALAR_EMBED1:%.*]] = shufflevector <12 x double> [[SCALAR_EMBED]], <12 x double> poison, <12 x i32> zeroinitializer
+  // CHECK-NEXT:  [[RES:%.*]] = fadd <12 x double> [[SCALAR_EMBED1]], [[MATRIX]]
+  // CHECK:       store <12 x double> [[RES]], ptr {{.*}}, align 8
+
+  DoubleWrapper2 w2;
+  w2.x = 20;
+  m.value = w2 + m.value;
+}
+
+struct IntWrapper {
+  uint16_t x;
+  operator int() {
+    return x;
+  }
+};
+
+// CHECK-LABEL: define {{.*}}test_IntWrapper_Add
+void test_IntWrapper_Add(inout MyMatrix<double, 4, 3> m) {
+  // NOOPT:       [[MATRIX:%.*]] = load <12 x double>, ptr {{.*}}, align 8{{$}}
+  // OPT:         [[MATRIX:%.*]] = load <12 x double>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[SCALAR:%.*]] = call{{.*}} i32 @_ZN10IntWrappercviEv(ptr {{[^,]*}} %w3)
+  // CHECK-NEXT:  [[SCALAR_FP:%.*]] = sitofp i32 [[SCALAR]] to double
+  // CHECK-NEXT:  [[SCALAR_EMBED:%.*]] = insertelement <12 x double> poison, double [[SCALAR_FP]], i64 0
+  // CHECK-NEXT:  [[SCALAR_EMBED1:%.*]] = shufflevector <12 x double> [[SCALAR_EMBED]], <12 x double> poison, <12 x i32> zeroinitializer
+  // CHECK-NEXT:  [[RES:%.*]] = fadd <12 x double> [[MATRIX]], [[SCALAR_EMBED1]]
+  // CHECK:       store <12 x double> [[RES]], ptr {{.*}}, align 8
+
+  IntWrapper w3;
+  w3.x = 13;
+  m.value = m.value + w3;
+}
+
+// CHECK-LABEL: define {{.*}}test_IntWrapper_Sub
+void test_IntWrapper_Sub(inout MyMatrix<double, 4, 3> m) {
+  // CHECK:       [[SCALAR:%.*]] = call{{.*}} i32 @_ZN10IntWrappercviEv(ptr {{[^,]*}} %w3)
+  // CHECK-NEXT:  [[SCALAR_FP:%.*]] = sitofp i32 [[SCALAR]] to double
+  // NOOPT:       [[MATRIX:%.*]] = load <12 x double>, ptr {{.*}}, align 8{{$}}
+  // OPT:         [[MATRIX:%.*]] = load <12 x double>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[SCALAR_EMBED:%.*]] = insertelement <12 x double> poison, double [[SCALAR_FP]], i64 0
+  // CHECK-NEXT:  [[SCALAR_EMBED1:%.*]] = shufflevector <12 x double> [[SCALAR_EMBED]], <12 x double> poison, <12 x i32> zeroinitializer
+  // CHECK-NEXT:  [[RES:%.*]] = fsub <12 x double> [[SCALAR_EMBED1]], [[MATRIX]]
+  // CHECK:       store <12 x double> [[RES]], ptr {{.*}}, align 8
+
+  IntWrapper w3;
+  w3.x = 13;
+  m.value = w3 - m.value;
+}
+
+template <typename EltTy0, unsigned R0, unsigned C0, unsigned C1>
+typename MyMatrix<EltTy0, R0, C1>::matrix_t multiply(inout MyMatrix<EltTy0, R0, C0> A, inout MyMatrix<EltTy0, C0, C1> B) {
+  return A.value * B.value;
+}
+
+// CHECK-LABEL: define {{.*}}test_multiply_template
+MyMatrix<float, 2, 2> test_multiply_template(MyMatrix<float, 2, 4> Mat1,
+                                             MyMatrix<float, 4, 2> Mat2) {
+  // CHECK-NEXT:  entry:
+  // SPIRV-NEXT:    %0 = call token @llvm.experimental.convergence.entry()
+  // CHECK-NEXT:    %tmp = alloca %struct.MyMatrix, align 4
+  // CHECK-NEXT:    %tmp1 = alloca %struct.MyMatrix.2, align 4
+  // CHECK-NEXT:    call void @llvm.memcpy.p0.p0.[[IPTR_T]](ptr align 4 %tmp, ptr align 4 %Mat1, [[IPTR_T]] 32, i1 false)
+  // OPT-NEXT:      call void @llvm.lifetime.start.p0(i64 32, ptr %tmp)
+  // CHECK-NEXT:    call void @llvm.memcpy.p0.p0.[[IPTR_T]](ptr align 4 %tmp1, ptr align 4 %Mat2, [[IPTR_T]] 32, i1 false)
+  // OPT-NEXT:      call void @llvm.lifetime.start.p0(i64 32, ptr %tmp1)
+  // CHECK-NEXT:    [[RES:%.*]] = call{{.*}} <4 x float> @_Z8multiplyIfLj2ELj4ELj2EEN8MyMatrixIT_XT0_EXT2_EE8matrix_tES0_IS1_XT0_EXT1_EES0_IS1_XT1_EXT2_EE(ptr noalias noundef nonnull align 4 dereferenceable(32) %tmp, ptr noalias noundef nonnull align 4 dereferenceable(32) %tmp1)
+  // CHECK-NEXT:    call void @llvm.memcpy.p0.p0.[[IPTR_T]](ptr align 4 %Mat1, ptr align 4 %tmp, [[IPTR_T]] 32, i1 false)
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 32, ptr %tmp)
+  // CHECK-NEXT:    call void @llvm.memcpy.p0.p0.[[IPTR_T]](ptr align 4 %Mat2, ptr align 4 %tmp1, [[IPTR_T]] 32, i1 false)
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 32, ptr %tmp1)
+  // CHECK-NEXT:    %value = getelementptr inbounds nuw %struct.MyMatrix.1, ptr %agg.result, i32 0, i32 0
+  // CHECK-NEXT:    store <4 x float> [[RES]], ptr %value, align 4
+  // CHECK-NEXT:    ret void
+  //
+  // CHECK-LABEL:  define{{.*}} <4 x float> @_Z8multiplyIfLj2ELj4ELj2EEN8MyMatrixIT_XT0_EXT2_EE8matrix_tES0_IS1_XT0_EXT1_EES0_IS1_XT1_EXT2_EE(
+  // NOOPT:         [[MAT1:%.*]] = load <8 x float>, ptr {{.*}}, align 4{{$}}
+  // NOOPT:         [[MAT2:%.*]] = load <8 x float>, ptr {{.*}}, align 4{{$}}
+  // OPT:           [[MAT1:%.*]] = load <8 x float>, ptr {{.*}}, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // OPT:           [[MAT2:%.*]] = load <8 x float>, ptr {{.*}}, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.matrix.multiply.v4f32.v8f32.v8f32(<8 x float> [[MAT1]], <8 x float> [[MAT2]], i32 2, i32 4, i32 2)
+  // CHECK-NEXT:    ret <4 x float> [[RES]]
+
+  MyMatrix<float, 2, 2> Res;
+  Res.value = multiply(Mat1, Mat2);
+  return Res;
+}
+
+// CHECK-LABEL: define {{.*}}test_IntWrapper_Multiply
+void test_IntWrapper_Multiply(inout MyMatrix<double, 4, 3> m, inout IntWrapper w3) {
+  // CHECK:       [[SCALAR:%.*]] = call{{.*}} i32 @_ZN10IntWrappercviEv(ptr noundef {{.*}})
+  // CHECK-NEXT:  [[SCALAR_FP:%.*]] = sitofp i32 [[SCALAR]] to double
+  // NOOPT:       [[MATRIX:%.*]] = load <12 x double>, ptr {{.*}}, align 8{{$}}
+  // OPT:         [[MATRIX:%.*]] = load <12 x double>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[SCALAR_EMBED:%.*]] = insertelement <12 x double> poison, double [[SCALAR_FP]], i64 0
+  // CHECK-NEXT:  [[SCALAR_EMBED1:%.*]] = shufflevector <12 x double> [[SCALAR_EMBED]], <12 x double> poison, <12 x i32> zeroinitializer
+  // CHECK-NEXT:  [[RES:%.*]] = fmul <12 x double> [[SCALAR_EMBED1]], [[MATRIX]]
+  // CHECK:       store <12 x double> [[RES]], ptr {{.*}}, align 8
+  // CHECK-NEXT:  ret void
+  m.value = w3 * m.value;
+}
+
+template <typename EltTy, unsigned Rows, unsigned Columns>
+void insert(inout MyMatrix<EltTy, Rows, Columns> Mat, EltTy e, unsigned i, unsigned j) {
+  Mat.value[i][j] = e;
+}
+
+// CHECK-LABEL: define {{.*}}test_insert_template1
+void test_insert_template1(inout MyMatrix<unsigned, 2, 2> Mat, unsigned e, unsigned i, unsigned j) {
+  // NOOPT:         [[MAT_ADDR:%.*]] = load ptr, ptr %Mat.addr, align [[ALIGN]]{{$}}
+  // NOOPT:         [[E:%.*]] = load i32, ptr %e.addr, align 4{{$}}
+  // NOOPT-NEXT:    [[I:%.*]] = load i32, ptr %i.addr, align 4{{$}}
+  // NOOPT-NEXT:    [[J:%.*]] = load i32, ptr %j.addr, align 4{{$}}
+  // OPT:           [[MAT_ADDR:%.*]] = load ptr, ptr %Mat.addr, align [[ALIGN]], !tbaa !{{[0-9]+}}{{$}}
+  // OPT:           [[E:%.*]] = load i32, ptr %e.addr, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:      [[I:%.*]] = load i32, ptr %i.addr, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:      [[J:%.*]] = load i32, ptr %j.addr, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    call{{.*}} void @_Z6insertIjLj2ELj2EEv8MyMatrixIT_XT0_EXT1_EES1_jj(ptr noalias noundef nonnull align 4 dereferenceable(16) %{{.*}}, i32 noundef [[E]], i32 noundef [[I]], i32 noundef [[J]])
+  // CHECK:         ret void
+  //
+  // CHECK-LABEL: define{{.*}} void @_Z6insertIjLj2ELj2EEv8MyMatrixIT_XT0_EXT1_EES1_jj(
+  // NOOPT:         [[E:%.*]] = load i32, ptr %e.addr, align 4{{$}}
+  // NOOPT:         [[I:%.*]] = load i32, ptr %i.addr, align 4{{$}}
+  // OPT:           [[E:%.*]] = load i32, ptr %e.addr, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // OPT:           [[I:%.*]] = load i32, ptr %i.addr, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // SPIRV-NEXT:    [[I:%.*]] = zext i32 {{.*}} to i64
+  // NOOPT-NEXT:    [[J:%.*]] = load i32, ptr %j.addr, align 4{{$}}
+  // OPT-NEXT:      [[J:%.*]] = load i32, ptr %j.addr, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // SPIRV-NEXT:    [[J:%.*]] = zext i32 {{.*}} to i64
+  // CHECK-NEXT:    [[IDX1:%.*]] = mul [[IPTR_T]] [[J]], 2
+  // CHECK-NEXT:    [[IDX2:%.*]] = add [[IPTR_T]] [[IDX1]], [[I]]
+  // OPT-NEXT:      [[CMP:%.*]] = icmp ult [[IPTR_T]] [[IDX2]], 4
+  // OPT-NEXT:      call void @llvm.assume(i1 [[CMP]])
+  // CHECK-NEXT:    [[MAT:%.*]] = load <4 x i32>, ptr {{.*}}, align 4{{$}}
+  // CHECK-NEXT:    [[MATINS:%.*]] = insertelement <4 x i32> [[MAT]], i32 [[E]], [[IPTR_T]] [[IDX2]]
+  // CHECK-NEXT:    store <4 x i32> [[MATINS]], ptr {{.*}}, align 4
+  // CHECK-NEXT:    ret void
+
+  insert(Mat, e, i, j);
+}
+
+// CHECK-LABEL: define {{.*}}test_insert_template2
+void test_insert_template2(inout MyMatrix<float, 3, 4> Mat, float e) {
+  // NOOPT:         [[MAT_ADDR:%.*]] = load ptr, ptr %Mat.addr, align [[ALIGN]]{{$}}
+  // NOOPT:         [[E:%.*]] = load float, ptr %e.addr, align 4{{$}}
+  // OPT:           [[MAT_ADDR:%.*]] = load ptr, ptr %Mat.addr, align [[ALIGN]], !tbaa !{{[0-9]+}}{{$}}
+  // OPT:           [[E:%.*]] = load float, ptr %e.addr, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    call{{.*}} void @_Z6insertIfLj3ELj4EEv8MyMatrixIT_XT0_EXT1_EES1_jj(ptr noalias noundef nonnull align 4 dereferenceable(48) %{{.*}}, float noundef [[E]], i32 noundef 2, i32 noundef 3)
+  // CHECK:         ret void
+  //
+  // CHECK-LABEL: define{{.*}} void @_Z6insertIfLj3ELj4EEv8MyMatrixIT_XT0_EXT1_EES1_jj(
+  // NOOPT:         [[E:%.*]] = load float, ptr %e.addr, align 4{{$}}
+  // NOOPT:         [[I:%.*]] = load i32, ptr %i.addr, align 4{{$}}
+  // OPT:           [[E:%.*]] = load float, ptr %e.addr, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // OPT:           [[I:%.*]] = load i32, ptr %i.addr, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // SPIRV-NEXT:    [[I:%.*]] = zext i32 {{.*}} to i64
+  // NOOPT-NEXT:    [[J:%.*]] = load i32, ptr %j.addr, align 4{{$}}
+  // OPT-NEXT:      [[J:%.*]] = load i32, ptr %j.addr, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // SPIRV-NEXT:    [[J:%.*]] = zext i32 {{.*}} to i64
+  // CHECK-NEXT:    [[IDX1:%.*]] = mul [[IPTR_T]] [[J]], 3
+  // CHECK-NEXT:    [[IDX2:%.*]] = add [[IPTR_T]] [[IDX1]], [[I]]
+  // OPT-NEXT:      [[CMP:%.*]] = icmp ult [[IPTR_T]] [[IDX2]], 12
+  // OPT-NEXT:      call void @llvm.assume(i1 [[CMP]])
+  // CHECK-NEXT:    [[MAT:%.*]] = load <12 x float>, ptr {{.*}}, align 4{{$}}
+  // CHECK-NEXT:    [[MATINS:%.*]] = insertelement <12 x float> [[MAT]], float [[E]], [[IPTR_T]] [[IDX2]]
+  // CHECK-NEXT:    store <12 x float> [[MATINS]], ptr {{.*}}, align 4
+  // CHECK-NEXT:    ret void
+
+  insert(Mat, e, 2, 3);
+}
+
+template <typename EltTy, unsigned Rows, unsigned Columns>
+EltTy extract(inout MyMatrix<EltTy, Rows, Columns> Mat) {
+  return Mat.value[1u][0u];
+}
+
+// CHECK-LABEL: define {{.*}}test_extract_template
+int test_extract_template(MyMatrix<int, 2, 2> Mat1) {
+  // CHECK-NEXT:  entry:
+  // SPIRV-NEXT:    %0 = call token @llvm.experimental.convergence.entry()
+  // CHECK-NEXT:    %tmp = alloca %struct.MyMatrix.5, align 4
+  // CHECK-NEXT:    call void @llvm.memcpy.p0.p0.[[IPTR_T]](ptr align 4 %tmp, ptr align 4 %Mat1, [[IPTR_T]] 16, i1 false)
+  // OPT-NEXT:      call void @llvm.lifetime.start.p0(i64 16, ptr %tmp)
+  // CHECK-NEXT:    [[CALL:%.*]] = call{{.*}} i32 @_Z7extractIiLj2ELj2EET_8MyMatrixIS0_XT0_EXT1_EE(ptr noalias noundef nonnull align 4 dereferenceable(16) %tmp)
+  // CHECK-NEXT:    call void @llvm.memcpy.p0.p0.[[IPTR_T]](ptr align 4 %Mat1, ptr align 4 %tmp, [[IPTR_T]] 16, i1 false)
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 16, ptr %tmp)
+  // CHECK-NEXT:    ret i32 [[CALL]]
+  //
+  // CHECK-LABEL: define{{.*}} i32 @_Z7extractIiLj2ELj2EET_8MyMatrixIS0_XT0_EXT1_EE(
+  // NOOPT:         [[MAT:%.*]] = load <4 x i32>, ptr {{.*}}, align 4{{$}}
+  // OPT:           [[MAT:%.*]] = load <4 x i32>, ptr {{.*}}, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[MATEXT:%.*]] = extractelement <4 x i32> [[MAT]], [[IPTR_T]] 1
+  // CHECK-NEXT:    ret i32 [[MATEXT]]
+
+  return extract(Mat1);
+}
+
+template <class R, class C>
+auto matrix_subscript(double4x4 m, R r, C c) -> decltype(m[r][c]) {}
+
+// CHECK-LABEL: define {{.*}}test_matrix_subscript
+double test_matrix_subscript(double4x4 m) {
+  // NOOPT:         [[MAT:%.*]] = load <16 x double>, ptr {{.*}}, align 8{{$}}
+  // OPT:           [[MAT:%.*]] = load <16 x double>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[CALL:%.*]] = call{{.*}} nonnull align 8 dereferenceable(8) ptr @_Z16matrix_subscriptIiiEDTixixfp_fp0_fp1_Eu11matrix_typeIL{{[mj]}}4EL{{[mj]}}4EdET_T0_(<16 x double> noundef [[MAT]], i32 noundef 1, i32 noundef 2)
+  // NOOPT-NEXT:    [[RES:%.*]] = load double, ptr [[CALL]], align 8{{$}}
+  // OPT-NEXT:      [[RES:%.*]] = load double, ptr [[CALL]], align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    ret double [[RES]]
+
+  return matrix_subscript(m, 1, 2);
+}
+
+// CHECK-LABEL: define {{.*}}test_matrix_subscript_const
+const double test_matrix_subscript_const(const double4x4 m) {
+  // CHECK-NEXT:  entry:
+  // SPIRV-NEXT:    %0 = call token @llvm.experimental.convergence.entry()
+  // CHECK-NEXT:    [[M_ADDR:%.*]] = alloca [16 x double], align 8
+  // CHECK-NEXT:    store <16 x double> [[M:%.*]], ptr [[M_ADDR]], align 8
+  // NOOPT:         [[NAMELESS1:%.*]] = load <16 x double>, ptr [[M_ADDR]], align 8{{$}}
+  // OPT:           [[NAMELESS1:%.*]] = load <16 x double>, ptr [[M_ADDR]], align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[MATEXT:%.*]] = extractelement <16 x double> [[NAMELESS1]], [[IPTR_T]] 4
+  // CHECK-NEXT:    ret double [[MATEXT]]
+
+  return m[0][1];
+}
+
+struct UnsignedWrapper {
+  char x;
+  operator unsigned() {
+    return x;
+  }
+};
+
+// CHECK-LABEL: define {{.*}}extract_IntWrapper_idx
+double extract_IntWrapper_idx(inout double4x4 m, IntWrapper i, UnsignedWrapper j) {
+  // CHECK:         [[I:%.*]] = call{{.*}} i32 @_ZN10IntWrappercviEv(ptr {{[^,]*}} %i)
+  // CHECK-NEXT:    [[I_ADD:%.*]] = add nsw i32 [[I]], 1
+  // SPIRV-NEXT:    [[I_ADD:%.*]] = sext i32 {{.*}} to i64
+  // CHECK-NEXT:    [[J:%.*]] = call{{.*}} i32 @_ZN15UnsignedWrappercvjEv(ptr {{[^,]*}} %j)
+  // CHECK-NEXT:    [[J_SUB:%.*]] = sub i32 [[J]], 1
+  // SPIRV-NEXT:    [[J_SUB:%.*]] = zext i32 {{.*}} to i64
+  // CHECK-NEXT:    [[IDX1:%.*]] = mul [[IPTR_T]] [[J_SUB]], 4
+  // CHECK-NEXT:    [[IDX2:%.*]] = add [[IPTR_T]] [[IDX1]], [[I_ADD]]
+  // NOOPT-NEXT:    [[MAT_ADDR:%.*]] = load ptr, ptr %m.addr, align [[ALIGN]]{{$}}
+  // NOOPT-NEXT:    [[MAT:%.*]] = load <16 x double>, ptr [[MAT_ADDR]], align 8{{$}}
+  // OPT-NEXT:      [[CMP:%.*]] = icmp ult [[IPTR_T]] [[IDX2]], 16
+  // OPT-NEXT:      call void @llvm.assume(i1 [[CMP]])
+  // OPT-NEXT:      [[MAT_ADDR:%.*]] = load ptr, ptr %m.addr, align [[ALIGN]], !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:      [[MAT:%.*]] = load <16 x double>, ptr [[MAT_ADDR]], align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[MATEXT:%.*]]  = extractelement <16 x double> [[MAT]], [[IPTR_T]] [[IDX2]]
+  // CHECK-NEXT:    ret double [[MATEXT]]
+  return m[i + 1][j - 1];
+}
+
+template <class T, unsigned R, unsigned C>
+using matrix_type = matrix<T, R, C>;
+struct identmatrix_t {
+  template <class T, unsigned N>
+  operator matrix_type<T, N, N>() const {
+    matrix_type<T, N, N> result;
+    for (unsigned i = 0; i != N; ++i)
+      result[i][i] = 1;
+    return result;
+  }
+};
+
+constexpr identmatrix_t identmatrix;
+
+// CHECK-LABEL: define {{.*}}test_constexpr1
+void test_constexpr1(inout matrix_type<float, 4, 4> m) {
+  // NOOPT:         [[MAT:%.*]] = load <16 x float>, ptr {{.*}}, align 4{{$}}
+  // OPT:           [[MAT:%.*]] = load <16 x float>, ptr {{.*}}, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[IM:%.*]] = call{{.*}} <16 x float> @_ZNK13identmatrix_tcvu11matrix_typeIXT0_EXT0_ET_EIfLj4EEEv(ptr {{[^,]*}} @_ZL11identmatrix)
+  // CHECK-NEXT:    [[ADD:%.*]] = fadd <16 x float> [[MAT]], [[IM]]
+  // NOOPT-NEXT:    [[MAT_ADDR:%.*]] = load ptr, ptr %m.addr, align [[ALIGN]]{{$}}
+  // OPT-NEXT:      [[MAT_ADDR:%.*]] = load ptr, ptr %m.addr, align [[ALIGN]], !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    store <16 x float> [[ADD]], ptr [[MAT_ADDR]], align 4
+  // CHECK-NEXT:    ret voi
+
+  // CHECK-LABEL: define{{.*}} <16 x float> @_ZNK13identmatrix_tcvu11matrix_typeIXT0_EXT0_ET_EIfLj4EEEv(
+  // CHECK-LABEL: for.body:                                         ; preds = %for.cond
+  // NOOPT-NEXT:   [[I:%.*]] = load i32, ptr %i, align 4{{$}}
+  // OPT-NEXT:     [[I:%.*]] = load i32, ptr %i, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // SPIRV-NEXT:   [[I:%.*]] = zext i32 {{.*}} to i64
+  // NOOPT-NEXT:   [[I2:%.*]] = load i32, ptr %i, align 4{{$}}
+  // OPT-NEXT:     [[I2:%.*]] = load i32, ptr %i, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // SPIRV-NEXT:   [[I2:%.*]] = zext i32 {{.*}} to i64
+  // CHECK-NEXT:   [[IDX1:%.*]] = mul [[IPTR_T]] [[I2]], 4
+  // CHECK-NEXT:   [[IDX2:%.*]] = add [[IPTR_T]] [[IDX1]], [[I]]
+  // OPT-NEXT:     [[CMP:%.*]] = icmp ult [[IPTR_T]] [[IDX2]], 16
+  // OPT-NEXT:     call void @llvm.assume(i1 [[CMP]])
+  // CHECK-NEXT:   [[MAT:%.*]] = load <16 x float>, ptr %result, align 4{{$}}
+  // CHECK-NEXT:   [[MATINS:%.*]] = insertelement <16 x float> [[MAT]], float 1.000000e+00, [[IPTR_T]] [[IDX2]]
+  // CHECK-NEXT:   store <16 x float> [[MATINS]], ptr %result, align 4
+  // CHECK-NEXT:   br label %for.inc
+  m = m + identmatrix;
+}
+
+// CHECK-LABEL: define {{.*}}test_constexpr2
+void test_constexpr2(inout matrix_type<int, 4, 4> m) {
+  // CHECK:         [[IM:%.*]] = call{{.*}} <16 x i32> @_ZNK13identmatrix_tcvu11matrix_typeIXT0_EXT0_ET_EIiLj4EEEv(ptr {{[^,]*}} @_ZL11identmatrix)
+  // NOOPT:         [[MAT:%.*]] = load <16 x i32>, ptr {{.*}}, align 4{{$}}
+  // OPT:           [[MAT:%.*]] = load <16 x i32>, ptr {{.*}}, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[SUB:%.*]] = sub <16 x i32> [[IM]], [[MAT]]
+  // CHECK-NEXT:    [[SUB2:%.*]] = add <16 x i32> [[SUB]], splat (i32 1)
+  // NOOPT-NEXT:    [[MAT_ADDR:%.*]] = load ptr, ptr %m.addr, align [[ALIGN]]{{$}}
+  // OPT-NEXT:      [[MAT_ADDR:%.*]] = load ptr, ptr %m.addr, align [[ALIGN]], !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    store <16 x i32> [[SUB2]], ptr [[MAT_ADDR]], align 4
+  // CHECK-NEXT:    ret void
+  //
+
+  // CHECK-LABEL: define{{.*}} <16 x i32> @_ZNK13identmatrix_tcvu11matrix_typeIXT0_EXT0_ET_EIiLj4EEEv(
+  // CHECK-LABEL: for.body:                                         ; preds = %for.cond
+  // NOOPT-NEXT:   [[I:%.*]] = load i32, ptr %i, align 4{{$}}
+  // OPT-NEXT:     [[I:%.*]] = load i32, ptr %i, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // SPIRV-NEXT:   [[I:%.*]] = zext i32 {{.*}} to i64
+  // NOOPT-NEXT:   [[I2:%.*]] = load i32, ptr %i, align 4{{$}}
+  // OPT-NEXT:     [[I2:%.*]] = load i32, ptr %i, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // SPIRV-NEXT:   [[I2:%.*]] = zext i32 {{.*}} to i64
+  // CHECK-NEXT:   [[IDX1:%.*]] = mul [[IPTR_T]] [[I2]], 4
+  // CHECK-NEXT:   [[IDX2:%.*]] = add [[IPTR_T]] [[IDX1]], [[I]]
+  // OPT-NEXT:     [[CMP:%.*]] = icmp ult [[IPTR_T]] [[IDX2]], 16
+  // OPT-NEXT:     call void @llvm.assume(i1 [[CMP]])
+  // CHECK-NEXT:   [[MAT:%.*]] = load <16 x i32>, ptr %result, align 4{{$}}
+  // CHECK-NEXT:   [[MATINS:%.*]] = insertelement <16 x i32> [[MAT]], i32 1, [[IPTR_T]] [[IDX2]]
+  // CHECK-NEXT:   store <16 x i32> [[MATINS]], ptr %result, align 4
+  // CHECK-NEXT:   br label %for.inc
+
+  m = identmatrix - m + 1;
+}
diff --git a/clang/test/CodeGenHLSL/Types/BuiltinMatrix/matrix-type-operators.hlsl b/clang/test/CodeGenHLSL/Types/BuiltinMatrix/matrix-type-operators.hlsl
new file mode 100644
index 0000000000000..be412d23a676f
--- /dev/null
+++ b/clang/test/CodeGenHLSL/Types/BuiltinMatrix/matrix-type-operators.hlsl
@@ -0,0 +1,1255 @@
+// RUN: %clang_cc1 -O0 -triple spirv-unknown-vulkan-compute -finclude-default-header -fnative-half-type -emit-llvm -disable-llvm-passes  %s -o - | FileCheck %s --check-prefixes=CHECK,SPIRV,NOOPT -DIPTR_T=i64 -DALIGN=8
+// RUN: %clang_cc1 -O1 -triple spirv-unknown-vulkan-compute -finclude-default-header -fnative-half-type -emit-llvm -disable-llvm-passes  %s -o - | FileCheck %s --check-prefixes=CHECK,SPIRV,OPT -DIPTR_T=i64 -DALIGN=8
+// RUN: %clang_cc1 -O0 -triple dxil-pc-shadermodel6.3-compute -finclude-default-header -fnative-half-type -emit-llvm -disable-llvm-passes  %s -o - | FileCheck %s --check-prefixes=CHECK,DXIL,NOOPT -DIPTR_T=i32 -DALIGN=4
+// RUN: %clang_cc1 -O1 -triple dxil-pc-shadermodel6.3-compute -finclude-default-header -fnative-half-type -emit-llvm -disable-llvm-passes  %s -o - | FileCheck %s --check-prefixes=CHECK,DXIL,OPT -DIPTR_T=i32 -DALIGN=4
+
+// Test arithmetic operations on matrix types.
+// This is adapted to HLSL from CodeGen/matrix-type-operators.c.
+
+// Floating point matrix/scalar additions.
+
+// CHECK-LABEL: define {{.*}}add_matrix_matrix_double
+void add_matrix_matrix_double(double4x4 a, double4x4 b, double4x4 c) {
+  // NOOPT:       [[B:%.*]] = load <16 x double>, ptr {{.*}}, align 8{{$}}
+  // NOOPT-NEXT:  [[C:%.*]] = load <16 x double>, ptr {{.*}}, align 8{{$}}
+  // OPT:         [[B:%.*]] = load <16 x double>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:    [[C:%.*]] = load <16 x double>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[RES:%.*]] = fadd <16 x double> [[B]], [[C]]
+  // CHECK-NEXT:  store <16 x double> [[RES]], ptr {{.*}}, align 8
+
+  a = b + c;
+}
+
+// CHECK-LABEL: define {{.*}}add_compound_assign_matrix_double
+void add_compound_assign_matrix_double(double4x4 a, double4x4 b) {
+  // NOOPT:       [[B:%.*]] = load <16 x double>, ptr {{.*}}, align 8{{$}}
+  // NOOPT-NEXT:  [[A:%.*]] = load <16 x double>, ptr {{.*}}, align 8{{$}}
+  // OPT:         [[B:%.*]] = load <16 x double>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:    [[A:%.*]] = load <16 x double>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[RES:%.*]] = fadd <16 x double> [[A]], [[B]]
+  // CHECK-NEXT:  store <16 x double> [[RES]], ptr {{.*}}, align 8
+
+  a += b;
+}
+
+// CHECK-LABEL: define {{.*}}subtract_compound_assign_matrix_double
+void subtract_compound_assign_matrix_double(double4x4 a, double4x4 b) {
+  // NOOPT:       [[B:%.*]] = load <16 x double>, ptr {{.*}}, align 8{{$}}
+  // NOOPT-NEXT:  [[A:%.*]] = load <16 x double>, ptr {{.*}}, align 8{{$}}
+  // OPT:         [[B:%.*]] = load <16 x double>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:    [[A:%.*]] = load <16 x double>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[RES:%.*]] = fsub <16 x double> [[A]], [[B]]
+  // CHECK-NEXT:  store <16 x double> [[RES]], ptr {{.*}}, align 8
+
+  a -= b;
+}
+
+// CHECK-LABEL: define {{.*}}add_matrix_matrix_float
+void add_matrix_matrix_float(float2x3 a, float2x3 b, float2x3 c) {
+  // NOOPT:       [[B:%.*]] = load <6 x float>, ptr {{.*}}, align 4{{$}}
+  // NOOPT-NEXT:  [[C:%.*]] = load <6 x float>, ptr {{.*}}, align 4{{$}}
+  // OPT:         [[B:%.*]] = load <6 x float>, ptr {{.*}}, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:    [[C:%.*]] = load <6 x float>, ptr {{.*}}, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[RES:%.*]] = fadd <6 x float> [[B]], [[C]]
+  // CHECK-NEXT:  store <6 x float> [[RES]], ptr {{.*}}, align 4
+
+  a = b + c;
+}
+
+// CHECK-LABEL: define {{.*}}add_compound_assign_matrix_float
+void add_compound_assign_matrix_float(float2x3 a, float2x3 b) {
+  // NOOPT:       [[B:%.*]] = load <6 x float>, ptr {{.*}}, align 4{{$}}
+  // NOOPT-NEXT:  [[A:%.*]] = load <6 x float>, ptr {{.*}}, align 4{{$}}
+  // OPT:         [[B:%.*]] = load <6 x float>, ptr {{.*}}, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:    [[A:%.*]] = load <6 x float>, ptr {{.*}}, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[RES:%.*]] = fadd <6 x float> [[A]], [[B]]
+  // CHECK-NEXT:  store <6 x float> [[RES]], ptr {{.*}}, align 4
+
+  a += b;
+}
+
+// CHECK-LABEL: define {{.*}}subtract_compound_assign_matrix_float
+void subtract_compound_assign_matrix_float(float2x3 a, float2x3 b) {
+  // NOOPT:       [[B:%.*]] = load <6 x float>, ptr {{.*}}, align 4{{$}}
+  // NOOPT-NEXT:  [[A:%.*]] = load <6 x float>, ptr {{.*}}, align 4{{$}}
+  // OPT:         [[B:%.*]] = load <6 x float>, ptr {{.*}}, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:    [[A:%.*]] = load <6 x float>, ptr {{.*}}, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[RES:%.*]] = fsub <6 x float> [[A]], [[B]]
+  // CHECK-NEXT:  store <6 x float> [[RES]], ptr {{.*}}, align 4
+
+  a -= b;
+}
+
+// CHECK-LABEL: define {{.*}}add_matrix_scalar_double_float
+void add_matrix_scalar_double_float(double4x4 a, float vf) {
+  // NOOPT:       [[MATRIX:%.*]] = load <16 x double>, ptr {{.*}}, align 8{{$}}
+  // NOOPT-NEXT:  [[SCALAR:%.*]] = load float, ptr {{.*}}, align 4{{$}}
+  // OPT:         [[MATRIX:%.*]] = load <16 x double>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:    [[SCALAR:%.*]] = load float, ptr {{.*}}, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[SCALAR_EXT:%.*]] = fpext float [[SCALAR]] to double
+  // CHECK-NEXT:  [[SCALAR_EMBED:%.*]] = insertelement <16 x double> poison, double [[SCALAR_EXT]], i64 0
+  // CHECK-NEXT:  [[SCALAR_EMBED1:%.*]] = shufflevector <16 x double> [[SCALAR_EMBED]], <16 x double> poison, <16 x i32> zeroinitializer
+  // CHECK-NEXT:  [[RES:%.*]] = fadd <16 x double> [[MATRIX]], [[SCALAR_EMBED1]]
+  // CHECK-NEXT:  store <16 x double> [[RES]], ptr {{.*}}, align 8
+
+  a = a + vf;
+}
+
+// CHECK-LABEL: define {{.*}}add_compound_matrix_scalar_double_float
+void add_compound_matrix_scalar_double_float(double4x4 a, float vf) {
+  // NOOPT:  [[SCALAR:%.*]] = load float, ptr {{.*}}, align 4{{$}}
+  // OPT:    [[SCALAR:%.*]] = load float, ptr {{.*}}, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[SCALAR_EXT:%.*]] = fpext float [[SCALAR]] to double
+  // NOOPT-NEXT:  [[MATRIX:%.*]] = load <16 x double>, ptr {{.*}}, align 8{{$}}
+  // OPT-NEXT:    [[MATRIX:%.*]] = load <16 x double>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[SCALAR_EMBED:%.*]] = insertelement <16 x double> poison, double [[SCALAR_EXT]], i64 0
+  // CHECK-NEXT:  [[SCALAR_EMBED1:%.*]] = shufflevector <16 x double> [[SCALAR_EMBED]], <16 x double> poison, <16 x i32> zeroinitializer
+  // CHECK-NEXT:  [[RES:%.*]] = fadd <16 x double> [[MATRIX]], [[SCALAR_EMBED1]]
+  // CHECK-NEXT:  store <16 x double> [[RES]], ptr {{.*}}, align 8
+
+  a += vf;
+}
+
+// CHECK-LABEL: define {{.*}}subtract_compound_matrix_scalar_double_float
+void subtract_compound_matrix_scalar_double_float(double4x4 a, float vf) {
+  // NOOPT:  [[SCALAR:%.*]] = load float, ptr %vf.addr, align 4{{$}}
+  // OPT:    [[SCALAR:%.*]] = load float, ptr %vf.addr, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[SCALAR_EXT:%.*]] = fpext float [[SCALAR]] to double
+  // NOOPT-NEXT:  [[MATRIX:%.*]] = load <16 x double>, ptr {{.*}}, align 8{{$}}
+  // OPT-NEXT:    [[MATRIX:%.*]] = load <16 x double>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[SCALAR_EMBED:%.*]] = insertelement <16 x double> poison, double [[SCALAR_EXT]], i64 0
+  // CHECK-NEXT:  [[SCALAR_EMBED1:%.*]] = shufflevector <16 x double> [[SCALAR_EMBED]], <16 x double> poison, <16 x i32> zeroinitializer
+  // CHECK-NEXT:  [[RES:%.*]] = fsub <16 x double> [[MATRIX]], [[SCALAR_EMBED1]]
+  // CHECK-NEXT:  store <16 x double> [[RES]], ptr {{.*}}, align 8
+
+  a -= vf;
+}
+
+// CHECK-LABEL: define {{.*}}add_matrix_scalar_double_double
+void add_matrix_scalar_double_double(double4x4 a, double vd) {
+  // NOOPT:       [[MATRIX:%.*]] = load <16 x double>, ptr {{.*}}, align 8{{$}}
+  // NOOPT-NEXT:  [[SCALAR:%.*]] = load double, ptr %vd.addr, align 8{{$}}
+  // OPT:         [[MATRIX:%.*]] = load <16 x double>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:    [[SCALAR:%.*]] = load double, ptr %vd.addr, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[SCALAR_EMBED:%.*]] = insertelement <16 x double> poison, double [[SCALAR]], i64 0
+  // CHECK-NEXT:  [[SCALAR_EMBED1:%.*]] = shufflevector <16 x double> [[SCALAR_EMBED]], <16 x double> poison, <16 x i32> zeroinitializer
+  // CHECK-NEXT:  [[RES:%.*]] = fadd <16 x double> [[MATRIX]], [[SCALAR_EMBED1]]
+  // CHECK-NEXT:  store <16 x double> [[RES]], ptr {{.*}}, align 8
+
+  a = a + vd;
+}
+
+// CHECK-LABEL: define {{.*}}add_compound_matrix_scalar_double_double
+void add_compound_matrix_scalar_double_double(double4x4 a, double vd) {
+  // NOOPT:       [[SCALAR:%.*]] = load double, ptr %vd.addr, align 8{{$}}
+  // NOOPT-NEXT:  [[MATRIX:%.*]] = load <16 x double>, ptr {{.*}}, align 8{{$}}
+  // OPT:         [[SCALAR:%.*]] = load double, ptr %vd.addr, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:    [[MATRIX:%.*]] = load <16 x double>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[SCALAR_EMBED:%.*]] = insertelement <16 x double> poison, double [[SCALAR]], i64 0
+  // CHECK-NEXT:  [[SCALAR_EMBED1:%.*]] = shufflevector <16 x double> [[SCALAR_EMBED]], <16 x double> poison, <16 x i32> zeroinitializer
+  // CHECK-NEXT:  [[RES:%.*]] = fadd <16 x double> [[MATRIX]], [[SCALAR_EMBED1]]
+  // store <16 x double> [[RES]], ptr {{.*}}, align 8
+  a += vd;
+}
+
+// CHECK-LABEL: define {{.*}}subtract_compound_matrix_scalar_double_double
+void subtract_compound_matrix_scalar_double_double(double4x4 a, double vd) {
+  // NOOPT:       [[SCALAR:%.*]] = load double, ptr %vd.addr, align 8{{$}}
+  // NOOPT-NEXT:  [[MATRIX:%.*]] = load <16 x double>, ptr {{.*}}, align 8{{$}}
+  // OPT:         [[SCALAR:%.*]] = load double, ptr %vd.addr, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:    [[MATRIX:%.*]] = load <16 x double>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[SCALAR_EMBED:%.*]] = insertelement <16 x double> poison, double [[SCALAR]], i64 0
+  // CHECK-NEXT:  [[SCALAR_EMBED1:%.*]] = shufflevector <16 x double> [[SCALAR_EMBED]], <16 x double> poison, <16 x i32> zeroinitializer
+  // CHECK-NEXT:  [[RES:%.*]] = fsub <16 x double> [[MATRIX]], [[SCALAR_EMBED1]]
+  // store <16 x double> [[RES]], ptr {{.*}}, align 8
+  a -= vd;
+}
+
+// CHECK-LABEL: define {{.*}}add_matrix_scalar_float_float
+void add_matrix_scalar_float_float(float2x3 b, float vf) {
+  // NOOPT:       [[MATRIX:%.*]] = load <6 x float>, ptr {{.*}}, align 4{{$}}
+  // NOOPT-NEXT:  [[SCALAR:%.*]] = load float, ptr %vf.addr, align 4{{$}}
+  // OPT:         [[MATRIX:%.*]] = load <6 x float>, ptr {{.*}}, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:    [[SCALAR:%.*]] = load float, ptr %vf.addr, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[SCALAR_EMBED:%.*]] = insertelement <6 x float> poison, float [[SCALAR]], i64 0
+  // CHECK-NEXT:  [[SCALAR_EMBED1:%.*]] = shufflevector <6 x float> [[SCALAR_EMBED]], <6 x float> poison, <6 x i32> zeroinitializer
+  // CHECK-NEXT:  [[RES:%.*]] = fadd <6 x float> [[MATRIX]], [[SCALAR_EMBED1]]
+  // CHECK-NEXT:  store <6 x float> [[RES]], ptr {{.*}}, align 4
+
+  b = b + vf;
+}
+
+// CHECK-LABEL: define {{.*}}add_compound_matrix_scalar_float_float
+void add_compound_matrix_scalar_float_float(float2x3 b, float vf) {
+  // NOOPT:       [[SCALAR:%.*]] = load float, ptr %vf.addr, align 4{{$}}
+  // NOOPT-NEXT:  [[MATRIX:%.*]] = load <6 x float>, ptr %b.addr, align 4{{$}}
+  // OPT:         [[SCALAR:%.*]] = load float, ptr %vf.addr, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:    [[MATRIX:%.*]] = load <6 x float>, ptr %b.addr, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[SCALAR_EMBED:%.*]] = insertelement <6 x float> poison, float [[SCALAR]], i64 0
+  // CHECK-NEXT:  [[SCALAR_EMBED1:%.*]] = shufflevector <6 x float> [[SCALAR_EMBED]], <6 x float> poison, <6 x i32> zeroinitializer
+  // CHECK-NEXT:  [[RES:%.*]] = fadd <6 x float> [[MATRIX]], [[SCALAR_EMBED1]]
+  // CHECK-NEXT:  store <6 x float> [[RES]], ptr {{.*}}, align 4
+  b += vf;
+}
+
+// CHECK-LABEL: define {{.*}}subtract_compound_matrix_scalar_float_float
+void subtract_compound_matrix_scalar_float_float(float2x3 b, float vf) {
+  // NOOPT:       [[SCALAR:%.*]] = load float, ptr %vf.addr, align 4{{$}}
+  // NOOPT-NEXT:  [[MATRIX:%.*]] = load <6 x float>, ptr %b.addr, align 4{{$}}
+  // OPT:         [[SCALAR:%.*]] = load float, ptr %vf.addr, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:    [[MATRIX:%.*]] = load <6 x float>, ptr %b.addr, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[SCALAR_EMBED:%.*]] = insertelement <6 x float> poison, float [[SCALAR]], i64 0
+  // CHECK-NEXT:  [[SCALAR_EMBED1:%.*]] = shufflevector <6 x float> [[SCALAR_EMBED]], <6 x float> poison, <6 x i32> zeroinitializer
+  // CHECK-NEXT:  [[RES:%.*]] = fsub <6 x float> [[MATRIX]], [[SCALAR_EMBED1]]
+  // CHECK-NEXT:  store <6 x float> [[RES]], ptr {{.*}}, align 4
+  b -= vf;
+}
+
+// CHECK-LABEL: define {{.*}}add_matrix_scalar_float_double
+void add_matrix_scalar_float_double(float2x3 b, double vd) {
+  // NOOPT:       [[MATRIX:%.*]] = load <6 x float>, ptr {{.*}}, align 4{{$}}
+  // NOOPT-NEXT:  [[SCALAR:%.*]] = load double, ptr %vd.addr, align 8{{$}}
+  // OPT:         [[MATRIX:%.*]] = load <6 x float>, ptr {{.*}}, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:    [[SCALAR:%.*]] = load double, ptr %vd.addr, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[SCALAR_TRUNC:%.*]] = fptrunc double [[SCALAR]] to float
+  // CHECK-NEXT:  [[SCALAR_EMBED:%.*]] = insertelement <6 x float> poison, float [[SCALAR_TRUNC]], i64 0
+  // CHECK-NEXT:  [[SCALAR_EMBED1:%.*]] = shufflevector <6 x float> [[SCALAR_EMBED]], <6 x float> poison, <6 x i32> zeroinitializer
+  // CHECK-NEXT:  [[RES:%.*]] = fadd <6 x float> [[MATRIX]], [[SCALAR_EMBED1]]
+  // CHECK-NEXT:  store <6 x float> [[RES]], ptr {{.*}}, align 4
+
+  b = b + vd;
+}
+
+// CHECK-LABEL: define {{.*}}add_compound_matrix_scalar_float_double
+void add_compound_matrix_scalar_float_double(float2x3 b, double vd) {
+  // NOOPT:       [[SCALAR:%.*]] = load double, ptr %vd.addr, align 8{{$}}
+  // OPT:         [[SCALAR:%.*]] = load double, ptr %vd.addr, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[SCALAR_TRUNC:%.*]] = fptrunc double [[SCALAR]] to float
+  // NOOPT-NEXT:  [[MATRIX:%.*]] = load <6 x float>, ptr {{.*}}, align 4{{$}}
+  // OPT-NEXT:    [[MATRIX:%.*]] = load <6 x float>, ptr {{.*}}, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[SCALAR_EMBED:%.*]] = insertelement <6 x float> poison, float [[SCALAR_TRUNC]], i64 0
+  // CHECK-NEXT:  [[SCALAR_EMBED1:%.*]] = shufflevector <6 x float> [[SCALAR_EMBED]], <6 x float> poison, <6 x i32> zeroinitializer
+  // CHECK-NEXT:  [[RES:%.*]] = fadd <6 x float> [[MATRIX]], [[SCALAR_EMBED1]]
+  // CHECK-NEXT:  store <6 x float> [[RES]], ptr {{.*}}, align 4
+  b += vd;
+}
+
+// CHECK-LABEL: define {{.*}}subtract_compound_matrix_scalar_float_double
+void subtract_compound_matrix_scalar_float_double(float2x3 b, double vd) {
+  // NOOPT:       [[SCALAR:%.*]] = load double, ptr %vd.addr, align 8{{$}}
+  // OPT:         [[SCALAR:%.*]] = load double, ptr %vd.addr, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[SCALAR_TRUNC:%.*]] = fptrunc double [[SCALAR]] to float
+  // NOOPT-NEXT:  [[MATRIX:%.*]] = load <6 x float>, ptr {{.*}}, align 4{{$}}
+  // OPT-NEXT:    [[MATRIX:%.*]] = load <6 x float>, ptr {{.*}}, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[SCALAR_EMBED:%.*]] = insertelement <6 x float> poison, float [[SCALAR_TRUNC]], i64 0
+  // CHECK-NEXT:  [[SCALAR_EMBED1:%.*]] = shufflevector <6 x float> [[SCALAR_EMBED]], <6 x float> poison, <6 x i32> zeroinitializer
+  // CHECK-NEXT:  [[RES:%.*]] = fsub <6 x float> [[MATRIX]], [[SCALAR_EMBED1]]
+  // CHECK-NEXT:  store <6 x float> [[RES]], ptr {{.*}}, align 4
+  b -= vd;
+}
+
+// Integer matrix/scalar additions
+
+// CHECK-LABEL: define {{.*}}add_matrix_matrix_int
+void add_matrix_matrix_int(int4x3 a, int4x3 b, int4x3 c) {
+  // NOOPT:       [[B:%.*]] = load <12 x i32>, ptr {{.*}}, align 4{{$}}
+  // NOOPT-NEXT:  [[C:%.*]] = load <12 x i32>, ptr {{.*}}, align 4{{$}}
+  // OPT:         [[B:%.*]] = load <12 x i32>, ptr {{.*}}, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:    [[C:%.*]] = load <12 x i32>, ptr {{.*}}, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[RES:%.*]] = add <12 x i32> [[B]], [[C]]
+  // CHECK-NEXT:  store <12 x i32> [[RES]], ptr {{.*}}, align 4
+  a = b + c;
+}
+
+// CHECK-LABEL: define {{.*}}add_compound_matrix_matrix_int
+void add_compound_matrix_matrix_int(int4x3 a, int4x3 b) {
+  // NOOPT:       [[B:%.*]] = load <12 x i32>, ptr {{.*}}, align 4{{$}}
+  // NOOPT-NEXT:  [[A:%.*]] = load <12 x i32>, ptr {{.*}}, align 4{{$}}
+  // OPT:         [[B:%.*]] = load <12 x i32>, ptr {{.*}}, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:    [[A:%.*]] = load <12 x i32>, ptr {{.*}}, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[RES:%.*]] = add <12 x i32> [[A]], [[B]]
+  // CHECK-NEXT:  store <12 x i32> [[RES]], ptr {{.*}}, align 4
+  a += b;
+}
+
+// CHECK-LABEL: define {{.*}}subtract_compound_matrix_matrix_int
+void subtract_compound_matrix_matrix_int(int4x3 a, int4x3 b) {
+  // NOOPT:       [[B:%.*]] = load <12 x i32>, ptr {{.*}}, align 4{{$}}
+  // NOOPT-NEXT:  [[A:%.*]] = load <12 x i32>, ptr {{.*}}, align 4{{$}}
+  // OPT:         [[B:%.*]] = load <12 x i32>, ptr {{.*}}, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:    [[A:%.*]] = load <12 x i32>, ptr {{.*}}, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[RES:%.*]] = sub <12 x i32> [[A]], [[B]]
+  // CHECK-NEXT:  store <12 x i32> [[RES]], ptr {{.*}}, align 4
+  a -= b;
+}
+
+// CHECK-LABEL: define {{.*}}add_matrix_matrix_uint64
+void add_matrix_matrix_uint64(uint64_t4x2 a, uint64_t4x2 b, uint64_t4x2 c) {
+  // NOOPT:       [[B:%.*]] = load <8 x i64>, ptr {{.*}}, align 8{{$}}
+  // NOOPT-NEXT:  [[C:%.*]] = load <8 x i64>, ptr {{.*}}, align 8{{$}}
+  // OPT:         [[B:%.*]] = load <8 x i64>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:    [[C:%.*]] = load <8 x i64>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[RES:%.*]] = add <8 x i64> [[B]], [[C]]
+  // CHECK-NEXT:  store <8 x i64> [[RES]], ptr {{.*}}, align 8
+
+  a = b + c;
+}
+
+// CHECK-LABEL: define {{.*}}add_compound_matrix_matrix_uint64
+void add_compound_matrix_matrix_uint64(uint64_t4x2 a, uint64_t4x2 b) {
+  // NOOPT:       [[B:%.*]] = load <8 x i64>, ptr {{.*}}, align 8{{$}}
+  // NOOPT-NEXT:  [[A:%.*]] = load <8 x i64>, ptr {{.*}}, align 8{{$}}
+  // OPT:         [[B:%.*]] = load <8 x i64>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:    [[A:%.*]] = load <8 x i64>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[RES:%.*]] = add <8 x i64> [[A]], [[B]]
+  // CHECK-NEXT:  store <8 x i64> [[RES]], ptr {{.*}}, align 8
+
+  a += b;
+}
+
+// CHECK-LABEL: define {{.*}}subtract_compound_matrix_matrix_uint64
+void subtract_compound_matrix_matrix_uint64(uint64_t4x2 a, uint64_t4x2 b) {
+  // NOOPT:       [[B:%.*]] = load <8 x i64>, ptr {{.*}}, align 8{{$}}
+  // OPT:         [[B:%.*]] = load <8 x i64>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // NOOPT-NEXT:  [[A:%.*]] = load <8 x i64>, ptr {{.*}}, align 8{{$}}
+  // OPT-NEXT:    [[A:%.*]] = load <8 x i64>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[RES:%.*]] = sub <8 x i64> [[A]], [[B]]
+  // CHECK-NEXT:  store <8 x i64> [[RES]], ptr {{.*}}, align 8
+
+  a -= b;
+}
+
+// CHECK-LABEL: define {{.*}}add_matrix_scalar_int_int16
+void add_matrix_scalar_int_int16(int4x3 a, int16_t vs) {
+  // NOOPT:        [[MATRIX:%.*]] = load <12 x i32>, ptr [[MAT_ADDR:%.*]], align 4{{$}}
+  // NOOPT-NEXT:   [[SCALAR:%.*]] = load i16, ptr %vs.addr, align 2{{$}}
+  // OPT:          [[MATRIX:%.*]] = load <12 x i32>, ptr [[MAT_ADDR:%.*]], align 4, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:     [[SCALAR:%.*]] = load i16, ptr %vs.addr, align 2, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:   [[SCALAR_EXT:%.*]] = sext i16 [[SCALAR]] to i32
+  // CHECK-NEXT:   [[SCALAR_EMBED:%.*]] = insertelement <12 x i32> poison, i32 [[SCALAR_EXT]], i64 0
+  // CHECK-NEXT:   [[SCALAR_EMBED1:%.*]] = shufflevector <12 x i32> [[SCALAR_EMBED]], <12 x i32> poison, <12 x i32> zeroinitializer
+  // CHECK-NEXT:   [[RES:%.*]] = add <12 x i32> [[MATRIX]], [[SCALAR_EMBED1]]
+  // CHECK-NEXT:   store <12 x i32> [[RES]], ptr [[MAT_ADDR]], align 4
+
+  a = a + vs;
+}
+
+// CHECK-LABEL: define {{.*}}add_compound_matrix_scalar_int_int16
+void add_compound_matrix_scalar_int_int16(int4x3 a, int16_t vs) {
+  // NOOPT:       [[SCALAR:%.*]] = load i16, ptr %vs.addr, align 2{{$}}
+  // OPT:         [[SCALAR:%.*]] = load i16, ptr %vs.addr, align 2, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[SCALAR_EXT:%.*]] = sext i16 [[SCALAR]] to i32
+  // NOOPT-NEXT:  [[MATRIX:%.*]] = load <12 x i32>, ptr %a.addr, align 4{{$}}
+  // OPT-NEXT:    [[MATRIX:%.*]] = load <12 x i32>, ptr %a.addr, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[SCALAR_EMBED:%.*]] = insertelement <12 x i32> poison, i32 [[SCALAR_EXT:%.*]], i64 0
+  // CHECK-NEXT:  [[SCALAR_EMBED1:%.*]] = shufflevector <12 x i32> [[SCALAR_EMBED]], <12 x i32> poison, <12 x i32> zeroinitializer
+  // CHECK-NEXT:  [[RES:%.*]] = add <12 x i32> [[MATRIX]], [[SCALAR_EMBED1]]
+  // CHECK-NEXT:  store <12 x i32> [[RES]], ptr [[MAT_ADDR]], align 4
+
+  a += vs;
+}
+
+// CHECK-LABEL: define {{.*}}subtract_compound_matrix_scalar_int_int16
+void subtract_compound_matrix_scalar_int_int16(int4x3 a, int16_t vs) {
+  // NOOPT:       [[SCALAR:%.*]] = load i16, ptr %vs.addr, align 2{{$}}
+  // OPT:         [[SCALAR:%.*]] = load i16, ptr %vs.addr, align 2, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[SCALAR_EXT:%.*]] = sext i16 [[SCALAR]] to i32
+  // NOOPT-NEXT:  [[MATRIX:%.*]] = load <12 x i32>, ptr %a.addr, align 4{{$}}
+  // OPT-NEXT:    [[MATRIX:%.*]] = load <12 x i32>, ptr %a.addr, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[SCALAR_EMBED:%.*]] = insertelement <12 x i32> poison, i32 [[SCALAR_EXT:%.*]], i64 0
+  // CHECK-NEXT:  [[SCALAR_EMBED1:%.*]] = shufflevector <12 x i32> [[SCALAR_EMBED]], <12 x i32> poison, <12 x i32> zeroinitializer
+  // CHECK-NEXT:  [[RES:%.*]] = sub <12 x i32> [[MATRIX]], [[SCALAR_EMBED1]]
+  // CHECK-NEXT:  store <12 x i32> [[RES]], ptr [[MAT_ADDR]], align 4
+
+  a -= vs;
+}
+
+// CHECK-LABEL: define {{.*}}add_matrix_scalar_int_int64
+void add_matrix_scalar_int_int64(int4x3 a, int64_t vli) {
+  // NOOPT:        [[MATRIX:%.*]] = load <12 x i32>, ptr [[MAT_ADDR:%.*]], align 4{{$}}
+  // NOOPT-NEXT:   [[SCALAR:%.*]] = load i64, ptr %vli.addr, align 8{{$}}
+  // OPT:          [[MATRIX:%.*]] = load <12 x i32>, ptr [[MAT_ADDR:%.*]], align 4, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:     [[SCALAR:%.*]] = load i64, ptr %vli.addr, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:   [[SCALAR_TRUNC:%.*]] = trunc i64 [[SCALAR]] to i32
+  // CHECK-NEXT:   [[SCALAR_EMBED:%.*]] = insertelement <12 x i32> poison, i32 [[SCALAR_TRUNC]], i64 0
+  // CHECK-NEXT:   [[SCALAR_EMBED1:%.*]] = shufflevector <12 x i32> [[SCALAR_EMBED]], <12 x i32> poison, <12 x i32> zeroinitializer
+  // CHECK-NEXT:   [[RES:%.*]] = add <12 x i32> [[MATRIX]], [[SCALAR_EMBED1]]
+  // CHECK-NEXT:   store <12 x i32> [[RES]], ptr [[MAT_ADDR]], align 4
+
+  a = a + vli;
+}
+
+// CHECK-LABEL: define {{.*}}add_compound_matrix_scalar_int_int64
+void add_compound_matrix_scalar_int_int64(int4x3 a, int64_t vli) {
+  // NOOPT:       [[SCALAR:%.*]] = load i64, ptr %vli.addr, align 8{{$}}
+  // OPT:         [[SCALAR:%.*]] = load i64, ptr %vli.addr, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[SCALAR_TRUNC:%.*]] = trunc i64 [[SCALAR]] to i32
+  // NOOPT-NEXT:  [[MATRIX:%.*]] = load <12 x i32>, ptr %a.addr, align 4{{$}}
+  // OPT-NEXT:    [[MATRIX:%.*]] = load <12 x i32>, ptr %a.addr, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[SCALAR_EMBED:%.*]] = insertelement <12 x i32> poison, i32 [[SCALAR_TRUNC]], i64 0
+  // CHECK-NEXT:  [[SCALAR_EMBED1:%.*]] = shufflevector <12 x i32> [[SCALAR_EMBED]], <12 x i32> poison, <12 x i32> zeroinitializer
+  // CHECK-NEXT:  [[RES:%.*]] = add <12 x i32> [[MATRIX]], [[SCALAR_EMBED1]]
+  // CHECK-NEXT:  store <12 x i32> [[RES]], ptr [[MAT_ADDR]], align 4
+
+  a += vli;
+}
+
+// CHECK-LABEL: define {{.*}}subtract_compound_matrix_scalar_int_int64
+void subtract_compound_matrix_scalar_int_int64(int4x3 a, int64_t vli) {
+  // NOOPT:       [[SCALAR:%.*]] = load i64, ptr %vli.addr, align 8{{$}}
+  // OPT:         [[SCALAR:%.*]] = load i64, ptr %vli.addr, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[SCALAR_TRUNC:%.*]] = trunc i64 [[SCALAR]] to i32
+  // NOOPT-NEXT:  [[MATRIX:%.*]] = load <12 x i32>, ptr %a.addr, align 4{{$}}
+  // OPT-NEXT:    [[MATRIX:%.*]] = load <12 x i32>, ptr %a.addr, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[SCALAR_EMBED:%.*]] = insertelement <12 x i32> poison, i32 [[SCALAR_TRUNC]], i64 0
+  // CHECK-NEXT:  [[SCALAR_EMBED1:%.*]] = shufflevector <12 x i32> [[SCALAR_EMBED]], <12 x i32> poison, <12 x i32> zeroinitializer
+  // CHECK-NEXT:  [[RES:%.*]] = sub <12 x i32> [[MATRIX]], [[SCALAR_EMBED1]]
+  // CHECK-NEXT:  store <12 x i32> [[RES]], ptr [[MAT_ADDR]], align 4
+
+  a -= vli;
+}
+
+// CHECK-LABEL: define {{.*}}add_matrix_scalar_int_uint64
+void add_matrix_scalar_int_uint64(int4x3 a, uint64_t vulli) {
+  // NOOPT:        [[MATRIX:%.*]] = load <12 x i32>, ptr [[MAT_ADDR:%.*]], align 4{{$}}
+  // NOOPT-NEXT:   [[SCALAR:%.*]] = load i64, ptr %vulli.addr, align 8{{$}}
+  // OPT:          [[MATRIX:%.*]] = load <12 x i32>, ptr [[MAT_ADDR:%.*]], align 4, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:     [[SCALAR:%.*]] = load i64, ptr %vulli.addr, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:   [[SCALAR_TRUNC:%.*]] = trunc i64 [[SCALAR]] to i32
+  // CHECK-NEXT:   [[SCALAR_EMBED:%.*]] = insertelement <12 x i32> poison, i32 [[SCALAR_TRUNC]], i64 0
+  // CHECK-NEXT:   [[SCALAR_EMBED1:%.*]] = shufflevector <12 x i32> [[SCALAR_EMBED]], <12 x i32> poison, <12 x i32> zeroinitializer
+  // CHECK-NEXT:   [[RES:%.*]] = add <12 x i32> [[MATRIX]], [[SCALAR_EMBED1]]
+  // CHECK-NEXT:   store <12 x i32> [[RES]], ptr [[MAT_ADDR]], align 4
+
+  a = a + vulli;
+}
+
+// CHECK-LABEL: define {{.*}}add_compound_matrix_scalar_int_uint64
+void add_compound_matrix_scalar_int_uint64(int4x3 a, uint64_t vulli) {
+  // NOOPT:        [[SCALAR:%.*]] = load i64, ptr %vulli.addr, align 8{{$}}
+  // OPT:          [[SCALAR:%.*]] = load i64, ptr %vulli.addr, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:   [[SCALAR_TRUNC:%.*]] = trunc i64 [[SCALAR]] to i32
+  // NOOPT-NEXT:   [[MATRIX:%.*]] = load <12 x i32>, ptr [[MATRIX_ADDR:%.*]], align 4{{$}}
+  // OPT-NEXT:     [[MATRIX:%.*]] = load <12 x i32>, ptr [[MATRIX_ADDR:%.*]], align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:   [[SCALAR_EMBED:%.*]] = insertelement <12 x i32> poison, i32 [[SCALAR_TRUNC]], i64 0
+  // CHECK-NEXT:   [[SCALAR_EMBED1:%.*]] = shufflevector <12 x i32> [[SCALAR_EMBED]], <12 x i32> poison, <12 x i32> zeroinitializer
+  // CHECK-NEXT:   [[RES:%.*]] = add <12 x i32> [[MATRIX]], [[SCALAR_EMBED1]]
+  // CHECK-NEXT:   store <12 x i32> [[RES]], ptr [[MAT_ADDR]], align 4
+
+  a += vulli;
+}
+
+// CHECK-LABEL: define {{.*}}subtract_compound_matrix_scalar_int_uint64
+void subtract_compound_matrix_scalar_int_uint64(int4x3 a, uint64_t vulli) {
+  // NOOPT:        [[SCALAR:%.*]] = load i64, ptr %vulli.addr, align 8{{$}}
+  // OPT:          [[SCALAR:%.*]] = load i64, ptr %vulli.addr, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:   [[SCALAR_TRUNC:%.*]] = trunc i64 [[SCALAR]] to i32
+  // NOOPT-NEXT:   [[MATRIX:%.*]] = load <12 x i32>, ptr [[MATRIX_ADDR:%.*]], align 4{{$}}
+  // OPT-NEXT:     [[MATRIX:%.*]] = load <12 x i32>, ptr [[MATRIX_ADDR:%.*]], align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:   [[SCALAR_EMBED:%.*]] = insertelement <12 x i32> poison, i32 [[SCALAR_TRUNC]], i64 0
+  // CHECK-NEXT:   [[SCALAR_EMBED1:%.*]] = shufflevector <12 x i32> [[SCALAR_EMBED]], <12 x i32> poison, <12 x i32> zeroinitializer
+  // CHECK-NEXT:   [[RES:%.*]] = sub <12 x i32> [[MATRIX]], [[SCALAR_EMBED1]]
+  // CHECK-NEXT:   store <12 x i32> [[RES]], ptr [[MAT_ADDR]], align 4
+
+  a -= vulli;
+}
+
+// CHECK-LABEL: define {{.*}}add_matrix_scalar_uint64_short
+void add_matrix_scalar_uint64_short(uint64_t4x2 b, int16_t vs) {
+  // NOOPT:         [[SCALAR:%.*]] = load i16, ptr %vs.addr, align 2{{$}}
+  // OPT:           [[SCALAR:%.*]] = load i16, ptr %vs.addr, align 2, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[SCALAR_EXT:%.*]] = sext i16 [[SCALAR]] to i64
+  // NOOPT-NEXT:    [[MATRIX:%.*]] = load <8 x i64>, ptr {{.*}}, align 8{{$}}
+  // OPT-NEXT:      [[MATRIX:%.*]] = load <8 x i64>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[SCALAR_EMBED:%.*]] = insertelement <8 x i64> poison, i64 [[SCALAR_EXT]], i64 0
+  // CHECK-NEXT:    [[SCALAR_EMBED1:%.*]] = shufflevector <8 x i64> [[SCALAR_EMBED]], <8 x i64> poison, <8 x i32> zeroinitializer
+  // CHECK-NEXT:    [[RES:%.*]] = add <8 x i64> [[SCALAR_EMBED1]], [[MATRIX]]
+  // CHECK-NEXT:    store <8 x i64> [[RES]], ptr {{.*}}, align 8
+
+  b = vs + b;
+}
+
+// CHECK-LABEL: define {{.*}}add_compound_matrix_scalar_uint64_short
+void add_compound_matrix_scalar_uint64_short(uint64_t4x2 b, int16_t vs) {
+  // NOOPT:       [[SCALAR:%.*]] = load i16, ptr %vs.addr, align 2{{$}}
+  // OPT:         [[SCALAR:%.*]] = load i16, ptr %vs.addr, align 2, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[SCALAR_EXT:%.*]] = sext i16 [[SCALAR]] to i64
+  // NOOPT-NEXT:  [[MATRIX:%.*]] = load <8 x i64>, ptr %b.addr, align 8{{$}}
+  // OPT-NEXT:    [[MATRIX:%.*]] = load <8 x i64>, ptr %b.addr, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[SCALAR_EMBED:%.*]] = insertelement <8 x i64> poison, i64 [[SCALAR_EXT]], i64 0
+  // CHECK-NEXT:  [[SCALAR_EMBED1:%.*]] = shufflevector <8 x i64> [[SCALAR_EMBED]], <8 x i64> poison, <8 x i32> zeroinitializer
+  // CHECK-NEXT:  [[RES:%.*]] = add <8 x i64> [[MATRIX]], [[SCALAR_EMBED1]]
+  // CHECK-NEXT:  store <8 x i64> [[RES]], ptr {{.*}}, align 8
+
+  b += vs;
+}
+
+// CHECK-LABEL: define {{.*}}subtract_compound_matrix_scalar_uint64_short
+void subtract_compound_matrix_scalar_uint64_short(uint64_t4x2 b, int16_t vs) {
+  // NOOPT:       [[SCALAR:%.*]] = load i16, ptr %vs.addr, align 2{{$}}
+  // OPT:         [[SCALAR:%.*]] = load i16, ptr %vs.addr, align 2, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[SCALAR_EXT:%.*]] = sext i16 [[SCALAR]] to i64
+  // NOOPT-NEXT:  [[MATRIX:%.*]] = load <8 x i64>, ptr %b.addr, align 8{{$}}
+  // OPT-NEXT:    [[MATRIX:%.*]] = load <8 x i64>, ptr %b.addr, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[SCALAR_EMBED:%.*]] = insertelement <8 x i64> poison, i64 [[SCALAR_EXT]], i64 0
+  // CHECK-NEXT:  [[SCALAR_EMBED1:%.*]] = shufflevector <8 x i64> [[SCALAR_EMBED]], <8 x i64> poison, <8 x i32> zeroinitializer
+  // CHECK-NEXT:  [[RES:%.*]] = sub <8 x i64> [[MATRIX]], [[SCALAR_EMBED1]]
+  // CHECK-NEXT:  store <8 x i64> [[RES]], ptr {{.*}}, align 8
+
+  b -= vs;
+}
+
+// CHECK-LABEL: define {{.*}}add_matrix_scalar_uint64_int
+void add_matrix_scalar_uint64_int(uint64_t4x2 b, int64_t vli) {
+  // NOOPT:         [[SCALAR:%.*]] = load i64, ptr %vli.addr, align 8{{$}}
+  // NOOPT-NEXT:    [[MATRIX:%.*]] = load <8 x i64>, ptr {{.*}}, align 8{{$}}
+  // OPT:           [[SCALAR:%.*]] = load i64, ptr %vli.addr, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:      [[MATRIX:%.*]] = load <8 x i64>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[SCALAR_EMBED:%.*]] = insertelement <8 x i64> poison, i64 [[SCALAR]], i64 0
+  // CHECK-NEXT:    [[SCALAR_EMBED1:%.*]] = shufflevector <8 x i64> [[SCALAR_EMBED]], <8 x i64> poison, <8 x i32> zeroinitializer
+  // CHECK-NEXT:    [[RES:%.*]] = add <8 x i64> [[SCALAR_EMBED1]], [[MATRIX]]
+  // CHECK-NEXT:    store <8 x i64> [[RES]], ptr {{.*}}, align 8
+
+  b = vli + b;
+}
+
+// CHECK-LABEL: define {{.*}}add_compound_matrix_scalar_uint64_int
+void add_compound_matrix_scalar_uint64_int(uint64_t4x2 b, int64_t vli) {
+  // NOOPT:        [[SCALAR:%.*]] = load i64, ptr %vli.addr, align 8{{$}}
+  // NOOPT-NEXT:   [[MATRIX:%.*]] = load <8 x i64>, ptr {{.*}}, align 8{{$}}
+  // OPT:          [[SCALAR:%.*]] = load i64, ptr %vli.addr, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:     [[MATRIX:%.*]] = load <8 x i64>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:   [[SCALAR_EMBED:%.*]] = insertelement <8 x i64> poison, i64 [[SCALAR]], i64 0
+  // CHECK-NEXT:   [[SCALAR_EMBED1:%.*]] = shufflevector <8 x i64> [[SCALAR_EMBED]], <8 x i64> poison, <8 x i32> zeroinitializer
+  // CHECK-NEXT:   [[RES:%.*]] = add <8 x i64> [[MATRIX]], [[SCALAR_EMBED1]]
+  // CHECK-NEXT:   store <8 x i64> [[RES]], ptr {{.*}}, align 8
+
+  b += vli;
+}
+
+// CHECK-LABEL: define {{.*}}subtract_compound_matrix_scalar_uint64_int
+void subtract_compound_matrix_scalar_uint64_int(uint64_t4x2 b, int64_t vli) {
+  // NOOPT:        [[SCALAR:%.*]] = load i64, ptr %vli.addr, align 8{{$}}
+  // OPT:          [[SCALAR:%.*]] = load i64, ptr %vli.addr, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // NOOPT-NEXT:   [[MATRIX:%.*]] = load <8 x i64>, ptr {{.*}}, align 8{{$}}
+  // OPT-NEXT:     [[MATRIX:%.*]] = load <8 x i64>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:   [[SCALAR_EMBED:%.*]] = insertelement <8 x i64> poison, i64 [[SCALAR]], i64 0
+  // CHECK-NEXT:   [[SCALAR_EMBED1:%.*]] = shufflevector <8 x i64> [[SCALAR_EMBED]], <8 x i64> poison, <8 x i32> zeroinitializer
+  // CHECK-NEXT:   [[RES:%.*]] = sub <8 x i64> [[MATRIX]], [[SCALAR_EMBED1]]
+  // CHECK-NEXT:   store <8 x i64> [[RES]], ptr {{.*}}, align 8
+
+  b -= vli;
+}
+
+// CHECK-LABEL: define {{.*}}add_matrix_scalar_uint64_uint64
+void add_matrix_scalar_uint64_uint64(uint64_t4x2 b, uint64_t vulli) {
+  // NOOPT:        [[SCALAR:%.*]] = load i64, ptr %vulli.addr, align 8{{$}}
+  // NOOPT-NEXT:   [[MATRIX:%.*]] = load <8 x i64>, ptr %b.addr, align 8{{$}}
+  // OPT:          [[SCALAR:%.*]] = load i64, ptr %vulli.addr, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:     [[MATRIX:%.*]] = load <8 x i64>, ptr %b.addr, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:   [[SCALAR_EMBED:%.*]] = insertelement <8 x i64> poison, i64 [[SCALAR]], i64 0
+  // CHECK-NEXT:   [[SCALAR_EMBED1:%.*]] = shufflevector <8 x i64> [[SCALAR_EMBED]], <8 x i64> poison, <8 x i32> zeroinitializer
+  // CHECK-NEXT:   [[RES:%.*]] = add <8 x i64> [[SCALAR_EMBED1]], [[MATRIX]]
+  // CHECK-NEXT:   store <8 x i64> [[RES]], ptr {{.*}}, align 8
+  b = vulli + b;
+}
+
+// CHECK-LABEL: define {{.*}}add_compound_matrix_scalar_uint64_uint64
+void add_compound_matrix_scalar_uint64_uint64(uint64_t4x2 b, uint64_t vulli) {
+  // NOOPT:        [[SCALAR:%.*]] = load i64, ptr %vulli.addr, align 8{{$}}
+  // NOOPT-NEXT:   [[MATRIX:%.*]] = load <8 x i64>, ptr %b.addr, align 8{{$}}
+  // OPT:          [[SCALAR:%.*]] = load i64, ptr %vulli.addr, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:     [[MATRIX:%.*]] = load <8 x i64>, ptr %b.addr, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:   [[SCALAR_EMBED:%.*]] = insertelement <8 x i64> poison, i64 [[SCALAR]], i64 0
+  // CHECK-NEXT:   [[SCALAR_EMBED1:%.*]] = shufflevector <8 x i64> [[SCALAR_EMBED]], <8 x i64> poison, <8 x i32> zeroinitializer
+  // CHECK-NEXT:   [[RES:%.*]] = add <8 x i64> [[MATRIX]], [[SCALAR_EMBED1]]
+  // CHECK-NEXT:   store <8 x i64> [[RES]], ptr {{.*}}, align 8
+
+  b += vulli;
+}
+
+// CHECK-LABEL: define {{.*}}subtract_compound_matrix_scalar_uint64_uint64
+void subtract_compound_matrix_scalar_uint64_uint64(uint64_t4x2 b, uint64_t vulli) {
+  // NOOPT:        [[SCALAR:%.*]] = load i64, ptr %vulli.addr, align 8{{$}}
+  // NOOPT-NEXT:   [[MATRIX:%.*]] = load <8 x i64>, ptr %b.addr, align 8{{$}}
+  // OPT:          [[SCALAR:%.*]] = load i64, ptr %vulli.addr, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:     [[MATRIX:%.*]] = load <8 x i64>, ptr %b.addr, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:   [[SCALAR_EMBED:%.*]] = insertelement <8 x i64> poison, i64 [[SCALAR]], i64 0
+  // CHECK-NEXT:   [[SCALAR_EMBED1:%.*]] = shufflevector <8 x i64> [[SCALAR_EMBED]], <8 x i64> poison, <8 x i32> zeroinitializer
+  // CHECK-NEXT:   [[RES:%.*]] = sub <8 x i64> [[MATRIX]], [[SCALAR_EMBED1]]
+  // CHECK-NEXT:   store <8 x i64> [[RES]], ptr {{.*}}, align 8
+
+  b -= vulli;
+}
+
+// Tests for matrix multiplication.
+
+// CHECK-LABEL: define {{.*}}multiply_matrix_matrix_double
+void multiply_matrix_matrix_double(double4x4 b, double4x4 c) {
+  // NOOPT:         [[B:%.*]] = load <16 x double>, ptr %b.addr, align 8{{$}}
+  // NOOPT-NEXT:    [[C:%.*]] = load <16 x double>, ptr %c.addr, align 8{{$}}
+  // OPT:           [[B:%.*]] = load <16 x double>, ptr %b.addr, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:      [[C:%.*]] = load <16 x double>, ptr %c.addr, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[RES:%.*]] = call <16 x double> @llvm.matrix.multiply.v16f64.v16f64.v16f64(<16 x double> [[B]], <16 x double> [[C]], i32 4, i32 4, i32 4)
+  // CHECK-NEXT:    store <16 x double> [[RES]], ptr %a, align 8
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 128, ptr %a)
+  // CHECK-NEXT:    ret void
+
+  double4x4 a;
+  a = b * c;
+}
+
+// CHECK-LABEL: define {{.*}}multiply_compound_matrix_matrix_double
+void multiply_compound_matrix_matrix_double(double4x4 b, double4x4 c) {
+  // NOOPT:        [[C:%.*]] = load <16 x double>, ptr {{.*}}, align 8{{$}}
+  // NOOPT-NEXT:   [[B:%.*]] = load <16 x double>, ptr {{.*}}, align 8{{$}}
+  // OPT:          [[C:%.*]] = load <16 x double>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:     [[B:%.*]] = load <16 x double>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:   [[RES:%.*]] = call <16 x double> @llvm.matrix.multiply.v16f64.v16f64.v16f64(<16 x double> [[B]], <16 x double> [[C]], i32 4, i32 4, i32 4)
+  // CHECK-NEXT:   store <16 x double> [[RES]], ptr {{.*}}, align 8
+  // CHECK-NEXT:   ret void
+  b *= c;
+}
+
+// CHECK-LABEL: define {{.*}}multiply_matrix_matrix_int
+void multiply_matrix_matrix_int(int4x4 b, int4x4 c) {
+  // NOOPT:         [[B:%.*]] = load <16 x i32>, ptr {{.*}}, align 4{{$}}
+  // NOOPT-NEXT:    [[C:%.*]] = load <16 x i32>, ptr {{.*}}, align 4{{$}}
+  // OPT:           [[B:%.*]] = load <16 x i32>, ptr {{.*}}, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:      [[C:%.*]] = load <16 x i32>, ptr {{.*}}, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[RES:%.*]] = call <16 x i32> @llvm.matrix.multiply.v16i32.v16i32.v16i32(<16 x i32> [[B]], <16 x i32> [[C]], i32 4, i32 4, i32 4)
+  // CHECK-NEXT:    store <16 x i32> [[RES]], ptr %a, align 4
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 64, ptr %a)
+  // CHECK:         ret void
+  int4x4 a;
+  a = b * c;
+}
+
+// CHECK-LABEL: define {{.*}}multiply_double_matrix_scalar_float
+void multiply_double_matrix_scalar_float(double4x4 a, float s) {
+  // NOOPT:         [[A:%.*]] = load <16 x double>, ptr {{.*}}, align 8{{$}}
+  // NOOPT-NEXT:    [[S:%.*]] = load float, ptr %s.addr, align 4{{$}}
+  // OPT:           [[A:%.*]] = load <16 x double>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:      [[S:%.*]] = load float, ptr %s.addr, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[S_EXT:%.*]] = fpext float [[S]] to double
+  // CHECK-NEXT:    [[VECINSERT:%.*]] = insertelement <16 x double> poison, double [[S_EXT]], i64 0
+  // CHECK-NEXT:    [[VECSPLAT:%.*]] = shufflevector <16 x double> [[VECINSERT]], <16 x double> poison, <16 x i32> zeroinitializer
+  // CHECK-NEXT:    [[RES:%.*]] = fmul <16 x double> [[A]], [[VECSPLAT]]
+  // CHECK-NEXT:    store <16 x double> [[RES]], ptr {{.*}}, align 8
+  // CHECK-NEXT:    ret void
+  a = a * s;
+}
+
+// CHECK-LABEL: define {{.*}}multiply_compound_double_matrix_scalar_float
+void multiply_compound_double_matrix_scalar_float(double4x4 a, float s) {
+  // NOOPT:         [[S:%.*]] = load float, ptr %s.addr, align 4{{$}}
+  // OPT:           [[S:%.*]] = load float, ptr %s.addr, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[S_EXT:%.*]] = fpext float [[S]] to double
+  // NOOPT-NEXT:    [[A:%.*]] = load <16 x double>, ptr {{.*}}, align 8{{$}}
+  // OPT-NEXT:      [[A:%.*]] = load <16 x double>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[VECINSERT:%.*]] = insertelement <16 x double> poison, double [[S_EXT]], i64 0
+  // CHECK-NEXT:    [[VECSPLAT:%.*]] = shufflevector <16 x double> [[VECINSERT]], <16 x double> poison, <16 x i32> zeroinitializer
+  // CHECK-NEXT:    [[RES:%.*]] = fmul <16 x double> [[A]], [[VECSPLAT]]
+  // CHECK-NEXT:    store <16 x double> [[RES]], ptr {{.*}}, align 8
+  // CHECK-NEXT:    ret void
+  a *= s;
+}
+
+// CHECK-LABEL: define {{.*}}multiply_double_matrix_scalar_double
+void multiply_double_matrix_scalar_double(double4x4 a, double s) {
+  // NOOPT:         [[A:%.*]] = load <16 x double>, ptr {{.*}}, align 8{{$}}
+  // NOOPT-NEXT:    [[S:%.*]] = load double, ptr %s.addr, align 8{{$}}
+  // OPT:           [[A:%.*]] = load <16 x double>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:      [[S:%.*]] = load double, ptr %s.addr, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[VECINSERT:%.*]] = insertelement <16 x double> poison, double [[S]], i64 0
+  // CHECK-NEXT:    [[VECSPLAT:%.*]] = shufflevector <16 x double> [[VECINSERT]], <16 x double> poison, <16 x i32> zeroinitializer
+  // CHECK-NEXT:    [[RES:%.*]] = fmul <16 x double> [[A]], [[VECSPLAT]]
+  // CHECK-NEXT:    store <16 x double> [[RES]], ptr {{.*}}, align 8
+  // CHECK-NEXT:    ret void
+  a = a * s;
+}
+
+// CHECK-LABEL: define {{.*}}multiply_compound_double_matrix_scalar_double
+void multiply_compound_double_matrix_scalar_double(double4x4 a, double s) {
+  // NOOPT:         [[S:%.*]] = load double, ptr %s.addr, align 8{{$}}
+  // NOOPT-NEXT:    [[A:%.*]] = load <16 x double>, ptr {{.*}}, align 8{{$}}
+  // OPT:           [[S:%.*]] = load double, ptr %s.addr, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:      [[A:%.*]] = load <16 x double>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[VECINSERT:%.*]] = insertelement <16 x double> poison, double [[S]], i64 0
+  // CHECK-NEXT:    [[VECSPLAT:%.*]] = shufflevector <16 x double> [[VECINSERT]], <16 x double> poison, <16 x i32> zeroinitializer
+  // CHECK-NEXT:    [[RES:%.*]] = fmul <16 x double> [[A]], [[VECSPLAT]]
+  // CHECK-NEXT:    store <16 x double> [[RES]], ptr {{.*}}, align 8
+  // CHECK-NEXT:    ret void
+  a *= s;
+}
+
+// CHECK-LABEL: define {{.*}}multiply_float_matrix_scalar_double
+void multiply_float_matrix_scalar_double(float2x3 b, double s) {
+  // NOOPT:         [[S:%.*]] = load double, ptr %s.addr, align 8{{$}}
+  // OPT:           [[S:%.*]] = load double, ptr %s.addr, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[S_TRUNC:%.*]] = fptrunc double [[S]] to float
+  // NOOPT-NEXT:    [[MAT:%.*]] = load <6 x float>, ptr [[B:%.*]], align 4{{$}}
+  // OPT-NEXT:      [[MAT:%.*]] = load <6 x float>, ptr [[B:%.*]], align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[VECINSERT:%.*]] = insertelement <6 x float> poison, float [[S_TRUNC]], i64 0
+  // CHECK-NEXT:    [[VECSPLAT:%.*]] = shufflevector <6 x float> [[VECINSERT]], <6 x float> poison, <6 x i32> zeroinitializer
+  // CHECK-NEXT:    [[RES:%.*]] = fmul <6 x float> [[VECSPLAT]], [[MAT]]
+  // CHECK-NEXT:    store <6 x float> [[RES]], ptr [[B]], align 4
+  // CHECK-NEXT:    ret void
+  b = s * b;
+}
+
+// CHECK-LABEL: define {{.*}}multiply_compound_float_matrix_scalar_double
+void multiply_compound_float_matrix_scalar_double(float2x3 b, double s) {
+  // NOOPT:         [[S:%.*]] = load double, ptr %s.addr, align 8{{$}}
+  // OPT:           [[S:%.*]] = load double, ptr %s.addr, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[S_TRUNC:%.*]] = fptrunc double [[S]] to float
+  // NOOPT-NEXT:    [[MAT:%.*]] = load <6 x float>, ptr [[B:%.*]], align 4{{$}}
+  // OPT-NEXT:      [[MAT:%.*]] = load <6 x float>, ptr [[B:%.*]], align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[VECINSERT:%.*]] = insertelement <6 x float> poison, float [[S_TRUNC]], i64 0
+  // CHECK-NEXT:    [[VECSPLAT:%.*]] = shufflevector <6 x float> [[VECINSERT]], <6 x float> poison, <6 x i32> zeroinitializer
+  // CHECK-NEXT:    [[RES:%.*]] = fmul <6 x float> [[MAT]], [[VECSPLAT]]
+  // CHECK-NEXT:    store <6 x float> [[RES]], ptr [[B]], align 4
+  // ret void
+  b *= s;
+}
+
+// CHECK-LABEL: define {{.*}}multiply_int_matrix_scalar_int16
+void multiply_int_matrix_scalar_int16(int4x3 b, int16_t s) {
+  // NOOPT:         [[S:%.*]] = load i16, ptr %s.addr, align 2{{$}}
+  // OPT:           [[S:%.*]] = load i16, ptr %s.addr, align 2, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[S_EXT:%.*]] = sext i16 [[S]] to i32
+  // NOOPT-NEXT:    [[MAT:%.*]] = load <12 x i32>, ptr [[B:%.*]], align 4{{$}}
+  // OPT-NEXT:      [[MAT:%.*]] = load <12 x i32>, ptr [[B:%.*]], align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[VECINSERT:%.*]] = insertelement <12 x i32> poison, i32 [[S_EXT]], i64 0
+  // CHECK-NEXT:    [[VECSPLAT:%.*]] = shufflevector <12 x i32> [[VECINSERT]], <12 x i32> poison, <12 x i32> zeroinitializer
+  // CHECK-NEXT:    [[RES:%.*]] = mul <12 x i32> [[VECSPLAT]], [[MAT]]
+  // CHECK-NEXT:    store <12 x i32> [[RES]], ptr [[B]], align 4
+  // CHECK-NEXT:    ret void
+  b = s * b;
+}
+
+// CHECK-LABEL: define {{.*}}multiply_compound_int_matrix_scalar_int16
+void multiply_compound_int_matrix_scalar_int16(int4x3 b, int16_t s) {
+  // NOOPT:        [[S:%.*]] = load i16, ptr %s.addr, align 2{{$}}
+  // OPT:          [[S:%.*]] = load i16, ptr %s.addr, align 2, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:   [[S_EXT:%.*]] = sext i16 [[S]] to i32
+  // NOOPT-NEXT:   [[MAT:%.*]] = load <12 x i32>, ptr [[B:%.*]], align 4{{$}}
+  // OPT-NEXT:     [[MAT:%.*]] = load <12 x i32>, ptr [[B:%.*]], align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:   [[VECINSERT:%.*]] = insertelement <12 x i32> poison, i32 [[S_EXT]], i64 0
+  // CHECK-NEXT:   [[VECSPLAT:%.*]] = shufflevector <12 x i32> [[VECINSERT]], <12 x i32> poison, <12 x i32> zeroinitializer
+  // CHECK-NEXT:   [[RES:%.*]] = mul <12 x i32> [[MAT]], [[VECSPLAT]]
+  // CHECK-NEXT:   store <12 x i32> [[RES]], ptr [[B]], align 4
+  // CHECK-NEXT:   ret void
+  b *= s;
+}
+
+// CHECK-LABEL: define {{.*}}multiply_int_matrix_scalar_ull
+void multiply_int_matrix_scalar_ull(int4x3 b, uint64_t s) {
+  // NOOPT:         [[MAT:%.*]] = load <12 x i32>, ptr [[B:%.*]], align 4{{$}}
+  // OPT:           [[MAT:%.*]] = load <12 x i32>, ptr [[B:%.*]], align 4, !tbaa !{{[0-9]+}}{{$}}
+  // NOOPT-NEXT:    [[S:%.*]] = load i64, ptr %s.addr, align 8{{$}}
+  // OPT-NEXT:      [[S:%.*]] = load i64, ptr %s.addr, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[S_TRUNC:%.*]] = trunc i64 [[S]] to i32
+  // CHECK-NEXT:    [[VECINSERT:%.*]] = insertelement <12 x i32> poison, i32 [[S_TRUNC]], i64 0
+  // CHECK-NEXT:    [[VECSPLAT:%.*]] = shufflevector <12 x i32> [[VECINSERT]], <12 x i32> poison, <12 x i32> zeroinitializer
+  // CHECK-NEXT:    [[RES:%.*]] = mul <12 x i32> [[MAT]], [[VECSPLAT]]
+  // CHECK-NEXT:    store <12 x i32> [[RES]], ptr [[B]], align 4
+  // CHECK-NEXT:    ret void
+  b = b * s;
+}
+
+// CHECK-LABEL: define {{.*}}multiply_compound_int_matrix_scalar_ull
+void multiply_compound_int_matrix_scalar_ull(int4x3 b, uint64_t s) {
+  // NOOPT:         [[S:%.*]] = load i64, ptr %s.addr, align 8{{$}}
+  // OPT:           [[S:%.*]] = load i64, ptr %s.addr, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[S_TRUNC:%.*]] = trunc i64 [[S]] to i32
+  // NOOPT-NEXT:    [[MAT:%.*]] = load <12 x i32>, ptr [[B:%.*]], align 4{{$}}
+  // OPT-NEXT:      [[MAT:%.*]] = load <12 x i32>, ptr [[B:%.*]], align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[VECINSERT:%.*]] = insertelement <12 x i32> poison, i32 [[S_TRUNC]], i64 0
+  // CHECK-NEXT:    [[VECSPLAT:%.*]] = shufflevector <12 x i32> [[VECINSERT]], <12 x i32> poison, <12 x i32> zeroinitializer
+  // CHECK-NEXT:    [[RES:%.*]] = mul <12 x i32> [[MAT]], [[VECSPLAT]]
+  // CHECK-NEXT:    store <12 x i32> [[RES]], ptr [[B]], align 4
+  // CHECK-NEXT:    ret void
+
+  b *= s;
+}
+
+// CHECK-LABEL: define {{.*}}multiply_float_matrix_constant
+void multiply_float_matrix_constant(float2x3 a) {
+  // CHECK:         [[A_ADDR:%.*]] = alloca [6 x float], align 4
+  // CHECK-NEXT:    store <6 x float> %a, ptr [[A_ADDR]], align 4
+  // NOOPT-NEXT:    [[MAT:%.*]] = load <6 x float>, ptr [[A_ADDR]], align 4{{$}}
+  // OPT-NEXT:      [[MAT:%.*]] = load <6 x float>, ptr [[A_ADDR]], align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[RES:%.*]] = fmul <6 x float> [[MAT]], splat (float 2.500000e+00)
+  // CHECK-NEXT:    store <6 x float> [[RES]], ptr [[A_ADDR]], align 4
+  // CHECK-NEXT:    ret void
+  a = a * 2.5;
+}
+
+// CHECK-LABEL: define {{.*}}multiply_compound_float_matrix_constant
+void multiply_compound_float_matrix_constant(float2x3 a) {
+  // CHECK:         [[A_ADDR:%.*]] = alloca [6 x float], align 4
+  // CHECK-NEXT:    store <6 x float> [[A:%.*]], ptr [[A_ADDR]], align 4
+  // NOOPT-NEXT:    [[MAT:%.*]] = load <6 x float>, ptr [[A_ADDR]], align 4{{$}}
+  // OPT-NEXT:      [[MAT:%.*]] = load <6 x float>, ptr [[A_ADDR]], align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[RES:%.*]] = fmul <6 x float> [[MAT]], splat (float 2.500000e+00)
+  // CHECK-NEXT:    store <6 x float> [[RES]], ptr [[A_ADDR]], align 4
+  // CHECK-NEXT:    ret void
+  a *= 2.5;
+}
+
+// CHECK-LABEL: define {{.*}}multiply_int_matrix_constant
+void multiply_int_matrix_constant(int4x3 a) {
+  // CHECK:         [[A_ADDR:%.*]] = alloca [12 x i32], align 4
+  // CHECK-NEXT:    store <12 x i32> [[A:%.*]], ptr [[A_ADDR]], align 4
+  // NOOPT-NEXT:    [[MAT:%.*]] = load <12 x i32>, ptr [[A_ADDR]], align 4{{$}}
+  // OPT-NEXT:      [[MAT:%.*]] = load <12 x i32>, ptr [[A_ADDR]], align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[RES:%.*]] = mul <12 x i32> splat (i32 5), [[MAT]]
+  // CHECK-NEXT:    store <12 x i32> [[RES]], ptr [[A_ADDR]], align 4
+  // CHECK-NEXT:    ret void
+  a = 5 * a;
+}
+
+// CHECK-LABEL: define {{.*}}multiply_compound_int_matrix_constant
+void multiply_compound_int_matrix_constant(int4x3 a) {
+  // CHECK:         [[A_ADDR:%.*]] = alloca [12 x i32], align 4
+  // CHECK-NEXT:    store <12 x i32> [[A:%.*]], ptr [[A_ADDR]], align 4
+  // NOOPT-NEXT:    [[MAT:%.*]] = load <12 x i32>, ptr [[A_ADDR]], align 4{{$}}
+  // OPT-NEXT:      [[MAT:%.*]] = load <12 x i32>, ptr [[A_ADDR]], align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[RES:%.*]] = mul <12 x i32> [[MAT]], splat (i32 5)
+  // CHECK-NEXT:    store <12 x i32> [[RES]], ptr [[A_ADDR]], align 4
+  // CHECK-NEXT:    ret void
+  a *= 5;
+}
+
+// CHECK-LABEL: define {{.*}}divide_double_matrix_scalar_float
+void divide_double_matrix_scalar_float(double4x4 a, float s) {
+  // NOOPT:         [[A:%.*]] = load <16 x double>, ptr {{.*}}, align 8{{$}}
+  // NOOPT-NEXT:    [[S:%.*]] = load float, ptr %s.addr, align 4{{$}}
+  // OPT:           [[A:%.*]] = load <16 x double>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:      [[S:%.*]] = load float, ptr %s.addr, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[S_EXT:%.*]] = fpext float [[S]] to double
+  // CHECK-NEXT:    [[VECINSERT:%.*]] = insertelement <16 x double> poison, double [[S_EXT]], i64 0
+  // CHECK-NEXT:    [[VECSPLAT:%.*]] = shufflevector <16 x double> [[VECINSERT]], <16 x double> poison, <16 x i32> zeroinitializer
+  // CHECK-NEXT:    [[RES:%.*]] = fdiv <16 x double> [[A]], [[VECSPLAT]]
+  // CHECK-NEXT:    store <16 x double> [[RES]], ptr {{.*}}, align 8
+  // CHECK-NEXT:    ret void
+  a = a / s;
+}
+
+// CHECK-LABEL: define {{.*}}divide_double_matrix_scalar_double
+void divide_double_matrix_scalar_double(double4x4 a, double s) {
+  // NOOPT:         [[A:%.*]] = load <16 x double>, ptr {{.*}}, align 8{{$}}
+  // NOOPT-NEXT:    [[S:%.*]] = load double, ptr %s.addr, align 8{{$}}
+  // OPT:           [[A:%.*]] = load <16 x double>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:      [[S:%.*]] = load double, ptr %s.addr, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[VECINSERT:%.*]] = insertelement <16 x double> poison, double [[S]], i64 0
+  // CHECK-NEXT:    [[VECSPLAT:%.*]] = shufflevector <16 x double> [[VECINSERT]], <16 x double> poison, <16 x i32> zeroinitializer
+  // CHECK-NEXT:    [[RES:%.*]] = fdiv <16 x double> [[A]], [[VECSPLAT]]
+  // CHECK-NEXT:    store <16 x double> [[RES]], ptr {{.*}}, align 8
+  // CHECK-NEXT:    ret void
+  a = a / s;
+}
+
+// CHECK-LABEL: define {{.*}}divide_float_matrix_scalar_double
+void divide_float_matrix_scalar_double(float2x3 b, double s) {
+  // NOOPT:         [[MAT:%.*]] = load <6 x float>, ptr [[B:%.*]], align 4{{$}}
+  // NOOPT-NEXT:    [[S:%.*]] = load double, ptr %s.addr, align 8{{$}}
+  // OPT:           [[MAT:%.*]] = load <6 x float>, ptr [[B:%.*]], align 4, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:      [[S:%.*]] = load double, ptr %s.addr, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[S_TRUNC:%.*]] = fptrunc double [[S]] to float
+  // CHECK-NEXT:    [[VECINSERT:%.*]] = insertelement <6 x float> poison, float [[S_TRUNC]], i64 0
+  // CHECK-NEXT:    [[VECSPLAT:%.*]] = shufflevector <6 x float> [[VECINSERT]], <6 x float> poison, <6 x i32> zeroinitializer
+  // CHECK-NEXT:    [[RES:%.*]] = fdiv <6 x float> [[MAT]], [[VECSPLAT]]
+  // CHECK-NEXT:    store <6 x float> [[RES]], ptr [[B]], align 4
+  // CHECK-NEXT:    ret void
+  b = b / s;
+}
+
+// CHECK-LABEL: define {{.*}}divide_int_matrix_scalar_int16
+void divide_int_matrix_scalar_int16(int4x3 b, int16_t s) {
+  // NOOPT:         [[MAT:%.*]] = load <12 x i32>, ptr [[B:%.*]], align 4{{$}}
+  // NOOPT-NEXT:    [[S:%.*]] = load i16, ptr %s.addr, align 2{{$}}
+  // OPT:           [[MAT:%.*]] = load <12 x i32>, ptr [[B:%.*]], align 4, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:      [[S:%.*]] = load i16, ptr %s.addr, align 2, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[S_EXT:%.*]] = sext i16 [[S]] to i32
+  // CHECK-NEXT:    [[VECINSERT:%.*]] = insertelement <12 x i32> poison, i32 [[S_EXT]], i64 0
+  // CHECK-NEXT:    [[VECSPLAT:%.*]] = shufflevector <12 x i32> [[VECINSERT]], <12 x i32> poison, <12 x i32> zeroinitializer
+  // CHECK-NEXT:    [[RES:%.*]] = sdiv <12 x i32> [[MAT]], [[VECSPLAT]]
+  // CHECK-NEXT:    store <12 x i32> [[RES]], ptr [[B]], align 4
+  // CHECK-NEXT:    ret void
+  b = b / s;
+}
+
+// CHECK-LABEL: define {{.*}}divide_int_matrix_scalar_ull
+void divide_int_matrix_scalar_ull(int4x3 b, uint64_t s) {
+  // NOOPT:         [[MAT:%.*]] = load <12 x i32>, ptr [[B:%.*]], align 4{{$}}
+  // NOOPT-NEXT:    [[S:%.*]] = load i64, ptr %s.addr, align 8{{$}}
+  // OPT:           [[MAT:%.*]] = load <12 x i32>, ptr [[B:%.*]], align 4, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:      [[S:%.*]] = load i64, ptr %s.addr, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[S_TRUNC:%.*]] = trunc i64 [[S]] to i32
+  // CHECK-NEXT:    [[VECINSERT:%.*]] = insertelement <12 x i32> poison, i32 [[S_TRUNC]], i64 0
+  // CHECK-NEXT:    [[VECSPLAT:%.*]] = shufflevector <12 x i32> [[VECINSERT]], <12 x i32> poison, <12 x i32> zeroinitializer
+  // CHECK-NEXT:    [[RES:%.*]] = sdiv <12 x i32> [[MAT]], [[VECSPLAT]]
+  // CHECK-NEXT:    store <12 x i32> [[RES]], ptr [[B]], align 4
+  // CHECK-NEXT:    ret void
+  b = b / s;
+}
+
+// CHECK-LABEL: define {{.*}}divide_ull_matrix_scalar_ull
+void divide_ull_matrix_scalar_ull(uint64_t4x2 b, uint64_t s) {
+  // NOOPT:         [[MAT:%.*]] = load <8 x i64>, ptr [[B:%.*]], align 8{{$}}
+  // NOOPT-NEXT:    [[S:%.*]] = load i64, ptr %s.addr, align 8{{$}}
+  // OPT:           [[MAT:%.*]] = load <8 x i64>, ptr [[B:%.*]], align 8, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:      [[S:%.*]] = load i64, ptr %s.addr, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[VECINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[S]], i64 0
+  // CHECK-NEXT:    [[VECSPLAT:%.*]] = shufflevector <8 x i64> [[VECINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
+  // CHECK-NEXT:    [[RES:%.*]] = udiv <8 x i64> [[MAT]], [[VECSPLAT]]
+  // CHECK-NEXT:    store <8 x i64> [[RES]], ptr [[B]], align 8
+  // CHECK-NEXT:    ret void
+  b = b / s;
+}
+
+// CHECK-LABEL: define {{.*}}divide_float_matrix_constant
+void divide_float_matrix_constant(float2x3 a) {
+  // CHECK:         [[A_ADDR:%.*]] = alloca [6 x float], align 4
+  // CHECK-NEXT:    store <6 x float> [[A:%.*]], ptr [[A_ADDR]], align 4
+  // NOOPT-NEXT:    [[MAT:%.*]] = load <6 x float>, ptr [[A_ADDR]], align 4{{$}}
+  // OPT-NEXT:      [[MAT:%.*]] = load <6 x float>, ptr [[A_ADDR]], align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[RES:%.*]] = fdiv <6 x float> [[MAT]], splat (float 2.500000e+00)
+  // CHECK-NEXT:    store <6 x float> [[RES]], ptr [[A_ADDR]], align 4
+  // CHECK-NEXT:    ret void
+  a = a / 2.5;
+}
+
+  // Tests for the matrix type operators.
+
+  // Check that we can use matrix index expression on different floating point
+  // matrixes and indices.
+// CHECK-LABEL: define {{.*}}insert_double_matrix_const_idx_ll_u_double
+void insert_double_matrix_const_idx_ll_u_double(double4x4 a, double d, float2x3 b, float e, int j, uint k) {
+  // NOOPT:         [[D:%.*]] = load double, ptr %d.addr, align 8{{$}}
+  // OPT:           [[D:%.*]] = load double, ptr %d.addr, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[MAT:%.*]] = load <16 x double>, ptr {{.*}}, align 8{{$}}
+  // CHECK-NEXT:    [[MATINS:%.*]] = insertelement <16 x double> [[MAT]], double [[D]], [[IPTR_T]] 4
+  // CHECK-NEXT:    store <16 x double> [[MATINS]], ptr {{.*}}, align 8
+  // CHECK-NEXT:    ret void
+
+  a[0ll][1u] = d;
+}
+
+// CHECK-LABEL: define {{.*}}insert_double_matrix_const_idx_i_u_double
+void insert_double_matrix_const_idx_i_u_double(double4x4 a, double d) {
+  // NOOPT:         [[D:%.*]] = load double, ptr %d.addr, align 8{{$}}
+  // OPT:           [[D:%.*]] = load double, ptr %d.addr, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[MAT:%.*]] = load <16 x double>, ptr [[B:%.*]], align 8{{$}}
+  // CHECK-NEXT:    [[MATINS:%.*]] = insertelement <16 x double> [[MAT]], double [[D]], [[IPTR_T]] 13
+  // CHECK-NEXT:    store <16 x double> [[MATINS]], ptr [[B]], align 8
+  // CHECK-NEXT:    ret void
+
+  a[1][3u] = d;
+}
+
+// CHECK-LABEL: define {{.*}}insert_float_matrix_const_idx_ull_i_float
+void insert_float_matrix_const_idx_ull_i_float(float2x3 b, float e) {
+  // NOOPT:         [[E:%.*]] = load float, ptr %e.addr, align 4{{$}}
+  // OPT:           [[E:%.*]] = load float, ptr %e.addr, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[MAT:%.*]] = load <6 x float>, ptr [[B:%.*]], align 4{{$}}
+  // CHECK-NEXT:    [[MATINS:%.*]] = insertelement <6 x float> [[MAT]], float [[E]], [[IPTR_T]] 3
+  // CHECK-NEXT:    store <6 x float> [[MATINS]], ptr [[B]], align 4
+  // CHECK-NEXT:    ret void
+
+  b[1ull][1] = e;
+}
+
+// CHECK-LABEL: define {{.*}}insert_float_matrix_idx_i_u_float
+void insert_float_matrix_idx_i_u_float(float2x3 b, float e, int j, uint k) {
+  // NOOPT:         [[E:%.*]] = load float, ptr %e.addr, align 4{{$}}
+  // NOOPT-NEXT:    [[J:%.*]] = load i32, ptr %j.addr, align 4{{$}}
+  // OPT:           [[E:%.*]] = load float, ptr %e.addr, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:      [[J:%.*]] = load i32, ptr %j.addr, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // SPIRV-NEXT:    [[J:%.*]] = sext i32 %{{.*}} to i64
+  // NOOPT-NEXT:    [[K:%.*]] = load i32, ptr %k.addr, align 4{{$}}
+  // OPT-NEXT:      [[K:%.*]] = load i32, ptr %k.addr, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // SPIRV-NEXT:    [[K:%.*]] = zext i32 %{{.*}} to i64
+  // CHECK-NEXT:    [[IDX1:%.*]] = mul [[IPTR_T]] [[K]], 2
+  // CHECK-NEXT:    [[IDX2:%.*]] = add [[IPTR_T]] [[IDX1]], [[J]]
+  // OPT-NEXT:      [[CMP:%.*]] = icmp ult [[IPTR_T]] [[IDX2]], 6
+  // OPT-NEXT:      call void @llvm.assume(i1 [[CMP]])
+  // CHECK-NEXT:    [[MAT:%.*]] = load <6 x float>, ptr [[B:%.*]], align 4{{$}}
+  // CHECK-NEXT:    [[MATINS:%.*]] = insertelement <6 x float> [[MAT]], float [[E]], [[IPTR_T]] [[IDX2]]
+  // CHECK-NEXT:    store <6 x float> [[MATINS]], ptr [[B]], align 4
+  // CHECK-NEXT:    ret void
+
+  b[j][k] = e;
+}
+
+// CHECK-LABEL: define {{.*}}insert_float_matrix_idx_s_ull_float
+void insert_float_matrix_idx_s_ull_float(float2x3 b, float e, int16_t j, uint64_t k) {
+  // NOOPT:         [[E:%.*]] = load float, ptr %e.addr, align 4{{$}}
+  // NOOPT-NEXT:    [[J:%.*]] = load i16, ptr %j.addr, align 2{{$}}
+  // OPT:           [[E:%.*]] = load float, ptr %e.addr, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:      [[J:%.*]] = load i16, ptr %j.addr, align 2, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[J:%.*]] = sext i16 %{{.*}} to [[IPTR_T]]
+  // NOOPT-NEXT:    [[K:%.*]] = load i64, ptr %k.addr, align 8{{$}}
+  // OPT-NEXT:      [[K:%.*]] = load i64, ptr %k.addr, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // DXIL-NEXT:     [[K:%.*]] = trunc i64 %{{.*}} to [[IPTR_T]]
+  // CHECK-NEXT:    [[IDX1:%.*]] = mul [[IPTR_T]] [[K]], 2
+  // CHECK-NEXT:    [[IDX2:%.*]] = add [[IPTR_T]] [[IDX1]], [[J]]
+  // OPT-NEXT:      [[CMP:%.*]] = icmp ult [[IPTR_T]] [[IDX2]], 6
+  // OPT-NEXT:      call void @llvm.assume(i1 [[CMP]])
+  // CHECK-NEXT:    [[MAT:%.*]] = load <6 x float>, ptr [[B:%.*]], align 4{{$}}
+  // CHECK-NEXT:    [[MATINS:%.*]] = insertelement <6 x float> [[MAT]], float [[E]], [[IPTR_T]] [[IDX2]]
+  // CHECK-NEXT:    store <6 x float> [[MATINS]], ptr [[B]], align 4
+  // CHECK-NEXT:    ret void
+
+  (b)[j][k] = e;
+}
+
+  // Check that we can can use matrix index expressions on integer matrixes.
+// CHECK-LABEL: define {{.*}}insert_int_idx_expr
+void insert_int_idx_expr(int4x3 a, int i) {
+  // NOOPT:         [[I1:%.*]] = load i32, ptr %i.addr, align 4{{$}}
+  // NOOPT-NEXT:    [[I2:%.*]] = load i32, ptr %i.addr, align 4{{$}}
+  // OPT:           [[I1:%.*]] = load i32, ptr %i.addr, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:      [[I2:%.*]] = load i32, ptr %i.addr, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 4, [[I2]]
+  // SPIRV-NEXT:    [[ADD:%.*]] = sext i32 %{{.*}} to [[IPTR_T]]
+  // CHECK-NEXT:    [[IDX2:%.*]] = add [[IPTR_T]] 8, [[ADD]]
+  // OPT-NEXT:      [[CMP:%.*]] = icmp ult [[IPTR_T]] [[IDX2]], 12
+  // OPT-NEXT:      call void @llvm.assume(i1 [[CMP]])
+  // CHECK-NEXT:    [[MAT:%.*]] = load <12 x i32>, ptr [[B:%.*]], align 4{{$}}
+  // CHECK-NEXT:    [[MATINS:%.*]] = insertelement <12 x i32> [[MAT]], i32 [[I1]], [[IPTR_T]] [[IDX2]]
+  // CHECK-NEXT:    store <12 x i32> [[MATINS]], ptr [[B]], align 4
+  // CHECK-NEXT:    ret void
+
+  a[4 + i][1 + 1u] = i;
+}
+
+// Check that we can can use matrix index expressions on FP and integer
+// matrixes.
+// CHECK-LABEL: define {{.*}}insert_float_into_int_matrix
+void insert_float_into_int_matrix(inout int4x3 a, int i) {
+  // NOOPT:         [[I:%.*]] = load i32, ptr %i.addr, align 4{{$}}
+  // OPT:           [[I:%.*]] = load i32, ptr %i.addr, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // NOOPT-NEXT:    [[MAT_ADDR1:%.*]] = load ptr, ptr %a.addr, align [[ALIGN]]{{$}}
+  // OPT-NEXT:      [[MAT_ADDR1:%.*]] = load ptr, ptr %a.addr, align [[ALIGN]], !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[MAT:%.*]] = load <12 x i32>, ptr [[MAT_ADDR:%.*]], align 4{{$}}
+  // CHECK-NEXT:    [[MATINS:%.*]] = insertelement <12 x i32> [[MAT]], i32 [[I]], [[IPTR_T]] 7
+  // CHECK-NEXT:    store <12 x i32> [[MATINS]], ptr [[MAT_ADDR]], align 4
+  // CHECK-NEXT:    ret void
+
+  a[3][1] = i;
+}
+
+// Check that we can use overloaded matrix index expressions on matrixes with
+// matching dimensions, but different element types.
+// CHECK-LABEL: define {{.*}}insert_matching_dimensions1
+void insert_matching_dimensions1(double3x3 a, double i) {
+  // NOOPT:         [[I:%.*]] = load double, ptr %i.addr, align 8{{$}}
+  // OPT:           [[I:%.*]] = load double, ptr %i.addr, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[MAT:%.*]] = load <9 x double>, ptr [[B:%.*]], align 8{{$}}
+  // CHECK-NEXT:    [[MATINS:%.*]] = insertelement <9 x double> [[MAT]], double [[I]], [[IPTR_T]] 5
+  // CHECK-NEXT:    store <9 x double> [[MATINS]], ptr [[B]], align 8
+  // CHECK-NEXT:    ret void
+
+  a[2u][1u] = i;
+}
+
+// CHECK-LABEL: define {{.*}}insert_matching_dimensions
+void insert_matching_dimensions(float3x3 b, float e) {
+  // NOOPT:         [[E:%.*]] = load float, ptr %e.addr, align 4{{$}}
+  // OPT:           [[E:%.*]] = load float, ptr %e.addr, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[MAT:%.*]] = load <9 x float>, ptr [[B:%.*]], align 4{{$}}
+  // CHECK-NEXT:    [[MATINS:%.*]] = insertelement <9 x float> [[MAT]], float [[E]], [[IPTR_T]] 7
+  // CHECK-NEXT:    store <9 x float> [[MATINS]], ptr [[B]], align 4
+  // CHECK-NEXT:    ret void
+
+  b[1u][2u] = e;
+}
+
+// CHECK-LABEL: define {{.*}}extract_double
+double extract_double(double4x4 a) {
+  // NOOPT:         [[MAT:%.*]] = load <16 x double>, ptr {{.*}}, align 8{{$}}
+  // OPT:           [[MAT:%.*]] = load <16 x double>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[MATEXT:%.*]] = extractelement <16 x double> [[MAT]], [[IPTR_T]] 10
+  // CHECK-NEXT:    ret double [[MATEXT]]
+
+  return a[2][3 - 1u];
+}
+
+// CHECK-LABEL: define {{.*}}extract_float
+double extract_float(float3x3 b) {
+  // NOOPT:         [[MAT:%.*]] = load <9 x float>, ptr {{.*}}, align 4{{$}}
+  // OPT:           [[MAT:%.*]] = load <9 x float>, ptr {{.*}}, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[MATEXT:%.*]] = extractelement <9 x float> [[MAT]], [[IPTR_T]] 5
+  // CHECK-NEXT:    [[TO_DOUBLE:%.*]] = fpext float [[MATEXT]] to double
+  // CHECK-NEXT:    ret double [[TO_DOUBLE]]
+
+  return b[2][1];
+}
+
+// CHECK-LABEL: define {{.*}}extract_int
+int extract_int(int4x3 c, uint64_t j) {
+  // NOOPT:         [[J1:%.*]] = load i64, ptr %j.addr, align 8{{$}}
+  // OPT:           [[J1:%.*]] = load i64, ptr %j.addr, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // DXIL-NEXT:     [[J1:%.*]] = trunc i64 %{{.*}} to [[IPTR_T]]
+  // NOOPT-NEXT:    [[J2:%.*]] = load i64, ptr %j.addr, align 8{{$}}
+  // OPT-NEXT:      [[J2:%.*]] = load i64, ptr %j.addr, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // DXIL-NEXT:     [[J2:%.*]] = trunc i64 %{{.*}} to [[IPTR_T]]
+  // CHECK-NEXT:    [[IDX1:%.*]] = mul [[IPTR_T]] [[J2]], 4
+  // CHECK-NEXT:    [[IDX2:%.*]] = add [[IPTR_T]] [[IDX1]], [[J1]]
+  // NOOPT-NEXT:    [[MAT:%.*]] = load <12 x i32>, ptr {{.*}}, align 4{{$}}
+  // OPT-NEXT:      [[CMP:%.*]] = icmp ult [[IPTR_T]] [[IDX2]], 12
+  // OPT-NEXT:      call void @llvm.assume(i1 [[CMP]])
+  // OPT-NEXT:      [[MAT:%.*]] = load <12 x i32>, ptr {{.*}}, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[MATEXT:%.*]] = extractelement <12 x i32> [[MAT]], [[IPTR_T]] [[IDX2]]
+  // CHECK-NEXT:    ret i32 [[MATEXT]]
+
+  return c[j][j];
+}
+
+// CHECK-LABEL: define {{.*}}test_extract_matrix_pointer1
+double test_extract_matrix_pointer1(inout double3x2 ptr, uint j) {
+  // NOOPT:         [[J:%.*]] = load i32, ptr %j.addr, align 4{{$}}
+  // OPT:           [[J:%.*]] = load i32, ptr %j.addr, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // SPIRV-NEXT:    [[J:%.*]] = zext i32 {{.*}} to [[IPTR_T]]
+  // CHECK-NEXT:    [[IDX:%.*]] = add [[IPTR_T]] 3, [[J]]
+  // OPT-NEXT:      [[CMP:%.*]] = icmp ult [[IPTR_T]] [[IDX]], 6
+  // OPT-NEXT:      call void @llvm.assume(i1 [[CMP]])
+  // NOOPT-NEXT:    [[PTR:%.*]] = load ptr, ptr %ptr.addr, align [[ALIGN]]{{$}}
+  // OPT-NEXT:      [[PTR:%.*]] = load ptr, ptr %ptr.addr, align [[ALIGN]], !tbaa !{{[0-9]+}}{{$}}
+  // NOOPT-NEXT:    [[MAT:%.*]] = load <6 x double>, ptr [[PTR]], align 8{{$}}
+  // OPT-NEXT:      [[MAT:%.*]] = load <6 x double>, ptr [[PTR]], align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[MATEXT:%.*]] = extractelement <6 x double> [[MAT]], [[IPTR_T]] [[IDX]]
+  // CHECK-NEXT:    ret double [[MATEXT]]
+
+  return ptr[j][1];
+}
+
+// CHECK-LABEL: define {{.*}}test_extract_matrix_pointer2
+double test_extract_matrix_pointer2(inout double3x2 ptr) {
+  // NOOPT:         [[PTR:%.*]] = load ptr, ptr %ptr.addr, align [[ALIGN]]{{$}}
+  // OPT:           [[PTR:%.*]] = load ptr, ptr %ptr.addr, align [[ALIGN]], !tbaa !{{[0-9]+}}{{$}}
+  // NOOPT-NEXT:    [[MAT:%.*]] = load <6 x double>, ptr [[PTR]], align 8{{$}}
+  // OPT-NEXT:      [[MAT:%.*]] = load <6 x double>, ptr [[PTR]], align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[MATEXT:%.*]] = extractelement <6 x double> [[MAT]], [[IPTR_T]] 5
+  // CHECK-NEXT:    ret double [[MATEXT]]
+
+  return ptr[2][1 * 3 - 2];
+}
+
+// CHECK-LABEL: define {{.*}}insert_extract
+void insert_extract(double4x4 a, float3x3 b, uint64_t j, int16_t k) {
+  // NOOPT:         [[K:%.*]] = load i16, ptr %k.addr, align 2{{$}}
+  // OPT:           [[K:%.*]] = load i16, ptr %k.addr, align 2, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[K_EXT:%.*]] = sext i16 [[K]] to [[IPTR_T]]
+  // CHECK-NEXT:    [[IDX1:%.*]] = mul [[IPTR_T]] [[K_EXT]], 3
+  // CHECK-NEXT:    [[IDX2:%.*]] = add [[IPTR_T]] [[IDX1]], 0
+  // NOOPT-NEXT:    [[MAT:%.*]] = load <9 x float>, ptr [[MAT_ADDR:%.*]], align 4{{$}}
+  // OPT-NEXT:      [[CMP:%.*]] = icmp ult [[IPTR_T]] [[IDX2]], 9
+  // OPT-NEXT:      call void @llvm.assume(i1 [[CMP]])
+  // OPT-NEXT:      [[MAT:%.*]] = load <9 x float>, ptr [[MAT_ADDR:%.*]], align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[MATEXT:%.*]] = extractelement <9 x float> [[MAT]], [[IPTR_T]] [[IDX2]]
+  // NOOPT-NEXT:    [[J:%.*]] = load i64, ptr %j.addr, align 8{{$}}
+  // OPT-NEXT:      [[J:%.*]] = load i64, ptr %j.addr, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // DXIL-NEXT:     [[J:%.*]] = trunc i64 %{{.*}} to [[IPTR_T]]
+  // CHECK-NEXT:    [[IDX3:%.*]] = mul [[IPTR_T]] [[J]], 3
+  // CHECK-NEXT:    [[IDX4:%.*]] = add [[IPTR_T]] [[IDX3]], 2
+  // OPT-NEXT:      [[CMP:%.*]] = icmp ult [[IPTR_T]] [[IDX4]], 9
+  // OPT-NEXT:      call void @llvm.assume(i1 [[CMP]])
+  // CHECK-NEXT:    [[MAT2:%.*]] = load <9 x float>, ptr [[MAT_ADDR]], align 4{{$}}
+  // CHECK-NEXT:    [[MATINS:%.*]] = insertelement <9 x float> [[MAT2]], float [[MATEXT]], [[IPTR_T]] [[IDX4]]
+  // CHECK-NEXT:    store <9 x float> [[MATINS]], ptr [[MAT_ADDR]], align 4
+  // CHECK-NEXT:    ret void
+
+  b[2][j] = b[0][k];
+}
+
+// CHECK-LABEL: define {{.*}}insert_compound_stmt
+void insert_compound_stmt(double4x4 a) {
+  // CHECK:        [[A:%.*]] = load <16 x double>, ptr [[A_PTR:%.*]], align 8{{$}}
+  // CHECK-NEXT:   [[EXT:%.*]] = extractelement <16 x double> [[A]], [[IPTR_T]] 14
+  // CHECK-NEXT:   [[SUB:%.*]] = fsub double [[EXT]], 1.000000e+00
+  // CHECK-NEXT:   [[A2:%.*]] = load <16 x double>, ptr [[A_PTR]], align 8{{$}}
+  // CHECK-NEXT:   [[INS:%.*]] = insertelement <16 x double> [[A2]], double [[SUB]], [[IPTR_T]] 14
+  // CHECK-NEXT:   store <16 x double> [[INS]], ptr [[A_PTR]], align 8
+  // CHECK-NEXT:   ret void
+
+  a[2][3] -= 1.0;
+}
+
+struct Foo {
+  float2x3 mat;
+};
+
+// CHECK-LABEL: define {{.*}}insert_compound_stmt_field
+void insert_compound_stmt_field(inout struct Foo a, float f, uint i, uint j) {
+  // NOOPT:         [[I:%.*]] = load i32, ptr %i.addr, align 4{{$}}
+  // OPT:           [[I:%.*]] = load i32, ptr %i.addr, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // SPIRV-NEXT:    [[I:%.*]] = zext i32 %{{.*}} to [[IPTR_T]]
+  // NOOPT-NEXT:    [[J:%.*]] = load i32, ptr %j.addr, align 4{{$}}
+  // OPT-NEXT:      [[J:%.*]] = load i32, ptr %j.addr, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // SPIRV-NEXT:    [[J:%.*]] = zext i32 %{{.*}} to [[IPTR_T]]
+  // CHECK-NEXT:    [[IDX1:%.*]] = mul [[IPTR_T]] [[J]], 2
+  // CHECK-NEXT:    [[IDX2:%.*]] = add [[IPTR_T]] [[IDX1]], [[I]]
+  // OPT-NEXT:      [[CMP:%.*]] = icmp ult [[IPTR_T]] [[IDX2]], 6
+  // OPT-NEXT:      call void @llvm.assume(i1 [[CMP]])
+
+  // CHECK-NEXT:    [[MAT:%.*]] = load <6 x float>, ptr %mat, align 4{{$}}
+  // CHECK-NEXT:    [[EXT:%.*]] = extractelement <6 x float> [[MAT]], [[IPTR_T]] [[IDX2]]
+  // CHECK-NEXT:    [[SUM:%.*]] = fadd float [[EXT]], {{.*}}
+  // OPT-NEXT:      [[CMP:%.*]] = icmp ult [[IPTR_T]] [[IDX2]], 6
+  // OPT-NEXT:      call void @llvm.assume(i1 [[CMP]])
+  // CHECK-NEXT:    [[MAT2:%.*]] = load <6 x float>, ptr %mat, align 4{{$}}
+  // CHECK-NEXT:    [[INS:%.*]] = insertelement <6 x float> [[MAT2]], float [[SUM]], [[IPTR_T]] [[IDX2]]
+  // CHECK-NEXT:    store <6 x float> [[INS]], ptr %mat, align 4
+  // CHECK-NEXT:    ret void
+
+  a.mat[i][j] += f;
+}
+
+// CHECK-LABEL: define {{.*}}matrix_as_idx
+void matrix_as_idx(int4x3 a, int i, int j, double4x4 b) {
+  // NOOPT:       [[I1:%.*]] = load i32, ptr %i.addr, align 4{{$}}
+  // OPT:         [[I1:%.*]] = load i32, ptr %i.addr, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // SPIRV-NEXT:  [[I1:%.*]] = sext i32 %{{.*}} to [[IPTR_T]]
+  // NOOPT-NEXT:  [[J1:%.*]] = load i32, ptr %j.addr, align 4{{$}}
+  // OPT-NEXT:    [[J1:%.*]] = load i32, ptr %j.addr, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // SPIRV-NEXT:  [[J1:%.*]] = sext i32 %{{.*}} to [[IPTR_T]]
+  // CHECK-NEXT:  [[IDX1_1:%.*]] = mul [[IPTR_T]] [[J1]], 4
+  // CHECK-NEXT:  [[IDX1_2:%.*]] = add [[IPTR_T]] [[IDX1_1]], [[I1]]
+  // NOOPT-NEXT:  [[A:%.*]] = load <12 x i32>, ptr %a.addr, align 4{{$}}
+  // OPT-NEXT:    [[CMP:%.*]] = icmp ult [[IPTR_T]] [[IDX1_2]], 12
+  // OPT-NEXT:    call void @llvm.assume(i1 [[CMP]])
+  // OPT-NEXT:    [[A:%.*]] = load <12 x i32>, ptr %a.addr, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[MI1:%.*]] = extractelement <12 x i32> [[A]], [[IPTR_T]] [[IDX1_2]]
+  // SPIRV-NEXT:  [[MI1:%.*]] = sext i32 %{{.*}} to [[IPTR_T]]
+  // NOOPT-NEXT:  [[J2:%.*]] = load i32, ptr %j.addr, align 4{{$}}
+  // OPT-NEXT:    [[J2:%.*]] = load i32, ptr %j.addr, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // SPIRV-NEXT:  [[J2:%.*]] = sext i32 %{{.*}} to [[IPTR_T]]
+  // NOOPT-NEXT:  [[I2:%.*]] = load i32, ptr %i.addr, align 4{{$}}
+  // OPT-NEXT:    [[I2:%.*]] = load i32, ptr %i.addr, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // SPIRV-NEXT:  [[I2:%.*]] = sext i32 %{{.*}} to [[IPTR_T]]
+  // CHECK-NEXT:  [[IDX2_1:%.*]] = mul [[IPTR_T]] [[I2]], 4
+  // CHECK-NEXT:  [[IDX2_2:%.*]] = add [[IPTR_T]] [[IDX2_1]], [[J2]]
+  // NOOPT-NEXT:  [[A2:%.*]] = load <12 x i32>, ptr {{.*}}, align 4{{$}}
+  // OPT-NEXT:    [[CMP:%.*]] = icmp ult [[IPTR_T]] [[IDX2_2]], 12
+  // OPT-NEXT:    call void @llvm.assume(i1 [[CMP]])
+  // OPT-NEXT:    [[A2:%.*]] = load <12 x i32>, ptr {{.*}}, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[MI2:%.*]] = extractelement <12 x i32> [[A2]], [[IPTR_T]] [[IDX2_2]]
+  // CHECK-NEXT:  [[MI3:%.*]] = add nsw i32 [[MI2]], 2
+  // SPIRV-NEXT:  [[MI3:%.*]] = sext i32 %{{.*}} to [[IPTR_T]]
+  // CHECK-NEXT:  [[IDX3_1:%.*]] = mul [[IPTR_T]] [[MI3]], 4
+  // CHECK-NEXT:  [[IDX3_2:%.*]] = add [[IPTR_T]] [[IDX3_1]], [[MI1]]
+  // OPT-NEXT:    [[CMP:%.*]] = icmp ult [[IPTR_T]] [[IDX3_2]], 16
+  // OPT-NEXT:    call void @llvm.assume(i1 [[CMP]])
+  // CHECK-NEXT:  [[B:%.*]] = load <16 x double>, ptr [[B_PTR:%.*]], align 8{{$}}
+  // CHECK-NEXT:  [[INS:%.*]] = insertelement <16 x double> [[B]], double 1.500000e+00, [[IPTR_T]] [[IDX3_2]]
+  // CHECK-NEXT:  store <16 x double> [[INS]], ptr [[B_PTR]], align 8
+  // CHECK-NEXT:  ret void
+
+  b[a[i][j]][a[j][i] + 2] = 1.5;
+}
+
+
diff --git a/clang/test/CodeGenHLSL/Types/BuiltinMatrix/matrix-type.hlsl b/clang/test/CodeGenHLSL/Types/BuiltinMatrix/matrix-type.hlsl
new file mode 100644
index 0000000000000..8747d23dcd2f1
--- /dev/null
+++ b/clang/test/CodeGenHLSL/Types/BuiltinMatrix/matrix-type.hlsl
@@ -0,0 +1,219 @@
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple spirv-unknown-vulkan-compute -fnative-half-type -finclude-default-header %s -emit-llvm -disable-llvm-passes -o - | FileCheck %s --check-prefixes=CHECK,SPIRV
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple dxil-pc-shadermodel6.3-compute -fnative-half-type -finclude-default-header %s -emit-llvm -disable-llvm-passes -o - | FileCheck %s
+
+// CHECK: %struct.Matrix = type { i16, [12 x float], float }
+
+// CHECK-LABEL:  define {{.*}}load_store_double
+void load_store_double(inout double4x4 a,  inout double4x4 b) {
+  // CHECK-NEXT:  entry:
+  // SPIRV-NEXT:    %0 = call token @llvm.experimental.convergence.entry()
+  // CHECK-NEXT:    %a.addr = alloca ptr, align
+  // CHECK-NEXT:    %b.addr = alloca ptr, align
+  // CHECK-NEXT:    store ptr %a, ptr %a.addr, align
+  // CHECK-NEXT:    store ptr %b, ptr %b.addr, align
+  // CHECK-NEXT:    [[B_PTR:%.*]] = load ptr, ptr %b.addr, align
+  // CHECK-NEXT:    [[B:%.*]] = load <16 x double>, ptr [[B_PTR]], align 8
+  // CHECK-NEXT:    [[A_PTR:%.*]] = load ptr, ptr %a.addr, align
+  // CHECK-NEXT:    store <16 x double> [[B]], ptr [[A_PTR]], align 8
+  // CHECK-NEXT:   ret void
+
+  a = b;
+}
+
+// CHECK-LABEL:  define {{.*}}load_store_float
+void load_store_float(inout float3x4 a,  inout float3x4 b) {
+  // CHECK-NEXT:  entry:
+  // SPIRV-NEXT:    %0 = call token @llvm.experimental.convergence.entry()
+  // CHECK-NEXT:    %a.addr = alloca ptr, align
+  // CHECK-NEXT:    %b.addr = alloca ptr, align
+  // CHECK-NEXT:    store ptr %a, ptr %a.addr, align
+  // CHECK-NEXT:    store ptr %b, ptr %b.addr, align
+  // CHECK-NEXT:    [[B_PTR:%.*]] = load ptr, ptr %b.addr, align
+  // CHECK-NEXT:    [[B:%.*]] = load <12 x float>, ptr [[B_PTR]], align 4
+  // CHECK-NEXT:    [[A_PTR:%.*]] = load ptr, ptr %a.addr, align
+  // CHECK-NEXT:    store <12 x float> [[B]], ptr [[A_PTR]], align 4
+  // CHECK-NEXT:   ret void
+
+  a = b;
+}
+
+// CHECK-LABEL:  define {{.*}}load_store_int
+void load_store_int(inout int3x4 a, inout int3x4 b) {
+  // CHECK-NEXT:  entry:
+  // SPIRV-NEXT:    %0 = call token @llvm.experimental.convergence.entry()
+  // CHECK-NEXT:    %a.addr = alloca ptr, align
+  // CHECK-NEXT:    %b.addr = alloca ptr, align
+  // CHECK-NEXT:    store ptr %a, ptr %a.addr, align
+  // CHECK-NEXT:    store ptr %b, ptr %b.addr, align
+  // CHECK-NEXT:    [[B_PTR:%.*]] = load ptr, ptr %b.addr, align
+  // CHECK-NEXT:    [[B:%.*]] = load <12 x i32>, ptr [[B_PTR]], align 4
+  // CHECK-NEXT:    [[A_PTR:%.*]] = load ptr, ptr %a.addr, align
+  // CHECK-NEXT:    store <12 x i32> [[B]], ptr [[A_PTR]], align 4
+  // CHECK-NEXT:   ret void
+
+  a = b;
+}
+
+// CHECK-LABEL:  define {{.*}}load_store_ull
+void load_store_ull(inout uint64_t3x4 a,  inout uint64_t3x4 b) {
+  // CHECK-NEXT:  entry:
+  // SPIRV-NEXT:    %0 = call token @llvm.experimental.convergence.entry()
+  // CHECK-NEXT:    %a.addr = alloca ptr, align
+  // CHECK-NEXT:    %b.addr = alloca ptr, align
+  // CHECK-NEXT:    store ptr %a, ptr %a.addr, align
+  // CHECK-NEXT:    store ptr %b, ptr %b.addr, align
+  // CHECK-NEXT:    [[B_PTR:%.*]] = load ptr, ptr %b.addr, align
+  // CHECK-NEXT:    [[B:%.*]] = load <12 x i64>, ptr [[B_PTR]], align 8
+  // CHECK-NEXT:    [[A_PTR:%.*]] = load ptr, ptr %a.addr, align
+  // CHECK-NEXT:    store <12 x i64> [[B]], ptr [[A_PTR]], align 8
+  // CHECK-NEXT:   ret void
+
+  a = b;
+}
+
+// CHECK-LABEL:  define {{.*}}load_store_fp16
+void load_store_fp16(inout float16_t3x4 a, inout float16_t3x4 b) {
+  // CHECK-NEXT:  entry:
+  // SPIRV-NEXT:    %0 = call token @llvm.experimental.convergence.entry()
+  // CHECK-NEXT:    %a.addr = alloca ptr, align
+  // CHECK-NEXT:    %b.addr = alloca ptr, align
+  // CHECK-NEXT:    store ptr %a, ptr %a.addr, align
+  // CHECK-NEXT:    store ptr %b, ptr %b.addr, align
+  // CHECK-NEXT:    [[B_PTR:%.*]] = load ptr, ptr %b.addr, align
+  // CHECK-NEXT:    [[B:%.*]] = load <12 x half>, ptr [[B_PTR]], align 2
+  // CHECK-NEXT:    [[A_PTR:%.*]] = load ptr, ptr %a.addr, align
+  // CHECK-NEXT:    store <12 x half> [[B]], ptr [[A_PTR]], align 2
+  // CHECK-NEXT:   ret void
+
+  a = b;
+}
+
+
+typedef struct {
+  uint16_t Tmp1;
+  float3x4 Data;
+  float Tmp2;
+} Matrix;
+
+// CHECK-LABEL: define {{.*}}matrix_struct
+void matrix_struct(Matrix a,  Matrix b) {
+  // CHECK-NEXT:  entry:
+  // SPIRV-NEXT:    %0 = call token @llvm.experimental.convergence.entry()
+  // CHECK-NEXT:    %Data = getelementptr inbounds nuw %struct.Matrix, ptr %a, i32 0, i32 1
+  // CHECK-NEXT:    [[tmp:%[0-9]*]] = load <12 x float>, ptr %Data, align 4
+  // CHECK-NEXT:    %Data1 = getelementptr inbounds nuw %struct.Matrix, ptr %b, i32 0, i32 1
+  // CHECK-NEXT:    store <12 x float> [[tmp]], ptr %Data1, align 4
+  // CHECK-NEXT:    ret void
+  b.Data = a.Data;
+}
+
+// CHECK-LABEL: define {{.*}}parameter_passing
+void parameter_passing(in float3x3 a, inout float3x3 b, out float3x3 c) {
+  // CHECK-NEXT:  entry:
+  // SPIRV-NEXT:    %0 = call token @llvm.experimental.convergence.entry()
+  // CHECK-NEXT:    %a.addr = alloca [9 x float], align 4
+  // CHECK-NEXT:    %b.addr = alloca ptr, align
+  // CHECK-NEXT:    %c.addr = alloca ptr, align
+  // CHECK-NEXT:    store <9 x float> %a, ptr %a.addr, align 4
+  // CHECK-NEXT:    store ptr %b, ptr %b.addr, align
+  // CHECK-NEXT:    store ptr %c, ptr %c.addr, align
+  // CHECK-NEXT:    [[A:%.*]] = load <9 x float>, ptr %a.addr, align 4
+  // CHECK-NEXT:    [[B:%.*]] = load ptr, ptr %b.addr, align
+  // CHECK-NEXT:    store <9 x float> [[A]], ptr [[B]], align 4
+  // CHECK-NEXT:    [[C:%.*]] = load ptr, ptr %c.addr, align
+  // CHECK-NEXT:    store <9 x float> [[A]], ptr [[C]], align 4
+  // CHECK-NEXT:    ret void
+  c = b = a;
+}
+
+// CHECK-LABEL: define {{.*}}return_matrix
+float3x3 return_matrix(inout float3x3 a) {
+  // CHECK-NEXT:  entry:
+  // SPIRV-NEXT:    %0 = call token @llvm.experimental.convergence.entry()
+  // CHECK-NEXT:    %a.addr = alloca ptr, align
+  // CHECK-NEXT:    store ptr %a, ptr %a.addr, align
+  // CHECK-NEXT:    [[A_PTR:%.*]] = load ptr, ptr %a.addr, align
+  // CHECK-NEXT:    [[A:%.*]] = load <9 x float>, ptr [[A_PTR]], align 4
+  // CHECK-NEXT:    ret <9 x float> [[A]]
+  return a;
+}
+
+
+class MatrixClass {
+  int Tmp1;
+  float3x4 Data;
+  int64_t Tmp2;
+};
+
+// CHECK-LABEL: define {{.*}}matrix_class_reference
+void matrix_class_reference(inout MatrixClass a, inout MatrixClass b) {
+  // CHECK-NEXT:  entry:
+  // SPIRV-NEXT:    %0 = call token @llvm.experimental.convergence.entry()
+  // CHECK-NEXT:    %a.addr = alloca ptr, align
+  // CHECK-NEXT:    %b.addr = alloca ptr, align
+  // CHECK-NEXT:    store ptr %a, ptr %a.addr, align
+  // CHECK-NEXT:    store ptr %b, ptr %b.addr, align
+  // CHECK-NEXT:    [[A_PTR:%.*]] = load ptr, ptr %a.addr, align
+  // CHECK-NEXT:    %Data = getelementptr inbounds nuw %class.MatrixClass, ptr [[A_PTR]], i32 0, i32 1
+  // CHECK-NEXT:    [[DATA:%.*]] = load <12 x float>, ptr %Data, align 4
+  // CHECK-NEXT:    [[B_PTR:%.*]] = load ptr, ptr %b.addr, align
+  // CHECK-NEXT:    %Data1 = getelementptr inbounds nuw %class.MatrixClass, ptr [[B_PTR]], i32 0, i32 1
+  // CHECK-NEXT:    store <12 x float> [[DATA]], ptr %Data1, align 4
+  // CHECK-NEXT:    ret void
+  b.Data = a.Data;
+}
+
+template <typename Ty, unsigned Rows, unsigned Cols>
+class MatrixClassTemplate {
+  using MatrixTy = matrix<Ty, Rows, Cols>;
+  int Tmp1;
+  MatrixTy Data;
+  int64_t Tmp2;
+};
+
+template <typename Ty, unsigned Rows, unsigned Cols>
+void matrix_template_reference(inout MatrixClassTemplate<Ty, Rows, Cols> a, inout MatrixClassTemplate<Ty, Rows, Cols> b) {
+  b.Data = a.Data;
+}
+
+// CHECK-LABEL: define {{.*}}matrix_template_reference_caller
+MatrixClassTemplate<float, 3, 4> matrix_template_reference_caller(matrix<float,3,4> Data) {
+  // CHECK-NEXT:  entry:
+  // SPIRV-NEXT:    %0 = call token @llvm.experimental.convergence.entry()
+  // CHECK-NEXT:    %Data.addr = alloca [12 x float], align 4
+  // CHECK-NEXT:    %Arg = alloca %class.MatrixClassTemplate, align 8
+  // CHECK-NEXT:    %tmp = alloca %class.MatrixClassTemplate, align 8
+  // CHECK-NEXT:    %tmp2 = alloca %class.MatrixClassTemplate, align 8
+  // CHECK-NEXT:    store <12 x float> %Data, ptr %Data.addr, align 4
+  // CHECK-NEXT:    [[DATA:%.*]] = load <12 x float>, ptr %Data.addr, align 4
+  // CHECK-NEXT:    %Data1 = getelementptr inbounds nuw %class.MatrixClassTemplate, ptr %Arg, i32 0, i32 1
+  // CHECK-NEXT:    store <12 x float> [[DATA]], ptr %Data1, align 4
+  // CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i{{[0-9]*}}(ptr align 8 %tmp, ptr align 8 %Arg, i{{[0-9]*}} 64, i1 false)
+  // CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i{{[0-9]*}}(ptr align 8 %tmp2, ptr align 8 %agg.result, i{{[0-9]*}} 64, i1 false)
+  // CHECK-NEXT:    call{{.*}} void @_Z25matrix_template_referenceIfLj3ELj4EEv19MatrixClassTemplateIT_XT0_EXT1_EES2_(ptr noalias nonnull align 8 dereferenceable(64) %tmp, ptr noalias nonnull align 8 dereferenceable(64) %tmp2)
+  // CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i{{[0-9]*}}(ptr align 8 %Arg, ptr align 8 %tmp, i{{[0-9]*}} 64, i1 false)
+  // CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i{{[0-9]*}}(ptr align 8 %agg.result, ptr align 8 %tmp2, i{{[0-9]*}} 64, i1 false)
+  // CHECK-NEXT:    ret void
+
+  // CHECK-LABEL: define{{.*}} void @_Z25matrix_template_referenceIfLj3ELj4EEv19MatrixClassTemplateIT_XT0_EXT1_EES2_(ptr noalias nonnull align 8 dereferenceable(64) %a, ptr noalias nonnull align 8 dereferenceable(64) %b)
+  // CHECK-NEXT:  entry:
+  // SPIRV-NEXT:    %0 = call token @llvm.experimental.convergence.entry()
+  // CHECK-NEXT:    %a.addr = alloca ptr, align
+  // CHECK-NEXT:    %b.addr = alloca ptr, align
+  // CHECK-NEXT:    store ptr %a, ptr %a.addr, align
+  // CHECK-NEXT:    store ptr %b, ptr %b.addr, align
+  // CHECK-NEXT:    [[A_PTR:%.*]] = load ptr, ptr %a.addr, align
+  // CHECK-NEXT:    %Data = getelementptr inbounds nuw %class.MatrixClassTemplate, ptr [[A_PTR]], i32 0, i32 1
+  // CHECK-NEXT:    [[DATA:%.*]] = load <12 x float>, ptr %Data, align 4
+  // CHECK-NEXT:    [[B_PTR:%.*]] = load ptr, ptr %b.addr, align
+  // CHECK-NEXT:    %Data1 = getelementptr inbounds nuw %class.MatrixClassTemplate, ptr [[B_PTR]], i32 0, i32 1
+  // CHECK-NEXT:    store <12 x float> [[DATA]], ptr %Data1, align 4
+  // CHECK-NEXT:    ret void
+
+  MatrixClassTemplate<float, 3, 4> Result, Arg;
+  Arg.Data = Data;
+  matrix_template_reference(Arg, Result);
+  return Result;
+}
+
+
diff --git a/clang/test/CodeGenHLSL/matrix-types.hlsl b/clang/test/CodeGenHLSL/matrix-types.hlsl
new file mode 100644
index 0000000000000..721d383cd04f1
--- /dev/null
+++ b/clang/test/CodeGenHLSL/matrix-types.hlsl
@@ -0,0 +1,348 @@
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - -DNAMESPACED| FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -triple spirv-unknown-vulkan-compute %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - -DSPIRV| FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -triple spirv-unknown-vulkan-compute %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - -DSPIRV -DNAMESPACED| FileCheck %s
+
+#ifdef NAMESPACED
+#define TYPE_DECL(T)  hlsl::T T##_Val
+#else
+#define TYPE_DECL(T)  T T##_Val
+#endif
+
+// Until MicrosoftCXXABI supports mangling matrices,
+// these have to be local variables for DXIL.
+#ifndef SPIRV
+void f() {
+#endif
+
+// built-in matrix types:
+
+// Capture target-specific details.
+//CHECK: [[PFX:[%@]]]int16_t1x1_Val = [[STR:(alloca|global)]] [1 x i16][[ZI:( zeroinitializer)?]], align 2
+//CHECK: [[PFX]]int16_t1x2_Val = [[STR]] [2 x i16][[ZI]], align 2
+//CHECK: [[PFX]]int16_t1x3_Val = [[STR]] [3 x i16][[ZI]], align 2
+//CHECK: [[PFX]]int16_t1x4_Val = [[STR]] [4 x i16][[ZI]], align 2
+TYPE_DECL( int16_t1x1 );
+TYPE_DECL( int16_t1x2 );
+TYPE_DECL( int16_t1x3 );
+TYPE_DECL( int16_t1x4 );
+
+//CHECK: [[PFX]]int16_t2x1_Val = [[STR]] [2 x i16][[ZI]], align 2
+//CHECK: [[PFX]]int16_t2x2_Val = [[STR]] [4 x i16][[ZI]], align 2
+//CHECK: [[PFX]]int16_t2x3_Val = [[STR]] [6 x i16][[ZI]], align 2
+//CHECK: [[PFX]]int16_t2x4_Val = [[STR]] [8 x i16][[ZI]], align 2
+TYPE_DECL( int16_t2x1 );
+TYPE_DECL( int16_t2x2 );
+TYPE_DECL( int16_t2x3 );
+TYPE_DECL( int16_t2x4 );
+
+//CHECK: [[PFX]]int16_t3x1_Val = [[STR]] [3 x i16][[ZI]], align 2
+//CHECK: [[PFX]]int16_t3x2_Val = [[STR]] [6 x i16][[ZI]], align 2
+//CHECK: [[PFX]]int16_t3x3_Val = [[STR]] [9 x i16][[ZI]], align 2
+//CHECK: [[PFX]]int16_t3x4_Val = [[STR]] [12 x i16][[ZI]], align 2
+TYPE_DECL( int16_t3x1 );
+TYPE_DECL( int16_t3x2 );
+TYPE_DECL( int16_t3x3 );
+TYPE_DECL( int16_t3x4 );
+
+//CHECK: [[PFX]]int16_t4x1_Val = [[STR]] [4 x i16][[ZI]], align 2
+//CHECK: [[PFX]]int16_t4x2_Val = [[STR]] [8 x i16][[ZI]], align 2
+//CHECK: [[PFX]]int16_t4x3_Val = [[STR]] [12 x i16][[ZI]], align 2
+//CHECK: [[PFX]]int16_t4x4_Val = [[STR]] [16 x i16][[ZI]], align 2
+TYPE_DECL( int16_t4x1 );
+TYPE_DECL( int16_t4x2 );
+TYPE_DECL( int16_t4x3 );
+TYPE_DECL( int16_t4x4 );
+
+//CHECK: [[PFX]]uint16_t1x1_Val = [[STR]] [1 x i16][[ZI]], align 2
+//CHECK: [[PFX]]uint16_t1x2_Val = [[STR]] [2 x i16][[ZI]], align 2
+//CHECK: [[PFX]]uint16_t1x3_Val = [[STR]] [3 x i16][[ZI]], align 2
+//CHECK: [[PFX]]uint16_t1x4_Val = [[STR]] [4 x i16][[ZI]], align 2
+TYPE_DECL( uint16_t1x1 );
+TYPE_DECL( uint16_t1x2 );
+TYPE_DECL( uint16_t1x3 );
+TYPE_DECL( uint16_t1x4 );
+
+//CHECK: [[PFX]]uint16_t2x1_Val = [[STR]] [2 x i16][[ZI]], align 2
+//CHECK: [[PFX]]uint16_t2x2_Val = [[STR]] [4 x i16][[ZI]], align 2
+//CHECK: [[PFX]]uint16_t2x3_Val = [[STR]] [6 x i16][[ZI]], align 2
+//CHECK: [[PFX]]uint16_t2x4_Val = [[STR]] [8 x i16][[ZI]], align 2
+TYPE_DECL( uint16_t2x1 );
+TYPE_DECL( uint16_t2x2 );
+TYPE_DECL( uint16_t2x3 );
+TYPE_DECL( uint16_t2x4 );
+
+//CHECK: [[PFX]]uint16_t3x1_Val = [[STR]] [3 x i16][[ZI]], align 2
+//CHECK: [[PFX]]uint16_t3x2_Val = [[STR]] [6 x i16][[ZI]], align 2
+//CHECK: [[PFX]]uint16_t3x3_Val = [[STR]] [9 x i16][[ZI]], align 2
+//CHECK: [[PFX]]uint16_t3x4_Val = [[STR]] [12 x i16][[ZI]], align 2
+TYPE_DECL( uint16_t3x1 );
+TYPE_DECL( uint16_t3x2 );
+TYPE_DECL( uint16_t3x3 );
+TYPE_DECL( uint16_t3x4 );
+
+//CHECK: [[PFX]]uint16_t4x1_Val = [[STR]] [4 x i16][[ZI]], align 2
+//CHECK: [[PFX]]uint16_t4x2_Val = [[STR]] [8 x i16][[ZI]], align 2
+//CHECK: [[PFX]]uint16_t4x3_Val = [[STR]] [12 x i16][[ZI]], align 2
+//CHECK: [[PFX]]uint16_t4x4_Val = [[STR]] [16 x i16][[ZI]], align 2
+TYPE_DECL( uint16_t4x1 );
+TYPE_DECL( uint16_t4x2 );
+TYPE_DECL( uint16_t4x3 );
+TYPE_DECL( uint16_t4x4 );
+
+//CHECK: [[PFX]]int1x1_Val = [[STR]] [1 x i32][[ZI]], align 4
+//CHECK: [[PFX]]int1x2_Val = [[STR]] [2 x i32][[ZI]], align 4
+//CHECK: [[PFX]]int1x3_Val = [[STR]] [3 x i32][[ZI]], align 4
+//CHECK: [[PFX]]int1x4_Val = [[STR]] [4 x i32][[ZI]], align 4
+TYPE_DECL( int1x1 );
+TYPE_DECL( int1x2 );
+TYPE_DECL( int1x3 );
+TYPE_DECL( int1x4 );
+
+//CHECK: [[PFX]]int2x1_Val = [[STR]] [2 x i32][[ZI]], align 4
+//CHECK: [[PFX]]int2x2_Val = [[STR]] [4 x i32][[ZI]], align 4
+//CHECK: [[PFX]]int2x3_Val = [[STR]] [6 x i32][[ZI]], align 4
+//CHECK: [[PFX]]int2x4_Val = [[STR]] [8 x i32][[ZI]], align 4
+TYPE_DECL( int2x1 );
+TYPE_DECL( int2x2 );
+TYPE_DECL( int2x3 );
+TYPE_DECL( int2x4 );
+
+//CHECK: [[PFX]]int3x1_Val = [[STR]] [3 x i32][[ZI]], align 4
+//CHECK: [[PFX]]int3x2_Val = [[STR]] [6 x i32][[ZI]], align 4
+//CHECK: [[PFX]]int3x3_Val = [[STR]] [9 x i32][[ZI]], align 4
+//CHECK: [[PFX]]int3x4_Val = [[STR]] [12 x i32][[ZI]], align 4
+TYPE_DECL( int3x1 );
+TYPE_DECL( int3x2 );
+TYPE_DECL( int3x3 );
+TYPE_DECL( int3x4 );
+
+//CHECK: [[PFX]]int4x1_Val = [[STR]] [4 x i32][[ZI]], align 4
+//CHECK: [[PFX]]int4x2_Val = [[STR]] [8 x i32][[ZI]], align 4
+//CHECK: [[PFX]]int4x3_Val = [[STR]] [12 x i32][[ZI]], align 4
+//CHECK: [[PFX]]int4x4_Val = [[STR]] [16 x i32][[ZI]], align 4
+TYPE_DECL( int4x1 );
+TYPE_DECL( int4x2 );
+TYPE_DECL( int4x3 );
+TYPE_DECL( int4x4 );
+
+//CHECK: [[PFX]]uint1x1_Val = [[STR]] [1 x i32][[ZI]], align 4
+//CHECK: [[PFX]]uint1x2_Val = [[STR]] [2 x i32][[ZI]], align 4
+//CHECK: [[PFX]]uint1x3_Val = [[STR]] [3 x i32][[ZI]], align 4
+//CHECK: [[PFX]]uint1x4_Val = [[STR]] [4 x i32][[ZI]], align 4
+TYPE_DECL( uint1x1 );
+TYPE_DECL( uint1x2 );
+TYPE_DECL( uint1x3 );
+TYPE_DECL( uint1x4 );
+
+//CHECK: [[PFX]]uint2x1_Val = [[STR]] [2 x i32][[ZI]], align 4
+//CHECK: [[PFX]]uint2x2_Val = [[STR]] [4 x i32][[ZI]], align 4
+//CHECK: [[PFX]]uint2x3_Val = [[STR]] [6 x i32][[ZI]], align 4
+//CHECK: [[PFX]]uint2x4_Val = [[STR]] [8 x i32][[ZI]], align 4
+TYPE_DECL( uint2x1 );
+TYPE_DECL( uint2x2 );
+TYPE_DECL( uint2x3 );
+TYPE_DECL( uint2x4 );
+
+//CHECK: [[PFX]]uint3x1_Val = [[STR]] [3 x i32][[ZI]], align 4
+//CHECK: [[PFX]]uint3x2_Val = [[STR]] [6 x i32][[ZI]], align 4
+//CHECK: [[PFX]]uint3x3_Val = [[STR]] [9 x i32][[ZI]], align 4
+//CHECK: [[PFX]]uint3x4_Val = [[STR]] [12 x i32][[ZI]], align 4
+TYPE_DECL( uint3x1 );
+TYPE_DECL( uint3x2 );
+TYPE_DECL( uint3x3 );
+TYPE_DECL( uint3x4 );
+
+//CHECK: [[PFX]]uint4x1_Val = [[STR]] [4 x i32][[ZI]], align 4
+//CHECK: [[PFX]]uint4x2_Val = [[STR]] [8 x i32][[ZI]], align 4
+//CHECK: [[PFX]]uint4x3_Val = [[STR]] [12 x i32][[ZI]], align 4
+//CHECK: [[PFX]]uint4x4_Val = [[STR]] [16 x i32][[ZI]], align 4
+TYPE_DECL( uint4x1 );
+TYPE_DECL( uint4x2 );
+TYPE_DECL( uint4x3 );
+TYPE_DECL( uint4x4 );
+
+//CHECK: [[PFX]]int64_t1x1_Val = [[STR]] [1 x i64][[ZI]], align 8
+//CHECK: [[PFX]]int64_t1x2_Val = [[STR]] [2 x i64][[ZI]], align 8
+//CHECK: [[PFX]]int64_t1x3_Val = [[STR]] [3 x i64][[ZI]], align 8
+//CHECK: [[PFX]]int64_t1x4_Val = [[STR]] [4 x i64][[ZI]], align 8
+TYPE_DECL( int64_t1x1 );
+TYPE_DECL( int64_t1x2 );
+TYPE_DECL( int64_t1x3 );
+TYPE_DECL( int64_t1x4 );
+
+//CHECK: [[PFX]]int64_t2x1_Val = [[STR]] [2 x i64][[ZI]], align 8
+//CHECK: [[PFX]]int64_t2x2_Val = [[STR]] [4 x i64][[ZI]], align 8
+//CHECK: [[PFX]]int64_t2x3_Val = [[STR]] [6 x i64][[ZI]], align 8
+//CHECK: [[PFX]]int64_t2x4_Val = [[STR]] [8 x i64][[ZI]], align 8
+TYPE_DECL( int64_t2x1 );
+TYPE_DECL( int64_t2x2 );
+TYPE_DECL( int64_t2x3 );
+TYPE_DECL( int64_t2x4 );
+
+//CHECK: [[PFX]]int64_t3x1_Val = [[STR]] [3 x i64][[ZI]], align 8
+//CHECK: [[PFX]]int64_t3x2_Val = [[STR]] [6 x i64][[ZI]], align 8
+//CHECK: [[PFX]]int64_t3x3_Val = [[STR]] [9 x i64][[ZI]], align 8
+//CHECK: [[PFX]]int64_t3x4_Val = [[STR]] [12 x i64][[ZI]], align 8
+TYPE_DECL( int64_t3x1 );
+TYPE_DECL( int64_t3x2 );
+TYPE_DECL( int64_t3x3 );
+TYPE_DECL( int64_t3x4 );
+
+//CHECK: [[PFX]]int64_t4x1_Val = [[STR]] [4 x i64][[ZI]], align 8
+//CHECK: [[PFX]]int64_t4x2_Val = [[STR]] [8 x i64][[ZI]], align 8
+//CHECK: [[PFX]]int64_t4x3_Val = [[STR]] [12 x i64][[ZI]], align 8
+//CHECK: [[PFX]]int64_t4x4_Val = [[STR]] [16 x i64][[ZI]], align 8
+TYPE_DECL( int64_t4x1 );
+TYPE_DECL( int64_t4x2 );
+TYPE_DECL( int64_t4x3 );
+TYPE_DECL( int64_t4x4 );
+
+//CHECK: [[PFX]]uint64_t1x1_Val = [[STR]] [1 x i64][[ZI]], align 8
+//CHECK: [[PFX]]uint64_t1x2_Val = [[STR]] [2 x i64][[ZI]], align 8
+//CHECK: [[PFX]]uint64_t1x3_Val = [[STR]] [3 x i64][[ZI]], align 8
+//CHECK: [[PFX]]uint64_t1x4_Val = [[STR]] [4 x i64][[ZI]], align 8
+TYPE_DECL( uint64_t1x1 );
+TYPE_DECL( uint64_t1x2 );
+TYPE_DECL( uint64_t1x3 );
+TYPE_DECL( uint64_t1x4 );
+
+//CHECK: [[PFX]]uint64_t2x1_Val = [[STR]] [2 x i64][[ZI]], align 8
+//CHECK: [[PFX]]uint64_t2x2_Val = [[STR]] [4 x i64][[ZI]], align 8
+//CHECK: [[PFX]]uint64_t2x3_Val = [[STR]] [6 x i64][[ZI]], align 8
+//CHECK: [[PFX]]uint64_t2x4_Val = [[STR]] [8 x i64][[ZI]], align 8
+TYPE_DECL( uint64_t2x1 );
+TYPE_DECL( uint64_t2x2 );
+TYPE_DECL( uint64_t2x3 );
+TYPE_DECL( uint64_t2x4 );
+
+//CHECK: [[PFX]]uint64_t3x1_Val = [[STR]] [3 x i64][[ZI]], align 8
+//CHECK: [[PFX]]uint64_t3x2_Val = [[STR]] [6 x i64][[ZI]], align 8
+//CHECK: [[PFX]]uint64_t3x3_Val = [[STR]] [9 x i64][[ZI]], align 8
+//CHECK: [[PFX]]uint64_t3x4_Val = [[STR]] [12 x i64][[ZI]], align 8
+TYPE_DECL( uint64_t3x1 );
+TYPE_DECL( uint64_t3x2 );
+TYPE_DECL( uint64_t3x3 );
+TYPE_DECL( uint64_t3x4 );
+
+//CHECK: [[PFX]]uint64_t4x1_Val = [[STR]] [4 x i64][[ZI]], align 8
+//CHECK: [[PFX]]uint64_t4x2_Val = [[STR]] [8 x i64][[ZI]], align 8
+//CHECK: [[PFX]]uint64_t4x3_Val = [[STR]] [12 x i64][[ZI]], align 8
+//CHECK: [[PFX]]uint64_t4x4_Val = [[STR]] [16 x i64][[ZI]], align 8
+TYPE_DECL( uint64_t4x1 );
+TYPE_DECL( uint64_t4x2 );
+TYPE_DECL( uint64_t4x3 );
+TYPE_DECL( uint64_t4x4 );
+
+
+//CHECK: [[PFX]]half1x1_Val = [[STR]] [1 x half][[ZI]], align 2
+//CHECK: [[PFX]]half1x2_Val = [[STR]] [2 x half][[ZI]], align 2
+//CHECK: [[PFX]]half1x3_Val = [[STR]] [3 x half][[ZI]], align 2
+//CHECK: [[PFX]]half1x4_Val = [[STR]] [4 x half][[ZI]], align 2
+TYPE_DECL( half1x1 );
+TYPE_DECL( half1x2 );
+TYPE_DECL( half1x3 );
+TYPE_DECL( half1x4 );
+
+//CHECK: [[PFX]]half2x1_Val = [[STR]] [2 x half][[ZI]], align 2
+//CHECK: [[PFX]]half2x2_Val = [[STR]] [4 x half][[ZI]], align 2
+//CHECK: [[PFX]]half2x3_Val = [[STR]] [6 x half][[ZI]], align 2
+//CHECK: [[PFX]]half2x4_Val = [[STR]] [8 x half][[ZI]], align 2
+TYPE_DECL( half2x1 );
+TYPE_DECL( half2x2 );
+TYPE_DECL( half2x3 );
+TYPE_DECL( half2x4 );
+
+//CHECK: [[PFX]]half3x1_Val = [[STR]] [3 x half][[ZI]], align 2
+//CHECK: [[PFX]]half3x2_Val = [[STR]] [6 x half][[ZI]], align 2
+//CHECK: [[PFX]]half3x3_Val = [[STR]] [9 x half][[ZI]], align 2
+//CHECK: [[PFX]]half3x4_Val = [[STR]] [12 x half][[ZI]], align 2
+TYPE_DECL( half3x1 );
+TYPE_DECL( half3x2 );
+TYPE_DECL( half3x3 );
+TYPE_DECL( half3x4 );
+
+//CHECK: [[PFX]]half4x1_Val = [[STR]] [4 x half][[ZI]], align 2
+//CHECK: [[PFX]]half4x2_Val = [[STR]] [8 x half][[ZI]], align 2
+//CHECK: [[PFX]]half4x3_Val = [[STR]] [12 x half][[ZI]], align 2
+//CHECK: [[PFX]]half4x4_Val = [[STR]] [16 x half][[ZI]], align 2
+TYPE_DECL( half4x1 );
+TYPE_DECL( half4x2 );
+TYPE_DECL( half4x3 );
+TYPE_DECL( half4x4 );
+
+//CHECK: [[PFX]]float1x1_Val = [[STR]] [1 x float][[ZI]], align 4
+//CHECK: [[PFX]]float1x2_Val = [[STR]] [2 x float][[ZI]], align 4
+//CHECK: [[PFX]]float1x3_Val = [[STR]] [3 x float][[ZI]], align 4
+//CHECK: [[PFX]]float1x4_Val = [[STR]] [4 x float][[ZI]], align 4
+TYPE_DECL( float1x1 );
+TYPE_DECL( float1x2 );
+TYPE_DECL( float1x3 );
+TYPE_DECL( float1x4 );
+
+//CHECK: [[PFX]]float2x1_Val = [[STR]] [2 x float][[ZI]], align 4
+//CHECK: [[PFX]]float2x2_Val = [[STR]] [4 x float][[ZI]], align 4
+//CHECK: [[PFX]]float2x3_Val = [[STR]] [6 x float][[ZI]], align 4
+//CHECK: [[PFX]]float2x4_Val = [[STR]] [8 x float][[ZI]], align 4
+TYPE_DECL( float2x1 );
+TYPE_DECL( float2x2 );
+TYPE_DECL( float2x3 );
+TYPE_DECL( float2x4 );
+
+//CHECK: [[PFX]]float3x1_Val = [[STR]] [3 x float][[ZI]], align 4
+//CHECK: [[PFX]]float3x2_Val = [[STR]] [6 x float][[ZI]], align 4
+//CHECK: [[PFX]]float3x3_Val = [[STR]] [9 x float][[ZI]], align 4
+//CHECK: [[PFX]]float3x4_Val = [[STR]] [12 x float][[ZI]], align 4
+TYPE_DECL( float3x1 );
+TYPE_DECL( float3x2 );
+TYPE_DECL( float3x3 );
+TYPE_DECL( float3x4 );
+
+//CHECK: [[PFX]]float4x1_Val = [[STR]] [4 x float][[ZI]], align 4
+//CHECK: [[PFX]]float4x2_Val = [[STR]] [8 x float][[ZI]], align 4
+//CHECK: [[PFX]]float4x3_Val = [[STR]] [12 x float][[ZI]], align 4
+//CHECK: [[PFX]]float4x4_Val = [[STR]] [16 x float][[ZI]], align 4
+TYPE_DECL( float4x1 );
+TYPE_DECL( float4x2 );
+TYPE_DECL( float4x3 );
+TYPE_DECL( float4x4 );
+
+//CHECK: [[PFX]]double1x1_Val = [[STR]] [1 x double][[ZI]], align 8
+//CHECK: [[PFX]]double1x2_Val = [[STR]] [2 x double][[ZI]], align 8
+//CHECK: [[PFX]]double1x3_Val = [[STR]] [3 x double][[ZI]], align 8
+//CHECK: [[PFX]]double1x4_Val = [[STR]] [4 x double][[ZI]], align 8
+TYPE_DECL( double1x1 );
+TYPE_DECL( double1x2 );
+TYPE_DECL( double1x3 );
+TYPE_DECL( double1x4 );
+
+//CHECK: [[PFX]]double2x1_Val = [[STR]] [2 x double][[ZI]], align 8
+//CHECK: [[PFX]]double2x2_Val = [[STR]] [4 x double][[ZI]], align 8
+//CHECK: [[PFX]]double2x3_Val = [[STR]] [6 x double][[ZI]], align 8
+//CHECK: [[PFX]]double2x4_Val = [[STR]] [8 x double][[ZI]], align 8
+TYPE_DECL( double2x1 );
+TYPE_DECL( double2x2 );
+TYPE_DECL( double2x3 );
+TYPE_DECL( double2x4 );
+
+//CHECK: [[PFX]]double3x1_Val = [[STR]] [3 x double][[ZI]], align 8
+//CHECK: [[PFX]]double3x2_Val = [[STR]] [6 x double][[ZI]], align 8
+//CHECK: [[PFX]]double3x3_Val = [[STR]] [9 x double][[ZI]], align 8
+//CHECK: [[PFX]]double3x4_Val = [[STR]] [12 x double][[ZI]], align 8
+TYPE_DECL( double3x1 );
+TYPE_DECL( double3x2 );
+TYPE_DECL( double3x3 );
+TYPE_DECL( double3x4 );
+
+//CHECK: [[PFX]]double4x1_Val = [[STR]] [4 x double][[ZI]], align 8
+//CHECK: [[PFX]]double4x2_Val = [[STR]] [8 x double][[ZI]], align 8
+//CHECK: [[PFX]]double4x3_Val = [[STR]] [12 x double][[ZI]], align 8
+//CHECK: [[PFX]]double4x4_Val = [[STR]] [16 x double][[ZI]], align 8
+TYPE_DECL( double4x1 );
+TYPE_DECL( double4x2 );
+TYPE_DECL( double4x3 );
+TYPE_DECL( double4x4 );
+
+#ifndef SPIRV
+}
+#endif
diff --git a/clang/test/SemaHLSL/Types/BuiltinMatrix/matrix-cast.hlsl b/clang/test/SemaHLSL/Types/BuiltinMatrix/matrix-cast.hlsl
new file mode 100644
index 0000000000000..03045e978c268
--- /dev/null
+++ b/clang/test/SemaHLSL/Types/BuiltinMatrix/matrix-cast.hlsl
@@ -0,0 +1,138 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-library -fnative-half-type -finclude-default-header -fsyntax-only %s -verify
+
+typedef struct test_struct { // expected-note 1+ {{candidate constructor (the implicit default constructor) not viable: requires 0 arguments, but 1 was provided}}
+  // expected-note-re@-1 1+ {{candidate constructor (the implicit move constructor) not viable: no known conversion from '{{[^']*}}' (aka '{{[^']*}}') to 'test_struct' for 1st argument}}
+  // expected-note-re@-2 1+ {{candidate constructor (the implicit copy constructor) not viable: no known conversion from '{{[^']*}}' (aka '{{[^']*}}') to 'const test_struct' for 1st argument}}
+} test_struct;
+
+void f1(void) {
+  uint16_t3x3 u16_3x3;
+  int3x3 i32_3x3;
+  int16_t3x3 i16_3x3;
+  int4x4 i32_4x4;
+  float4x4 f32_4x4;
+  int i;
+  float4 v;
+  test_struct s;
+
+  i32_3x3 = (int3x3)u16_3x3;
+  i16_3x3 = (int16_t3x3)i32_3x3;
+  i32_4x4 = (int4x4)i16_3x3;        // expected-error {{conversion between matrix types 'int4x4' (aka 'matrix<int, 4, 4>') and 'matrix<short, 3, 3>' of different size is not allowed}}
+  f32_4x4 = (int4x4)i32_4x4;        // expected-error {{assigning to 'matrix<float, [2 * ...]>' from incompatible type 'matrix<int, [2 * ...]>'}}
+  i = (int)i32_4x4;           // expected-error {{C-style cast from 'int4x4' (aka 'matrix<int, 4, 4>') to 'int' is not allowed}}
+  i32_4x4 = (int4x4)i;         // expected-error {{C-style cast from 'int' to 'int4x4' (aka 'matrix<int, 4, 4>') is not allowed}}
+  v = (float4)i32_4x4;           // expected-error {{C-style cast from 'int4x4' (aka 'matrix<int, 4, 4>') to 'float4' (aka 'vector<float, 4>') is not allowed}}
+  i32_4x4 = (int4x4)v;         // expected-error {{C-style cast from 'float4' (aka 'vector<float, 4>') to 'int4x4' (aka 'matrix<int, 4, 4>') is not allowed}}
+  s = (test_struct)i16_3x3; // expected-error {{no matching conversion for C-style cast from 'int16_t3x3' (aka 'matrix<int16_t, 3, 3>') to 'test_struct'}}
+  i16_3x3 = (int16_t3x3)s;         // expected-error {{cannot convert 'test_struct' to 'int16_t3x3' (aka 'matrix<int16_t, 3, 3>') without a conversion operator}}
+
+  i32_4x4 = (int4x4)f32_4x4;
+}
+
+void f2(void) {
+  float2x2 f32_2x2;
+  double3x3 f64_3x3;
+  double2x2 f64_2x2;
+  int4x4 i32_4x4;
+  uint4x4 u32_4x4;
+  uint3x3 u32_3x3;
+  float f;
+
+  f64_3x3 = (double3x3)f32_2x2; // expected-error {{conversion between matrix types 'double3x3' (aka 'matrix<double, 3, 3>') and 'matrix<float, 2, 2>' of different size is not allowed}}
+  f64_2x2 = (double2x2)f32_2x2;
+
+  u32_4x4 = (uint4x4)i32_4x4;
+  i32_4x4 = (int4x4)u32_4x4;
+  u32_3x3 = (uint3x3)i32_4x4; // expected-error {{conversion between matrix types 'uint3x3' (aka 'matrix<uint, 3, 3>') and 'matrix<int, 4, 4>' of different size is not allowed}}
+  f = (float)i32_4x4;    // expected-error {{C-style cast from 'int4x4' (aka 'matrix<int, 4, 4>') to 'float' is not allowed}}
+  i32_4x4 = (int4x4)f;    // expected-error {{C-style cast from 'float' to 'int4x4' (aka 'matrix<int, 4, 4>') is not allowed}}
+}
+
+template <typename X>
+using matrix_3_3 = matrix<X, 3, 3>;
+
+template <typename Y>
+using matrix_4_4 = matrix<Y, 4, 4>;
+
+void f3() {
+  matrix_3_3<uint16_t> u16_3x3;
+  matrix_3_3<int> i32_3x3;
+  matrix_3_3<int16_t> i16_3x3;
+  matrix_4_4<int> i32_4x4;
+  matrix_4_4<float> f32_4x4;
+  int i;
+  int4 v;
+  test_struct s;
+
+  i32_3x3 = (matrix_3_3<int>)u16_3x3;
+  i32_3x3 = u16_3x3; // expected-error {{assigning to 'matrix_3_3<int>' from incompatible type 'matrix_3_3<uint16_t>'}}
+  i16_3x3 = (matrix_3_3<int16_t>)i32_3x3;
+  i32_4x4 = (matrix_4_4<int>)i16_3x3; // expected-error {{conversion between matrix types 'matrix_4_4<int>' (aka 'matrix<int, 4, 4>') and 'matrix<short, 3, 3>' of different size is not allowed}}
+
+  i = (int)i16_3x3;            // expected-error {{C-style cast from 'matrix_3_3<int16_t>' (aka 'matrix<int16_t, 3, 3>') to 'int' is not allowed}}
+  i32_3x3 = (matrix_3_3<int>)i; // expected-error {{C-style cast from 'int' to 'matrix_3_3<int>' (aka 'matrix<int, 3, 3>') is not allowed}}
+
+  v = (int4)i32_3x3;            // expected-error {{C-style cast from 'matrix_3_3<int>' (aka 'matrix<int, 3, 3>') to 'int4' (aka 'vector<int, 4>') is not allowed}}
+  u16_3x3 = (matrix_3_3<uint16_t>)v; // expected-error {{C-style cast from 'int4' (aka 'vector<int, 4>') to 'matrix_3_3<uint16_t>' (aka 'matrix<uint16_t, 3, 3>') is not allowed}}
+  s = (test_struct)u16_3x3;    // expected-error {{no matching conversion for C-style cast from 'matrix_3_3<uint16_t>' (aka 'matrix<uint16_t, 3, 3>') to 'test_struct'}}
+  f32_4x4 = (matrix_4_4<float>)s; // expected-error {{cannot convert 'test_struct' to 'matrix_4_4<float>' (aka 'matrix<float, 4, 4>') without a conversion operator}}
+}
+
+void f4() {
+  matrix_3_3<uint16_t> u16_3x3;
+  matrix_3_3<int> i32_3x3;
+  matrix_3_3<int16_t> i16_3x3;
+  matrix_4_4<int> i32_4x4;
+  matrix_4_4<float> f32_4x4;
+  int i;
+  int4 v;
+  test_struct s;
+
+  i32_3x3 = static_cast<matrix_3_3<int>>(u16_3x3);
+  i16_3x3 = static_cast<matrix_3_3<int16_t>>(i32_3x3);
+  i32_4x4 = static_cast<matrix_4_4<int>>(i16_3x3); // expected-error {{conversion between matrix types 'matrix_4_4<int>' (aka 'matrix<int, 4, 4>') and 'matrix<short, 3, 3>' of different size is not allowed}}
+
+  i = static_cast<int>(i16_3x3);            // expected-error {{static_cast from 'matrix_3_3<int16_t>' (aka 'matrix<int16_t, 3, 3>') to 'int' is not allowed}}
+  i32_3x3 = static_cast<matrix_3_3<int>>(i); // expected-error {{static_cast from 'int' to 'matrix_3_3<int>' (aka 'matrix<int, 3, 3>') is not allowed}}
+
+  v = static_cast<int4>(i32_3x3);             // expected-error {{static_cast from 'matrix_3_3<int>' (aka 'matrix<int, 3, 3>') to 'int4' (aka 'vector<int, 4>') is not allowed}}
+  i16_3x3 = static_cast<matrix_3_3<uint16_t>>(v); // expected-error {{static_cast from 'int4' (aka 'vector<int, 4>') to 'matrix_3_3<uint16_t>' (aka 'matrix<uint16_t, 3, 3>') is not allowed}}
+
+  s = static_cast<test_struct>(u16_3x3);    // expected-error {{no matching conversion for static_cast from 'matrix_3_3<uint16_t>' (aka 'matrix<uint16_t, 3, 3>') to 'test_struct'}}
+  f32_4x4 = static_cast<matrix_4_4<float>>(s); // expected-error {{cannot convert 'test_struct' to 'matrix_4_4<float>' (aka 'matrix<float, 4, 4>') without a conversion operator}}
+}
+
+void f5() {
+  matrix_3_3<float> f32_3x3;
+  matrix_3_3<double> f64_3x3;
+  matrix_4_4<double> f64_4x4;
+  matrix_4_4<int> i32_4x4;
+  matrix_3_3<uint> u32_3x3;
+  matrix_4_4<uint> u32_4x4;
+  float f;
+
+  f64_3x3 = (matrix_3_3<double>)f32_3x3;
+  f64_4x4 = (matrix_4_4<double>)f32_3x3; // expected-error {{conversion between matrix types 'matrix_4_4<double>' (aka 'matrix<double, 4, 4>') and 'matrix<float, 3, 3>' of different size is not allowed}}
+  i32_4x4 = (matrix_4_4<int>)f64_4x4;
+  u32_3x3 = (matrix_4_4<uint>)i32_4x4; // expected-error {{assigning to 'matrix<[...], 3, 3>' from incompatible type 'matrix<[...], 4, 4>'}}
+  u32_4x4 = (matrix_4_4<uint>)i32_4x4;
+  i32_4x4 = (matrix_4_4<int>)u32_4x4;
+}
+
+void f6() {
+  matrix_3_3<float> f32_3x3;
+  matrix_3_3<double> f64_3x3;
+  matrix_4_4<double> f64_4x4;
+  matrix_4_4<int> i32_4x4;
+  matrix_3_3<uint> u32_3x3;
+  matrix_4_4<uint> u32_4x4;
+  float f;
+
+  f64_3x3 = static_cast<matrix_3_3<double>>(f32_3x3);
+  f64_4x4 = static_cast<matrix_4_4<double>>(f32_3x3); // expected-error {{conversion between matrix types 'matrix_4_4<double>' (aka 'matrix<double, 4, 4>') and 'matrix<float, 3, 3>' of different size is not allowed}}
+
+  i32_4x4 = static_cast<matrix_4_4<int>>(f64_4x4);
+  u32_3x3 = static_cast<matrix_4_4<uint>>(i32_4x4); // expected-error {{assigning to 'matrix<[...], 3, 3>' from incompatible type 'matrix<[...], 4, 4>'}}
+  u32_4x4 = static_cast<matrix_4_4<uint>>(i32_4x4);
+  i32_4x4 = static_cast<matrix_4_4<signed int>>(u32_4x4);
+}
diff --git a/clang/test/SemaHLSL/Types/BuiltinMatrix/matrix-index-operator-type.hlsl b/clang/test/SemaHLSL/Types/BuiltinMatrix/matrix-index-operator-type.hlsl
new file mode 100644
index 0000000000000..29640ae01d6fb
--- /dev/null
+++ b/clang/test/SemaHLSL/Types/BuiltinMatrix/matrix-index-operator-type.hlsl
@@ -0,0 +1,27 @@
+// RUN: %clang_cc1 -triple spirv-unknown-vulkan-compute -fnative-half-type -finclude-default-header -fsyntax-only %s -verify
+
+double indexi32(matrix<double,3,1> X, int       i) { return X[i][0]; }
+
+double indexu32(matrix<double,3,1> X, uint      i) { return X[i][0]; }
+
+double indexi16(matrix<double,3,1> X, int16_t   i) { return X[i][0]; }
+
+double indexu16(matrix<double,3,1> X, uint16_t  i) { return X[i][0]; }
+
+double indexi64(matrix<double,3,1> X, int64_t   i) { return X[i][0]; }
+
+double indexu64(matrix<double,3,1> X, uint64_t  i) { return X[i][0]; }
+
+double indexi32c(matrix<double,3,1> X, int      i) { return X[0][i]; }
+
+double indexu32c(matrix<double,3,1> X, uint     i) { return X[0][i]; }
+
+double indexi16c(matrix<double,3,1> X, int16_t  i) { return X[0][i]; }
+
+double indexu16c(matrix<double,3,1> X, uint16_t i) { return X[0][i]; }
+
+double indexi64c(matrix<double,3,1> X, int64_t  i) { return X[0][i]; }
+
+double indexu64c(matrix<double,3,1> X, uint64_t i) { return X[0][i]; }
+
+// expected-no-diagnostics
diff --git a/clang/test/SemaHLSL/Types/BuiltinMatrix/matrix-transpose.hlsl b/clang/test/SemaHLSL/Types/BuiltinMatrix/matrix-transpose.hlsl
new file mode 100644
index 0000000000000..4423e7cde4bd5
--- /dev/null
+++ b/clang/test/SemaHLSL/Types/BuiltinMatrix/matrix-transpose.hlsl
@@ -0,0 +1,53 @@
+// RUN: %clang_cc1 -triple spirv-unknown-vulkan-compute -fnative-half-type -finclude-default-header -fsyntax-only %s -verify
+
+void transpose(float3x4 a, int3x2 b, double3x3 c, int e) {
+
+  a = __builtin_matrix_transpose(b);
+  // expected-error@-1 {{assigning to 'float3x4' (aka 'matrix<float, 3, 4>') from incompatible type 'matrix<int, 2, 3>'}}
+  b = __builtin_matrix_transpose(b);
+  // expected-error@-1 {{assigning to 'int3x2' (aka 'matrix<int, 3, 2>') from incompatible type 'matrix<int, 2, 3>'}}
+  __builtin_matrix_transpose(e);
+  // expected-error@-1 {{1st argument must be a matrix}}
+  __builtin_matrix_transpose("test");
+  // expected-error@-1 {{1st argument must be a matrix}}
+
+  uint3x3 m = __builtin_matrix_transpose(c);
+  // expected-error@-1 {{cannot initialize a variable of type 'uint3x3' (aka 'matrix<uint, 3, 3>') with an rvalue of type 'matrix<double, 3, 3>'}}
+}
+
+template <typename EltTy, unsigned Rows, unsigned Columns>
+struct MyMatrix {
+  using matrix_t = matrix<EltTy, Rows, Columns>;
+
+  matrix_t value;
+};
+
+template <typename EltTy0, unsigned R0, unsigned C0, typename EltTy1, unsigned R1, unsigned C1>
+typename MyMatrix<EltTy1, R1, C1>::matrix_t transpose(inout MyMatrix<EltTy0, R0, C0> A) {
+  uint16_t v1 = __builtin_matrix_transpose(A.value);
+  // expected-error@-1 {{cannot initialize a variable of type 'uint16_t' (aka 'unsigned short') with an rvalue of type 'matrix<unsigned int, 3, 2>'}}
+  // expected-error@-2 2 {{cannot initialize a variable of type 'uint16_t' (aka 'unsigned short') with an rvalue of type 'matrix<unsigned int, 3, 3>'}}
+
+  __builtin_matrix_transpose(A);
+  // expected-error@-1 3 {{1st argument must be a matrix}}
+
+  return __builtin_matrix_transpose(A.value);
+  // expected-error@-1 {{cannot initialize return object of type 'typename MyMatrix<unsigned int, 2U, 3U>::matrix_t' (aka 'matrix<unsigned int, 2, 3>') with an rvalue of type 'matrix<unsigned int, 3, 2>'}}
+  // expected-error@-2 {{cannot initialize return object of type 'typename MyMatrix<unsigned int, 2U, 3U>::matrix_t' (aka 'matrix<unsigned int, 2, 3>') with an rvalue of type 'matrix<unsigned int, 3, 3>'}}
+  // expected-error@-3 {{cannot initialize return object of type 'typename MyMatrix<float, 3U, 3U>::matrix_t' (aka 'matrix<float, 3, 3>') with an rvalue of type 'matrix<unsigned int, 3, 3>'}}
+}
+
+void test_transpose_template() {
+  MyMatrix<unsigned, 2, 3> Mat1;
+  MyMatrix<unsigned, 3, 3> Mat2;
+  Mat1.value = transpose<unsigned, 2, 3, unsigned, 2, 3>(Mat1);
+  // expected-note@-1 {{in instantiation of function template specialization 'transpose<unsigned int, 2U, 3U, unsigned int, 2U, 3U>' requested here}}
+
+  Mat1.value = transpose<unsigned, 3, 3, unsigned, 2, 3>(Mat2);
+  // expected-note@-1 {{in instantiation of function template specialization 'transpose<unsigned int, 3U, 3U, unsigned int, 2U, 3U>' requested here}}
+
+  MyMatrix<float, 3, 3> Mat3;
+  Mat3.value = transpose<unsigned, 3, 3, float, 3, 3>(Mat2);
+  // expected-note@-1 {{in instantiation of function template specialization 'transpose<unsigned int, 3U, 3U, float, 3U, 3U>' requested here}}
+}
+
diff --git a/clang/test/SemaHLSL/Types/BuiltinMatrix/matrix-type-operators.hlsl b/clang/test/SemaHLSL/Types/BuiltinMatrix/matrix-type-operators.hlsl
new file mode 100644
index 0000000000000..fd62c300857fe
--- /dev/null
+++ b/clang/test/SemaHLSL/Types/BuiltinMatrix/matrix-type-operators.hlsl
@@ -0,0 +1,307 @@
+// RUN: %clang_cc1 -triple spirv-unknown-vulkan-compute -fnative-half-type -finclude-default-header -fsyntax-only %s -verify
+
+void add(float4x4 a, float3x4 b, float4x3 c) {
+  a = b + c;
+  // expected-error@-1 {{invalid operands to binary expression ('float3x4' (aka 'matrix<float, 3, 4>') and 'float4x3' (aka 'matrix<float, 4, 3>'))}}
+
+  b += c;
+  // expected-error@-1 {{invalid operands to binary expression ('float3x4' (aka 'matrix<float, 3, 4>') and 'float4x3' (aka 'matrix<float, 4, 3>'))}}
+
+  a = b + b; // expected-error {{assigning to 'matrix<[...], 4, [...]>' from incompatible type 'matrix<[...], 3, [...]>'}}
+
+  a = 10 + b;
+  // expected-error@-1 {{assigning to 'matrix<[...], 4, [...]>' from incompatible type 'matrix<[...], 3, [...]>'}}
+}
+
+void sub(float4x4 a, float3x4 b, float4x3 c) {
+  a = b - c;
+  // expected-error@-1 {{invalid operands to binary expression ('float3x4' (aka 'matrix<float, 3, 4>') and 'float4x3' (aka 'matrix<float, 4, 3>'))}}
+
+  b -= c;
+  // expected-error@-1 {{invalid operands to binary expression ('float3x4' (aka 'matrix<float, 3, 4>') and 'float4x3' (aka 'matrix<float, 4, 3>'))}}
+
+  a = b - b; // expected-error {{assigning to 'matrix<[...], 4, [...]>' from incompatible type 'matrix<[...], 3, [...]>'}}
+
+  a = 10 - b;
+  // expected-error@-1 {{assigning to 'matrix<[...], 4, [...]>' from incompatible type 'matrix<[...], 3, [...]>'}}
+
+}
+
+void matrix_matrix_multiply(float4x4 a, float3x4 b, int4x3 c, int4x4 d, float sf, inout uint16_t p) {
+  // Check dimension mismatches.
+  a = a * b;
+  // expected-error@-1 {{invalid operands to binary expression ('float4x4' (aka 'matrix<float, 4, 4>') and 'float3x4' (aka 'matrix<float, 3, 4>'))}}
+  a *= b;
+  // expected-error@-1 {{invalid operands to binary expression ('float4x4' (aka 'matrix<float, 4, 4>') and 'float3x4' (aka 'matrix<float, 3, 4>'))}}
+  b = a * a;
+  // expected-error@-1 {{assigning to 'matrix<[...], 3, [...]>' from incompatible type 'matrix<[...], 4, [...]>'}}
+
+  // Check element type mismatches.
+  a = b * c;
+  // expected-error@-1 {{invalid operands to binary expression ('float3x4' (aka 'matrix<float, 3, 4>') and 'int4x3' (aka 'matrix<int, 4, 3>'))}}
+  b *= c;
+  // expected-error@-1 {{invalid operands to binary expression ('float3x4' (aka 'matrix<float, 3, 4>') and 'int4x3' (aka 'matrix<int, 4, 3>'))}}
+  d = a * a;
+  // expected-error@-1 {{assigning to 'matrix<int, [2 * ...]>' from incompatible type 'matrix<float, [2 * ...]>'}}
+
+  p = a * a;
+  // expected-error@-1 {{assigning to 'uint16_t' (aka 'unsigned short') from incompatible type 'float4x4' (aka 'matrix<float, 4, 4>')}}
+}
+
+void mat_scalar_multiply(float4x4 a, float3x4 b, float sf, inout uint16_t p) {
+  // Shape of multiplication result does not match the type of b.
+  b = a * sf;
+  // expected-error@-1 {{assigning to 'matrix<[...], 3, [...]>' from incompatible type 'matrix<[...], 4, [...]>'}}
+  b = sf * a;
+  // expected-error@-1 {{assigning to 'matrix<[...], 3, [...]>' from incompatible type 'matrix<[...], 4, [...]>'}}
+
+  sf = a * sf;
+  // expected-error@-1 {{assigning to 'float' from incompatible type 'float4x4' (aka 'matrix<float, 4, 4>')}}
+}
+
+void mat_scalar_divide(float4x4 a, float3x4 b, float sf, inout uint16_t p) {
+  // Shape of multiplication result does not match the type of b.
+  b = a / sf;
+  // expected-error@-1 {{assigning to 'matrix<[...], 3, [...]>' from incompatible type 'matrix<[...], 4, [...]>'}}
+  b = sf / a;
+  // expected-error@-1 {{invalid operands to binary expression ('float' and 'float4x4' (aka 'matrix<float, 4, 4>'))}}
+
+  a = p / a;
+  // expected-error@-1 {{invalid operands to binary expression ('uint16_t' (aka 'unsigned short') and 'float4x4' (aka 'matrix<float, 4, 4>'))}}
+
+  sf = a / sf;
+  // expected-error@-1 {{assigning to 'float' from incompatible type 'float4x4' (aka 'matrix<float, 4, 4>')}}
+}
+
+void matrix_matrix_divide(float4x4 a, float3x4 b, int4x3 c, int4x4 d, float sf, uint16_t p) {
+  // Matrix by matrix division is not supported.
+  a = a / a;
+  // expected-error@-1 {{invalid operands to binary expression ('float4x4' (aka 'matrix<float, 4, 4>') and 'float4x4')}}
+
+  b = a / a;
+  // expected-error@-1 {{invalid operands to binary expression ('float4x4' (aka 'matrix<float, 4, 4>') and 'float4x4')}}
+
+  // Check element type mismatches.
+  a = b / c;
+  // expected-error@-1 {{invalid operands to binary expression ('float3x4' (aka 'matrix<float, 3, 4>') and 'int4x3' (aka 'matrix<int, 4, 3>'))}}
+  d = a / a;
+  // expected-error@-1 {{invalid operands to binary expression ('float4x4' (aka 'matrix<float, 4, 4>') and 'float4x4')}}
+
+  p = a / a;
+  // expected-error@-1 {{invalid operands to binary expression ('float4x4' (aka 'matrix<float, 4, 4>') and 'float4x4')}}
+}
+
+float3x4 get_matrix(void);
+
+void insert(float3x4 a, float f) {
+  // Non integer indexes.
+  a[1][f] = 0;
+  // expected-error@-1 {{matrix column index is not an integer}}
+  a[f][2] = 0;
+  // expected-error@-1 {{matrix row index is not an integer}}
+  a[f][f] = 0;
+  // expected-error@-1 {{matrix row index is not an integer}}
+  // expected-error@-2 {{matrix column index is not an integer}}
+  a[0][f] = 0;
+  // expected-error@-1 {{matrix column index is not an integer}}
+
+  a[f][f] = 0;
+  // expected-error@-1 {{matrix row index is not an integer}}
+  // expected-error@-2 {{matrix column index is not an integer}}
+
+  // Indexes outside allowed dimensions.
+  a[-1][3] = 10.0;
+  // expected-error@-1 {{matrix row index is outside the allowed range [0, 3)}}
+  a[2][-1] = 10.0;
+  // expected-error@-1 {{matrix column index is outside the allowed range [0, 4)}}
+  a[2][-1u] = 10.0;
+  // expected-error@-1 {{matrix column index is outside the allowed range [0, 4)}}
+  a[-1u][3] = 10.0;
+  // expected-error@-1 {{matrix row index is outside the allowed range [0, 3)}}
+  a[5][2] = 10.0;
+  // expected-error@-1 {{matrix row index is outside the allowed range [0, 3)}}
+  a[2][10] = 10.0;
+  // expected-error@-1 {{matrix column index is outside the allowed range [0, 4)}}
+  a[3][2.0] = f;
+  // expected-error@-1 {{matrix row index is outside the allowed range [0, 3)}}
+  // expected-error@-2 {{matrix column index is not an integer}}
+  (a[1])[1] = f;
+  // expected-error@-1 {{matrix row and column subscripts cannot be separated by any expression}}
+
+  get_matrix()[0][0] = f;
+  // expected-error@-1 {{expression is not assignable}}
+  get_matrix()[3][1.0] = f;
+  // expected-error@-1 {{matrix row index is outside the allowed range [0, 3)}}
+  // expected-error@-2 {{matrix column index is not an integer}}
+
+  (get_matrix()[0])[2] = f;
+  // expected-error@-1 {{matrix row and column subscripts cannot be separated by any expression}}
+
+  a[4, 5] = 5.0;
+  // expected-error@-1 {{comma expressions are not allowed as indices in matrix subscript expressions}}
+  // expected-warning@-2 {{left operand of comma operator has no effect}}
+
+  a[4, 5, 4] = 5.0;
+  // expected-error@-1 {{comma expressions are not allowed as indices in matrix subscript expressions}}
+  // expected-warning@-2 {{left operand of comma operator has no effect}}
+  // expected-warning@-3 {{left operand of comma operator has no effect}}
+}
+
+void extract(float3x4 a, float f) {
+  // Non integer indexes.
+  float v1 = a[2][f];
+  // expected-error@-1 {{matrix column index is not an integer}}
+  float v2 = a[f][3];
+  // expected-error@-1 {{matrix row index is not an integer}}
+  float v3 = a[f][f];
+  // expected-error@-1 {{matrix row index is not an integer}}
+  // expected-error@-2 {{matrix column index is not an integer}}
+
+  // Indexes outside allowed dimensions.
+  float v5 = a[-1][3];
+  // expected-error@-1 {{matrix row index is outside the allowed range [0, 3)}}
+  float v6 = a[2][-1];
+  // expected-error@-1 {{matrix column index is outside the allowed range [0, 4)}}
+  float v8 = a[-1u][3];
+  // expected-error@-1 {{matrix row index is outside the allowed range [0, 3)}}
+  float v9 = a[5][2];
+  // expected-error@-1 {{matrix row index is outside the allowed range [0, 3)}}
+  float v10 = a[2][4];
+  // expected-error@-1 {{matrix column index is outside the allowed range [0, 4)}}
+  float v11 = a[3][2.0];
+  // expected-error@-1 {{matrix row index is outside the allowed range [0, 3)}}
+  // expected-error@-2 {{matrix column index is not an integer}}
+
+  float v12 = get_matrix()[0][0];
+  float v13 = get_matrix()[3][2.0];
+  // expected-error@-1 {{matrix row index is outside the allowed range [0, 3)}}
+  // expected-error@-2 {{matrix column index is not an integer}}
+
+}
+
+template <typename EltTy, unsigned Rows, unsigned Columns>
+struct MyMatrix {
+  using matrix_t = matrix<EltTy, Rows, Columns>;
+
+  matrix_t value;
+};
+
+template <typename EltTy0, unsigned R0, unsigned C0, typename EltTy1, unsigned R1, unsigned C1, typename EltTy2, unsigned R2, unsigned C2>
+typename MyMatrix<EltTy2, R2, C2>::matrix_t add(inout MyMatrix<EltTy0, R0, C0> A, inout MyMatrix<EltTy1, R1, C1> B) {
+  uint16_t v1 = A.value + B.value;
+  // expected-error@-1 {{cannot initialize a variable of type 'uint16_t' (aka 'unsigned short') with an rvalue of type 'matrix_t' (aka 'matrix<unsigned int, 2, 2>')}}
+  // expected-error@-2 {{invalid operands to binary expression ('matrix_t' (aka 'matrix<unsigned int, 3, 3>') and 'matrix_t' (aka 'matrix<float, 2, 2>'))}}
+  // expected-error@-3 {{invalid operands to binary expression ('matrix_t' (aka 'matrix<unsigned int, 2, 2>') and 'matrix_t' (aka 'matrix<unsigned int, 3, 3>'))}}
+
+  return A.value + B.value;
+  // expected-error@-1 {{invalid operands to binary expression ('matrix_t' (aka 'matrix<unsigned int, 3, 3>') and 'matrix_t' (aka 'matrix<float, 2, 2>'))}}
+  // expected-error@-2 {{invalid operands to binary expression ('matrix_t' (aka 'matrix<unsigned int, 2, 2>') and 'matrix_t' (aka 'matrix<unsigned int, 3, 3>'))}}
+}
+
+void test_add_template() {
+  MyMatrix<unsigned, 2, 2> Mat1;
+  MyMatrix<unsigned, 3, 3> Mat2;
+  MyMatrix<float, 2, 2> Mat3;
+  unsigned v1 = add<unsigned, 2, 2, unsigned, 2, 2, unsigned, 2, 2>(Mat1, Mat1);
+  // expected-error@-1 {{cannot initialize a variable of type 'unsigned int' with an rvalue of type 'typename MyMatrix<unsigned int, 2U, 2U>::matrix_t' (aka 'matrix<unsigned int, 2, 2>')}}
+  // expected-note@-2 {{in instantiation of function template specialization 'add<unsigned int, 2U, 2U, unsigned int, 2U, 2U, unsigned int, 2U, 2U>' requested here}}
+
+  Mat1.value = add<unsigned, 2, 2, unsigned, 3, 3, unsigned, 2, 2>(Mat1, Mat2);
+  // expected-note@-1 {{in instantiation of function template specialization 'add<unsigned int, 2U, 2U, unsigned int, 3U, 3U, unsigned int, 2U, 2U>' requested here}}
+
+  Mat1.value = add<unsigned, 3, 3, float, 2, 2, unsigned, 2, 2>(Mat2, Mat3);
+  // expected-note@-1 {{in instantiation of function template specialization 'add<unsigned int, 3U, 3U, float, 2U, 2U, unsigned int, 2U, 2U>' requested here}}
+}
+
+template <typename EltTy0, unsigned R0, unsigned C0, typename EltTy1, unsigned R1, unsigned C1, typename EltTy2, unsigned R2, unsigned C2>
+typename MyMatrix<EltTy2, R2, C2>::matrix_t subtract(inout MyMatrix<EltTy0, R0, C0> A, inout MyMatrix<EltTy1, R1, C1> B) {
+  uint16_t v1 = A.value - B.value;
+  // expected-error@-1 {{cannot initialize a variable of type 'uint16_t' (aka 'unsigned short') with an rvalue of type 'matrix_t' (aka 'matrix<unsigned int, 2, 2>')}}
+  // expected-error@-2 {{invalid operands to binary expression ('matrix_t' (aka 'matrix<unsigned int, 3, 3>') and 'matrix_t' (aka 'matrix<float, 2, 2>')}}
+  // expected-error@-3 {{invalid operands to binary expression ('matrix_t' (aka 'matrix<unsigned int, 2, 2>') and 'matrix_t' (aka 'matrix<unsigned int, 3, 3>')}}
+
+  return A.value - B.value;
+  // expected-error@-1 {{invalid operands to binary expression ('matrix_t' (aka 'matrix<unsigned int, 3, 3>') and 'matrix_t' (aka 'matrix<float, 2, 2>')}}
+  // expected-error@-2 {{invalid operands to binary expression ('matrix_t' (aka 'matrix<unsigned int, 2, 2>') and 'matrix_t' (aka 'matrix<unsigned int, 3, 3>')}}
+}
+
+void test_subtract_template() {
+  MyMatrix<unsigned, 2, 2> Mat1;
+  MyMatrix<unsigned, 3, 3> Mat2;
+  MyMatrix<float, 2, 2> Mat3;
+  unsigned v1 = subtract<unsigned, 2, 2, unsigned, 2, 2, unsigned, 2, 2>(Mat1, Mat1);
+  // expected-error@-1 {{cannot initialize a variable of type 'unsigned int' with an rvalue of type 'typename MyMatrix<unsigned int, 2U, 2U>::matrix_t' (aka 'matrix<unsigned int, 2, 2>')}}
+  // expected-note@-2 {{in instantiation of function template specialization 'subtract<unsigned int, 2U, 2U, unsigned int, 2U, 2U, unsigned int, 2U, 2U>' requested here}}
+
+  Mat1.value = subtract<unsigned, 2, 2, unsigned, 3, 3, unsigned, 2, 2>(Mat1, Mat2);
+  // expected-note@-1 {{in instantiation of function template specialization 'subtract<unsigned int, 2U, 2U, unsigned int, 3U, 3U, unsigned int, 2U, 2U>' requested here}}
+
+  Mat1.value = subtract<unsigned, 3, 3, float, 2, 2, unsigned, 2, 2>(Mat2, Mat3);
+  // expected-note@-1 {{in instantiation of function template specialization 'subtract<unsigned int, 3U, 3U, float, 2U, 2U, unsigned int, 2U, 2U>' requested here}}
+}
+
+template <typename EltTy0, unsigned R0, unsigned C0, typename EltTy1, unsigned R1, unsigned C1, typename EltTy2, unsigned R2, unsigned C2>
+typename MyMatrix<EltTy2, R2, C2>::matrix_t multiply(inout MyMatrix<EltTy0, R0, C0> A, inout MyMatrix<EltTy1, R1, C1> B) {
+  uint16_t v1 = A.value * B.value;
+  // expected-error@-1 {{cannot initialize a variable of type 'uint16_t' (aka 'unsigned short') with an rvalue of type 'matrix_t' (aka 'matrix<unsigned int, 2, 2>')}}
+  // expected-error@-2 {{invalid operands to binary expression ('matrix_t' (aka 'matrix<unsigned int, 3, 2>') and 'matrix_t' (aka 'matrix<unsigned int, 3, 3>'))}}
+  // expected-error@-3 {{invalid operands to binary expression ('matrix_t' (aka 'matrix<float, 2, 2>') and 'matrix_t' (aka 'matrix<unsigned int, 2, 2>'))}}
+
+  MyMatrix<int, 3, 4> m;
+  B.value = m.value * A.value;
+  // expected-error@-1 {{invalid operands to binary expression ('matrix_t' (aka 'matrix<int, 3, 4>') and 'matrix_t' (aka 'matrix<unsigned int, 2, 2>'))}}
+  // expected-error@-2 {{invalid operands to binary expression ('matrix_t' (aka 'matrix<int, 3, 4>') and 'matrix_t' (aka 'matrix<unsigned int, 3, 2>'))}}
+  // expected-error@-3 {{invalid operands to binary expression ('matrix_t' (aka 'matrix<int, 3, 4>') and 'matrix_t' (aka 'matrix<float, 2, 2>'))}}
+
+  return A.value * B.value;
+  // expected-error@-1 {{invalid operands to binary expression ('matrix_t' (aka 'matrix<unsigned int, 3, 2>') and 'matrix_t' (aka 'matrix<unsigned int, 3, 3>'))}}
+  // expected-error@-2 {{invalid operands to binary expression ('matrix_t' (aka 'matrix<float, 2, 2>') and 'matrix_t' (aka 'matrix<unsigned int, 2, 2>'))}}
+}
+
+void test_multiply_template() {
+  MyMatrix<unsigned, 2, 2> Mat1;
+  MyMatrix<unsigned, 3, 3> Mat2;
+  MyMatrix<float, 2, 2> Mat3;
+  unsigned v1 = multiply<unsigned, 2, 2, unsigned, 2, 2, unsigned, 2, 2>(Mat1, Mat1);
+  // expected-note@-1 {{in instantiation of function template specialization 'multiply<unsigned int, 2U, 2U, unsigned int, 2U, 2U, unsigned int, 2U, 2U>' requested here}}
+  // expected-error@-2 {{cannot initialize a variable of type 'unsigned int' with an rvalue of type 'typename MyMatrix<unsigned int, 2U, 2U>::matrix_t' (aka 'matrix<unsigned int, 2, 2>')}}
+
+  MyMatrix<unsigned, 3, 2> Mat4;
+  Mat1.value = multiply<unsigned, 3, 2, unsigned, 3, 3, unsigned, 2, 2>(Mat4, Mat2);
+  // expected-note@-1 {{in instantiation of function template specialization 'multiply<unsigned int, 3U, 2U, unsigned int, 3U, 3U, unsigned int, 2U, 2U>' requested here}}
+
+  Mat1.value = multiply<float, 2, 2, unsigned, 2, 2, unsigned, 2, 2>(Mat3, Mat1);
+  // expected-note@-1 {{in instantiation of function template specialization 'multiply<float, 2U, 2U, unsigned int, 2U, 2U, unsigned int, 2U, 2U>' requested here}}
+
+  Mat4.value = Mat4.value * Mat1;
+  // expected-error@-1 {{no viable conversion from 'MyMatrix<unsigned int, 2, 2>' to 'unsigned int'}}
+  // expected-error@-2 {{invalid operands to binary expression ('matrix_t' (aka 'matrix<unsigned int, 3, 2>') and 'MyMatrix<unsigned int, 2, 2>')}}
+}
+
+struct UserT {};
+
+struct StructWithC {
+  operator UserT() {
+    // expected-note@-1 4 {{candidate function}}
+    return {};
+  }
+};
+
+void test_DoubleWrapper(inout MyMatrix<double, 4, 3> m, inout StructWithC c) {
+  m.value = m.value + c;
+  // expected-error@-1 {{no viable conversion from 'StructWithC' to 'double'}}
+  // expected-error@-2 {{invalid operands to binary expression ('matrix_t' (aka 'matrix<double, 4, 3>') and 'StructWithC')}}
+
+  m.value = c + m.value;
+  // expected-error@-1 {{no viable conversion from 'StructWithC' to 'double'}}
+  // expected-error@-2 {{invalid operands to binary expression ('StructWithC' and 'matrix_t' (aka 'matrix<double, 4, 3>'))}}
+
+  m.value = m.value - c;
+  // expected-error@-1 {{no viable conversion from 'StructWithC' to 'double'}}
+  // expected-error@-2 {{invalid operands to binary expression ('matrix_t' (aka 'matrix<double, 4, 3>') and 'StructWithC')}}
+
+  m.value = c - m.value;
+  // expected-error@-1 {{no viable conversion from 'StructWithC' to 'double'}}
+  // expected-error@-2 {{invalid operands to binary expression ('StructWithC' and 'matrix_t' (aka 'matrix<double, 4, 3>'))}}
+}
+
diff --git a/clang/test/SemaHLSL/Types/BuiltinMatrix/matrix-type.hlsl b/clang/test/SemaHLSL/Types/BuiltinMatrix/matrix-type.hlsl
new file mode 100644
index 0000000000000..fe374f388d104
--- /dev/null
+++ b/clang/test/SemaHLSL/Types/BuiltinMatrix/matrix-type.hlsl
@@ -0,0 +1,48 @@
+// A note points to the external source at present, so we have to ignore it.
+// RUN: %clang_cc1 -triple spirv-unknown-vulkan-compute -fnative-half-type -finclude-default-header -fsyntax-only %s -verify -verify-ignore-unexpected=note
+// All the errors are actually in the external source at present, so we have to ignore them.
+// The notes point to the proper lines though.
+// RUN: %clang_cc1 -triple spirv-unknown-vulkan-compute -fnative-half-type -finclude-default-header -fsyntax-only -DMTXTYPE %s -verify=mtxtype -verify-ignore-unexpected=error
+
+#ifndef MTXTYPE
+void matrix_var_dimensions(int Rows, unsigned Columns, uint16_t C) {
+  // expected-note@-1 3{{declared here}}
+  matrix<int, Rows, 1> m1;    // expected-error{{non-type template argument is not a constant expression}}
+  // expected-note@-1{{function parameter 'Rows' with unknown value cannot be used in a constant expression}}
+  matrix<int, 1, Columns> m2; // expected-error{{non-type template argument is not a constant expression}}
+  // expected-note@-1{{function parameter 'Columns' with unknown value cannot be used in a constant expression}}
+  matrix<int, C, C> m3;       // expected-error{{non-type template argument is not a constant expression}}
+  // expected-note@-1{{function parameter 'C' with unknown value cannot be used in a constant expression}}
+  matrix<int, char, 0> m8;    // expected-error{{template argument for non-type template parameter must be an expression}}
+
+}
+#else
+struct S1 {};
+
+enum TestEnum {
+  A,
+  B
+};
+
+void matrix_unsupported_element_type() {
+  // The future-errors are not checked yet since they are predeclared and are ignored.
+  matrix<S1, 1, 1> m1;       // future-error{{invalid matrix element type 'S1'}}
+  // mtxtype-note@-1{{in instantiation of template type alias 'matrix' requested here}}
+  matrix<bool, 1, 1> m2;     // future-error{{invalid matrix element type 'bool'}}
+  // mtxtype-note@-1{{in instantiation of template type alias 'matrix' requested here}}
+  matrix<TestEnum, 1, 1> m3; // future-error{{invalid matrix element type 'TestEnum'}}
+  // mtxtype-note@-1{{in instantiation of template type alias 'matrix' requested here}}
+
+  matrix<int, -1, 1> m4;      // future-error{{matrix row size too large}}
+  // mtxtype-note@-1{{in instantiation of template type alias 'matrix' requested here}}
+  matrix<int, 1, -1> m5;      // future-error{{matrix column size too large}}
+  // mtxtype-note@-1{{in instantiation of template type alias 'matrix' requested here}}
+  matrix<int, 0, 1> m6;       // future-error{{zero matrix size}}
+  // mtxtype-note@-1{{in instantiation of template type alias 'matrix' requested here}}
+  matrix<int, 1, 0> m7;       // future-error{{zero matrix size}}
+  // mtxtype-note@-1{{in instantiation of template type alias 'matrix' requested here}}
+  matrix<int, 1048576, 1> m9; // future-error{{matrix row size too large}}
+  // mtxtype-note@-1{{in instantiation of template type alias 'matrix' requested here}}
+
+}
+#endif