From b93f6a46fd78fe5557f7a5263d8f4885ea3930fd Mon Sep 17 00:00:00 2001
From: alecardoso-tech <alecardoso@microsoft.com>
Date: Fri, 18 Apr 2025 14:59:52 -0700
Subject: [PATCH 01/93] Moved HANDLE management to Test Class Constructor and
 Destructor (#7357)

Test class `ShaderOpTest` had a `HANDLE` not closed after `RunShaderOp`
test ran.

Created a constructor and destructor for the Test Class to manage the
`HANDLE`.
---
 tools/clang/unittests/HLSLExec/ShaderOpTest.cpp | 13 +++++++++----
 tools/clang/unittests/HLSLExec/ShaderOpTest.h   |  3 +++
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/tools/clang/unittests/HLSLExec/ShaderOpTest.cpp b/tools/clang/unittests/HLSLExec/ShaderOpTest.cpp
index e6c9b10f6c..8dde3faa0b 100644
--- a/tools/clang/unittests/HLSLExec/ShaderOpTest.cpp
+++ b/tools/clang/unittests/HLSLExec/ShaderOpTest.cpp
@@ -258,6 +258,15 @@ void CommandListRefs::CreateForDevice(ID3D12Device *pDevice, bool compute) {
                                       IID_PPV_ARGS(&List)));
 }
 
+ShaderOpTest::ShaderOpTest() {
+  m_hFence = CreateEvent(nullptr, FALSE, FALSE, nullptr);
+  if (m_hFence == nullptr) {
+    AtlThrow(HRESULT_FROM_WIN32(GetLastError()));
+  }
+}
+
+ShaderOpTest::~ShaderOpTest() { CloseHandle(m_hFence); }
+
 void ShaderOpTest::CopyBackResources() {
   CommandListRefs ResCommandList;
   ResCommandList.CreateForDevice(m_pDevice, m_pShaderOp->IsCompute());
@@ -423,10 +432,6 @@ void ShaderOpTest::CreateDevice() {
   CHECK_HR(m_pDevice->CreateFence(0, D3D12_FENCE_FLAG_NONE,
                                   __uuidof(ID3D12Fence), (void **)&m_pFence));
   m_pFence->SetName(L"ShaderOpTest Fence");
-  m_hFence = CreateEvent(nullptr, FALSE, FALSE, nullptr);
-  if (m_hFence == nullptr) {
-    AtlThrow(HRESULT_FROM_WIN32(GetLastError()));
-  }
 }
 
 static void InitByteCode(D3D12_SHADER_BYTECODE *pBytecode, ID3D10Blob *pBlob) {
diff --git a/tools/clang/unittests/HLSLExec/ShaderOpTest.h b/tools/clang/unittests/HLSLExec/ShaderOpTest.h
index e65bd9e4e5..b71ee08765 100644
--- a/tools/clang/unittests/HLSLExec/ShaderOpTest.h
+++ b/tools/clang/unittests/HLSLExec/ShaderOpTest.h
@@ -275,6 +275,9 @@ class ShaderOpTest {
   typedef std::function<void(LPCSTR Name, LPCSTR pText, IDxcBlob **ppShaderBlob,
                              ShaderOp *pShaderOp)>
       TShaderCallbackFn;
+
+  ShaderOpTest();
+  ~ShaderOpTest();
   void GetPipelineStats(D3D12_QUERY_DATA_PIPELINE_STATISTICS *pStats);
   void GetReadBackData(LPCSTR pResourceName, MappedData *pData);
   void RunShaderOp(ShaderOp *pShaderOp);

From b4a3076caa92c4e9ed05761cbcd2141591fb3f89 Mon Sep 17 00:00:00 2001
From: Chris B <cbieneman@microsoft.com>
Date: Mon, 21 Apr 2025 12:23:02 -0500
Subject: [PATCH 02/93] Revert ADO pipelines to Ubuntu 22.04 temporarily
 (#7365)

DXC seems to be building inocrrectly with GCC-13 and later, which is
causing our pre-merge testing on 24.04 to fail. This will take some time
to sort out, so in the meantime I'm reverting to 22.04 on our pipelines.
---
 azure-pipelines.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 33c5349f9e..7967fa03e3 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -52,7 +52,7 @@ stages:
 
     variables:
       macOS: macOS-latest
-      linux: Ubuntu-latest
+      linux: Ubuntu-22.04 # FIXME: #7364, DXC does not build correctly with GCC 13+
 
     strategy:
       matrix:

From 8988e473465454f7a6dbc55223c7655c1b5af973 Mon Sep 17 00:00:00 2001
From: Simon Moll <smoll@nvidia.com>
Date: Tue, 22 Apr 2025 18:31:31 +0200
Subject: [PATCH 03/93] [SER] Diagnose payload in HitObject::TraceRay|Invoke
 (#7356)

- Generalize raypayload validation to HitObject::TraceRay|Invoke
- Reject non-numeric payload types in [HitObject::]TraceRay|Invoke

Specification:
https://github.com/microsoft/hlsl-specs/blob/main/proposals/0027-shader-execution-reordering.md

Bug: #7234 [SER] Diagnose and validate illegal use of HitObject in
unsupported contexts
---
 tools/clang/lib/Sema/SemaDXR.cpp              | 141 +++++++++++-------
 .../hitobject_traceinvoke_payload.hlsl        |  27 ++++
 .../hitobject_traceinvoke_payload_udt.hlsl    |  22 +++
 3 files changed, 133 insertions(+), 57 deletions(-)
 create mode 100644 tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_traceinvoke_payload.hlsl
 create mode 100644 tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_traceinvoke_payload_udt.hlsl

diff --git a/tools/clang/lib/Sema/SemaDXR.cpp b/tools/clang/lib/Sema/SemaDXR.cpp
index 36ab55ea10..e5b2140cca 100644
--- a/tools/clang/lib/Sema/SemaDXR.cpp
+++ b/tools/clang/lib/Sema/SemaDXR.cpp
@@ -28,6 +28,7 @@
 
 #include "dxc/DXIL/DxilConstants.h"
 #include "dxc/DXIL/DxilShaderModel.h"
+#include "dxc/HlslIntrinsicOp.h"
 
 using namespace clang;
 using namespace sema;
@@ -49,9 +50,9 @@ struct PayloadUse {
   const MemberExpr *Member = nullptr;
 };
 
-struct TraceRayCall {
-  TraceRayCall() = default;
-  TraceRayCall(const CallExpr *Call, const CFGBlock *Parent)
+struct PayloadBuiltinCall {
+  PayloadBuiltinCall() = default;
+  PayloadBuiltinCall(const CallExpr *Call, const CFGBlock *Parent)
       : Call(Call), Parent(Parent) {}
   const CallExpr *Call = nullptr;
   const CFGBlock *Parent = nullptr;
@@ -71,7 +72,7 @@ struct DxrShaderDiagnoseInfo {
   const FunctionDecl *funcDecl;
   const VarDecl *Payload;
   DXIL::PayloadAccessShaderStage Stage;
-  std::vector<TraceRayCall> TraceCalls;
+  std::vector<PayloadBuiltinCall> PayloadBuiltinCalls;
   std::map<const FieldDecl *, std::vector<PayloadUse>> WritesPerField;
   std::map<const FieldDecl *, std::vector<PayloadUse>> ReadsPerField;
   std::vector<PayloadUse> PayloadAsCallArg;
@@ -121,24 +122,42 @@ GetPayloadQualifierForStage(FieldDecl *Field,
   return DXIL::PayloadAccessQualifier::NoAccess;
 }
 
-// Returns the declaration of the payload used in a TraceRay call
-const VarDecl *GetPayloadParameterForTraceCall(const CallExpr *Trace) {
-  const Decl *callee = Trace->getCalleeDecl();
-  if (!callee)
+static int GetPayloadParamIdxForIntrinsic(const FunctionDecl *FD) {
+  HLSLIntrinsicAttr *IntrinAttr = FD->getAttr<HLSLIntrinsicAttr>();
+  if (!IntrinAttr)
+    return -1;
+  switch ((IntrinsicOp)IntrinAttr->getOpcode()) {
+  default:
+    return -1;
+  case IntrinsicOp::IOP_TraceRay:
+  case IntrinsicOp::MOP_DxHitObject_TraceRay:
+  case IntrinsicOp::MOP_DxHitObject_Invoke:
+    return FD->getNumParams() - 1;
+  }
+}
+
+static bool IsBuiltinWithPayload(const FunctionDecl *FD) {
+  return GetPayloadParamIdxForIntrinsic(FD) >= 0;
+}
+
+// Returns the declaration of the payload used in a call to TraceRay,
+// HitObject::TraceRay or HitObject::Invoke.
+const VarDecl *GetPayloadParameterForBuiltinCall(const CallExpr *Call) {
+  const Decl *Callee = Call->getCalleeDecl();
+  if (!Callee)
     return nullptr;
 
-  if (!isa<FunctionDecl>(callee))
+  if (!isa<FunctionDecl>(Callee))
     return nullptr;
 
-  const FunctionDecl *FD = cast<FunctionDecl>(callee);
+  int PldParamIdx = GetPayloadParamIdxForIntrinsic(cast<FunctionDecl>(Callee));
+  if (PldParamIdx < 0)
+    return nullptr;
 
-  if (FD->isImplicit() && FD->getName() == "TraceRay") {
-    const Stmt *Param = IgnoreParensAndDecay(Trace->getArg(7));
-    if (const DeclRefExpr *ParamRef = dyn_cast<DeclRefExpr>(Param)) {
-      if (const VarDecl *Decl = dyn_cast<VarDecl>(ParamRef->getDecl()))
-        return Decl;
-    }
-  }
+  const Stmt *Param = IgnoreParensAndDecay(Call->getArg(PldParamIdx));
+  if (const DeclRefExpr *ParamRef = dyn_cast<DeclRefExpr>(Param))
+    if (const VarDecl *Decl = dyn_cast<VarDecl>(ParamRef->getDecl()))
+      return Decl;
   return nullptr;
 }
 
@@ -190,12 +209,9 @@ void CollectReadsWritesAndCallsForPayload(const Stmt *S,
   }
 }
 
-// Collects all TraceRay calls.
-void CollectTraceRayCalls(const Stmt *S, DxrShaderDiagnoseInfo &Info,
-                          const CFGBlock *Block) {
-  // TraceRay has void as return type so it should never be something else
-  // than a plain CallExpr.
-
+// Collects all calls to TraceRay, HitObject::TraceRay and HitObject::Invoke.
+void CollectBuiltinCallsWithPayload(const Stmt *S, DxrShaderDiagnoseInfo &Info,
+                                    const CFGBlock *Block) {
   if (const CallExpr *Call = dyn_cast<CallExpr>(S)) {
 
     const Decl *Callee = Call->getCalleeDecl();
@@ -204,11 +220,8 @@ void CollectTraceRayCalls(const Stmt *S, DxrShaderDiagnoseInfo &Info,
 
     const FunctionDecl *CalledFunction = cast<FunctionDecl>(Callee);
 
-    // Ignore trace calls here.
-    if (CalledFunction->isImplicit() &&
-        CalledFunction->getName() == "TraceRay") {
-      Info.TraceCalls.push_back({Call, Block});
-    }
+    if (IsBuiltinWithPayload(CalledFunction))
+      Info.PayloadBuiltinCalls.push_back({Call, Block});
   }
 }
 
@@ -528,13 +541,14 @@ void TraverseCFG(const CFGBlock &Block, Action PerElementAction,
   }
 }
 
-// Forward traverse the CFG and collect calls to TraceRay.
-void ForwardTraverseCFGAndCollectTraceCalls(
+// Forward traverse the CFG and collect calls to TraceRay, HitObject::TraceRay
+// and HitObject::Invoke.
+void ForwardTraverseCFGAndCollectBuiltinCallsWithPayload(
     const CFGBlock &Block, DxrShaderDiagnoseInfo &Info,
     std::set<const CFGBlock *> &Visited) {
   auto Action = [&Info](const CFGBlock &Block, const CFGElement &Element) {
     if (Optional<CFGStmt> S = Element.getAs<CFGStmt>()) {
-      CollectTraceRayCalls(S->getStmt(), Info, &Block);
+      CollectBuiltinCallsWithPayload(S->getStmt(), Info, &Block);
     }
   };
 
@@ -664,9 +678,9 @@ DiagnosePayloadAsFunctionArg(
       const FunctionDecl *CalledFunction = cast<FunctionDecl>(Callee);
 
       // Ignore trace calls here.
-      if (CalledFunction->isImplicit() &&
-          CalledFunction->getName() == "TraceRay") {
-        Info.TraceCalls.push_back(TraceRayCall{Call, Use.Parent});
+      if (IsBuiltinWithPayload(CalledFunction)) {
+        Info.PayloadBuiltinCalls.push_back(
+            PayloadBuiltinCall{Call, Use.Parent});
         continue;
       }
 
@@ -789,10 +803,12 @@ void HandlePayloadInitializer(DxrShaderDiagnoseInfo &Info) {
   }
 }
 
-// Emit diagnostics for a TraceRay call.
-void DiagnoseTraceCall(Sema &S, const VarDecl *Payload,
-                       const TraceRayCall &Trace, DominatorTree &DT) {
-  // For each TraceRay call check if write(caller) fields are written.
+// Emit diagnostics for this call to either TraceRay, HitObject::TraceRay or
+// HitObject::Invoke.
+void DiagnoseBuiltinCallWithPayload(Sema &S, const VarDecl *Payload,
+                                    const PayloadBuiltinCall &PldCall,
+                                    DominatorTree &DT) {
+  // For each call check if write(caller) fields are written.
   const DXIL::PayloadAccessShaderStage CallerStage =
       DXIL::PayloadAccessShaderStage::Caller;
 
@@ -810,6 +826,13 @@ void DiagnoseTraceCall(Sema &S, const VarDecl *Payload,
     return;
   }
 
+  // Verify that the payload type is legal
+  if (!hlsl::IsHLSLCopyableAnnotatableRecord(Payload->getType())) {
+    S.Diag(Payload->getLocation(), diag::err_payload_attrs_must_be_udt)
+        << /*payload|attributes|callable*/ 0 << Payload;
+    return;
+  }
+
   if (ContainsLongVector(Payload->getType())) {
     const unsigned PayloadParametersIdx = 10;
     S.Diag(Payload->getLocation(), diag::err_hlsl_unsupported_long_vector)
@@ -832,12 +855,12 @@ void DiagnoseTraceCall(Sema &S, const VarDecl *Payload,
 
   std::set<const CFGBlock *> Visited;
 
-  const CFGBlock *Parent = Trace.Parent;
+  const CFGBlock *Parent = PldCall.Parent;
   Visited.insert(Parent);
-  // Collect payload accesses in the same block until we reach the TraceRay call
+  // Collect payload accesses in the same block until we reach the call
   for (auto Element : *Parent) {
     if (Optional<CFGStmt> S = Element.getAs<CFGStmt>()) {
-      if (S->getStmt() == Trace.Call)
+      if (S->getStmt() == PldCall.Call)
         break;
       CollectReadsWritesAndCallsForPayload(S->getStmt(), TraceInfo, Parent);
     }
@@ -850,10 +873,12 @@ void DiagnoseTraceCall(Sema &S, const VarDecl *Payload,
     BackwardTraverseCFGAndCollectReadsWrites(*Pred, TraceInfo, Visited);
   }
 
+  int PldArgIdx = PldCall.Call->getNumArgs() - 1;
+
   // Warn if a writeable field has not been written.
   for (const FieldDecl *Field : WriteableFields) {
     if (!TraceInfo.WritesPerField.count(Field)) {
-      S.Diag(Trace.Call->getArg(7)->getExprLoc(),
+      S.Diag(PldCall.Call->getArg(PldArgIdx)->getExprLoc(),
              diag::warn_hlsl_payload_access_no_write_for_trace_payload)
           << Field->getName();
     }
@@ -862,7 +887,7 @@ void DiagnoseTraceCall(Sema &S, const VarDecl *Payload,
   for (const FieldDecl *Field : NonWriteableFields) {
     if (TraceInfo.WritesPerField.count(Field)) {
       S.Diag(
-          Trace.Call->getArg(7)->getExprLoc(),
+          PldCall.Call->getArg(PldArgIdx)->getExprLoc(),
           diag::warn_hlsl_payload_access_write_but_no_write_for_trace_payload)
           << Field->getName();
     }
@@ -878,7 +903,7 @@ void DiagnoseTraceCall(Sema &S, const VarDecl *Payload,
   bool CallFound = false;
   for (auto Element : *Parent) { // TODO: reverse iterate?
     if (Optional<CFGStmt> S = Element.getAs<CFGStmt>()) {
-      if (S->getStmt() == Trace.Call) {
+      if (S->getStmt() == PldCall.Call) {
         CallFound = true;
         continue;
       }
@@ -895,7 +920,7 @@ void DiagnoseTraceCall(Sema &S, const VarDecl *Payload,
 
   for (const FieldDecl *Field : ReadableFields) {
     if (!TraceInfo.ReadsPerField.count(Field)) {
-      S.Diag(Trace.Call->getArg(7)->getExprLoc(),
+      S.Diag(PldCall.Call->getArg(PldArgIdx)->getExprLoc(),
              diag::warn_hlsl_payload_access_read_but_no_read_after_trace)
           << Field->getName();
     }
@@ -928,27 +953,29 @@ void DiagnoseTraceCall(Sema &S, const VarDecl *Payload,
   }
 }
 
-// Emit diagnostics for all TraceRay calls.
-void DiagnoseTraceCalls(Sema &S, CFG &ShaderCFG, DominatorTree &DT,
-                        DxrShaderDiagnoseInfo &Info) {
-  // Collect TraceRay calls in the shader.
+// Emit diagnostics for all calls to TraceRay, HitObject::TraceRay or
+// HitObject::Invoke.
+void DiagnoseBuiltinCallsWithPayload(Sema &S, CFG &ShaderCFG, DominatorTree &DT,
+                                     DxrShaderDiagnoseInfo &Info) {
+  // Collect calls with payload in the shader.
   std::set<const CFGBlock *> Visited;
-  ForwardTraverseCFGAndCollectTraceCalls(ShaderCFG.getEntry(), Info, Visited);
+  ForwardTraverseCFGAndCollectBuiltinCallsWithPayload(ShaderCFG.getEntry(),
+                                                      Info, Visited);
 
   std::set<const CallExpr *> Diagnosed;
 
-  for (const TraceRayCall &TraceCall : Info.TraceCalls) {
-    if (Diagnosed.count(TraceCall.Call))
+  for (const PayloadBuiltinCall &PldCall : Info.PayloadBuiltinCalls) {
+    if (Diagnosed.count(PldCall.Call))
       continue;
-    Diagnosed.insert(TraceCall.Call);
+    Diagnosed.insert(PldCall.Call);
 
-    const VarDecl *Payload = GetPayloadParameterForTraceCall(TraceCall.Call);
-    DiagnoseTraceCall(S, Payload, TraceCall, DT);
+    const VarDecl *Payload = GetPayloadParameterForBuiltinCall(PldCall.Call);
+    DiagnoseBuiltinCallWithPayload(S, Payload, PldCall, DT);
   }
 }
 
 // Emit diagnostics for all access to the payload of a shader,
-// and the input to TraceRay calls.
+// and the input to TraceRay, HitObject::TraceRay or HitObject::Invoke calls.
 std::vector<const FieldDecl *>
 DiagnosePayloadAccess(Sema &S, DxrShaderDiagnoseInfo &Info,
                       const std::set<const FieldDecl *> &FieldsToIgnoreRead,
@@ -1012,7 +1039,7 @@ DiagnosePayloadAccess(Sema &S, DxrShaderDiagnoseInfo &Info,
       DiagnosePayloadReads(S, TheCFG, DT, Info, NonReadableFields);
   }
 
-  DiagnoseTraceCalls(S, TheCFG, DT, Info);
+  DiagnoseBuiltinCallsWithPayload(S, TheCFG, DT, Info);
 
   return WrittenFields;
 }
diff --git a/tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_traceinvoke_payload.hlsl b/tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_traceinvoke_payload.hlsl
new file mode 100644
index 0000000000..f4781bc796
--- /dev/null
+++ b/tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_traceinvoke_payload.hlsl
@@ -0,0 +1,27 @@
+// RUN: %dxc -T lib_6_9 %s -D TEST_NUM=0 %s -verify
+// RUN: %dxc -T lib_6_9 %s -D TEST_NUM=1 %s -verify
+
+RaytracingAccelerationStructure scene : register(t0);
+
+struct Payload
+{
+    int a : read (caller, closesthit, miss) : write(caller, closesthit, miss);
+};
+
+struct Attribs
+{
+    float2 barys;
+};
+
+[shader("raygeneration")]
+void RayGen()
+{
+// expected-error@+1{{type 'Payload' used as payload requires that it is annotated with the [raypayload] attribute}}
+    Payload payload_in_rg;
+    RayDesc ray;
+#if TEST_NUM == 0
+    dx::HitObject::TraceRay( scene, RAY_FLAG_NONE, 0xff, 0, 1, 0, ray, payload_in_rg );
+#else
+    dx::HitObject::Invoke( dx::HitObject(), payload_in_rg );
+#endif
+}
\ No newline at end of file
diff --git a/tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_traceinvoke_payload_udt.hlsl b/tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_traceinvoke_payload_udt.hlsl
new file mode 100644
index 0000000000..e89e33a78f
--- /dev/null
+++ b/tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_traceinvoke_payload_udt.hlsl
@@ -0,0 +1,22 @@
+// RUN: %dxc -T lib_6_9 %s -verify
+
+struct
+[raypayload]
+Payload
+{
+    int a : read(caller, closesthit, miss) : write(caller, closesthit, miss);
+    dx::HitObject hit;
+};
+
+struct Attribs
+{
+    float2 barys;
+};
+
+[shader("raygeneration")]
+void RayGen()
+{
+  // expected-error@+1{{payload parameter 'payload_in_rg' must be a user-defined type composed of only numeric types}}
+  Payload payload_in_rg;
+  dx::HitObject::Invoke( dx::HitObject(), payload_in_rg );
+}
\ No newline at end of file

From f19b5da54170210e3cbc7f080be3f080abc52505 Mon Sep 17 00:00:00 2001
From: Simon Moll <smoll@nvidia.com>
Date: Tue, 22 Apr 2025 18:32:11 +0200
Subject: [PATCH 04/93] [SER] TraceRay|Invoke HLSL -> DXIL lowering (#7355)

Lowering for
- HitObject::TraceRay
- HitObject::Invoke

Specification:
https://github.com/microsoft/hlsl-specs/blob/main/proposals/0027-shader-execution-reordering.md
DXC SER implementation tracker:: #7214
---
 include/dxc/DXIL/DxilConstants.h              |   5 +
 include/dxc/HLSL/HLOperations.h               |   4 +
 lib/HLSL/HLOperationLower.cpp                 | 126 ++++++++-----
 .../Scalar/ScalarReplAggregatesHLSL.cpp       |   2 +
 tools/clang/lib/Sema/SemaHLSL.cpp             |   2 +
 .../HitObject/hitobject_traceinvoke.hlsl      | 102 +++++++++++
 .../DxilGen/hitobject_traceinvoke_dxilgen.ll  | 167 ++++++++++++++++++
 7 files changed, 367 insertions(+), 41 deletions(-)
 create mode 100644 tools/clang/test/CodeGenDXIL/hlsl/objects/HitObject/hitobject_traceinvoke.hlsl
 create mode 100644 tools/clang/test/DXC/Passes/DxilGen/hitobject_traceinvoke_dxilgen.ll

diff --git a/include/dxc/DXIL/DxilConstants.h b/include/dxc/DXIL/DxilConstants.h
index 8c73328fbd..3752274f18 100644
--- a/include/dxc/DXIL/DxilConstants.h
+++ b/include/dxc/DXIL/DxilConstants.h
@@ -1556,6 +1556,11 @@ const unsigned kMSStoreOutputColOpIdx = 3;
 const unsigned kMSStoreOutputVIdxOpIdx = 4;
 const unsigned kMSStoreOutputValOpIdx = 5;
 
+// HitObject::TraceRay
+const unsigned kHitObjectTraceRay_RayDescOpIdx = 7;
+const unsigned kHitObjectTraceRay_PayloadOpIdx = 15;
+const unsigned kHitObjectTraceRay_NumOp = 16;
+
 // TODO: add operand index for all the OpCodeClass.
 } // namespace OperandIndex
 
diff --git a/include/dxc/HLSL/HLOperations.h b/include/dxc/HLSL/HLOperations.h
index a7db8612a6..970ddd3e85 100644
--- a/include/dxc/HLSL/HLOperations.h
+++ b/include/dxc/HLSL/HLOperations.h
@@ -437,6 +437,10 @@ const unsigned kAnnotateNodeRecordHandleNodeRecordPropIdx = 2;
 const unsigned kHitObjectMakeMiss_NumOp = 8;
 const unsigned kHitObjectMakeMissRayDescOpIdx = 4;
 
+// HitObject::TraceRay
+const unsigned kHitObjectTraceRay_RayDescOpIdx = 8;
+const unsigned kHitObjectTraceRay_NumOp = 10;
+
 } // namespace HLOperandIndex
 
 llvm::Function *GetOrCreateHLFunction(llvm::Module &M,
diff --git a/lib/HLSL/HLOperationLower.cpp b/lib/HLSL/HLOperationLower.cpp
index be45021e41..69dd803f7b 100644
--- a/lib/HLSL/HLOperationLower.cpp
+++ b/lib/HLSL/HLOperationLower.cpp
@@ -19,6 +19,8 @@
 #include <functional>
 #include <unordered_set>
 
+#include "dxc/DXIL/DxilConstants.h"
+#include "dxc/DXIL/DxilInstructions.h"
 #include "dxc/DXIL/DxilModule.h"
 #include "dxc/DXIL/DxilOperations.h"
 #include "dxc/DXIL/DxilResourceProperties.h"
@@ -5718,23 +5720,9 @@ Value *TranslateCallShader(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
   return Builder.CreateCall(F, {opArg, ShaderIndex, Parameter});
 }
 
-Value *TranslateTraceRay(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
-                         HLOperationLowerHelper &helper,
-                         HLObjectOperationLowerHelper *pObjHelper,
-                         bool &Translated) {
-  hlsl::OP *hlslOP = &helper.hlslOP;
-
-  Value *rayDesc = CI->getArgOperand(HLOperandIndex::kTraceRayRayDescOpIdx);
-  Value *payLoad = CI->getArgOperand(HLOperandIndex::kTraceRayPayLoadOpIdx);
-
-  Value *opArg = hlslOP->GetU32Const(static_cast<unsigned>(opcode));
-
-  Value *Args[DXIL::OperandIndex::kTraceRayNumOp];
-  Args[0] = opArg;
-  for (unsigned i = 1; i < HLOperandIndex::kTraceRayRayDescOpIdx; i++) {
-    Args[i] = CI->getArgOperand(i);
-  }
-  IRBuilder<> Builder(CI);
+static unsigned LoadRayDescElementsIntoArgs(Value **Args, hlsl::OP *OP,
+                                            IRBuilder<> &Builder,
+                                            Value *RayDescPtr, unsigned Index) {
   // struct RayDesc
   //{
   //    float3 Origin;
@@ -5742,34 +5730,51 @@ Value *TranslateTraceRay(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
   //    float3 Direction;
   //    float  TMax;
   //};
-  Value *zeroIdx = hlslOP->GetU32Const(0);
-  Value *origin = Builder.CreateGEP(rayDesc, {zeroIdx, zeroIdx});
-  origin = Builder.CreateLoad(origin);
-  unsigned index = DXIL::OperandIndex::kTraceRayRayDescOpIdx;
-  Args[index++] = Builder.CreateExtractElement(origin, (uint64_t)0);
-  Args[index++] = Builder.CreateExtractElement(origin, 1);
-  Args[index++] = Builder.CreateExtractElement(origin, 2);
+  Value *ZeroIdx = OP->GetU32Const(0);
+  Value *Origin = Builder.CreateGEP(RayDescPtr, {ZeroIdx, ZeroIdx});
+  Origin = Builder.CreateLoad(Origin);
+  Args[Index++] = Builder.CreateExtractElement(Origin, (uint64_t)0);
+  Args[Index++] = Builder.CreateExtractElement(Origin, 1);
+  Args[Index++] = Builder.CreateExtractElement(Origin, 2);
 
-  Value *tmin = Builder.CreateGEP(rayDesc, {zeroIdx, hlslOP->GetU32Const(1)});
-  tmin = Builder.CreateLoad(tmin);
-  Args[index++] = tmin;
+  Value *TMinPtr = Builder.CreateGEP(RayDescPtr, {ZeroIdx, OP->GetU32Const(1)});
+  Args[Index++] = Builder.CreateLoad(TMinPtr);
 
-  Value *direction =
-      Builder.CreateGEP(rayDesc, {zeroIdx, hlslOP->GetU32Const(2)});
-  direction = Builder.CreateLoad(direction);
+  Value *DirectionPtr =
+      Builder.CreateGEP(RayDescPtr, {ZeroIdx, OP->GetU32Const(2)});
+  Value *Direction = Builder.CreateLoad(DirectionPtr);
 
-  Args[index++] = Builder.CreateExtractElement(direction, (uint64_t)0);
-  Args[index++] = Builder.CreateExtractElement(direction, 1);
-  Args[index++] = Builder.CreateExtractElement(direction, 2);
+  Args[Index++] = Builder.CreateExtractElement(Direction, (uint64_t)0);
+  Args[Index++] = Builder.CreateExtractElement(Direction, 1);
+  Args[Index++] = Builder.CreateExtractElement(Direction, 2);
 
-  Value *tmax = Builder.CreateGEP(rayDesc, {zeroIdx, hlslOP->GetU32Const(3)});
-  tmax = Builder.CreateLoad(tmax);
-  Args[index++] = tmax;
+  Value *TMaxPtr = Builder.CreateGEP(RayDescPtr, {ZeroIdx, OP->GetU32Const(3)});
+  Args[Index++] = Builder.CreateLoad(TMaxPtr);
+  return Index;
+}
+
+Value *TranslateTraceRay(CallInst *CI, IntrinsicOp IOP, OP::OpCode OpCode,
+                         HLOperationLowerHelper &Helper,
+                         HLObjectOperationLowerHelper *pObjHelper,
+                         bool &Translated) {
+  hlsl::OP *OP = &Helper.hlslOP;
 
-  Args[DXIL::OperandIndex::kTraceRayPayloadOpIdx] = payLoad;
+  Value *RayDesc = CI->getArgOperand(HLOperandIndex::kTraceRayRayDescOpIdx);
+  Value *PayLoad = CI->getArgOperand(HLOperandIndex::kTraceRayPayLoadOpIdx);
 
-  Type *Ty = payLoad->getType();
-  Function *F = hlslOP->GetOpFunc(opcode, Ty);
+  Value *Args[DXIL::OperandIndex::kTraceRayNumOp];
+  Args[0] = OP->GetU32Const(static_cast<unsigned>(OpCode));
+  for (unsigned i = 1; i < HLOperandIndex::kTraceRayRayDescOpIdx; i++)
+    Args[i] = CI->getArgOperand(i);
+
+  IRBuilder<> Builder(CI);
+  LoadRayDescElementsIntoArgs(Args, OP, Builder, RayDesc,
+                              DXIL::OperandIndex::kTraceRayRayDescOpIdx);
+
+  Args[DXIL::OperandIndex::kTraceRayPayloadOpIdx] = PayLoad;
+
+  Type *Ty = PayLoad->getType();
+  Function *F = OP->GetOpFunc(OpCode, Ty);
 
   return Builder.CreateCall(F, Args);
 }
@@ -6307,7 +6312,37 @@ Value *TranslateHitObjectTraceRay(CallInst *CI, IntrinsicOp IOP,
                                   HLOperationLowerHelper &Helper,
                                   HLObjectOperationLowerHelper *pObjHelper,
                                   bool &Translated) {
-  return UndefValue::get(CI->getType()); // TODO: Merge SER DXIL patches
+  hlsl::OP *OP = &Helper.hlslOP;
+  IRBuilder<> Builder(CI);
+
+  const unsigned DxilNumArgs = DxilInst_HitObject_TraceRay::arg_payload + 1;
+  DXASSERT_NOMSG(CI->getNumArgOperands() ==
+                 HLOperandIndex::kHitObjectTraceRay_NumOp);
+  Value *Args[DxilNumArgs];
+  Value *OpArg = OP->GetU32Const(static_cast<unsigned>(OpCode));
+  Args[0] = OpArg;
+
+  unsigned DestIdx = 1, SrcIdx = 1;
+  Value *HitObjectPtr = CI->getArgOperand(SrcIdx++);
+  Args[DestIdx++] = CI->getArgOperand(SrcIdx++);
+  for (; SrcIdx < HLOperandIndex::kHitObjectTraceRay_RayDescOpIdx;
+       ++SrcIdx, ++DestIdx) {
+    Args[DestIdx] = CI->getArgOperand(SrcIdx);
+  }
+
+  Value *RayDescPtr = CI->getArgOperand(SrcIdx++);
+  DestIdx = LoadRayDescElementsIntoArgs(Args, OP, Builder, RayDescPtr, DestIdx);
+  Value *Payload = CI->getArgOperand(SrcIdx++);
+  Args[DestIdx++] = Payload;
+
+  DXASSERT_NOMSG(SrcIdx == CI->getNumArgOperands());
+  DXASSERT_NOMSG(DestIdx == DxilNumArgs);
+
+  Function *F = OP->GetOpFunc(OpCode, Payload->getType());
+
+  Value *OutHitObject = Builder.CreateCall(F, Args);
+  Builder.CreateStore(OutHitObject, HitObjectPtr);
+  return nullptr;
 }
 
 Value *TranslateHitObjectInvoke(CallInst *CI, IntrinsicOp IOP,
@@ -6315,7 +6350,16 @@ Value *TranslateHitObjectInvoke(CallInst *CI, IntrinsicOp IOP,
                                 HLOperationLowerHelper &Helper,
                                 HLObjectOperationLowerHelper *pObjHelper,
                                 bool &Translated) {
-  return nullptr; // TODO: Merge SER DXIL patches
+  unsigned SrcIdx = 1;
+  Value *HitObjectPtr = CI->getArgOperand(SrcIdx++);
+  Value *Payload = CI->getArgOperand(SrcIdx++);
+  DXASSERT_NOMSG(SrcIdx == CI->getNumArgOperands());
+
+  IRBuilder<> Builder(CI);
+  Value *HitObject = Builder.CreateLoad(HitObjectPtr);
+  TrivialDxilOperation(OpCode, {nullptr, HitObject, Payload},
+                       Payload->getType(), CI, &Helper.hlslOP);
+  return nullptr;
 }
 
 Value *TranslateHitObjectGetAttributes(CallInst *CI, IntrinsicOp IOP,
diff --git a/lib/Transforms/Scalar/ScalarReplAggregatesHLSL.cpp b/lib/Transforms/Scalar/ScalarReplAggregatesHLSL.cpp
index e487079b94..b13e9a0f5d 100644
--- a/lib/Transforms/Scalar/ScalarReplAggregatesHLSL.cpp
+++ b/lib/Transforms/Scalar/ScalarReplAggregatesHLSL.cpp
@@ -1540,6 +1540,8 @@ void isSafeForScalarRepl(Instruction *I, uint64_t Offset, AllocaInfo &Info) {
         // basing on IOP?
         IntrinsicOp opcode = static_cast<IntrinsicOp>(GetHLOpcode(CI));
         if (IntrinsicOp::IOP_TraceRay == opcode ||
+            IntrinsicOp::MOP_DxHitObject_TraceRay == opcode ||
+            IntrinsicOp::MOP_DxHitObject_Invoke == opcode ||
             IntrinsicOp::IOP_ReportHit == opcode ||
             IntrinsicOp::IOP_CallShader == opcode) {
           return MarkUnsafe(Info, User);
diff --git a/tools/clang/lib/Sema/SemaHLSL.cpp b/tools/clang/lib/Sema/SemaHLSL.cpp
index 418425a468..bddf834509 100644
--- a/tools/clang/lib/Sema/SemaHLSL.cpp
+++ b/tools/clang/lib/Sema/SemaHLSL.cpp
@@ -12066,8 +12066,10 @@ void Sema::DiagnoseReachableHLSLCall(CallExpr *CE, const hlsl::ShaderModel *SM,
   case hlsl::IntrinsicOp::MOP_TraceRayInline:
     DiagnoseTraceRayInline(*this, CE);
     break;
+  case hlsl::IntrinsicOp::MOP_DxHitObject_Invoke:
   case hlsl::IntrinsicOp::MOP_DxHitObject_MakeMiss:
   case hlsl::IntrinsicOp::MOP_DxHitObject_MakeNop:
+  case hlsl::IntrinsicOp::MOP_DxHitObject_TraceRay:
     DiagnoseReachableSERCall(*this, CE, EntrySK, EntryDecl, false);
     break;
   case hlsl::IntrinsicOp::IOP_DxMaybeReorderThread:
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/objects/HitObject/hitobject_traceinvoke.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/objects/HitObject/hitobject_traceinvoke.hlsl
new file mode 100644
index 0000000000..13bff4a3f4
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/hlsl/objects/HitObject/hitobject_traceinvoke.hlsl
@@ -0,0 +1,102 @@
+// RUN: %dxc -T lib_6_9 -E main %s -ast-dump-implicit | FileCheck %s --check-prefix AST
+// RUN: %dxc -T lib_6_9 -E main %s -fcgl | FileCheck %s --check-prefix FCGL
+// RUN: %dxc -T lib_6_9 -E main %s | FileCheck %s --check-prefix DXIL
+
+// AST: | | |-FunctionTemplateDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> Invoke
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TResult
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class Tho
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TPayload
+// AST-NEXT: | | | |-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> implicit Invoke 'TResult (Tho, TPayload &) const' static
+// AST-NEXT: | | | | |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> ho 'Tho'
+// AST-NEXT: | | | | `-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> Payload 'TPayload &'
+// AST-NEXT: | | | `-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> used Invoke 'void (dx::HitObject, Payload &)' static
+// AST-NEXT: | | |   |-TemplateArgument type 'void'
+// AST-NEXT: | | |   |-TemplateArgument type 'dx::HitObject':'dx::HitObject'
+// AST-NEXT: | | |   |-TemplateArgument type 'Payload'
+// AST-NEXT: | | |   |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> Invoke 'dx::HitObject':'dx::HitObject'
+// AST-NEXT: | | |   |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> ho 'Payload &&__restrict'
+// AST-NEXT: | | |   |-HLSLIntrinsicAttr {{[^ ]+}} <<invalid sloc>> Implicit "op" "" 382
+// AST-NEXT: | | |   `-AvailabilityAttr {{[^ ]+}} <<invalid sloc>> Implicit  6.9 0 0 ""
+
+// AST: | | |-FunctionTemplateDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> TraceRay
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TResult
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TAccelerationStructure
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TRayFlags
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TInstanceInclusionMask
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TRayContributionToHitGroupIndex
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TMultiplierForGeometryContributionToHitGroupIndex
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TMissShaderIndex
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TRay
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TPayload
+// AST-NEXT: | | | |-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> implicit TraceRay 'TResult (TAccelerationStructure, TRayFlags, TInstanceInclusionMask, TRayContributionToHitGroupIndex, TMultiplierForGeometryContributionToHitGroupIndex, TMissShaderIndex, TRay, TPayload &) const' static
+// AST-NEXT: | | | | |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> AccelerationStructure 'TAccelerationStructure'
+// AST-NEXT: | | | | |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> RayFlags 'TRayFlags'
+// AST-NEXT: | | | | |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> InstanceInclusionMask 'TInstanceInclusionMask'
+// AST-NEXT: | | | | |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> RayContributionToHitGroupIndex 'TRayContributionToHitGroupIndex'
+// AST-NEXT: | | | | |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> MultiplierForGeometryContributionToHitGroupIndex 'TMultiplierForGeometryContributionToHitGroupIndex'
+// AST-NEXT: | | | | |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> MissShaderIndex 'TMissShaderIndex'
+// AST-NEXT: | | | | |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> Ray 'TRay'
+// AST-NEXT: | | | | `-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> Payload 'TPayload &'
+// AST-NEXT: | | | `-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> used TraceRay 'dx::HitObject (RaytracingAccelerationStructure, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, RayDesc, Payload &)' static
+// AST-NEXT: | | |   |-TemplateArgument type 'dx::HitObject'
+// AST-NEXT: | | |   |-TemplateArgument type 'RaytracingAccelerationStructure'
+// AST-NEXT: | | |   |-TemplateArgument type 'unsigned int'
+// AST-NEXT: | | |   |-TemplateArgument type 'unsigned int'
+// AST-NEXT: | | |   |-TemplateArgument type 'unsigned int'
+// AST-NEXT: | | |   |-TemplateArgument type 'unsigned int'
+// AST-NEXT: | | |   |-TemplateArgument type 'unsigned int'
+// AST-NEXT: | | |   |-TemplateArgument type 'RayDesc'
+// AST-NEXT: | | |   |-TemplateArgument type 'Payload'
+// AST-NEXT: | | |   |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> TraceRay 'RaytracingAccelerationStructure'
+// AST-NEXT: | | |   |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> AccelerationStructure 'unsigned int'
+// AST-NEXT: | | |   |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> RayFlags 'unsigned int'
+// AST-NEXT: | | |   |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> InstanceInclusionMask 'unsigned int'
+// AST-NEXT: | | |   |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> RayContributionToHitGroupIndex 'unsigned int'
+// AST-NEXT: | | |   |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> MultiplierForGeometryContributionToHitGroupIndex 'unsigned int'
+// AST-NEXT: | | |   |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> MissShaderIndex 'RayDesc'
+// AST-NEXT: | | |   |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> Ray 'Payload &&__restrict'
+// AST-NEXT: | | |   |-HLSLIntrinsicAttr {{[^ ]+}} <<invalid sloc>> Implicit "op" "" 389
+// AST-NEXT: | | |   `-AvailabilityAttr {{[^ ]+}} <<invalid sloc>> Implicit  6.9 0 0 ""
+
+// FCGL:  %[[HANDLE:[^ ]+]] = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RaytracingAccelerationStructure)"(i32 14, %dx.types.Handle %{{[^ ]+}}, %dx.types.ResourceProperties { i32 16, i32 0 }, %struct.RaytracingAccelerationStructure undef)
+// FCGL-NEXT:  call void @"dx.hl.op..void (i32, %dx.types.HitObject*, %dx.types.Handle, i32, i32, i32, i32, i32, %struct.RayDesc*, %struct.Payload*)"(i32 389, %dx.types.HitObject* %{{[^ ]+}}, %dx.types.Handle %[[HANDLE]], i32 513, i32 1, i32 2, i32 4, i32 0, %struct.RayDesc* %{{[^ ]+}}, %struct.Payload* %{{[^ ]+}})
+// FCGL-NEXT:  call void @"dx.hl.op..void (i32, %dx.types.HitObject*, %struct.Payload*)"(i32 382, %dx.types.HitObject* %{{[^ ]+}}, %struct.Payload* %{{[^ ]+}})
+
+// DXIL:  %[[RTAS:[^ ]+]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %{{[^ ]+}}, %dx.types.ResourceProperties { i32 16, i32 0 })  ; AnnotateHandle(res,props)  resource: RTAccelerationStructure
+// DXIL:  %[[HIT:[^ ]+]] = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle %[[RTAS]], i32 513, i32 1, i32 2, i32 4, i32 0, float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, %struct.Payload* nonnull %{{[^ ]+}})  ; HitObject_TraceRay(accelerationStructure,rayFlags,instanceInclusionMask,rayContributionToHitGroupIndex,multiplierForGeometryContributionToHitGroupIndex,missShaderIndex,Origin_X,Origin_Y,Origin_Z,TMin,Direction_X,Direction_Y,Direction_Z,TMax,payload)
+// DXIL:  call void @dx.op.hitObject_Invoke.struct.Payload(i32 267, %dx.types.HitObject %[[HIT]], %struct.Payload* nonnull %{{[^ ]+}})  ; HitObject_Invoke(hitObject,payload)
+
+// DXIL: !dx.dxrPayloadAnnotations = !{![[MDPLD:[^ ]+]]}
+// DXIL: ![[MDPLD]] = !{i32 0, %struct.Payload undef, !{{[^ ]+}}}
+
+RaytracingAccelerationStructure RTAS;
+RWStructuredBuffer<float> UAV : register(u0);
+
+struct [raypayload]
+Payload {
+  float3 dummy : read(closesthit) : write(caller, anyhit);
+};
+
+[shader("raygeneration")]
+void main() {
+  RayDesc rayDesc;
+  rayDesc.Origin = float3(0.0, 1.0, 2.0);
+  rayDesc.TMin = 3.0f;
+  rayDesc.Direction = float3(4.0, 5.0, 6.0);
+  rayDesc.TMax = 7.0f;
+
+  Payload pld;
+  pld.dummy = float3(7.0, 8.0, 9.0);
+
+  dx::HitObject hit = dx::HitObject::TraceRay(
+      RTAS,
+      RAY_FLAG_FORCE_OPAQUE | RAY_FLAG_SKIP_PROCEDURAL_PRIMITIVES,
+      1,
+      2,
+      4,
+      0,
+      rayDesc,
+      pld);
+
+  dx::HitObject::Invoke(hit, pld);
+}
diff --git a/tools/clang/test/DXC/Passes/DxilGen/hitobject_traceinvoke_dxilgen.ll b/tools/clang/test/DXC/Passes/DxilGen/hitobject_traceinvoke_dxilgen.ll
new file mode 100644
index 0000000000..6f364a0161
--- /dev/null
+++ b/tools/clang/test/DXC/Passes/DxilGen/hitobject_traceinvoke_dxilgen.ll
@@ -0,0 +1,167 @@
+; RUN: %dxopt %s -hlsl-passes-resume -dxilgen -S | FileCheck %s
+; REQUIRES: dxil-1-9
+
+;
+; Buffer Definitions:
+;
+;
+; Resource Bindings:
+;
+; Name                                 Type  Format         Dim      ID      HLSL Bind  Count
+; ------------------------------ ---------- ------- ----------- ------- -------------- ------
+; RTAS                              texture     i32         ras      T0t4294967295,space4294967295     1
+;
+target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+%struct.RaytracingAccelerationStructure = type { i32 }
+%struct.RayDesc = type { <3 x float>, float, <3 x float>, float }
+%struct.Payload = type { <3 x float> }
+%dx.types.HitObject = type { i8* }
+%dx.types.Handle = type { i8* }
+%dx.types.ResourceProperties = type { i32, i32 }
+%"class.RWStructuredBuffer<float>" = type { float }
+%"class.dx::HitObject" = type { i32 }
+
+@"\01?RTAS@@3URaytracingAccelerationStructure@@A" = external global %struct.RaytracingAccelerationStructure, align 4
+
+; Function Attrs: nounwind
+define void @"\01?main@@YAXXZ"() #0 {
+entry:
+  %rayDesc = alloca %struct.RayDesc, align 4
+  %pld = alloca %struct.Payload, align 4
+  %hit = alloca %dx.types.HitObject, align 4
+  %0 = bitcast %struct.RayDesc* %rayDesc to i8*, !dbg !31 ; line:80 col:3
+  call void @llvm.lifetime.start(i64 32, i8* %0) #0, !dbg !31 ; line:80 col:3
+  %Origin = getelementptr inbounds %struct.RayDesc, %struct.RayDesc* %rayDesc, i32 0, i32 0, !dbg !35 ; line:81 col:11
+  store <3 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00>, <3 x float>* %Origin, align 4, !dbg !36, !tbaa !37 ; line:81 col:18
+  %TMin = getelementptr inbounds %struct.RayDesc, %struct.RayDesc* %rayDesc, i32 0, i32 1, !dbg !40 ; line:82 col:11
+  store float 3.000000e+00, float* %TMin, align 4, !dbg !41, !tbaa !42 ; line:82 col:16
+  %Direction = getelementptr inbounds %struct.RayDesc, %struct.RayDesc* %rayDesc, i32 0, i32 2, !dbg !44 ; line:83 col:11
+  store <3 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00>, <3 x float>* %Direction, align 4, !dbg !45, !tbaa !37 ; line:83 col:21
+  %TMax = getelementptr inbounds %struct.RayDesc, %struct.RayDesc* %rayDesc, i32 0, i32 3, !dbg !46 ; line:84 col:11
+  store float 7.000000e+00, float* %TMax, align 4, !dbg !47, !tbaa !42 ; line:84 col:16
+  %1 = bitcast %struct.Payload* %pld to i8*, !dbg !48 ; line:86 col:3
+  call void @llvm.lifetime.start(i64 12, i8* %1) #0, !dbg !48 ; line:86 col:3
+  %dummy = getelementptr inbounds %struct.Payload, %struct.Payload* %pld, i32 0, i32 0, !dbg !49 ; line:87 col:7
+  store <3 x float> <float 7.000000e+00, float 8.000000e+00, float 9.000000e+00>, <3 x float>* %dummy, align 4, !dbg !50, !tbaa !37 ; line:87 col:13
+  %2 = bitcast %dx.types.HitObject* %hit to i8*, !dbg !51 ; line:89 col:3
+  call void @llvm.lifetime.start(i64 4, i8* %2) #0, !dbg !51 ; line:89 col:3
+  %3 = load %struct.RaytracingAccelerationStructure, %struct.RaytracingAccelerationStructure* @"\01?RTAS@@3URaytracingAccelerationStructure@@A", !dbg !52 ; line:89 col:23
+  %4 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RaytracingAccelerationStructure)"(i32 0, %struct.RaytracingAccelerationStructure %3), !dbg !52 ; line:89 col:23
+  %5 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RaytracingAccelerationStructure)"(i32 14, %dx.types.Handle %4, %dx.types.ResourceProperties { i32 16, i32 0 }, %struct.RaytracingAccelerationStructure zeroinitializer), !dbg !52 ; line:89 col:23
+  ; CHECK: %[[ORIGINPTR:[^ ]+]] = getelementptr %struct.RayDesc, %struct.RayDesc* %[[RAYDESCPTR:[^ ]+]], i32 0, i32 0
+  ; CHECK: %[[ORIGIN:[^ ]+]] = load <3 x float>, <3 x float>* %[[ORIGINPTR]]
+  ; CHECK: %[[O0:[^ ]+]] = extractelement <3 x float> %[[ORIGIN]], i64 0
+  ; CHECK: %[[O1:[^ ]+]] = extractelement <3 x float> %[[ORIGIN]], i64 1
+  ; CHECK: %[[O2:[^ ]+]] = extractelement <3 x float> %[[ORIGIN]], i64 2
+  ; CHECK: %[[TMINPTR:[^ ]+]] = getelementptr %struct.RayDesc, %struct.RayDesc* %[[RAYDESCPTR]], i32 0, i32 1
+  ; CHECK: %[[TMIN:[^ ]+]] = load float, float* %[[TMINPTR]]
+  ; CHECK: %[[DIRPTR:[^ ]+]] = getelementptr %struct.RayDesc, %struct.RayDesc* %[[RAYDESCPTR]], i32 0, i32 2
+  ; CHECK: %[[DIR:[^ ]+]] = load <3 x float>, <3 x float>* %[[DIRPTR]]
+  ; CHECK: %[[D0:[^ ]+]] = extractelement <3 x float> %[[DIR]], i64 0
+  ; CHECK: %[[D1:[^ ]+]] = extractelement <3 x float> %[[DIR]], i64 1
+  ; CHECK: %[[D2:[^ ]+]] = extractelement <3 x float> %[[DIR]], i64 2
+  ; CHECK: %[[TMAXPTR:[^ ]+]] = getelementptr %struct.RayDesc, %struct.RayDesc* %[[RAYDESCPTR]], i32 0, i32 3
+  ; CHECK: %[[TMAX:[^ ]+]] = load float, float* %[[TMAXPTR]]
+  ; CHECK: %[[TRACEHO:[^ ]+]] = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle %5, i32 513, i32 1, i32 2, i32 4, i32 0, float %[[O0]], float %[[O1]], float %[[O2]], float %[[TMIN]], float %[[D0]], float %[[D1]], float %[[D2]], float %[[TMAX]], %struct.Payload* %pld)
+  call void @"dx.hl.op..void (i32, %dx.types.HitObject*, %dx.types.Handle, i32, i32, i32, i32, i32, %struct.RayDesc*, %struct.Payload*)"(i32 389, %dx.types.HitObject* %hit, %dx.types.Handle %5, i32 513, i32 1, i32 2, i32 4, i32 0, %struct.RayDesc* %rayDesc, %struct.Payload* %pld), !dbg !52 ; line:89 col:23
+  ; CHECK: store %dx.types.HitObject %[[TRACEHO]], %dx.types.HitObject* %[[HOPTR:[^ ]+]]
+  ; CHECK: %[[INVOKEHO:[^ ]+]] = load %dx.types.HitObject, %dx.types.HitObject* %[[HOPTR]]
+  ; CHECK: call void @dx.op.hitObject_Invoke.struct.Payload(i32 267, %dx.types.HitObject %[[INVOKEHO]], %struct.Payload* %pld)
+  call void @"dx.hl.op..void (i32, %dx.types.HitObject*, %struct.Payload*)"(i32 382, %dx.types.HitObject* %hit, %struct.Payload* %pld), !dbg !53 ; line:99 col:3
+  %6 = bitcast %dx.types.HitObject* %hit to i8*, !dbg !54 ; line:100 col:1
+  call void @llvm.lifetime.end(i64 4, i8* %6) #0, !dbg !54 ; line:100 col:1
+  %7 = bitcast %struct.Payload* %pld to i8*, !dbg !54 ; line:100 col:1
+  call void @llvm.lifetime.end(i64 12, i8* %7) #0, !dbg !54 ; line:100 col:1
+  %8 = bitcast %struct.RayDesc* %rayDesc to i8*, !dbg !54 ; line:100 col:1
+  call void @llvm.lifetime.end(i64 32, i8* %8) #0, !dbg !54 ; line:100 col:1
+  ret void, !dbg !54 ; line:100 col:1
+}
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture) #0
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture) #0
+
+; Function Attrs: nounwind
+declare void @"dx.hl.op..void (i32, %dx.types.HitObject*, %dx.types.Handle, i32, i32, i32, i32, i32, %struct.RayDesc*, %struct.Payload*)"(i32, %dx.types.HitObject*, %dx.types.Handle, i32, i32, i32, i32, i32, %struct.RayDesc*, %struct.Payload*) #0
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RaytracingAccelerationStructure)"(i32, %struct.RaytracingAccelerationStructure) #1
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RaytracingAccelerationStructure)"(i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RaytracingAccelerationStructure) #1
+
+; Function Attrs: nounwind
+declare void @"dx.hl.op..void (i32, %dx.types.HitObject*, %struct.Payload*)"(i32, %dx.types.HitObject*, %struct.Payload*) #0
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+
+!llvm.module.flags = !{!0}
+!pauseresume = !{!1}
+!dx.version = !{!2}
+!dx.valver = !{!2}
+!dx.shaderModel = !{!3}
+!dx.typeAnnotations = !{!4, !19}
+!dx.entryPoints = !{!23}
+!dx.fnprops = !{!28}
+!dx.options = !{!29, !30}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !{!"hlsl-hlemit", !"hlsl-hlensure"}
+!2 = !{i32 1, i32 9}
+!3 = !{!"lib", i32 6, i32 9}
+!4 = !{i32 0, %"class.RWStructuredBuffer<float>" undef, !5, %struct.RayDesc undef, !10, %struct.Payload undef, !15, %"class.dx::HitObject" undef, !17}
+!5 = !{i32 4, !6, !7}
+!6 = !{i32 6, !"h", i32 3, i32 0, i32 7, i32 9}
+!7 = !{i32 0, !8}
+!8 = !{!9}
+!9 = !{i32 0, float undef}
+!10 = !{i32 32, !11, !12, !13, !14}
+!11 = !{i32 6, !"Origin", i32 3, i32 0, i32 7, i32 9, i32 13, i32 3}
+!12 = !{i32 6, !"TMin", i32 3, i32 12, i32 7, i32 9}
+!13 = !{i32 6, !"Direction", i32 3, i32 16, i32 7, i32 9, i32 13, i32 3}
+!14 = !{i32 6, !"TMax", i32 3, i32 28, i32 7, i32 9}
+!15 = !{i32 12, !16}
+!16 = !{i32 6, !"dummy", i32 3, i32 0, i32 7, i32 9, i32 13, i32 3}
+!17 = !{i32 4, !18}
+!18 = !{i32 6, !"h", i32 3, i32 0, i32 7, i32 4}
+!19 = !{i32 1, void ()* @"\01?main@@YAXXZ", !20}
+!20 = !{!21}
+!21 = !{i32 1, !22, !22}
+!22 = !{}
+!23 = !{null, !"", null, !24, null}
+!24 = !{!25, null, null, null}
+!25 = !{!26}
+!26 = !{i32 0, %struct.RaytracingAccelerationStructure* @"\01?RTAS@@3URaytracingAccelerationStructure@@A", !"RTAS", i32 -1, i32 -1, i32 1, i32 16, i32 0, !27}
+!27 = !{i32 0, i32 4}
+!28 = !{void ()* @"\01?main@@YAXXZ", i32 7}
+!29 = !{i32 -2147483584}
+!30 = !{i32 -1}
+!31 = !DILocation(line: 80, column: 3, scope: !32)
+!32 = !DISubprogram(name: "main", scope: !33, file: !33, line: 79, type: !34, isLocal: false, isDefinition: true, scopeLine: 79, flags: DIFlagPrototyped, isOptimized: false, function: void ()* @"\01?main@@YAXXZ")
+!33 = !DIFile(filename: "tools/clang/test/CodeGenDXIL/hlsl/objects/HitObject/hitobject_traceinvoke.hlsl", directory: "")
+!34 = !DISubroutineType(types: !22)
+!35 = !DILocation(line: 81, column: 11, scope: !32)
+!36 = !DILocation(line: 81, column: 18, scope: !32)
+!37 = !{!38, !38, i64 0}
+!38 = !{!"omnipotent char", !39, i64 0}
+!39 = !{!"Simple C/C++ TBAA"}
+!40 = !DILocation(line: 82, column: 11, scope: !32)
+!41 = !DILocation(line: 82, column: 16, scope: !32)
+!42 = !{!43, !43, i64 0}
+!43 = !{!"float", !38, i64 0}
+!44 = !DILocation(line: 83, column: 11, scope: !32)
+!45 = !DILocation(line: 83, column: 21, scope: !32)
+!46 = !DILocation(line: 84, column: 11, scope: !32)
+!47 = !DILocation(line: 84, column: 16, scope: !32)
+!48 = !DILocation(line: 86, column: 3, scope: !32)
+!49 = !DILocation(line: 87, column: 7, scope: !32)
+!50 = !DILocation(line: 87, column: 13, scope: !32)
+!51 = !DILocation(line: 89, column: 3, scope: !32)
+!52 = !DILocation(line: 89, column: 23, scope: !32)
+!53 = !DILocation(line: 99, column: 3, scope: !32)
+!54 = !DILocation(line: 100, column: 1, scope: !32)

From ea95489309139c47c87fe2b2a54fc426910e8ccd Mon Sep 17 00:00:00 2001
From: Steven Perron <stevenperron@google.com>
Date: Wed, 23 Apr 2025 07:32:35 -0400
Subject: [PATCH 05/93] [SPIRV] Update submodules (#7369)

---
 external/SPIRV-Headers                                       | 2 +-
 external/SPIRV-Tools                                         | 2 +-
 tools/clang/test/CodeGenSPIRV/fn.export.with.entrypoint.hlsl | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/external/SPIRV-Headers b/external/SPIRV-Headers
index 0e71067798..aa6cef192b 160000
--- a/external/SPIRV-Headers
+++ b/external/SPIRV-Headers
@@ -1 +1 @@
-Subproject commit 0e710677989b4326ac974fd80c5308191ed80965
+Subproject commit aa6cef192b8e693916eb713e7a9ccadf06062ceb
diff --git a/external/SPIRV-Tools b/external/SPIRV-Tools
index 4bd1536ed7..898ed77be1 160000
--- a/external/SPIRV-Tools
+++ b/external/SPIRV-Tools
@@ -1 +1 @@
-Subproject commit 4bd1536ed79003a5194a4bd8c9aa2fa17a84c15b
+Subproject commit 898ed77be18c99418f983ea220be91a926e6e26e
diff --git a/tools/clang/test/CodeGenSPIRV/fn.export.with.entrypoint.hlsl b/tools/clang/test/CodeGenSPIRV/fn.export.with.entrypoint.hlsl
index da25ead9c1..312476b260 100644
--- a/tools/clang/test/CodeGenSPIRV/fn.export.with.entrypoint.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/fn.export.with.entrypoint.hlsl
@@ -1,4 +1,4 @@
-// RUN: %dxc -T as_6_6 -E main -fspv-target-env=vulkan1.3 -fcgl  %s -spirv | FileCheck %s
+// RUN: %dxc -T as_6_6 -E main -fspv-target-env=universal1.5 -fcgl  %s -spirv | FileCheck %s
 
 // CHECK: OpCapability Linkage
 // CHECK: OpDecorate %external_function LinkageAttributes "external_function" Export
@@ -10,4 +10,4 @@ export int external_function() {
 void main() {
   external_function();
 	return;
-}
\ No newline at end of file
+}

From 06381f2d7b2c8b32add7839dc068f0e761f4d4b4 Mon Sep 17 00:00:00 2001
From: Steven Perron <stevenperron@google.com>
Date: Wed, 23 Apr 2025 10:48:19 -0400
Subject: [PATCH 06/93] [SPIRV] Update submodules (#7373)

Update the submodules to the latest release candidate for the Vulkan
SDK.
---
 external/SPIRV-Tools | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/external/SPIRV-Tools b/external/SPIRV-Tools
index 898ed77be1..a62abcb402 160000
--- a/external/SPIRV-Tools
+++ b/external/SPIRV-Tools
@@ -1 +1 @@
-Subproject commit 898ed77be18c99418f983ea220be91a926e6e26e
+Subproject commit a62abcb402009b9ca5975e6167c09f237f630e0e

From bddee27c021ce88df9c90ca2d88cf6872f6c0963 Mon Sep 17 00:00:00 2001
From: Simon Moll <smoll@nvidia.com>
Date: Wed, 23 Apr 2025 18:35:56 +0200
Subject: [PATCH 07/93] [SER] HitObject accessors HLSL -> DXIL lowering (#7360)

Lowering for all HitObject accessors (ex GetAttributes)

Specification:
https://github.com/microsoft/hlsl-specs/blob/main/proposals/0027-shader-execution-reordering.md
DXC SER implementation tracker::
https://github.com/microsoft/DirectXShaderCompiler/issues/7214
---
 lib/HLSL/HLOperationLower.cpp                 |  98 ++-
 .../HitObject/hitobject_accessors.hlsl        | 113 +++
 .../DxilGen/hitobject_accessors_dxilgen.ll    | 687 ++++++++++++++++++
 .../HitObject/hitobject_accessors.hlsl        | 263 +++++++
 4 files changed, 1142 insertions(+), 19 deletions(-)
 create mode 100644 tools/clang/test/CodeGenDXIL/hlsl/objects/HitObject/hitobject_accessors.hlsl
 create mode 100644 tools/clang/test/DXC/Passes/DxilGen/hitobject_accessors_dxilgen.ll
 create mode 100644 tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_accessors.hlsl

diff --git a/lib/HLSL/HLOperationLower.cpp b/lib/HLSL/HLOperationLower.cpp
index 69dd803f7b..1e43cce07c 100644
--- a/lib/HLSL/HLOperationLower.cpp
+++ b/lib/HLSL/HLOperationLower.cpp
@@ -5958,19 +5958,31 @@ Value *TranslateNoArgVectorOperation(CallInst *CI, IntrinsicOp IOP,
   return retVal;
 }
 
+template <typename ColElemTy>
+static void GetMatrixIndices(Constant *&Rows, Constant *&Cols, bool Is3x4,
+                             LLVMContext &Ctx) {
+  if (Is3x4) {
+    uint32_t RVals[] = {0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2};
+    Rows = ConstantDataVector::get(Ctx, RVals);
+    ColElemTy CVals[] = {0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3};
+    Cols = ConstantDataVector::get(Ctx, CVals);
+    return;
+  }
+  uint32_t RVals[] = {0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2};
+  Rows = ConstantDataVector::get(Ctx, RVals);
+  ColElemTy CVals[] = {0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3};
+  Cols = ConstantDataVector::get(Ctx, CVals);
+}
+
 Value *TranslateNoArgMatrix3x4Operation(
     CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
     HLOperationLowerHelper &helper, HLObjectOperationLowerHelper *pObjHelper,
     bool &Translated) {
   hlsl::OP *hlslOP = &helper.hlslOP;
   VectorType *Ty = cast<VectorType>(CI->getType());
-  uint32_t rVals[] = {0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2};
-  Constant *rows = ConstantDataVector::get(CI->getContext(), rVals);
-  uint8_t cVals[] = {0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3};
-  Constant *cols = ConstantDataVector::get(CI->getContext(), cVals);
-  Value *retVal =
-      TrivialDxilOperation(opcode, {nullptr, rows, cols}, Ty, CI, hlslOP);
-  return retVal;
+  Constant *Rows, *Cols;
+  GetMatrixIndices<uint8_t>(Rows, Cols, true, CI->getContext());
+  return TrivialDxilOperation(opcode, {nullptr, Rows, Cols}, Ty, CI, hlslOP);
 }
 
 Value *TranslateNoArgTransposedMatrix3x4Operation(
@@ -5979,13 +5991,9 @@ Value *TranslateNoArgTransposedMatrix3x4Operation(
     bool &Translated) {
   hlsl::OP *hlslOP = &helper.hlslOP;
   VectorType *Ty = cast<VectorType>(CI->getType());
-  uint32_t rVals[] = {0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2};
-  Constant *rows = ConstantDataVector::get(CI->getContext(), rVals);
-  uint8_t cVals[] = {0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3};
-  Constant *cols = ConstantDataVector::get(CI->getContext(), cVals);
-  Value *retVal =
-      TrivialDxilOperation(opcode, {nullptr, rows, cols}, Ty, CI, hlslOP);
-  return retVal;
+  Constant *Rows, *Cols;
+  GetMatrixIndices<uint8_t>(Rows, Cols, false, CI->getContext());
+  return TrivialDxilOperation(opcode, {nullptr, Rows, Cols}, Ty, CI, hlslOP);
 }
 
 /*
@@ -6375,7 +6383,12 @@ Value *TranslateHitObjectScalarGetter(CallInst *CI, IntrinsicOp IOP,
                                       HLOperationLowerHelper &Helper,
                                       HLObjectOperationLowerHelper *pObjHelper,
                                       bool &Translated) {
-  return UndefValue::get(CI->getType()); // TODO: Merge SER DXIL patches
+  hlsl::OP *OP = &Helper.hlslOP;
+  Value *HitObjectPtr = CI->getArgOperand(1);
+  IRBuilder<> Builder(CI);
+  Value *HitObject = Builder.CreateLoad(HitObjectPtr);
+  return TrivialDxilOperation(OpCode, {nullptr, HitObject}, CI->getType(), CI,
+                              OP);
 }
 
 Value *TranslateHitObjectVectorGetter(CallInst *CI, IntrinsicOp IOP,
@@ -6383,7 +6396,24 @@ Value *TranslateHitObjectVectorGetter(CallInst *CI, IntrinsicOp IOP,
                                       HLOperationLowerHelper &Helper,
                                       HLObjectOperationLowerHelper *pObjHelper,
                                       bool &Translated) {
-  return UndefValue::get(CI->getType()); // TODO: Merge SER DXIL patches
+  hlsl::OP *OP = &Helper.hlslOP;
+  Value *HitObjectPtr = CI->getArgOperand(1);
+  IRBuilder<> Builder(CI);
+  Value *HitObject = Builder.CreateLoad(HitObjectPtr);
+  VectorType *Ty = cast<VectorType>(CI->getType());
+  uint32_t Vals[] = {0, 1, 2, 3};
+  Constant *Src = ConstantDataVector::get(CI->getContext(), Vals);
+  return TrivialDxilOperation(OpCode, {nullptr, HitObject, Src}, Ty, CI, OP);
+}
+
+static bool IsHitObject3x4Getter(IntrinsicOp IOP) {
+  switch (IOP) {
+  default:
+    return false;
+  case IntrinsicOp::MOP_DxHitObject_GetObjectToWorld3x4:
+  case IntrinsicOp::MOP_DxHitObject_GetWorldToObject3x4:
+    return true;
+  }
 }
 
 Value *TranslateHitObjectMatrixGetter(CallInst *CI, IntrinsicOp IOP,
@@ -6391,21 +6421,51 @@ Value *TranslateHitObjectMatrixGetter(CallInst *CI, IntrinsicOp IOP,
                                       HLOperationLowerHelper &Helper,
                                       HLObjectOperationLowerHelper *pObjHelper,
                                       bool &Translated) {
-  return UndefValue::get(CI->getType()); // TODO: Merge SER DXIL patches
+  hlsl::OP *OP = &Helper.hlslOP;
+  Value *HitObjectPtr = CI->getArgOperand(1);
+  IRBuilder<> Builder(CI);
+  Value *HitObject = Builder.CreateLoad(HitObjectPtr);
+
+  // Create 3x4 matrix indices
+  bool Is3x4 = IsHitObject3x4Getter(IOP);
+  Constant *Rows, *Cols;
+  GetMatrixIndices<uint32_t>(Rows, Cols, Is3x4, CI->getContext());
+
+  VectorType *Ty = cast<VectorType>(CI->getType());
+  return TrivialDxilOperation(OpCode, {nullptr, HitObject, Rows, Cols}, Ty, CI,
+                              OP);
 }
 
 Value *TranslateHitObjectLoadLocalRootTableConstant(
     CallInst *CI, IntrinsicOp IOP, OP::OpCode OpCode,
     HLOperationLowerHelper &Helper, HLObjectOperationLowerHelper *pObjHelper,
     bool &Translated) {
-  return UndefValue::get(CI->getType()); // TODO: Merge SER DXIL patches
+  hlsl::OP *OP = &Helper.hlslOP;
+  IRBuilder<> Builder(CI);
+
+  Value *HitObjectPtr = CI->getArgOperand(1);
+  Value *Offset = CI->getArgOperand(2);
+
+  Value *HitObject = Builder.CreateLoad(HitObjectPtr);
+  return TrivialDxilOperation(OpCode, {nullptr, HitObject, Offset},
+                              Helper.voidTy, CI, OP);
 }
 
 Value *TranslateHitObjectSetShaderTableIndex(
     CallInst *CI, IntrinsicOp IOP, OP::OpCode OpCode,
     HLOperationLowerHelper &Helper, HLObjectOperationLowerHelper *pObjHelper,
     bool &Translated) {
-  return UndefValue::get(CI->getType()); // TODO: Merge SER DXIL patches
+  hlsl::OP *OP = &Helper.hlslOP;
+  IRBuilder<> Builder(CI);
+
+  Value *HitObjectPtr = CI->getArgOperand(1);
+  Value *ShaderTableIndex = CI->getArgOperand(2);
+
+  Value *InHitObject = Builder.CreateLoad(HitObjectPtr);
+  Value *OutHitObject = TrivialDxilOperation(
+      OpCode, {nullptr, InHitObject, ShaderTableIndex}, Helper.voidTy, CI, OP);
+  Builder.CreateStore(OutHitObject, HitObjectPtr);
+  return nullptr;
 }
 
 } // namespace
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/objects/HitObject/hitobject_accessors.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/objects/HitObject/hitobject_accessors.hlsl
new file mode 100644
index 0000000000..bae2b0590c
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/hlsl/objects/HitObject/hitobject_accessors.hlsl
@@ -0,0 +1,113 @@
+// REQUIRES: dxil-1-9
+// RUN: %dxc -T lib_6_9 -E main %s | FileCheck %s --check-prefix DXIL
+
+// DXIL: %dx.types.HitObject = type { i8* }
+
+// DXIL:   %[[NOP:[^ ]+]] = call %dx.types.HitObject @dx.op.hitObject_MakeNop(i32 266)  ; HitObject_MakeNop()
+// DXIL:   %[[HIT:[^ ]+]] = call %dx.types.HitObject @dx.op.hitObject_SetShaderTableIndex(i32 287, %dx.types.HitObject %[[NOP]], i32 1)  ; HitObject_SetShaderTableIndex(hitObject,shaderTableIndex)
+// DXIL-DAG:   %{{[^ ]+}} = call i1 @dx.op.hitObject_StateScalar.i1(i32 270, %dx.types.HitObject %[[HIT]])  ; HitObject_IsHit(hitObject)
+// DXIL-DAG:   %{{[^ ]+}} = call i1 @dx.op.hitObject_StateScalar.i1(i32 269, %dx.types.HitObject %[[HIT]])  ; HitObject_IsMiss(hitObject)
+// DXIL-DAG:   %{{[^ ]+}} = call i1 @dx.op.hitObject_StateScalar.i1(i32 271, %dx.types.HitObject %[[HIT]])  ; HitObject_IsNop(hitObject)
+// DXIL-DAG:   %{{[^ ]+}} = call i32 @dx.op.hitObject_StateScalar.i32(i32 281, %dx.types.HitObject %[[HIT]])  ; HitObject_GeometryIndex(hitObject)
+// DXIL-DAG:   %{{[^ ]+}} = call i32 @dx.op.hitObject_StateScalar.i32(i32 285, %dx.types.HitObject %[[HIT]])  ; HitObject_HitKind(hitObject)
+// DXIL-DAG:   %{{[^ ]+}} = call i32 @dx.op.hitObject_StateScalar.i32(i32 282, %dx.types.HitObject %[[HIT]])  ; HitObject_InstanceIndex(hitObject)
+// DXIL-DAG:   %{{[^ ]+}} = call i32 @dx.op.hitObject_StateScalar.i32(i32 283, %dx.types.HitObject %[[HIT]])  ; HitObject_InstanceID(hitObject)
+// DXIL-DAG:   %{{[^ ]+}} = call i32 @dx.op.hitObject_StateScalar.i32(i32 284, %dx.types.HitObject %[[HIT]])  ; HitObject_PrimitiveIndex(hitObject)
+// DXIL-DAG:   %{{[^ ]+}} = call i32 @dx.op.hitObject_StateScalar.i32(i32 286, %dx.types.HitObject %[[HIT]])  ; HitObject_ShaderTableIndex(hitObject)
+// DXIL-DAG:   %{{[^ ]+}} = call i32 @dx.op.hitObject_LoadLocalRootTableConstant(i32 288, %dx.types.HitObject %[[HIT]], i32 42)  ; HitObject_LoadLocalRootTableConstant(hitObject,offset)
+// DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateVector.f32(i32 277, %dx.types.HitObject %[[HIT]], i32 0)  ; HitObject_ObjectRayOrigin(hitObject,component)
+// DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateVector.f32(i32 277, %dx.types.HitObject %[[HIT]], i32 1)  ; HitObject_ObjectRayOrigin(hitObject,component)
+// DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateVector.f32(i32 277, %dx.types.HitObject %[[HIT]], i32 2)  ; HitObject_ObjectRayOrigin(hitObject,component)
+// DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateVector.f32(i32 278, %dx.types.HitObject %[[HIT]], i32 0)  ; HitObject_ObjectRayDirection(hitObject,component)
+// DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateVector.f32(i32 278, %dx.types.HitObject %[[HIT]], i32 1)  ; HitObject_ObjectRayDirection(hitObject,component)
+// DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateVector.f32(i32 278, %dx.types.HitObject %[[HIT]], i32 2)  ; HitObject_ObjectRayDirection(hitObject,component)
+// DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateVector.f32(i32 275, %dx.types.HitObject %[[HIT]], i32 0)  ; HitObject_WorldRayOrigin(hitObject,component)
+// DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateVector.f32(i32 275, %dx.types.HitObject %[[HIT]], i32 1)  ; HitObject_WorldRayOrigin(hitObject,component)
+// DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateVector.f32(i32 275, %dx.types.HitObject %[[HIT]], i32 2)  ; HitObject_WorldRayOrigin(hitObject,component)
+// DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateVector.f32(i32 276, %dx.types.HitObject %[[HIT]], i32 0)  ; HitObject_WorldRayDirection(hitObject,component)
+// DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateVector.f32(i32 276, %dx.types.HitObject %[[HIT]], i32 1)  ; HitObject_WorldRayDirection(hitObject,component)
+// DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateVector.f32(i32 276, %dx.types.HitObject %[[HIT]], i32 2)  ; HitObject_WorldRayDirection(hitObject,component)
+// DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %[[HIT]], i32 0, i32 0)  ; HitObject_ObjectToWorld3x4(hitObject,row,col)
+// DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %[[HIT]], i32 0, i32 1)  ; HitObject_ObjectToWorld3x4(hitObject,row,col)
+// DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %[[HIT]], i32 0, i32 2)  ; HitObject_ObjectToWorld3x4(hitObject,row,col)
+// DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %[[HIT]], i32 0, i32 3)  ; HitObject_ObjectToWorld3x4(hitObject,row,col)
+// DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %[[HIT]], i32 1, i32 0)  ; HitObject_ObjectToWorld3x4(hitObject,row,col)
+// DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %[[HIT]], i32 1, i32 1)  ; HitObject_ObjectToWorld3x4(hitObject,row,col)
+// DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %[[HIT]], i32 1, i32 2)  ; HitObject_ObjectToWorld3x4(hitObject,row,col)
+// DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %[[HIT]], i32 1, i32 3)  ; HitObject_ObjectToWorld3x4(hitObject,row,col)
+// DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %[[HIT]], i32 2, i32 0)  ; HitObject_ObjectToWorld3x4(hitObject,row,col)
+// DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %[[HIT]], i32 2, i32 1)  ; HitObject_ObjectToWorld3x4(hitObject,row,col)
+// DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %[[HIT]], i32 2, i32 2)  ; HitObject_ObjectToWorld3x4(hitObject,row,col)
+// DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %[[HIT]], i32 2, i32 3)  ; HitObject_ObjectToWorld3x4(hitObject,row,col)
+// DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %[[HIT]], i32 0, i32 0)  ; HitObject_WorldToObject3x4(hitObject,row,col)
+// DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %[[HIT]], i32 0, i32 1)  ; HitObject_WorldToObject3x4(hitObject,row,col)
+// DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %[[HIT]], i32 0, i32 2)  ; HitObject_WorldToObject3x4(hitObject,row,col)
+// DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %[[HIT]], i32 0, i32 3)  ; HitObject_WorldToObject3x4(hitObject,row,col)
+// DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %[[HIT]], i32 1, i32 0)  ; HitObject_WorldToObject3x4(hitObject,row,col)
+// DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %[[HIT]], i32 1, i32 1)  ; HitObject_WorldToObject3x4(hitObject,row,col)
+// DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %[[HIT]], i32 1, i32 2)  ; HitObject_WorldToObject3x4(hitObject,row,col)
+// DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %[[HIT]], i32 1, i32 3)  ; HitObject_WorldToObject3x4(hitObject,row,col)
+// DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %[[HIT]], i32 2, i32 0)  ; HitObject_WorldToObject3x4(hitObject,row,col)
+// DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %[[HIT]], i32 2, i32 1)  ; HitObject_WorldToObject3x4(hitObject,row,col)
+// DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %[[HIT]], i32 2, i32 2)  ; HitObject_WorldToObject3x4(hitObject,row,col)
+// DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %[[HIT]], i32 2, i32 3)  ; HitObject_WorldToObject3x4(hitObject,row,col)
+// DXIL:   ret void
+
+RWByteAddressBuffer outbuf;
+
+template <int M, int N>
+float hashM(in matrix<float, M, N> mat) {
+  float h = 0.f;
+  for (int i = 0; i < M; ++i)
+    for (int j = 0; j < N; ++j)
+      h += mat[i][j];
+  return h;
+}
+
+[shader("raygeneration")]
+void main() {
+  dx::HitObject hit;
+  int isum = 0;
+  float fsum = 0.0f;
+  vector<float, 3> vsum = 0;
+
+  ///// Setters
+  hit.SetShaderTableIndex(1);
+
+  ///// Getters
+
+  // i1 accessors
+  isum += hit.IsHit();
+  isum += hit.IsMiss();
+  isum += hit.IsNop();
+
+  // i32 accessors
+  isum += hit.GetGeometryIndex();
+  isum += hit.GetHitKind();
+  isum += hit.GetInstanceIndex();
+  isum += hit.GetInstanceID();
+  isum += hit.GetPrimitiveIndex();
+  isum += hit.GetShaderTableIndex();
+  isum += hit.LoadLocalRootTableConstant(42);
+
+  // float3 accessors
+  vsum += hit.GetWorldRayOrigin();
+  vsum += hit.GetWorldRayDirection();
+  vsum += hit.GetObjectRayOrigin();
+  vsum += hit.GetObjectRayDirection();
+  fsum += vsum[0] + vsum[1] + vsum[2];
+
+  // matrix accessors
+  fsum += hashM<3, 4>(hit.GetObjectToWorld3x4());
+  fsum += hashM<4, 3>(hit.GetObjectToWorld4x3());
+  fsum += hashM<3, 4>(hit.GetWorldToObject3x4());
+  fsum += hashM<4, 3>(hit.GetWorldToObject4x3());
+
+  // f32 accessors
+  isum += hit.GetRayFlags();
+  fsum += hit.GetRayTMin();
+  fsum += hit.GetRayTCurrent();
+
+  outbuf.Store(0, fsum);
+  outbuf.Store(4, isum);
+}
diff --git a/tools/clang/test/DXC/Passes/DxilGen/hitobject_accessors_dxilgen.ll b/tools/clang/test/DXC/Passes/DxilGen/hitobject_accessors_dxilgen.ll
new file mode 100644
index 0000000000..4fc6a47780
--- /dev/null
+++ b/tools/clang/test/DXC/Passes/DxilGen/hitobject_accessors_dxilgen.ll
@@ -0,0 +1,687 @@
+; RUN: %dxopt %s -hlsl-passes-resume -dxilgen -S | FileCheck %s
+; REQUIRES: dxil-1-9
+
+;
+; Buffer Definitions:
+;
+;
+; Resource Bindings:
+;
+; Name                                 Type  Format         Dim      ID      HLSL Bind  Count
+; ------------------------------ ---------- ------- ----------- ------- -------------- ------
+; outbuf                                UAV    byte         r/w      U0u4294967295,space4294967295     1
+;
+target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+%struct.RWByteAddressBuffer = type { i32 }
+%dx.types.HitObject = type { i8* }
+%dx.types.Handle = type { i8* }
+%dx.types.ResourceProperties = type { i32, i32 }
+%"class.dx::HitObject" = type { i32 }
+
+@"\01?outbuf@@3URWByteAddressBuffer@@A" = external global %struct.RWByteAddressBuffer, align 4
+
+; CHECK:  %{{[^ ]+}} = call %dx.types.HitObject @dx.op.hitObject_SetShaderTableIndex(i32 287, %dx.types.HitObject %{{[^ ]+}}, i32 1)
+; CHECK:  %{{[^ ]+}} = call i1 @dx.op.hitObject_StateScalar.i1(i32 270, %dx.types.HitObject %{{[^ ]+}})
+; CHECK:  %{{[^ ]+}} = call i1 @dx.op.hitObject_StateScalar.i1(i32 269, %dx.types.HitObject %{{[^ ]+}})
+; CHECK:  %{{[^ ]+}} = call i1 @dx.op.hitObject_StateScalar.i1(i32 271, %dx.types.HitObject %{{[^ ]+}})
+; CHECK:  %{{[^ ]+}} = call i32 @dx.op.hitObject_StateScalar.i32(i32 281, %dx.types.HitObject %{{[^ ]+}})
+; CHECK:  %{{[^ ]+}} = call i32 @dx.op.hitObject_StateScalar.i32(i32 285, %dx.types.HitObject %{{[^ ]+}})
+; CHECK:  %{{[^ ]+}} = call i32 @dx.op.hitObject_StateScalar.i32(i32 282, %dx.types.HitObject %{{[^ ]+}})
+; CHECK:  %{{[^ ]+}} = call i32 @dx.op.hitObject_StateScalar.i32(i32 283, %dx.types.HitObject %{{[^ ]+}})
+; CHECK:  %{{[^ ]+}} = call i32 @dx.op.hitObject_StateScalar.i32(i32 284, %dx.types.HitObject %{{[^ ]+}})
+; CHECK:  %{{[^ ]+}} = call i32 @dx.op.hitObject_StateScalar.i32(i32 286, %dx.types.HitObject %{{[^ ]+}})
+; CHECK:  %{{[^ ]+}} = call i32 @dx.op.hitObject_LoadLocalRootTableConstant(i32 288, %dx.types.HitObject %{{[^ ]+}}, i32 42)
+; CHECK:  %{{[^ ]+}} = call float @dx.op.hitObject_StateVector.f32(i32 275, %dx.types.HitObject %{{[^ ]+}}, i32 0)
+; CHECK:  %{{[^ ]+}} = call float @dx.op.hitObject_StateVector.f32(i32 275, %dx.types.HitObject %{{[^ ]+}}, i32 1)
+; CHECK:  %{{[^ ]+}} = call float @dx.op.hitObject_StateVector.f32(i32 275, %dx.types.HitObject %{{[^ ]+}}, i32 2)
+; CHECK:  %{{[^ ]+}} = call float @dx.op.hitObject_StateVector.f32(i32 276, %dx.types.HitObject %{{[^ ]+}}, i32 0)
+; CHECK:  %{{[^ ]+}} = call float @dx.op.hitObject_StateVector.f32(i32 276, %dx.types.HitObject %{{[^ ]+}}, i32 1)
+; CHECK:  %{{[^ ]+}} = call float @dx.op.hitObject_StateVector.f32(i32 276, %dx.types.HitObject %{{[^ ]+}}, i32 2)
+; CHECK:  %{{[^ ]+}} = call float @dx.op.hitObject_StateVector.f32(i32 277, %dx.types.HitObject %{{[^ ]+}}, i32 0)
+; CHECK:  %{{[^ ]+}} = call float @dx.op.hitObject_StateVector.f32(i32 277, %dx.types.HitObject %{{[^ ]+}}, i32 1)
+; CHECK:  %{{[^ ]+}} = call float @dx.op.hitObject_StateVector.f32(i32 277, %dx.types.HitObject %{{[^ ]+}}, i32 2)
+; CHECK:  %{{[^ ]+}} = call float @dx.op.hitObject_StateVector.f32(i32 278, %dx.types.HitObject %{{[^ ]+}}, i32 0)
+; CHECK:  %{{[^ ]+}} = call float @dx.op.hitObject_StateVector.f32(i32 278, %dx.types.HitObject %{{[^ ]+}}, i32 1)
+; CHECK:  %{{[^ ]+}} = call float @dx.op.hitObject_StateVector.f32(i32 278, %dx.types.HitObject %{{[^ ]+}}, i32 2)
+
+; CHECK:  %[[M34OW00:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %[[M34OWHO:[^ ]+]], i32 0, i32 0)
+; CHECK-NEXT:  %[[M34VOW0:[^ ]+]] = insertelement <12 x float> undef, float %[[M34OW00]], i64 0
+; CHECK-NEXT:  %[[M34OW01:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %[[M34OWHO]], i32 0, i32 1)
+; CHECK-NEXT:  %[[M34VOW1:[^ ]+]] = insertelement <12 x float> %[[M34VOW0]], float %[[M34OW01]], i64 1
+; CHECK-NEXT:  %[[M34OW02:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %[[M34OWHO]], i32 0, i32 2)
+; CHECK-NEXT:  %[[M34VOW2:[^ ]+]] = insertelement <12 x float> %[[M34VOW1]], float %[[M34OW02]], i64 2
+; CHECK-NEXT:  %[[M34OW03:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %[[M34OWHO]], i32 0, i32 3)
+; CHECK-NEXT:  %[[M34VOW3:[^ ]+]] = insertelement <12 x float> %[[M34VOW2]], float %[[M34OW03]], i64 3
+; CHECK-NEXT:  %[[M34OW10:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %[[M34OWHO]], i32 1, i32 0)
+; CHECK-NEXT:  %[[M34VOW4:[^ ]+]] = insertelement <12 x float> %[[M34VOW3]], float %[[M34OW10]], i64 4
+; CHECK-NEXT:  %[[M34OW11:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %[[M34OWHO]], i32 1, i32 1)
+; CHECK-NEXT:  %[[M34VOW5:[^ ]+]] = insertelement <12 x float> %[[M34VOW4]], float %[[M34OW11]], i64 5
+; CHECK-NEXT:  %[[M34OW12:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %[[M34OWHO]], i32 1, i32 2)
+; CHECK-NEXT:  %[[M34VOW6:[^ ]+]] = insertelement <12 x float> %[[M34VOW5]], float %[[M34OW12]], i64 6
+; CHECK-NEXT:  %[[M34OW13:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %[[M34OWHO]], i32 1, i32 3)
+; CHECK-NEXT:  %[[M34VOW7:[^ ]+]] = insertelement <12 x float> %[[M34VOW6]], float %[[M34OW13]], i64 7
+; CHECK-NEXT:  %[[M34OW20:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %[[M34OWHO]], i32 2, i32 0)
+; CHECK-NEXT:  %[[M34VOW8:[^ ]+]] = insertelement <12 x float> %[[M34VOW7]], float %[[M34OW20]], i64 8
+; CHECK-NEXT:  %[[M34OW21:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %[[M34OWHO]], i32 2, i32 1)
+; CHECK-NEXT:  %[[M34VOW9:[^ ]+]] = insertelement <12 x float> %[[M34VOW8]], float %[[M34OW21]], i64 9
+; CHECK-NEXT:  %[[M34OW22:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %[[M34OWHO]], i32 2, i32 2)
+; CHECK-NEXT:  %[[M34VOW10:[^ ]+]] = insertelement <12 x float> %[[M34VOW9]], float %[[M34OW22]], i64 10
+; CHECK-NEXT:  %[[M34OW23:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %[[M34OWHO]], i32 2, i32 3)
+; CHECK-NEXT:  %{{[^ ]+}} = insertelement <12 x float> %[[M34VOW10]], float %[[M34OW23]], i64 11
+
+; CHECK:  %[[M43OW00:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %[[M43OWHO:[^ ]+]], i32 0, i32 0)
+; CHECK-NEXT:  %[[M43VOW0:[^ ]+]] = insertelement <12 x float> undef, float %[[M43OW00]], i64 0
+; CHECK-NEXT:  %[[M43OW10:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %[[M43OWHO]], i32 1, i32 0)
+; CHECK-NEXT:  %[[M43VOW1:[^ ]+]] = insertelement <12 x float> %[[M43VOW0]], float %[[M43OW10]], i64 1
+; CHECK-NEXT:  %[[M43OW20:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %[[M43OWHO]], i32 2, i32 0)
+; CHECK-NEXT:  %[[M43VOW2:[^ ]+]] = insertelement <12 x float> %[[M43VOW1]], float %[[M43OW20]], i64 2
+; CHECK-NEXT:  %[[M43OW01:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %[[M43OWHO]], i32 0, i32 1)
+; CHECK-NEXT:  %[[M43VOW3:[^ ]+]] = insertelement <12 x float> %[[M43VOW2]], float %[[M43OW01]], i64 3
+; CHECK-NEXT:  %[[M43OW11:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %[[M43OWHO]], i32 1, i32 1)
+; CHECK-NEXT:  %[[M43VOW4:[^ ]+]] = insertelement <12 x float> %[[M43VOW3]], float %[[M43OW11]], i64 4
+; CHECK-NEXT:  %[[M43OW21:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %[[M43OWHO]], i32 2, i32 1)
+; CHECK-NEXT:  %[[M43VOW5:[^ ]+]] = insertelement <12 x float> %[[M43VOW4]], float %[[M43OW21]], i64 5
+; CHECK-NEXT:  %[[M43OW02:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %[[M43OWHO]], i32 0, i32 2)
+; CHECK-NEXT:  %[[M43VOW6:[^ ]+]] = insertelement <12 x float> %[[M43VOW5]], float %[[M43OW02]], i64 6
+; CHECK-NEXT:  %[[M43OW12:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %[[M43OWHO]], i32 1, i32 2)
+; CHECK-NEXT:  %[[M43VOW7:[^ ]+]] = insertelement <12 x float> %[[M43VOW6]], float %[[M43OW12]], i64 7
+; CHECK-NEXT:  %[[M43OW22:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %[[M43OWHO]], i32 2, i32 2)
+; CHECK-NEXT:  %[[M43VOW8:[^ ]+]] = insertelement <12 x float> %[[M43VOW7]], float %[[M43OW22]], i64 8
+; CHECK-NEXT:  %[[M43OW03:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %[[M43OWHO]], i32 0, i32 3)
+; CHECK-NEXT:  %[[M43VOW9:[^ ]+]] = insertelement <12 x float> %[[M43VOW8]], float %[[M43OW03]], i64 9
+; CHECK-NEXT:  %[[M43OW13:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %[[M43OWHO]], i32 1, i32 3)
+; CHECK-NEXT:  %[[M43VOW10:[^ ]+]] = insertelement <12 x float> %[[M43VOW9]], float %[[M43OW13]], i64 10
+; CHECK-NEXT:  %[[M43OW23:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %[[M43OWHO]], i32 2, i32 3)
+; CHECK-NEXT:  %{{[^ ]+}} = insertelement <12 x float> %[[M43VOW10]], float %[[M43OW23]], i64 11
+
+; CHECK:  %[[M34WO00:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %[[M34WOHO:[^ ]+]], i32 0, i32 0)
+; CHECK-NEXT:  %[[M34VWO0:[^ ]+]] = insertelement <12 x float> undef, float %[[M34WO00]], i64 0
+; CHECK-NEXT:  %[[M34WO01:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %[[M34WOHO]], i32 0, i32 1)
+; CHECK-NEXT:  %[[M34VWO1:[^ ]+]] = insertelement <12 x float> %[[M34VWO0]], float %[[M34WO01]], i64 1
+; CHECK-NEXT:  %[[M34WO02:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %[[M34WOHO]], i32 0, i32 2)
+; CHECK-NEXT:  %[[M34VWO2:[^ ]+]] = insertelement <12 x float> %[[M34VWO1]], float %[[M34WO02]], i64 2
+; CHECK-NEXT:  %[[M34WO03:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %[[M34WOHO]], i32 0, i32 3)
+; CHECK-NEXT:  %[[M34VWO3:[^ ]+]] = insertelement <12 x float> %[[M34VWO2]], float %[[M34WO03]], i64 3
+; CHECK-NEXT:  %[[M34WO10:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %[[M34WOHO]], i32 1, i32 0)
+; CHECK-NEXT:  %[[M34VWO4:[^ ]+]] = insertelement <12 x float> %[[M34VWO3]], float %[[M34WO10]], i64 4
+; CHECK-NEXT:  %[[M34WO11:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %[[M34WOHO]], i32 1, i32 1)
+; CHECK-NEXT:  %[[M34VWO5:[^ ]+]] = insertelement <12 x float> %[[M34VWO4]], float %[[M34WO11]], i64 5
+; CHECK-NEXT:  %[[M34WO12:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %[[M34WOHO]], i32 1, i32 2)
+; CHECK-NEXT:  %[[M34VWO6:[^ ]+]] = insertelement <12 x float> %[[M34VWO5]], float %[[M34WO12]], i64 6
+; CHECK-NEXT:  %[[M34WO13:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %[[M34WOHO]], i32 1, i32 3)
+; CHECK-NEXT:  %[[M34VWO7:[^ ]+]] = insertelement <12 x float> %[[M34VWO6]], float %[[M34WO13]], i64 7
+; CHECK-NEXT:  %[[M34WO20:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %[[M34WOHO]], i32 2, i32 0)
+; CHECK-NEXT:  %[[M34VWO8:[^ ]+]] = insertelement <12 x float> %[[M34VWO7]], float %[[M34WO20]], i64 8
+; CHECK-NEXT:  %[[M34WO21:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %[[M34WOHO]], i32 2, i32 1)
+; CHECK-NEXT:  %[[M34VWO9:[^ ]+]] = insertelement <12 x float> %[[M34VWO8]], float %[[M34WO21]], i64 9
+; CHECK-NEXT:  %[[M34WO22:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %[[M34WOHO]], i32 2, i32 2)
+; CHECK-NEXT:  %[[M34VWO10:[^ ]+]] = insertelement <12 x float> %[[M34VWO9]], float %[[M34WO22]], i64 10
+; CHECK-NEXT:  %[[M34WO23:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %[[M34WOHO]], i32 2, i32 3)
+; CHECK-NEXT:  %{{[^ ]+}} = insertelement <12 x float> %[[M34VWO10]], float %[[M34WO23]], i64 11
+
+; CHECK:  %[[M43WO00:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %[[M43WOHO:[^ ]+]], i32 0, i32 0)
+; CHECK-NEXT:  %[[M43VWO0:[^ ]+]] = insertelement <12 x float> undef, float %[[M43WO00]], i64 0
+; CHECK-NEXT:  %[[M43WO10:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %[[M43WOHO]], i32 1, i32 0)
+; CHECK-NEXT:  %[[M43VWO1:[^ ]+]] = insertelement <12 x float> %[[M43VWO0]], float %[[M43WO10]], i64 1
+; CHECK-NEXT:  %[[M43WO20:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %[[M43WOHO]], i32 2, i32 0)
+; CHECK-NEXT:  %[[M43VWO2:[^ ]+]] = insertelement <12 x float> %[[M43VWO1]], float %[[M43WO20]], i64 2
+; CHECK-NEXT:  %[[M43WO01:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %[[M43WOHO]], i32 0, i32 1)
+; CHECK-NEXT:  %[[M43VWO3:[^ ]+]] = insertelement <12 x float> %[[M43VWO2]], float %[[M43WO01]], i64 3
+; CHECK-NEXT:  %[[M43WO11:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %[[M43WOHO]], i32 1, i32 1)
+; CHECK-NEXT:  %[[M43VWO4:[^ ]+]] = insertelement <12 x float> %[[M43VWO3]], float %[[M43WO11]], i64 4
+; CHECK-NEXT:  %[[M43WO21:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %[[M43WOHO]], i32 2, i32 1)
+; CHECK-NEXT:  %[[M43VWO5:[^ ]+]] = insertelement <12 x float> %[[M43VWO4]], float %[[M43WO21]], i64 5
+; CHECK-NEXT:  %[[M43WO02:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %[[M43WOHO]], i32 0, i32 2)
+; CHECK-NEXT:  %[[M43VWO6:[^ ]+]] = insertelement <12 x float> %[[M43VWO5]], float %[[M43WO02]], i64 6
+; CHECK-NEXT:  %[[M43WO12:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %[[M43WOHO]], i32 1, i32 2)
+; CHECK-NEXT:  %[[M43VWO7:[^ ]+]] = insertelement <12 x float> %[[M43VWO6]], float %[[M43WO12]], i64 7
+; CHECK-NEXT:  %[[M43WO22:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %[[M43WOHO]], i32 2, i32 2)
+; CHECK-NEXT:  %[[M43VWO8:[^ ]+]] = insertelement <12 x float> %[[M43VWO7]], float %[[M43WO22]], i64 8
+; CHECK-NEXT:  %[[M43WO03:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %[[M43WOHO]], i32 0, i32 3)
+; CHECK-NEXT:  %[[M43VWO9:[^ ]+]] = insertelement <12 x float> %[[M43VWO8]], float %[[M43WO03]], i64 9
+; CHECK-NEXT:  %[[M43WO13:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %[[M43WOHO]], i32 1, i32 3)
+; CHECK-NEXT:  %[[M43VWO10:[^ ]+]] = insertelement <12 x float> %[[M43VWO9]], float %[[M43WO13]], i64 10
+; CHECK-NEXT:  %[[M43WO23:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %[[M43WOHO]], i32 2, i32 3)
+; CHECK-NEXT:  %{{[^ ]+}} = insertelement <12 x float> %[[M43VWO10]], float %[[M43WO23]], i64 11
+
+; Function Attrs: nounwind
+define void @"\01?main@@YAXXZ"() #0 {
+for.body.i.lr.ph:
+  %0 = alloca [12 x float]
+  %1 = alloca [3 x i32]
+  %2 = alloca [12 x float]
+  %3 = alloca [4 x i32]
+  %4 = alloca [12 x float]
+  %5 = alloca [3 x i32]
+  %6 = alloca [12 x float]
+  %7 = alloca [4 x i32]
+  %hit = alloca %dx.types.HitObject, align 4
+  %8 = bitcast %dx.types.HitObject* %hit to i8*, !dbg !19 ; line:69 col:3
+  call void @llvm.lifetime.start(i64 4, i8* %8) #0, !dbg !19 ; line:69 col:3
+  %9 = call %dx.types.HitObject* @"dx.hl.op..%dx.types.HitObject* (i32, %dx.types.HitObject*)"(i32 358, %dx.types.HitObject* %hit), !dbg !23 ; line:69 col:17
+  call void @"dx.hl.op..void (i32, %dx.types.HitObject*, i32)"(i32 388, %dx.types.HitObject* %hit, i32 1), !dbg !24 ; line:75 col:3
+  %10 = call i1 @"dx.hl.op.rn.i1 (i32, %dx.types.HitObject*)"(i32 383, %dx.types.HitObject* %hit), !dbg !25 ; line:80 col:11
+  %conv = zext i1 %10 to i32, !dbg !25 ; line:80 col:11
+  %11 = call i1 @"dx.hl.op.rn.i1 (i32, %dx.types.HitObject*)"(i32 384, %dx.types.HitObject* %hit), !dbg !26 ; line:81 col:11
+  %conv3 = zext i1 %11 to i32, !dbg !26 ; line:81 col:11
+  %add4 = add nsw i32 %conv, %conv3, !dbg !27 ; line:81 col:8
+  %12 = call i1 @"dx.hl.op.rn.i1 (i32, %dx.types.HitObject*)"(i32 385, %dx.types.HitObject* %hit), !dbg !28 ; line:82 col:11
+  %conv6 = zext i1 %12 to i32, !dbg !28 ; line:82 col:11
+  %add7 = add nsw i32 %add4, %conv6, !dbg !29 ; line:82 col:8
+  %13 = call i32 @"dx.hl.op.rn.i32 (i32, %dx.types.HitObject*)"(i32 365, %dx.types.HitObject* %hit), !dbg !30 ; line:85 col:11
+  %add9 = add i32 %add7, %13, !dbg !31 ; line:85 col:8
+  %14 = call i32 @"dx.hl.op.rn.i32 (i32, %dx.types.HitObject*)"(i32 366, %dx.types.HitObject* %hit), !dbg !32 ; line:86 col:11
+  %add11 = add i32 %add9, %14, !dbg !33 ; line:86 col:8
+  %15 = call i32 @"dx.hl.op.rn.i32 (i32, %dx.types.HitObject*)"(i32 368, %dx.types.HitObject* %hit), !dbg !34 ; line:87 col:11
+  %add13 = add i32 %add11, %15, !dbg !35 ; line:87 col:8
+  %16 = call i32 @"dx.hl.op.rn.i32 (i32, %dx.types.HitObject*)"(i32 367, %dx.types.HitObject* %hit), !dbg !36 ; line:88 col:11
+  %add15 = add i32 %add13, %16, !dbg !37 ; line:88 col:8
+  %17 = call i32 @"dx.hl.op.rn.i32 (i32, %dx.types.HitObject*)"(i32 373, %dx.types.HitObject* %hit), !dbg !38 ; line:89 col:11
+  %add17 = add i32 %add15, %17, !dbg !39 ; line:89 col:8
+  %18 = call i32 @"dx.hl.op.rn.i32 (i32, %dx.types.HitObject*)"(i32 377, %dx.types.HitObject* %hit), !dbg !40 ; line:90 col:11
+  %add19 = add i32 %add17, %18, !dbg !41 ; line:90 col:8
+  %19 = call i32 @"dx.hl.op.ro.i32 (i32, %dx.types.HitObject*, i32)"(i32 386, %dx.types.HitObject* %hit, i32 42), !dbg !42 ; line:91 col:11
+  %add21 = add i32 %add19, %19, !dbg !43 ; line:91 col:8
+  %20 = call <3 x float> @"dx.hl.op.rn.<3 x float> (i32, %dx.types.HitObject*)"(i32 379, %dx.types.HitObject* %hit), !dbg !44 ; line:94 col:11
+  %add23 = fadd <3 x float> zeroinitializer, %20, !dbg !45 ; line:94 col:8
+  %21 = call <3 x float> @"dx.hl.op.rn.<3 x float> (i32, %dx.types.HitObject*)"(i32 378, %dx.types.HitObject* %hit), !dbg !46 ; line:95 col:11
+  %add25 = fadd <3 x float> %add23, %21, !dbg !47 ; line:95 col:8
+  %22 = call <3 x float> @"dx.hl.op.rn.<3 x float> (i32, %dx.types.HitObject*)"(i32 370, %dx.types.HitObject* %hit), !dbg !48 ; line:96 col:11
+  %add27 = fadd <3 x float> %add25, %22, !dbg !49 ; line:96 col:8
+  %23 = call <3 x float> @"dx.hl.op.rn.<3 x float> (i32, %dx.types.HitObject*)"(i32 369, %dx.types.HitObject* %hit), !dbg !50 ; line:97 col:11
+  %add29 = fadd <3 x float> %add27, %23, !dbg !51 ; line:97 col:8
+  %vsum.0.vec.extract = extractelement <3 x float> %add29, i32 0, !dbg !52 ; line:98 col:11
+  %vsum.4.vec.extract = extractelement <3 x float> %add29, i32 1, !dbg !53 ; line:98 col:21
+  %add30 = fadd float %vsum.0.vec.extract, %vsum.4.vec.extract, !dbg !54 ; line:98 col:19
+  %vsum.8.vec.extract = extractelement <3 x float> %add29, i32 2, !dbg !55 ; line:98 col:31
+  %add31 = fadd float %add30, %vsum.8.vec.extract, !dbg !56 ; line:98 col:29
+  %add32 = fadd float 0.000000e+00, %add31, !dbg !57 ; line:98 col:8
+  %24 = call <12 x float> @"dx.hl.op.rn.<12 x float> (i32, %dx.types.HitObject*)"(i32 371, %dx.types.HitObject* %hit), !dbg !58 ; line:101 col:23
+  %row2col = shufflevector <12 x float> %24, <12 x float> %24, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>, !dbg !59 ; line:101 col:11
+  br label %for.body.7.i.lr.ph, !dbg !60 ; line:61 col:3
+
+for.body.7.i.lr.ph:                               ; preds = %for.cond.cleanup.6.i, %for.body.i.lr.ph
+  %i.i.0 = phi i32 [ 0, %for.body.i.lr.ph ], [ %inc9.i, %for.cond.cleanup.6.i ]
+  %h.i.0 = phi float [ 0.000000e+00, %for.body.i.lr.ph ], [ %add.i, %for.cond.cleanup.6.i ]
+  br label %for.body.7.i, !dbg !63 ; line:62 col:5
+
+for.cond.cleanup.6.i:                             ; preds = %for.body.7.i
+  %inc9.i = add nsw i32 %i.i.0, 1, !dbg !64 ; line:61 col:26
+  %cmp.i = icmp slt i32 %inc9.i, 3, !dbg !65 ; line:61 col:21
+  br i1 %cmp.i, label %for.body.7.i.lr.ph, label %for.body.i.8.lr.ph, !dbg !60 ; line:61 col:3
+
+for.body.7.i:                                     ; preds = %for.body.7.i.lr.ph, %for.body.7.i
+  %h.i.263 = phi float [ %h.i.0, %for.body.7.i.lr.ph ], [ %add.i, %for.body.7.i ]
+  %j.i.0 = phi i32 [ 0, %for.body.7.i.lr.ph ], [ %inc.i, %for.body.7.i ]
+  %25 = add i32 3, %i.i.0, !dbg !66 ; line:63 col:12
+  %26 = add i32 6, %i.i.0, !dbg !66 ; line:63 col:12
+  %27 = add i32 9, %i.i.0, !dbg !66 ; line:63 col:12
+  %28 = getelementptr [4 x i32], [4 x i32]* %7, i32 0, i32 0, !dbg !66 ; line:63 col:12
+  store i32 %i.i.0, i32* %28, !dbg !66 ; line:63 col:12
+  %29 = getelementptr [4 x i32], [4 x i32]* %7, i32 0, i32 1, !dbg !66 ; line:63 col:12
+  store i32 %25, i32* %29, !dbg !66 ; line:63 col:12
+  %30 = getelementptr [4 x i32], [4 x i32]* %7, i32 0, i32 2, !dbg !66 ; line:63 col:12
+  store i32 %26, i32* %30, !dbg !66 ; line:63 col:12
+  %31 = getelementptr [4 x i32], [4 x i32]* %7, i32 0, i32 3, !dbg !66 ; line:63 col:12
+  store i32 %27, i32* %31, !dbg !66 ; line:63 col:12
+  %32 = getelementptr [4 x i32], [4 x i32]* %7, i32 0, i32 %j.i.0, !dbg !66 ; line:63 col:12
+  %33 = load i32, i32* %32, !dbg !66 ; line:63 col:12
+  %34 = extractelement <12 x float> %row2col, i64 0, !dbg !66 ; line:63 col:12
+  %35 = getelementptr [12 x float], [12 x float]* %6, i32 0, i32 0, !dbg !66 ; line:63 col:12
+  store float %34, float* %35, !dbg !66 ; line:63 col:12
+  %36 = extractelement <12 x float> %row2col, i64 1, !dbg !66 ; line:63 col:12
+  %37 = getelementptr [12 x float], [12 x float]* %6, i32 0, i32 1, !dbg !66 ; line:63 col:12
+  store float %36, float* %37, !dbg !66 ; line:63 col:12
+  %38 = extractelement <12 x float> %row2col, i64 2, !dbg !66 ; line:63 col:12
+  %39 = getelementptr [12 x float], [12 x float]* %6, i32 0, i32 2, !dbg !66 ; line:63 col:12
+  store float %38, float* %39, !dbg !66 ; line:63 col:12
+  %40 = extractelement <12 x float> %row2col, i64 3, !dbg !66 ; line:63 col:12
+  %41 = getelementptr [12 x float], [12 x float]* %6, i32 0, i32 3, !dbg !66 ; line:63 col:12
+  store float %40, float* %41, !dbg !66 ; line:63 col:12
+  %42 = extractelement <12 x float> %row2col, i64 4, !dbg !66 ; line:63 col:12
+  %43 = getelementptr [12 x float], [12 x float]* %6, i32 0, i32 4, !dbg !66 ; line:63 col:12
+  store float %42, float* %43, !dbg !66 ; line:63 col:12
+  %44 = extractelement <12 x float> %row2col, i64 5, !dbg !66 ; line:63 col:12
+  %45 = getelementptr [12 x float], [12 x float]* %6, i32 0, i32 5, !dbg !66 ; line:63 col:12
+  store float %44, float* %45, !dbg !66 ; line:63 col:12
+  %46 = extractelement <12 x float> %row2col, i64 6, !dbg !66 ; line:63 col:12
+  %47 = getelementptr [12 x float], [12 x float]* %6, i32 0, i32 6, !dbg !66 ; line:63 col:12
+  store float %46, float* %47, !dbg !66 ; line:63 col:12
+  %48 = extractelement <12 x float> %row2col, i64 7, !dbg !66 ; line:63 col:12
+  %49 = getelementptr [12 x float], [12 x float]* %6, i32 0, i32 7, !dbg !66 ; line:63 col:12
+  store float %48, float* %49, !dbg !66 ; line:63 col:12
+  %50 = extractelement <12 x float> %row2col, i64 8, !dbg !66 ; line:63 col:12
+  %51 = getelementptr [12 x float], [12 x float]* %6, i32 0, i32 8, !dbg !66 ; line:63 col:12
+  store float %50, float* %51, !dbg !66 ; line:63 col:12
+  %52 = extractelement <12 x float> %row2col, i64 9, !dbg !66 ; line:63 col:12
+  %53 = getelementptr [12 x float], [12 x float]* %6, i32 0, i32 9, !dbg !66 ; line:63 col:12
+  store float %52, float* %53, !dbg !66 ; line:63 col:12
+  %54 = extractelement <12 x float> %row2col, i64 10, !dbg !66 ; line:63 col:12
+  %55 = getelementptr [12 x float], [12 x float]* %6, i32 0, i32 10, !dbg !66 ; line:63 col:12
+  store float %54, float* %55, !dbg !66 ; line:63 col:12
+  %56 = extractelement <12 x float> %row2col, i64 11, !dbg !66 ; line:63 col:12
+  %57 = getelementptr [12 x float], [12 x float]* %6, i32 0, i32 11, !dbg !66 ; line:63 col:12
+  store float %56, float* %57, !dbg !66 ; line:63 col:12
+  %58 = getelementptr [12 x float], [12 x float]* %6, i32 0, i32 %33, !dbg !66 ; line:63 col:12
+  %59 = load float, float* %58, !dbg !66 ; line:63 col:12
+  %add.i = fadd float %h.i.263, %59, !dbg !67 ; line:63 col:9
+  %inc.i = add nsw i32 %j.i.0, 1, !dbg !68 ; line:62 col:28
+  %cmp3.i = icmp slt i32 %inc.i, 4, !dbg !69 ; line:62 col:23
+  br i1 %cmp3.i, label %for.body.7.i, label %for.cond.cleanup.6.i, !dbg !63 ; line:62 col:5
+
+for.body.i.8.lr.ph:                               ; preds = %for.cond.cleanup.6.i
+  %add35 = fadd float %add32, %add.i, !dbg !70 ; line:101 col:8
+  %60 = call <12 x float> @"dx.hl.op.rn.<12 x float> (i32, %dx.types.HitObject*)"(i32 372, %dx.types.HitObject* %hit), !dbg !71 ; line:102 col:23
+  %row2col52 = shufflevector <12 x float> %60, <12 x float> %60, <12 x i32> <i32 0, i32 3, i32 6, i32 9, i32 1, i32 4, i32 7, i32 10, i32 2, i32 5, i32 8, i32 11>, !dbg !72 ; line:102 col:11
+  br label %for.body.7.i.15.lr.ph, !dbg !73 ; line:61 col:3
+
+for.body.7.i.15.lr.ph:                            ; preds = %for.cond.cleanup.6.i.12, %for.body.i.8.lr.ph
+  %i.i.3.0 = phi i32 [ 0, %for.body.i.8.lr.ph ], [ %inc9.i.11, %for.cond.cleanup.6.i.12 ]
+  %h.i.2.0 = phi float [ 0.000000e+00, %for.body.i.8.lr.ph ], [ %add.i.13, %for.cond.cleanup.6.i.12 ]
+  br label %for.body.7.i.15, !dbg !76 ; line:62 col:5
+
+for.cond.cleanup.6.i.12:                          ; preds = %for.body.7.i.15
+  %inc9.i.11 = add nsw i32 %i.i.3.0, 1, !dbg !77 ; line:61 col:26
+  %cmp.i.6 = icmp slt i32 %inc9.i.11, 4, !dbg !78 ; line:61 col:21
+  br i1 %cmp.i.6, label %for.body.7.i.15.lr.ph, label %for.body.i.23.lr.ph, !dbg !73 ; line:61 col:3
+
+for.body.7.i.15:                                  ; preds = %for.body.7.i.15.lr.ph, %for.body.7.i.15
+  %j.i.5.0 = phi i32 [ 0, %for.body.7.i.15.lr.ph ], [ %inc.i.14, %for.body.7.i.15 ]
+  %h.i.2.2 = phi float [ %h.i.2.0, %for.body.7.i.15.lr.ph ], [ %add.i.13, %for.body.7.i.15 ]
+  %61 = add i32 4, %i.i.3.0, !dbg !79 ; line:63 col:12
+  %62 = add i32 8, %i.i.3.0, !dbg !79 ; line:63 col:12
+  %63 = getelementptr [3 x i32], [3 x i32]* %5, i32 0, i32 0, !dbg !79 ; line:63 col:12
+  store i32 %i.i.3.0, i32* %63, !dbg !79 ; line:63 col:12
+  %64 = getelementptr [3 x i32], [3 x i32]* %5, i32 0, i32 1, !dbg !79 ; line:63 col:12
+  store i32 %61, i32* %64, !dbg !79 ; line:63 col:12
+  %65 = getelementptr [3 x i32], [3 x i32]* %5, i32 0, i32 2, !dbg !79 ; line:63 col:12
+  store i32 %62, i32* %65, !dbg !79 ; line:63 col:12
+  %66 = getelementptr [3 x i32], [3 x i32]* %5, i32 0, i32 %j.i.5.0, !dbg !79 ; line:63 col:12
+  %67 = load i32, i32* %66, !dbg !79 ; line:63 col:12
+  %68 = extractelement <12 x float> %row2col52, i64 0, !dbg !79 ; line:63 col:12
+  %69 = getelementptr [12 x float], [12 x float]* %4, i32 0, i32 0, !dbg !79 ; line:63 col:12
+  store float %68, float* %69, !dbg !79 ; line:63 col:12
+  %70 = extractelement <12 x float> %row2col52, i64 1, !dbg !79 ; line:63 col:12
+  %71 = getelementptr [12 x float], [12 x float]* %4, i32 0, i32 1, !dbg !79 ; line:63 col:12
+  store float %70, float* %71, !dbg !79 ; line:63 col:12
+  %72 = extractelement <12 x float> %row2col52, i64 2, !dbg !79 ; line:63 col:12
+  %73 = getelementptr [12 x float], [12 x float]* %4, i32 0, i32 2, !dbg !79 ; line:63 col:12
+  store float %72, float* %73, !dbg !79 ; line:63 col:12
+  %74 = extractelement <12 x float> %row2col52, i64 3, !dbg !79 ; line:63 col:12
+  %75 = getelementptr [12 x float], [12 x float]* %4, i32 0, i32 3, !dbg !79 ; line:63 col:12
+  store float %74, float* %75, !dbg !79 ; line:63 col:12
+  %76 = extractelement <12 x float> %row2col52, i64 4, !dbg !79 ; line:63 col:12
+  %77 = getelementptr [12 x float], [12 x float]* %4, i32 0, i32 4, !dbg !79 ; line:63 col:12
+  store float %76, float* %77, !dbg !79 ; line:63 col:12
+  %78 = extractelement <12 x float> %row2col52, i64 5, !dbg !79 ; line:63 col:12
+  %79 = getelementptr [12 x float], [12 x float]* %4, i32 0, i32 5, !dbg !79 ; line:63 col:12
+  store float %78, float* %79, !dbg !79 ; line:63 col:12
+  %80 = extractelement <12 x float> %row2col52, i64 6, !dbg !79 ; line:63 col:12
+  %81 = getelementptr [12 x float], [12 x float]* %4, i32 0, i32 6, !dbg !79 ; line:63 col:12
+  store float %80, float* %81, !dbg !79 ; line:63 col:12
+  %82 = extractelement <12 x float> %row2col52, i64 7, !dbg !79 ; line:63 col:12
+  %83 = getelementptr [12 x float], [12 x float]* %4, i32 0, i32 7, !dbg !79 ; line:63 col:12
+  store float %82, float* %83, !dbg !79 ; line:63 col:12
+  %84 = extractelement <12 x float> %row2col52, i64 8, !dbg !79 ; line:63 col:12
+  %85 = getelementptr [12 x float], [12 x float]* %4, i32 0, i32 8, !dbg !79 ; line:63 col:12
+  store float %84, float* %85, !dbg !79 ; line:63 col:12
+  %86 = extractelement <12 x float> %row2col52, i64 9, !dbg !79 ; line:63 col:12
+  %87 = getelementptr [12 x float], [12 x float]* %4, i32 0, i32 9, !dbg !79 ; line:63 col:12
+  store float %86, float* %87, !dbg !79 ; line:63 col:12
+  %88 = extractelement <12 x float> %row2col52, i64 10, !dbg !79 ; line:63 col:12
+  %89 = getelementptr [12 x float], [12 x float]* %4, i32 0, i32 10, !dbg !79 ; line:63 col:12
+  store float %88, float* %89, !dbg !79 ; line:63 col:12
+  %90 = extractelement <12 x float> %row2col52, i64 11, !dbg !79 ; line:63 col:12
+  %91 = getelementptr [12 x float], [12 x float]* %4, i32 0, i32 11, !dbg !79 ; line:63 col:12
+  store float %90, float* %91, !dbg !79 ; line:63 col:12
+  %92 = getelementptr [12 x float], [12 x float]* %4, i32 0, i32 %67, !dbg !79 ; line:63 col:12
+  %93 = load float, float* %92, !dbg !79 ; line:63 col:12
+  %add.i.13 = fadd float %h.i.2.2, %93, !dbg !80 ; line:63 col:9
+  %inc.i.14 = add nsw i32 %j.i.5.0, 1, !dbg !81 ; line:62 col:28
+  %cmp3.i.9 = icmp slt i32 %inc.i.14, 3, !dbg !82 ; line:62 col:23
+  br i1 %cmp3.i.9, label %for.body.7.i.15, label %for.cond.cleanup.6.i.12, !dbg !76 ; line:62 col:5
+
+for.body.i.23.lr.ph:                              ; preds = %for.cond.cleanup.6.i.12
+  %add38 = fadd float %add35, %add.i.13, !dbg !83 ; line:102 col:8
+  %94 = call <12 x float> @"dx.hl.op.rn.<12 x float> (i32, %dx.types.HitObject*)"(i32 380, %dx.types.HitObject* %hit), !dbg !84 ; line:103 col:23
+  %row2col53 = shufflevector <12 x float> %94, <12 x float> %94, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>, !dbg !85 ; line:103 col:11
+  br label %for.body.7.i.30.lr.ph, !dbg !86 ; line:61 col:3
+
+for.body.7.i.30.lr.ph:                            ; preds = %for.cond.cleanup.6.i.27, %for.body.i.23.lr.ph
+  %i.i.18.0 = phi i32 [ 0, %for.body.i.23.lr.ph ], [ %inc9.i.26, %for.cond.cleanup.6.i.27 ]
+  %h.i.17.0 = phi float [ 0.000000e+00, %for.body.i.23.lr.ph ], [ %add.i.28, %for.cond.cleanup.6.i.27 ]
+  br label %for.body.7.i.30, !dbg !88 ; line:62 col:5
+
+for.cond.cleanup.6.i.27:                          ; preds = %for.body.7.i.30
+  %inc9.i.26 = add nsw i32 %i.i.18.0, 1, !dbg !89 ; line:61 col:26
+  %cmp.i.21 = icmp slt i32 %inc9.i.26, 3, !dbg !90 ; line:61 col:21
+  br i1 %cmp.i.21, label %for.body.7.i.30.lr.ph, label %for.body.i.39.lr.ph, !dbg !86 ; line:61 col:3
+
+for.body.7.i.30:                                  ; preds = %for.body.7.i.30.lr.ph, %for.body.7.i.30
+  %j.i.20.0 = phi i32 [ 0, %for.body.7.i.30.lr.ph ], [ %inc.i.29, %for.body.7.i.30 ]
+  %h.i.17.2 = phi float [ %h.i.17.0, %for.body.7.i.30.lr.ph ], [ %add.i.28, %for.body.7.i.30 ]
+  %95 = add i32 3, %i.i.18.0, !dbg !91 ; line:63 col:12
+  %96 = add i32 6, %i.i.18.0, !dbg !91 ; line:63 col:12
+  %97 = add i32 9, %i.i.18.0, !dbg !91 ; line:63 col:12
+  %98 = getelementptr [4 x i32], [4 x i32]* %3, i32 0, i32 0, !dbg !91 ; line:63 col:12
+  store i32 %i.i.18.0, i32* %98, !dbg !91 ; line:63 col:12
+  %99 = getelementptr [4 x i32], [4 x i32]* %3, i32 0, i32 1, !dbg !91 ; line:63 col:12
+  store i32 %95, i32* %99, !dbg !91 ; line:63 col:12
+  %100 = getelementptr [4 x i32], [4 x i32]* %3, i32 0, i32 2, !dbg !91 ; line:63 col:12
+  store i32 %96, i32* %100, !dbg !91 ; line:63 col:12
+  %101 = getelementptr [4 x i32], [4 x i32]* %3, i32 0, i32 3, !dbg !91 ; line:63 col:12
+  store i32 %97, i32* %101, !dbg !91 ; line:63 col:12
+  %102 = getelementptr [4 x i32], [4 x i32]* %3, i32 0, i32 %j.i.20.0, !dbg !91 ; line:63 col:12
+  %103 = load i32, i32* %102, !dbg !91 ; line:63 col:12
+  %104 = extractelement <12 x float> %row2col53, i64 0, !dbg !91 ; line:63 col:12
+  %105 = getelementptr [12 x float], [12 x float]* %2, i32 0, i32 0, !dbg !91 ; line:63 col:12
+  store float %104, float* %105, !dbg !91 ; line:63 col:12
+  %106 = extractelement <12 x float> %row2col53, i64 1, !dbg !91 ; line:63 col:12
+  %107 = getelementptr [12 x float], [12 x float]* %2, i32 0, i32 1, !dbg !91 ; line:63 col:12
+  store float %106, float* %107, !dbg !91 ; line:63 col:12
+  %108 = extractelement <12 x float> %row2col53, i64 2, !dbg !91 ; line:63 col:12
+  %109 = getelementptr [12 x float], [12 x float]* %2, i32 0, i32 2, !dbg !91 ; line:63 col:12
+  store float %108, float* %109, !dbg !91 ; line:63 col:12
+  %110 = extractelement <12 x float> %row2col53, i64 3, !dbg !91 ; line:63 col:12
+  %111 = getelementptr [12 x float], [12 x float]* %2, i32 0, i32 3, !dbg !91 ; line:63 col:12
+  store float %110, float* %111, !dbg !91 ; line:63 col:12
+  %112 = extractelement <12 x float> %row2col53, i64 4, !dbg !91 ; line:63 col:12
+  %113 = getelementptr [12 x float], [12 x float]* %2, i32 0, i32 4, !dbg !91 ; line:63 col:12
+  store float %112, float* %113, !dbg !91 ; line:63 col:12
+  %114 = extractelement <12 x float> %row2col53, i64 5, !dbg !91 ; line:63 col:12
+  %115 = getelementptr [12 x float], [12 x float]* %2, i32 0, i32 5, !dbg !91 ; line:63 col:12
+  store float %114, float* %115, !dbg !91 ; line:63 col:12
+  %116 = extractelement <12 x float> %row2col53, i64 6, !dbg !91 ; line:63 col:12
+  %117 = getelementptr [12 x float], [12 x float]* %2, i32 0, i32 6, !dbg !91 ; line:63 col:12
+  store float %116, float* %117, !dbg !91 ; line:63 col:12
+  %118 = extractelement <12 x float> %row2col53, i64 7, !dbg !91 ; line:63 col:12
+  %119 = getelementptr [12 x float], [12 x float]* %2, i32 0, i32 7, !dbg !91 ; line:63 col:12
+  store float %118, float* %119, !dbg !91 ; line:63 col:12
+  %120 = extractelement <12 x float> %row2col53, i64 8, !dbg !91 ; line:63 col:12
+  %121 = getelementptr [12 x float], [12 x float]* %2, i32 0, i32 8, !dbg !91 ; line:63 col:12
+  store float %120, float* %121, !dbg !91 ; line:63 col:12
+  %122 = extractelement <12 x float> %row2col53, i64 9, !dbg !91 ; line:63 col:12
+  %123 = getelementptr [12 x float], [12 x float]* %2, i32 0, i32 9, !dbg !91 ; line:63 col:12
+  store float %122, float* %123, !dbg !91 ; line:63 col:12
+  %124 = extractelement <12 x float> %row2col53, i64 10, !dbg !91 ; line:63 col:12
+  %125 = getelementptr [12 x float], [12 x float]* %2, i32 0, i32 10, !dbg !91 ; line:63 col:12
+  store float %124, float* %125, !dbg !91 ; line:63 col:12
+  %126 = extractelement <12 x float> %row2col53, i64 11, !dbg !91 ; line:63 col:12
+  %127 = getelementptr [12 x float], [12 x float]* %2, i32 0, i32 11, !dbg !91 ; line:63 col:12
+  store float %126, float* %127, !dbg !91 ; line:63 col:12
+  %128 = getelementptr [12 x float], [12 x float]* %2, i32 0, i32 %103, !dbg !91 ; line:63 col:12
+  %129 = load float, float* %128, !dbg !91 ; line:63 col:12
+  %add.i.28 = fadd float %h.i.17.2, %129, !dbg !92 ; line:63 col:9
+  %inc.i.29 = add nsw i32 %j.i.20.0, 1, !dbg !93 ; line:62 col:28
+  %cmp3.i.24 = icmp slt i32 %inc.i.29, 4, !dbg !94 ; line:62 col:23
+  br i1 %cmp3.i.24, label %for.body.7.i.30, label %for.cond.cleanup.6.i.27, !dbg !88 ; line:62 col:5
+
+for.body.i.39.lr.ph:                              ; preds = %for.cond.cleanup.6.i.27
+  %add41 = fadd float %add38, %add.i.28, !dbg !95 ; line:103 col:8
+  %130 = call <12 x float> @"dx.hl.op.rn.<12 x float> (i32, %dx.types.HitObject*)"(i32 381, %dx.types.HitObject* %hit), !dbg !96 ; line:104 col:23
+  %row2col54 = shufflevector <12 x float> %130, <12 x float> %130, <12 x i32> <i32 0, i32 3, i32 6, i32 9, i32 1, i32 4, i32 7, i32 10, i32 2, i32 5, i32 8, i32 11>, !dbg !97 ; line:104 col:11
+  br label %for.body.7.i.46.lr.ph, !dbg !98 ; line:61 col:3
+
+for.body.7.i.46.lr.ph:                            ; preds = %for.cond.cleanup.6.i.43, %for.body.i.39.lr.ph
+  %i.i.34.0 = phi i32 [ 0, %for.body.i.39.lr.ph ], [ %inc9.i.42, %for.cond.cleanup.6.i.43 ]
+  %h.i.33.0 = phi float [ 0.000000e+00, %for.body.i.39.lr.ph ], [ %add.i.44, %for.cond.cleanup.6.i.43 ]
+  br label %for.body.7.i.46, !dbg !100 ; line:62 col:5
+
+for.cond.cleanup.6.i.43:                          ; preds = %for.body.7.i.46
+  %inc9.i.42 = add nsw i32 %i.i.34.0, 1, !dbg !101 ; line:61 col:26
+  %cmp.i.37 = icmp slt i32 %inc9.i.42, 4, !dbg !102 ; line:61 col:21
+  br i1 %cmp.i.37, label %for.body.7.i.46.lr.ph, label %"\01??$hashM@$03$02@@YAMV?$matrix@M$03$02@@@Z.exit.47", !dbg !98 ; line:61 col:3
+
+for.body.7.i.46:                                  ; preds = %for.body.7.i.46.lr.ph, %for.body.7.i.46
+  %j.i.36.0 = phi i32 [ 0, %for.body.7.i.46.lr.ph ], [ %inc.i.45, %for.body.7.i.46 ]
+  %h.i.33.2 = phi float [ %h.i.33.0, %for.body.7.i.46.lr.ph ], [ %add.i.44, %for.body.7.i.46 ]
+  %131 = add i32 4, %i.i.34.0, !dbg !103 ; line:63 col:12
+  %132 = add i32 8, %i.i.34.0, !dbg !103 ; line:63 col:12
+  %133 = getelementptr [3 x i32], [3 x i32]* %1, i32 0, i32 0, !dbg !103 ; line:63 col:12
+  store i32 %i.i.34.0, i32* %133, !dbg !103 ; line:63 col:12
+  %134 = getelementptr [3 x i32], [3 x i32]* %1, i32 0, i32 1, !dbg !103 ; line:63 col:12
+  store i32 %131, i32* %134, !dbg !103 ; line:63 col:12
+  %135 = getelementptr [3 x i32], [3 x i32]* %1, i32 0, i32 2, !dbg !103 ; line:63 col:12
+  store i32 %132, i32* %135, !dbg !103 ; line:63 col:12
+  %136 = getelementptr [3 x i32], [3 x i32]* %1, i32 0, i32 %j.i.36.0, !dbg !103 ; line:63 col:12
+  %137 = load i32, i32* %136, !dbg !103 ; line:63 col:12
+  %138 = extractelement <12 x float> %row2col54, i64 0, !dbg !103 ; line:63 col:12
+  %139 = getelementptr [12 x float], [12 x float]* %0, i32 0, i32 0, !dbg !103 ; line:63 col:12
+  store float %138, float* %139, !dbg !103 ; line:63 col:12
+  %140 = extractelement <12 x float> %row2col54, i64 1, !dbg !103 ; line:63 col:12
+  %141 = getelementptr [12 x float], [12 x float]* %0, i32 0, i32 1, !dbg !103 ; line:63 col:12
+  store float %140, float* %141, !dbg !103 ; line:63 col:12
+  %142 = extractelement <12 x float> %row2col54, i64 2, !dbg !103 ; line:63 col:12
+  %143 = getelementptr [12 x float], [12 x float]* %0, i32 0, i32 2, !dbg !103 ; line:63 col:12
+  store float %142, float* %143, !dbg !103 ; line:63 col:12
+  %144 = extractelement <12 x float> %row2col54, i64 3, !dbg !103 ; line:63 col:12
+  %145 = getelementptr [12 x float], [12 x float]* %0, i32 0, i32 3, !dbg !103 ; line:63 col:12
+  store float %144, float* %145, !dbg !103 ; line:63 col:12
+  %146 = extractelement <12 x float> %row2col54, i64 4, !dbg !103 ; line:63 col:12
+  %147 = getelementptr [12 x float], [12 x float]* %0, i32 0, i32 4, !dbg !103 ; line:63 col:12
+  store float %146, float* %147, !dbg !103 ; line:63 col:12
+  %148 = extractelement <12 x float> %row2col54, i64 5, !dbg !103 ; line:63 col:12
+  %149 = getelementptr [12 x float], [12 x float]* %0, i32 0, i32 5, !dbg !103 ; line:63 col:12
+  store float %148, float* %149, !dbg !103 ; line:63 col:12
+  %150 = extractelement <12 x float> %row2col54, i64 6, !dbg !103 ; line:63 col:12
+  %151 = getelementptr [12 x float], [12 x float]* %0, i32 0, i32 6, !dbg !103 ; line:63 col:12
+  store float %150, float* %151, !dbg !103 ; line:63 col:12
+  %152 = extractelement <12 x float> %row2col54, i64 7, !dbg !103 ; line:63 col:12
+  %153 = getelementptr [12 x float], [12 x float]* %0, i32 0, i32 7, !dbg !103 ; line:63 col:12
+  store float %152, float* %153, !dbg !103 ; line:63 col:12
+  %154 = extractelement <12 x float> %row2col54, i64 8, !dbg !103 ; line:63 col:12
+  %155 = getelementptr [12 x float], [12 x float]* %0, i32 0, i32 8, !dbg !103 ; line:63 col:12
+  store float %154, float* %155, !dbg !103 ; line:63 col:12
+  %156 = extractelement <12 x float> %row2col54, i64 9, !dbg !103 ; line:63 col:12
+  %157 = getelementptr [12 x float], [12 x float]* %0, i32 0, i32 9, !dbg !103 ; line:63 col:12
+  store float %156, float* %157, !dbg !103 ; line:63 col:12
+  %158 = extractelement <12 x float> %row2col54, i64 10, !dbg !103 ; line:63 col:12
+  %159 = getelementptr [12 x float], [12 x float]* %0, i32 0, i32 10, !dbg !103 ; line:63 col:12
+  store float %158, float* %159, !dbg !103 ; line:63 col:12
+  %160 = extractelement <12 x float> %row2col54, i64 11, !dbg !103 ; line:63 col:12
+  %161 = getelementptr [12 x float], [12 x float]* %0, i32 0, i32 11, !dbg !103 ; line:63 col:12
+  store float %160, float* %161, !dbg !103 ; line:63 col:12
+  %162 = getelementptr [12 x float], [12 x float]* %0, i32 0, i32 %137, !dbg !103 ; line:63 col:12
+  %163 = load float, float* %162, !dbg !103 ; line:63 col:12
+  %add.i.44 = fadd float %h.i.33.2, %163, !dbg !104 ; line:63 col:9
+  %inc.i.45 = add nsw i32 %j.i.36.0, 1, !dbg !105 ; line:62 col:28
+  %cmp3.i.40 = icmp slt i32 %inc.i.45, 3, !dbg !106 ; line:62 col:23
+  br i1 %cmp3.i.40, label %for.body.7.i.46, label %for.cond.cleanup.6.i.43, !dbg !100 ; line:62 col:5
+
+"\01??$hashM@$03$02@@YAMV?$matrix@M$03$02@@@Z.exit.47": ; preds = %for.cond.cleanup.6.i.43
+  %add44 = fadd float %add41, %add.i.44, !dbg !107 ; line:104 col:8
+  %164 = call i32 @"dx.hl.op.rn.i32 (i32, %dx.types.HitObject*)"(i32 374, %dx.types.HitObject* %hit), !dbg !108 ; line:107 col:11
+  %add46 = add i32 %add21, %164, !dbg !109 ; line:107 col:8
+  %165 = call float @"dx.hl.op.rn.float (i32, %dx.types.HitObject*)"(i32 376, %dx.types.HitObject* %hit), !dbg !110 ; line:108 col:11
+  %add48 = fadd float %add44, %165, !dbg !111 ; line:108 col:8
+  %166 = call float @"dx.hl.op.rn.float (i32, %dx.types.HitObject*)"(i32 375, %dx.types.HitObject* %hit), !dbg !112 ; line:109 col:11
+  %add50 = fadd float %add48, %166, !dbg !113 ; line:109 col:8
+  %167 = load %struct.RWByteAddressBuffer, %struct.RWByteAddressBuffer* @"\01?outbuf@@3URWByteAddressBuffer@@A", !dbg !114 ; line:111 col:3
+  %168 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RWByteAddressBuffer)"(i32 0, %struct.RWByteAddressBuffer %167), !dbg !114 ; line:111 col:3
+  %169 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RWByteAddressBuffer)"(i32 14, %dx.types.Handle %168, %dx.types.ResourceProperties { i32 4107, i32 0 }, %struct.RWByteAddressBuffer zeroinitializer), !dbg !114 ; line:111 col:3
+  call void @"dx.hl.op..void (i32, %dx.types.Handle, i32, float)"(i32 277, %dx.types.Handle %169, i32 0, float %add50), !dbg !114 ; line:111 col:3
+  %170 = load %struct.RWByteAddressBuffer, %struct.RWByteAddressBuffer* @"\01?outbuf@@3URWByteAddressBuffer@@A", !dbg !115 ; line:112 col:3
+  %171 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RWByteAddressBuffer)"(i32 0, %struct.RWByteAddressBuffer %170), !dbg !115 ; line:112 col:3
+  %172 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RWByteAddressBuffer)"(i32 14, %dx.types.Handle %171, %dx.types.ResourceProperties { i32 4107, i32 0 }, %struct.RWByteAddressBuffer zeroinitializer), !dbg !115 ; line:112 col:3
+  call void @"dx.hl.op..void (i32, %dx.types.Handle, i32, i32)"(i32 277, %dx.types.Handle %172, i32 4, i32 %add46), !dbg !115 ; line:112 col:3
+  %173 = bitcast %dx.types.HitObject* %hit to i8*, !dbg !116 ; line:113 col:1
+  call void @llvm.lifetime.end(i64 4, i8* %173) #0, !dbg !116 ; line:113 col:1
+  ret void, !dbg !116 ; line:113 col:1
+}
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture) #0
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture) #0
+
+; Function Attrs: nounwind
+declare %dx.types.HitObject* @"dx.hl.op..%dx.types.HitObject* (i32, %dx.types.HitObject*)"(i32, %dx.types.HitObject*) #0
+
+; Function Attrs: nounwind
+declare void @"dx.hl.op..void (i32, %dx.types.HitObject*, i32)"(i32, %dx.types.HitObject*, i32) #0
+
+; Function Attrs: nounwind readnone
+declare i1 @"dx.hl.op.rn.i1 (i32, %dx.types.HitObject*)"(i32, %dx.types.HitObject*) #1
+
+; Function Attrs: nounwind readnone
+declare i32 @"dx.hl.op.rn.i32 (i32, %dx.types.HitObject*)"(i32, %dx.types.HitObject*) #1
+
+; Function Attrs: nounwind readonly
+declare i32 @"dx.hl.op.ro.i32 (i32, %dx.types.HitObject*, i32)"(i32, %dx.types.HitObject*, i32) #2
+
+; Function Attrs: nounwind readnone
+declare <3 x float> @"dx.hl.op.rn.<3 x float> (i32, %dx.types.HitObject*)"(i32, %dx.types.HitObject*) #1
+
+; Function Attrs: nounwind readnone
+declare float @"dx.hl.op.rn.float (i32, %dx.types.HitObject*)"(i32, %dx.types.HitObject*) #1
+
+; Function Attrs: nounwind
+declare void @"dx.hl.op..void (i32, %dx.types.Handle, i32, float)"(i32, %dx.types.Handle, i32, float) #0
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RWByteAddressBuffer)"(i32, %struct.RWByteAddressBuffer) #1
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RWByteAddressBuffer)"(i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RWByteAddressBuffer) #1
+
+; Function Attrs: nounwind
+declare void @"dx.hl.op..void (i32, %dx.types.Handle, i32, i32)"(i32, %dx.types.Handle, i32, i32) #0
+
+; Function Attrs: nounwind readnone
+declare <12 x float> @"dx.hl.op.rn.<12 x float> (i32, %dx.types.HitObject*)"(i32, %dx.types.HitObject*) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!pauseresume = !{!1}
+!llvm.ident = !{!2}
+!dx.version = !{!3}
+!dx.valver = !{!3}
+!dx.shaderModel = !{!4}
+!dx.typeAnnotations = !{!5, !8}
+!dx.entryPoints = !{!12}
+!dx.fnprops = !{!16}
+!dx.options = !{!17, !18}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !{!"hlsl-hlemit", !"hlsl-hlensure"}
+!2 = !{!"dxc(private) 1.8.0.4891 (staging/ser_hlslaccessors_patch, 1ca27ee12)"}
+!3 = !{i32 1, i32 9}
+!4 = !{!"lib", i32 6, i32 9}
+!5 = !{i32 0, %"class.dx::HitObject" undef, !6}
+!6 = !{i32 4, !7}
+!7 = !{i32 6, !"h", i32 3, i32 0, i32 7, i32 4}
+!8 = !{i32 1, void ()* @"\01?main@@YAXXZ", !9}
+!9 = !{!10}
+!10 = !{i32 1, !11, !11}
+!11 = !{}
+!12 = !{null, !"", null, !13, null}
+!13 = !{null, !14, null, null}
+!14 = !{!15}
+!15 = !{i32 0, %struct.RWByteAddressBuffer* @"\01?outbuf@@3URWByteAddressBuffer@@A", !"outbuf", i32 -1, i32 -1, i32 1, i32 11, i1 false, i1 false, i1 false, null}
+!16 = !{void ()* @"\01?main@@YAXXZ", i32 7}
+!17 = !{i32 -2147483584}
+!18 = !{i32 -1}
+!19 = !DILocation(line: 69, column: 3, scope: !20)
+!20 = !DISubprogram(name: "main", scope: !21, file: !21, line: 68, type: !22, isLocal: false, isDefinition: true, scopeLine: 68, flags: DIFlagPrototyped, isOptimized: false, function: void ()* @"\01?main@@YAXXZ")
+!21 = !DIFile(filename: "tools/clang/test/CodeGenDXIL/hlsl/objects/HitObject/hitobject_accessors.hlsl", directory: "")
+!22 = !DISubroutineType(types: !11)
+!23 = !DILocation(line: 69, column: 17, scope: !20)
+!24 = !DILocation(line: 75, column: 3, scope: !20)
+!25 = !DILocation(line: 80, column: 11, scope: !20)
+!26 = !DILocation(line: 81, column: 11, scope: !20)
+!27 = !DILocation(line: 81, column: 8, scope: !20)
+!28 = !DILocation(line: 82, column: 11, scope: !20)
+!29 = !DILocation(line: 82, column: 8, scope: !20)
+!30 = !DILocation(line: 85, column: 11, scope: !20)
+!31 = !DILocation(line: 85, column: 8, scope: !20)
+!32 = !DILocation(line: 86, column: 11, scope: !20)
+!33 = !DILocation(line: 86, column: 8, scope: !20)
+!34 = !DILocation(line: 87, column: 11, scope: !20)
+!35 = !DILocation(line: 87, column: 8, scope: !20)
+!36 = !DILocation(line: 88, column: 11, scope: !20)
+!37 = !DILocation(line: 88, column: 8, scope: !20)
+!38 = !DILocation(line: 89, column: 11, scope: !20)
+!39 = !DILocation(line: 89, column: 8, scope: !20)
+!40 = !DILocation(line: 90, column: 11, scope: !20)
+!41 = !DILocation(line: 90, column: 8, scope: !20)
+!42 = !DILocation(line: 91, column: 11, scope: !20)
+!43 = !DILocation(line: 91, column: 8, scope: !20)
+!44 = !DILocation(line: 94, column: 11, scope: !20)
+!45 = !DILocation(line: 94, column: 8, scope: !20)
+!46 = !DILocation(line: 95, column: 11, scope: !20)
+!47 = !DILocation(line: 95, column: 8, scope: !20)
+!48 = !DILocation(line: 96, column: 11, scope: !20)
+!49 = !DILocation(line: 96, column: 8, scope: !20)
+!50 = !DILocation(line: 97, column: 11, scope: !20)
+!51 = !DILocation(line: 97, column: 8, scope: !20)
+!52 = !DILocation(line: 98, column: 11, scope: !20)
+!53 = !DILocation(line: 98, column: 21, scope: !20)
+!54 = !DILocation(line: 98, column: 19, scope: !20)
+!55 = !DILocation(line: 98, column: 31, scope: !20)
+!56 = !DILocation(line: 98, column: 29, scope: !20)
+!57 = !DILocation(line: 98, column: 8, scope: !20)
+!58 = !DILocation(line: 101, column: 23, scope: !20)
+!59 = !DILocation(line: 101, column: 11, scope: !20)
+!60 = !DILocation(line: 61, column: 3, scope: !61, inlinedAt: !62)
+!61 = !DISubprogram(name: "hashM<3, 4>", scope: !21, file: !21, line: 59, type: !22, isLocal: false, isDefinition: true, scopeLine: 59, flags: DIFlagPrototyped, isOptimized: false)
+!62 = distinct !DILocation(line: 101, column: 11, scope: !20)
+!63 = !DILocation(line: 62, column: 5, scope: !61, inlinedAt: !62)
+!64 = !DILocation(line: 61, column: 26, scope: !61, inlinedAt: !62)
+!65 = !DILocation(line: 61, column: 21, scope: !61, inlinedAt: !62)
+!66 = !DILocation(line: 63, column: 12, scope: !61, inlinedAt: !62)
+!67 = !DILocation(line: 63, column: 9, scope: !61, inlinedAt: !62)
+!68 = !DILocation(line: 62, column: 28, scope: !61, inlinedAt: !62)
+!69 = !DILocation(line: 62, column: 23, scope: !61, inlinedAt: !62)
+!70 = !DILocation(line: 101, column: 8, scope: !20)
+!71 = !DILocation(line: 102, column: 23, scope: !20)
+!72 = !DILocation(line: 102, column: 11, scope: !20)
+!73 = !DILocation(line: 61, column: 3, scope: !74, inlinedAt: !75)
+!74 = !DISubprogram(name: "hashM<4, 3>", scope: !21, file: !21, line: 59, type: !22, isLocal: false, isDefinition: true, scopeLine: 59, flags: DIFlagPrototyped, isOptimized: false)
+!75 = distinct !DILocation(line: 102, column: 11, scope: !20)
+!76 = !DILocation(line: 62, column: 5, scope: !74, inlinedAt: !75)
+!77 = !DILocation(line: 61, column: 26, scope: !74, inlinedAt: !75)
+!78 = !DILocation(line: 61, column: 21, scope: !74, inlinedAt: !75)
+!79 = !DILocation(line: 63, column: 12, scope: !74, inlinedAt: !75)
+!80 = !DILocation(line: 63, column: 9, scope: !74, inlinedAt: !75)
+!81 = !DILocation(line: 62, column: 28, scope: !74, inlinedAt: !75)
+!82 = !DILocation(line: 62, column: 23, scope: !74, inlinedAt: !75)
+!83 = !DILocation(line: 102, column: 8, scope: !20)
+!84 = !DILocation(line: 103, column: 23, scope: !20)
+!85 = !DILocation(line: 103, column: 11, scope: !20)
+!86 = !DILocation(line: 61, column: 3, scope: !61, inlinedAt: !87)
+!87 = distinct !DILocation(line: 103, column: 11, scope: !20)
+!88 = !DILocation(line: 62, column: 5, scope: !61, inlinedAt: !87)
+!89 = !DILocation(line: 61, column: 26, scope: !61, inlinedAt: !87)
+!90 = !DILocation(line: 61, column: 21, scope: !61, inlinedAt: !87)
+!91 = !DILocation(line: 63, column: 12, scope: !61, inlinedAt: !87)
+!92 = !DILocation(line: 63, column: 9, scope: !61, inlinedAt: !87)
+!93 = !DILocation(line: 62, column: 28, scope: !61, inlinedAt: !87)
+!94 = !DILocation(line: 62, column: 23, scope: !61, inlinedAt: !87)
+!95 = !DILocation(line: 103, column: 8, scope: !20)
+!96 = !DILocation(line: 104, column: 23, scope: !20)
+!97 = !DILocation(line: 104, column: 11, scope: !20)
+!98 = !DILocation(line: 61, column: 3, scope: !74, inlinedAt: !99)
+!99 = distinct !DILocation(line: 104, column: 11, scope: !20)
+!100 = !DILocation(line: 62, column: 5, scope: !74, inlinedAt: !99)
+!101 = !DILocation(line: 61, column: 26, scope: !74, inlinedAt: !99)
+!102 = !DILocation(line: 61, column: 21, scope: !74, inlinedAt: !99)
+!103 = !DILocation(line: 63, column: 12, scope: !74, inlinedAt: !99)
+!104 = !DILocation(line: 63, column: 9, scope: !74, inlinedAt: !99)
+!105 = !DILocation(line: 62, column: 28, scope: !74, inlinedAt: !99)
+!106 = !DILocation(line: 62, column: 23, scope: !74, inlinedAt: !99)
+!107 = !DILocation(line: 104, column: 8, scope: !20)
+!108 = !DILocation(line: 107, column: 11, scope: !20)
+!109 = !DILocation(line: 107, column: 8, scope: !20)
+!110 = !DILocation(line: 108, column: 11, scope: !20)
+!111 = !DILocation(line: 108, column: 8, scope: !20)
+!112 = !DILocation(line: 109, column: 11, scope: !20)
+!113 = !DILocation(line: 109, column: 8, scope: !20)
+!114 = !DILocation(line: 111, column: 3, scope: !20)
+!115 = !DILocation(line: 112, column: 3, scope: !20)
+!116 = !DILocation(line: 113, column: 1, scope: !20)
diff --git a/tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_accessors.hlsl b/tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_accessors.hlsl
new file mode 100644
index 0000000000..7b4182b739
--- /dev/null
+++ b/tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_accessors.hlsl
@@ -0,0 +1,263 @@
+// RUN: %dxc -T lib_6_9 -E main %s -ast-dump-implicit | FileCheck %s --check-prefix AST
+// RUN: %dxc -T lib_6_9 -E main %s -fcgl | FileCheck %s --check-prefix FCGL
+
+// AST: | | |-FunctionTemplateDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> GetHitKind
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TResult
+// AST-NEXT: | | | |-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> implicit GetHitKind 'TResult () const'
+// AST-NEXT: | | | `-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> used GetHitKind 'unsigned int ()' extern
+// AST-NEXT: | | |   |-TemplateArgument type 'unsigned int'
+// AST-NEXT: | | |   |-HLSLIntrinsicAttr {{[^ ]+}} <<invalid sloc>> Implicit "op" "" 366
+// AST-NEXT: | | |   |-ConstAttr {{[^ ]+}} <<invalid sloc>> Implicit
+// AST-NEXT: | | |   `-AvailabilityAttr {{[^ ]+}} <<invalid sloc>> Implicit  6.9 0 0 ""
+// AST: | | |-FunctionTemplateDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> GetInstanceID
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TResult
+// AST-NEXT: | | | |-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> implicit GetInstanceID 'TResult () const'
+// AST-NEXT: | | | `-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> used GetInstanceID 'unsigned int ()' extern
+// AST-NEXT: | | |   |-TemplateArgument type 'unsigned int'
+// AST-NEXT: | | |   |-HLSLIntrinsicAttr {{[^ ]+}} <<invalid sloc>> Implicit "op" "" 367
+// AST-NEXT: | | |   |-ConstAttr {{[^ ]+}} <<invalid sloc>> Implicit
+// AST-NEXT: | | |   `-AvailabilityAttr {{[^ ]+}} <<invalid sloc>> Implicit  6.9 0 0 ""
+// AST: | | |-FunctionTemplateDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> GetInstanceIndex
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TResult
+// AST-NEXT: | | | |-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> implicit GetInstanceIndex 'TResult () const'
+// AST-NEXT: | | | `-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> used GetInstanceIndex 'unsigned int ()' extern
+// AST-NEXT: | | |   |-TemplateArgument type 'unsigned int'
+// AST-NEXT: | | |   |-HLSLIntrinsicAttr {{[^ ]+}} <<invalid sloc>> Implicit "op" "" 368
+// AST-NEXT: | | |   |-ConstAttr {{[^ ]+}} <<invalid sloc>> Implicit
+// AST-NEXT: | | |   `-AvailabilityAttr {{[^ ]+}} <<invalid sloc>> Implicit  6.9 0 0 ""
+// AST: | | |-FunctionTemplateDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> GetObjectRayDirection
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TResult
+// AST-NEXT: | | | |-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> implicit GetObjectRayDirection 'TResult () const'
+// AST-NEXT: | | | `-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> used GetObjectRayDirection 'vector<float, 3> ()' extern
+// AST-NEXT: | | |   |-TemplateArgument type 'vector<float, 3>':'vector<float, 3>'
+// AST-NEXT: | | |   |-HLSLIntrinsicAttr {{[^ ]+}} <<invalid sloc>> Implicit "op" "" 369
+// AST-NEXT: | | |   |-ConstAttr {{[^ ]+}} <<invalid sloc>> Implicit
+// AST-NEXT: | | |   `-AvailabilityAttr {{[^ ]+}} <<invalid sloc>> Implicit  6.9 0 0 ""
+// AST: | | |-FunctionTemplateDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> GetObjectRayOrigin
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TResult
+// AST-NEXT: | | | |-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> implicit GetObjectRayOrigin 'TResult () const'
+// AST-NEXT: | | | `-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> used GetObjectRayOrigin 'vector<float, 3> ()' extern
+// AST-NEXT: | | |   |-TemplateArgument type 'vector<float, 3>':'vector<float, 3>'
+// AST-NEXT: | | |   |-HLSLIntrinsicAttr {{[^ ]+}} <<invalid sloc>> Implicit "op" "" 370
+// AST-NEXT: | | |   |-ConstAttr {{[^ ]+}} <<invalid sloc>> Implicit
+// AST-NEXT: | | |   `-AvailabilityAttr {{[^ ]+}} <<invalid sloc>> Implicit  6.9 0 0 ""
+// AST: | | |-FunctionTemplateDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> GetObjectToWorld3x4
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TResult
+// AST-NEXT: | | | |-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> implicit GetObjectToWorld3x4 'TResult () const'
+// AST-NEXT: | | | `-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> used GetObjectToWorld3x4 'matrix<float, 3, 4> ()' extern
+// AST-NEXT: | | |   |-TemplateArgument type 'matrix<float, 3, 4>':'matrix<float, 3, 4>'
+// AST-NEXT: | | |   |-HLSLIntrinsicAttr {{[^ ]+}} <<invalid sloc>> Implicit "op" "" 371
+// AST-NEXT: | | |   |-ConstAttr {{[^ ]+}} <<invalid sloc>> Implicit
+// AST-NEXT: | | |   `-AvailabilityAttr {{[^ ]+}} <<invalid sloc>> Implicit  6.9 0 0 ""
+// AST: | | |-FunctionTemplateDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> GetObjectToWorld4x3
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TResult
+// AST-NEXT: | | | |-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> implicit GetObjectToWorld4x3 'TResult () const'
+// AST-NEXT: | | | `-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> used GetObjectToWorld4x3 'matrix<float, 4, 3> ()' extern
+// AST-NEXT: | | |   |-TemplateArgument type 'matrix<float, 4, 3>':'matrix<float, 4, 3>'
+// AST-NEXT: | | |   |-HLSLIntrinsicAttr {{[^ ]+}} <<invalid sloc>> Implicit "op" "" 372
+// AST-NEXT: | | |   |-ConstAttr {{[^ ]+}} <<invalid sloc>> Implicit
+// AST-NEXT: | | |   `-AvailabilityAttr {{[^ ]+}} <<invalid sloc>> Implicit  6.9 0 0 ""
+// AST: | | |-FunctionTemplateDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> GetPrimitiveIndex
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TResult
+// AST-NEXT: | | | |-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> implicit GetPrimitiveIndex 'TResult () const'
+// AST-NEXT: | | | `-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> used GetPrimitiveIndex 'unsigned int ()' extern
+// AST-NEXT: | | |   |-TemplateArgument type 'unsigned int'
+// AST-NEXT: | | |   |-HLSLIntrinsicAttr {{[^ ]+}} <<invalid sloc>> Implicit "op" "" 373
+// AST-NEXT: | | |   |-ConstAttr {{[^ ]+}} <<invalid sloc>> Implicit
+// AST-NEXT: | | |   `-AvailabilityAttr {{[^ ]+}} <<invalid sloc>> Implicit  6.9 0 0 ""
+// AST: | | |-FunctionTemplateDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> GetRayFlags
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TResult
+// AST-NEXT: | | | |-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> implicit GetRayFlags 'TResult () const'
+// AST-NEXT: | | | `-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> used GetRayFlags 'unsigned int ()' extern
+// AST-NEXT: | | |   |-TemplateArgument type 'unsigned int'
+// AST-NEXT: | | |   |-HLSLIntrinsicAttr {{[^ ]+}} <<invalid sloc>> Implicit "op" "" 374
+// AST-NEXT: | | |   |-ConstAttr {{[^ ]+}} <<invalid sloc>> Implicit
+// AST-NEXT: | | |   `-AvailabilityAttr {{[^ ]+}} <<invalid sloc>> Implicit  6.9 0 0 ""
+// AST: | | |-FunctionTemplateDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> GetRayTCurrent
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TResult
+// AST-NEXT: | | | |-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> implicit GetRayTCurrent 'TResult () const'
+// AST-NEXT: | | | `-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> used GetRayTCurrent 'float ()' extern
+// AST-NEXT: | | |   |-TemplateArgument type 'float'
+// AST-NEXT: | | |   |-HLSLIntrinsicAttr {{[^ ]+}} <<invalid sloc>> Implicit "op" "" 375
+// AST-NEXT: | | |   |-ConstAttr {{[^ ]+}} <<invalid sloc>> Implicit
+// AST-NEXT: | | |   `-AvailabilityAttr {{[^ ]+}} <<invalid sloc>> Implicit  6.9 0 0 ""
+// AST: | | |-FunctionTemplateDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> GetRayTMin
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TResult
+// AST-NEXT: | | | |-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> implicit GetRayTMin 'TResult () const'
+// AST-NEXT: | | | `-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> used GetRayTMin 'float ()' extern
+// AST-NEXT: | | |   |-TemplateArgument type 'float'
+// AST-NEXT: | | |   |-HLSLIntrinsicAttr {{[^ ]+}} <<invalid sloc>> Implicit "op" "" 376
+// AST-NEXT: | | |   |-ConstAttr {{[^ ]+}} <<invalid sloc>> Implicit
+// AST-NEXT: | | |   `-AvailabilityAttr {{[^ ]+}} <<invalid sloc>> Implicit  6.9 0 0 ""
+// AST: | | |-FunctionTemplateDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> GetShaderTableIndex
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TResult
+// AST-NEXT: | | | |-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> implicit GetShaderTableIndex 'TResult () const'
+// AST-NEXT: | | | `-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> used GetShaderTableIndex 'unsigned int ()' extern
+// AST-NEXT: | | |   |-TemplateArgument type 'unsigned int'
+// AST-NEXT: | | |   |-HLSLIntrinsicAttr {{[^ ]+}} <<invalid sloc>> Implicit "op" "" 377
+// AST-NEXT: | | |   |-ConstAttr {{[^ ]+}} <<invalid sloc>> Implicit
+// AST-NEXT: | | |   `-AvailabilityAttr {{[^ ]+}} <<invalid sloc>> Implicit  6.9 0 0 ""
+// AST: | | |-FunctionTemplateDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> GetWorldRayDirection
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TResult
+// AST-NEXT: | | | |-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> implicit GetWorldRayDirection 'TResult () const'
+// AST-NEXT: | | | `-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> used GetWorldRayDirection 'vector<float, 3> ()' extern
+// AST-NEXT: | | |   |-TemplateArgument type 'vector<float, 3>':'vector<float, 3>'
+// AST-NEXT: | | |   |-HLSLIntrinsicAttr {{[^ ]+}} <<invalid sloc>> Implicit "op" "" 378
+// AST-NEXT: | | |   |-ConstAttr {{[^ ]+}} <<invalid sloc>> Implicit
+// AST-NEXT: | | |   `-AvailabilityAttr {{[^ ]+}} <<invalid sloc>> Implicit  6.9 0 0 ""
+// AST: | | |-FunctionTemplateDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> GetWorldRayOrigin
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TResult
+// AST-NEXT: | | | |-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> implicit GetWorldRayOrigin 'TResult () const'
+// AST-NEXT: | | | `-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> used GetWorldRayOrigin 'vector<float, 3> ()' extern
+// AST-NEXT: | | |   |-TemplateArgument type 'vector<float, 3>':'vector<float, 3>'
+// AST-NEXT: | | |   |-HLSLIntrinsicAttr {{[^ ]+}} <<invalid sloc>> Implicit "op" "" 379
+// AST-NEXT: | | |   |-ConstAttr {{[^ ]+}} <<invalid sloc>> Implicit
+// AST-NEXT: | | |   `-AvailabilityAttr {{[^ ]+}} <<invalid sloc>> Implicit  6.9 0 0 ""
+// AST: | | |-FunctionTemplateDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> GetWorldToObject3x4
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TResult
+// AST-NEXT: | | | |-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> implicit GetWorldToObject3x4 'TResult () const'
+// AST-NEXT: | | | `-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> used GetWorldToObject3x4 'matrix<float, 3, 4> ()' extern
+// AST-NEXT: | | |   |-TemplateArgument type 'matrix<float, 3, 4>':'matrix<float, 3, 4>'
+// AST-NEXT: | | |   |-HLSLIntrinsicAttr {{[^ ]+}} <<invalid sloc>> Implicit "op" "" 380
+// AST-NEXT: | | |   |-ConstAttr {{[^ ]+}} <<invalid sloc>> Implicit
+// AST-NEXT: | | |   `-AvailabilityAttr {{[^ ]+}} <<invalid sloc>> Implicit  6.9 0 0 ""
+// AST: | | |-FunctionTemplateDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> GetWorldToObject4x3
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TResult
+// AST-NEXT: | | | |-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> implicit GetWorldToObject4x3 'TResult () const'
+// AST-NEXT: | | | `-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> used GetWorldToObject4x3 'matrix<float, 4, 3> ()' extern
+// AST-NEXT: | | |   |-TemplateArgument type 'matrix<float, 4, 3>':'matrix<float, 4, 3>'
+// AST-NEXT: | | |   |-HLSLIntrinsicAttr {{[^ ]+}} <<invalid sloc>> Implicit "op" "" 381
+// AST-NEXT: | | |   |-ConstAttr {{[^ ]+}} <<invalid sloc>> Implicit
+// AST-NEXT: | | |   `-AvailabilityAttr {{[^ ]+}} <<invalid sloc>> Implicit  6.9 0 0 ""
+// AST: | | |-FunctionTemplateDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> IsHit
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TResult
+// AST-NEXT: | | | |-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> implicit IsHit 'TResult () const'
+// AST-NEXT: | | | `-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> used IsHit 'bool ()' extern
+// AST-NEXT: | | |   |-TemplateArgument type 'bool'
+// AST-NEXT: | | |   |-HLSLIntrinsicAttr {{[^ ]+}} <<invalid sloc>> Implicit "op" "" 383
+// AST-NEXT: | | |   |-ConstAttr {{[^ ]+}} <<invalid sloc>> Implicit
+// AST-NEXT: | | |   `-AvailabilityAttr {{[^ ]+}} <<invalid sloc>> Implicit  6.9 0 0 ""
+// AST: | | |-FunctionTemplateDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> IsMiss
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TResult
+// AST-NEXT: | | | |-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> implicit IsMiss 'TResult () const'
+// AST-NEXT: | | | `-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> used IsMiss 'bool ()' extern
+// AST-NEXT: | | |   |-TemplateArgument type 'bool'
+// AST-NEXT: | | |   |-HLSLIntrinsicAttr {{[^ ]+}} <<invalid sloc>> Implicit "op" "" 384
+// AST-NEXT: | | |   |-ConstAttr {{[^ ]+}} <<invalid sloc>> Implicit
+// AST-NEXT: | | |   `-AvailabilityAttr {{[^ ]+}} <<invalid sloc>> Implicit  6.9 0 0 ""
+// AST: | | |-FunctionTemplateDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> IsNop
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TResult
+// AST-NEXT: | | | |-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> implicit IsNop 'TResult () const'
+// AST-NEXT: | | | `-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> used IsNop 'bool ()' extern
+// AST-NEXT: | | |   |-TemplateArgument type 'bool'
+// AST-NEXT: | | |   |-HLSLIntrinsicAttr {{[^ ]+}} <<invalid sloc>> Implicit "op" "" 385
+// AST-NEXT: | | |   |-ConstAttr {{[^ ]+}} <<invalid sloc>> Implicit
+// AST-NEXT: | | |   `-AvailabilityAttr {{[^ ]+}} <<invalid sloc>> Implicit  6.9 0 0 ""
+// AST: | | |-FunctionTemplateDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> LoadLocalRootTableConstant
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TResult
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TRootConstantOffsetInBytes
+// AST-NEXT: | | | |-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> implicit LoadLocalRootTableConstant 'TResult (TRootConstantOffsetInBytes) const'
+// AST-NEXT: | | | | `-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> RootConstantOffsetInBytes 'TRootConstantOffsetInBytes'
+// AST-NEXT: | | | `-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> used LoadLocalRootTableConstant 'unsigned int (unsigned int)' extern
+// AST-NEXT: | | |   |-TemplateArgument type 'unsigned int'
+// AST-NEXT: | | |   |-TemplateArgument type 'unsigned int'
+// AST-NEXT: | | |   |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> LoadLocalRootTableConstant 'unsigned int'
+// AST-NEXT: | | |   |-HLSLIntrinsicAttr {{[^ ]+}} <<invalid sloc>> Implicit "op" "" 386
+// AST-NEXT: | | |   |-PureAttr {{[^ ]+}} <<invalid sloc>> Implicit
+// AST-NEXT: | | |   `-AvailabilityAttr {{[^ ]+}} <<invalid sloc>> Implicit  6.9 0 0 ""
+// AST: | | |-FunctionTemplateDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> SetShaderTableIndex
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TResult
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TRecordIndex
+// AST-NEXT: | | | |-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> implicit SetShaderTableIndex 'TResult (TRecordIndex) const'
+// AST-NEXT: | | | | `-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> RecordIndex 'TRecordIndex'
+// AST-NEXT: | | | `-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> used SetShaderTableIndex 'void (unsigned int)' extern
+// AST-NEXT: | | |   |-TemplateArgument type 'void'
+// AST-NEXT: | | |   |-TemplateArgument type 'unsigned int'
+// AST-NEXT: | | |   |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> SetShaderTableIndex 'unsigned int'
+// AST-NEXT: | | |   |-HLSLIntrinsicAttr {{[^ ]+}} <<invalid sloc>> Implicit "op" "" 388
+// AST-NEXT: | | |   `-AvailabilityAttr {{[^ ]+}} <<invalid sloc>> Implicit  6.9 0 0 ""
+
+// FCGL: define void @"\01?main@@YAXXZ"() #0 {
+// FCGL:   %{{[^ ]+}} = call %dx.types.HitObject* @"dx.hl.op..%dx.types.HitObject* (i32, %dx.types.HitObject*)"(i32 358, %dx.types.HitObject* %[[HIT:[^ ]+]])
+// FCGL:   call void @"dx.hl.op..void (i32, %dx.types.HitObject*, i32)"(i32 388, %dx.types.HitObject* %[[HIT]], i32 1)
+// FCGL:   %{{[^ ]+}} = call i1 @"dx.hl.op.rn.i1 (i32, %dx.types.HitObject*)"(i32 383, %dx.types.HitObject* %[[HIT]])
+// FCGL:   %{{[^ ]+}} = call i1 @"dx.hl.op.rn.i1 (i32, %dx.types.HitObject*)"(i32 384, %dx.types.HitObject* %[[HIT]])
+// FCGL:   %{{[^ ]+}} = call i1 @"dx.hl.op.rn.i1 (i32, %dx.types.HitObject*)"(i32 385, %dx.types.HitObject* %[[HIT]])
+// FCGL:   %{{[^ ]+}} = call i32 @"dx.hl.op.rn.i32 (i32, %dx.types.HitObject*)"(i32 365, %dx.types.HitObject* %[[HIT]])
+// FCGL:   %{{[^ ]+}} = call i32 @"dx.hl.op.rn.i32 (i32, %dx.types.HitObject*)"(i32 366, %dx.types.HitObject* %[[HIT]])
+// FCGL:   %{{[^ ]+}} = call i32 @"dx.hl.op.rn.i32 (i32, %dx.types.HitObject*)"(i32 368, %dx.types.HitObject* %[[HIT]])
+// FCGL:   %{{[^ ]+}} = call i32 @"dx.hl.op.rn.i32 (i32, %dx.types.HitObject*)"(i32 367, %dx.types.HitObject* %[[HIT]])
+// FCGL:   %{{[^ ]+}} = call i32 @"dx.hl.op.rn.i32 (i32, %dx.types.HitObject*)"(i32 373, %dx.types.HitObject* %[[HIT]])
+// FCGL:   %{{[^ ]+}} = call i32 @"dx.hl.op.rn.i32 (i32, %dx.types.HitObject*)"(i32 377, %dx.types.HitObject* %[[HIT]])
+// FCGL:   %{{[^ ]+}} = call i32 @"dx.hl.op.ro.i32 (i32, %dx.types.HitObject*, i32)"(i32 386, %dx.types.HitObject* %[[HIT]], i32 42)
+// FCGL:   %{{[^ ]+}} = call <3 x float> @"dx.hl.op.rn.<3 x float> (i32, %dx.types.HitObject*)"(i32 379, %dx.types.HitObject* %[[HIT]])
+// FCGL:   %{{[^ ]+}} = call <3 x float> @"dx.hl.op.rn.<3 x float> (i32, %dx.types.HitObject*)"(i32 378, %dx.types.HitObject* %[[HIT]])
+// FCGL:   %{{[^ ]+}} = call <3 x float> @"dx.hl.op.rn.<3 x float> (i32, %dx.types.HitObject*)"(i32 370, %dx.types.HitObject* %[[HIT]])
+// FCGL:   %{{[^ ]+}} = call <3 x float> @"dx.hl.op.rn.<3 x float> (i32, %dx.types.HitObject*)"(i32 369, %dx.types.HitObject* %[[HIT]])
+// FCGL:   %{{[^ ]+}} = call %class.matrix.float.3.4 @"dx.hl.op.rn.%class.matrix.float.3.4 (i32, %dx.types.HitObject*)"(i32 371, %dx.types.HitObject* %[[HIT]])
+// FCGL:   %{{[^ ]+}} = call %class.matrix.float.4.3 @"dx.hl.op.rn.%class.matrix.float.4.3 (i32, %dx.types.HitObject*)"(i32 372, %dx.types.HitObject* %[[HIT]])
+// FCGL:   %{{[^ ]+}} = call %class.matrix.float.3.4 @"dx.hl.op.rn.%class.matrix.float.3.4 (i32, %dx.types.HitObject*)"(i32 380, %dx.types.HitObject* %[[HIT]])
+// FCGL:   %{{[^ ]+}} = call %class.matrix.float.4.3 @"dx.hl.op.rn.%class.matrix.float.4.3 (i32, %dx.types.HitObject*)"(i32 381, %dx.types.HitObject* %[[HIT]])
+// FCGL:   %{{[^ ]+}} = call i32 @"dx.hl.op.rn.i32 (i32, %dx.types.HitObject*)"(i32 374, %dx.types.HitObject* %[[HIT]])
+// FCGL:   %{{[^ ]+}} = call float @"dx.hl.op.rn.float (i32, %dx.types.HitObject*)"(i32 376, %dx.types.HitObject* %[[HIT]])
+// FCGL:   %{{[^ ]+}} = call float @"dx.hl.op.rn.float (i32, %dx.types.HitObject*)"(i32 375, %dx.types.HitObject* %[[HIT]])
+// FCGL:   ret void
+
+RWByteAddressBuffer outbuf;
+
+template <int M, int N>
+float hashM(in matrix<float, M, N> mat) {
+  float h = 0.f;
+  for (int i = 0; i < M; ++i)
+    for (int j = 0; j < N; ++j)
+      h += mat[i][j];
+  return h;
+}
+
+[shader("raygeneration")]
+void main() {
+  dx::HitObject hit;
+  int isum = 0;
+  float fsum = 0.0f;
+  vector<float, 3> vsum = 0;
+
+  ///// Setters
+  hit.SetShaderTableIndex(1);
+
+  ///// Getters
+
+  // i1 accessors
+  isum += hit.IsHit();
+  isum += hit.IsMiss();
+  isum += hit.IsNop();
+
+  // i32 accessors
+  isum += hit.GetGeometryIndex();
+  isum += hit.GetHitKind();
+  isum += hit.GetInstanceIndex();
+  isum += hit.GetInstanceID();
+  isum += hit.GetPrimitiveIndex();
+  isum += hit.GetShaderTableIndex();
+  isum += hit.LoadLocalRootTableConstant(42);
+
+  // float3 accessors
+  vsum += hit.GetWorldRayOrigin();
+  vsum += hit.GetWorldRayDirection();
+  vsum += hit.GetObjectRayOrigin();
+  vsum += hit.GetObjectRayDirection();
+  fsum += vsum[0] + vsum[1] + vsum[2];
+
+  // matrix accessors
+  fsum += hashM<3, 4>(hit.GetObjectToWorld3x4());
+  fsum += hashM<4, 3>(hit.GetObjectToWorld4x3());
+  fsum += hashM<3, 4>(hit.GetWorldToObject3x4());
+  fsum += hashM<4, 3>(hit.GetWorldToObject4x3());
+
+  // f32 accessors
+  isum += hit.GetRayFlags();
+  fsum += hit.GetRayTMin();
+  fsum += hit.GetRayTCurrent();
+
+  outbuf.Store(0, fsum);
+  outbuf.Store(4, isum);
+}

From 847d5ad29ed989b489e9404e0a94f6d1bdfeeb25 Mon Sep 17 00:00:00 2001
From: Simon Moll <smoll@nvidia.com>
Date: Wed, 23 Apr 2025 18:36:15 +0200
Subject: [PATCH 08/93] [SER] HitObject::GetAttributes HLSL -> DXIL lowering
 and attributes sema (#7361)

Lowering for `HitObject::GetAttributes<T>()`

Specification:
https://github.com/microsoft/hlsl-specs/blob/main/proposals/0027-shader-execution-reordering.md
DXC SER implementation tracker::
https://github.com/microsoft/DirectXShaderCompiler/issues/7214
---
 lib/HLSL/HLOperationLower.cpp                 |  18 ++-
 .../clang/Basic/DiagnosticSemaKinds.td        |   4 +-
 tools/clang/lib/Sema/SemaDXR.cpp              |  12 +-
 tools/clang/lib/Sema/SemaHLSL.cpp             |  63 ++++++---
 .../HitObject/hitobject_attributes.hlsl       |  26 ++++
 .../DxilGen/hitobject_attributes_dxilgen.ll   | 130 ++++++++++++++++++
 .../HitObject/hitobject_attributes.hlsl       |  28 ++++
 .../hitobject_attributes_invalid_longvec.hlsl |  14 ++
 .../hitobject_attributes_invalid_udt.hlsl     |  14 ++
 9 files changed, 284 insertions(+), 25 deletions(-)
 create mode 100644 tools/clang/test/CodeGenDXIL/hlsl/objects/HitObject/hitobject_attributes.hlsl
 create mode 100644 tools/clang/test/DXC/Passes/DxilGen/hitobject_attributes_dxilgen.ll
 create mode 100644 tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_attributes.hlsl
 create mode 100644 tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_attributes_invalid_longvec.hlsl
 create mode 100644 tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_attributes_invalid_udt.hlsl

diff --git a/lib/HLSL/HLOperationLower.cpp b/lib/HLSL/HLOperationLower.cpp
index 1e43cce07c..f8a9f528cc 100644
--- a/lib/HLSL/HLOperationLower.cpp
+++ b/lib/HLSL/HLOperationLower.cpp
@@ -6375,7 +6375,23 @@ Value *TranslateHitObjectGetAttributes(CallInst *CI, IntrinsicOp IOP,
                                        HLOperationLowerHelper &Helper,
                                        HLObjectOperationLowerHelper *pObjHelper,
                                        bool &Translated) {
-  return UndefValue::get(CI->getType()); // TODO: Merge SER DXIL patches
+  hlsl::OP *OP = &Helper.hlslOP;
+  IRBuilder<> Builder(CI);
+
+  Value *HitObjectPtr = CI->getArgOperand(1);
+  Value *HitObject = Builder.CreateLoad(HitObjectPtr);
+
+  Type *AttrTy = cast<PointerType>(CI->getType())->getPointerElementType();
+
+  IRBuilder<> EntryBuilder(
+      dxilutil::FindAllocaInsertionPt(CI->getParent()->getParent()));
+  unsigned AttrAlign = Helper.dataLayout.getABITypeAlignment(AttrTy);
+  AllocaInst *AttrMem = EntryBuilder.CreateAlloca(AttrTy);
+  AttrMem->setAlignment(AttrAlign);
+  Constant *opArg = OP->GetU32Const((unsigned)OpCode);
+  TrivialDxilOperation(OpCode, {opArg, HitObject, AttrMem}, CI->getType(),
+                       Helper.voidTy, OP, Builder);
+  return AttrMem;
 }
 
 Value *TranslateHitObjectScalarGetter(CallInst *CI, IntrinsicOp IOP,
diff --git a/tools/clang/include/clang/Basic/DiagnosticSemaKinds.td b/tools/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 6254e5fc71..ae7e777180 100644
--- a/tools/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/tools/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -7646,7 +7646,7 @@ def err_payload_requires_inout : Error<
 def err_attributes_requiers_in : Error<
   "intersection attributes parameter %0 must be 'in'">;
 def err_payload_attrs_must_be_udt : Error<
-  "%select{payload|attributes|callable}0 parameter %1 must be a user-defined type composed of only numeric types">;
+  "%select{payload|attributes|callable}0 %select{parameter %2|type}1 must be a user-defined type composed of only numeric types">;
 def err_shader_must_return_void : Error<
   "return type for '%0' shaders must be void">;
 def err_raytracing_entry_param_count : Error<
@@ -7885,7 +7885,7 @@ def err_hlsl_unsupported_long_vector
     "cbuffers or tbuffers|user-defined struct parameter|"
     "entry function parameters|entry function return type|"
     "patch constant function parameters|patch constant function return type|"
-    "payload parameters}0 are not supported">;
+    "payload parameters|attributes}0 are not supported">;
 def err_hlsl_logical_binop_scalar : Error<
    "operands for short-circuiting logical binary operator must be scalar, for non-scalar types use '%select{and|or}0'">;
 def err_hlsl_ternary_scalar : Error<
diff --git a/tools/clang/lib/Sema/SemaDXR.cpp b/tools/clang/lib/Sema/SemaDXR.cpp
index e5b2140cca..f0102f9e3f 100644
--- a/tools/clang/lib/Sema/SemaDXR.cpp
+++ b/tools/clang/lib/Sema/SemaDXR.cpp
@@ -829,7 +829,8 @@ void DiagnoseBuiltinCallWithPayload(Sema &S, const VarDecl *Payload,
   // Verify that the payload type is legal
   if (!hlsl::IsHLSLCopyableAnnotatableRecord(Payload->getType())) {
     S.Diag(Payload->getLocation(), diag::err_payload_attrs_must_be_udt)
-        << /*payload|attributes|callable*/ 0 << Payload;
+        << /*payload|attributes|callable*/ 0 << /*parameter %2|type*/ 0
+        << Payload;
     return;
   }
 
@@ -1194,7 +1195,8 @@ void DiagnoseCallableEntry(Sema &S, FunctionDecl *FD,
 
     if (!(hlsl::IsHLSLCopyableAnnotatableRecord(Ty)))
       S.Diag(Param->getLocation(), diag::err_payload_attrs_must_be_udt)
-          << /*payload|attributes|callable*/ 2 << Param;
+          << /*payload|attributes|callable*/ 2 << /*parameter %2|type*/ 0
+          << Param;
   }
   return;
 }
@@ -1235,7 +1237,8 @@ void DiagnoseMissOrAnyHitEntry(Sema &S, FunctionDecl *FD,
 
     if (!(hlsl::IsHLSLCopyableAnnotatableRecord(Ty))) {
       S.Diag(Param->getLocation(), diag::err_payload_attrs_must_be_udt)
-          << /*payload|attributes|callable*/ Idx << Param;
+          << /*payload|attributes|callable*/ Idx << /*parameter %2|type*/ 0
+          << Param;
     }
   }
   return;
@@ -1288,7 +1291,8 @@ void DiagnoseClosestHitEntry(Sema &S, FunctionDecl *FD,
 
     if (!(hlsl::IsHLSLCopyableAnnotatableRecord(Ty))) {
       S.Diag(Param->getLocation(), diag::err_payload_attrs_must_be_udt)
-          << /*payload|attributes|callable*/ Idx << Param;
+          << /*payload|attributes|callable*/ Idx << /*parameter %2|type*/ 0
+          << Param;
     }
   }
   return;
diff --git a/tools/clang/lib/Sema/SemaHLSL.cpp b/tools/clang/lib/Sema/SemaHLSL.cpp
index bddf834509..6eadfeaac9 100644
--- a/tools/clang/lib/Sema/SemaHLSL.cpp
+++ b/tools/clang/lib/Sema/SemaHLSL.cpp
@@ -10770,6 +10770,22 @@ HLSLExternalSource::ApplyTypeSpecSignToParsedType(clang::QualType &type,
   }
 }
 
+bool DiagnoseIntersectionAttributes(Sema &S, SourceLocation Loc, QualType Ty) {
+  // Must be a UDT
+  if (Ty.isNull() || !hlsl::IsHLSLCopyableAnnotatableRecord(Ty)) {
+    S.Diag(Loc, diag::err_payload_attrs_must_be_udt)
+        << /*payload|attributes|callable*/ 1 << /*parameter %2|type*/ 1;
+    return false;
+  }
+
+  if (ContainsLongVector(Ty)) {
+    const unsigned AttributesIdx = 11;
+    S.Diag(Loc, diag::err_hlsl_unsupported_long_vector) << AttributesIdx;
+    return false;
+  }
+  return true;
+}
+
 Sema::TemplateDeductionResult
 HLSLExternalSource::DeduceTemplateArgumentsForHLSL(
     FunctionTemplateDecl *FunctionTemplate,
@@ -10878,6 +10894,7 @@ HLSLExternalSource::DeduceTemplateArgumentsForHLSL(
     LPCSTR tableName = cursor.GetTableName();
     // Currently only intrinsic we allow for explicit template arguments are
     // for Load/Store for ByteAddressBuffer/RWByteAddressBuffer
+    // and HitObject::GetAttributes with user-defined intersection attributes.
 
     // Check Explicit template arguments
     UINT intrinsicOp = (*cursor)->Op;
@@ -10892,28 +10909,38 @@ HLSLExternalSource::DeduceTemplateArgumentsForHLSL(
       IsBABLoad = intrinsicOp == (UINT)IntrinsicOp::MOP_Load;
       IsBABStore = intrinsicOp == (UINT)IntrinsicOp::MOP_Store;
     }
-    if (ExplicitTemplateArgs && ExplicitTemplateArgs->size() > 0) {
-      bool isLegalTemplate = false;
+    bool IsHitObjectGetAttributes =
+        intrinsicOp == (UINT)IntrinsicOp::MOP_DxHitObject_GetAttributes;
+    if (ExplicitTemplateArgs && ExplicitTemplateArgs->size() >= 1) {
       SourceLocation Loc = ExplicitTemplateArgs->getLAngleLoc();
-      auto TemplateDiag = diag::err_hlsl_intrinsic_template_arg_unsupported;
-      if (ExplicitTemplateArgs->size() >= 1 && (IsBABLoad || IsBABStore)) {
-        TemplateDiag = diag::err_hlsl_intrinsic_template_arg_requires_2018;
-        Loc = (*ExplicitTemplateArgs)[0].getLocation();
-        if (Is2018) {
-          TemplateDiag = diag::err_hlsl_intrinsic_template_arg_numeric;
-          if (ExplicitTemplateArgs->size() == 1 &&
-              !functionTemplateTypeArg.isNull() &&
-              hlsl::IsHLSLNumericOrAggregateOfNumericType(
-                  functionTemplateTypeArg)) {
-            isLegalTemplate = true;
-          }
-        }
+      if (!IsBABLoad && !IsBABStore && !IsHitObjectGetAttributes) {
+        getSema()->Diag(Loc, diag::err_hlsl_intrinsic_template_arg_unsupported)
+            << intrinsicName;
+        return Sema::TemplateDeductionResult::TDK_Invalid;
       }
-
-      if (!isLegalTemplate) {
-        getSema()->Diag(Loc, TemplateDiag) << intrinsicName;
+      Loc = (*ExplicitTemplateArgs)[0].getLocation();
+      if (!Is2018) {
+        getSema()->Diag(Loc,
+                        diag::err_hlsl_intrinsic_template_arg_requires_2018)
+            << intrinsicName;
         return Sema::TemplateDeductionResult::TDK_Invalid;
       }
+
+      if (IsBABLoad || IsBABStore) {
+        const bool IsLegalTemplate =
+            !functionTemplateTypeArg.isNull() &&
+            hlsl::IsHLSLNumericOrAggregateOfNumericType(
+                functionTemplateTypeArg);
+        if (!IsLegalTemplate) {
+          getSema()->Diag(Loc, diag::err_hlsl_intrinsic_template_arg_numeric)
+              << intrinsicName;
+          return Sema::TemplateDeductionResult::TDK_Invalid;
+        }
+      }
+      if (IsHitObjectGetAttributes &&
+          !DiagnoseIntersectionAttributes(*getSema(), Loc,
+                                          functionTemplateTypeArg))
+        return Sema::TemplateDeductionResult::TDK_Invalid;
     } else if (IsBABStore) {
       // Prior to HLSL 2018, Store operation only stored scalar uint.
       if (!Is2018) {
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/objects/HitObject/hitobject_attributes.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/objects/HitObject/hitobject_attributes.hlsl
new file mode 100644
index 0000000000..03cefe8e48
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/hlsl/objects/HitObject/hitobject_attributes.hlsl
@@ -0,0 +1,26 @@
+// RUN: %dxc -T lib_6_9 -E main %s | FileCheck %s --check-prefix DXIL
+
+// DXIL: %[[APTR:[^ ]+]] = alloca %struct.CustomAttrs, align 4
+// DXIL: %[[NOP:[^ ]+]] = call %dx.types.HitObject @dx.op.hitObject_MakeNop(i32 266)  ; HitObject_MakeNop()
+// DXIL: call void @dx.op.hitObject_Attributes.struct.CustomAttrs(i32 289, %dx.types.HitObject %[[NOP]], %struct.CustomAttrs* nonnull %[[APTR]])  ; HitObject_Attributes(hitObject,attributes)
+// DXIL: %[[VPTR:[^ ]+]] = getelementptr inbounds %struct.CustomAttrs, %struct.CustomAttrs* %[[APTR]], i32 0, i32 0
+// DXIL: %{{[^ ]+}} = load <4 x float>, <4 x float>* %[[VPTR]], align 4
+// DXIL: %[[IPTR:[^ ]+]] = getelementptr inbounds %struct.CustomAttrs, %struct.CustomAttrs* %[[APTR]], i32 0, i32 1
+// DXIL: %{{[^ ]+}} = load i32, i32* %[[IPTR]], align 4
+// DXIL: ret void
+
+RWByteAddressBuffer outbuf;
+
+struct
+CustomAttrs {
+  float4 v;
+  int y;
+};
+
+[shader("raygeneration")]
+void main() {
+  dx::HitObject hit;
+  CustomAttrs attrs = hit.GetAttributes<CustomAttrs>();
+  float sum = attrs.v.x + attrs.v.y + attrs.v.z + attrs.v.w + attrs.y;
+  outbuf.Store(0, sum);
+}
diff --git a/tools/clang/test/DXC/Passes/DxilGen/hitobject_attributes_dxilgen.ll b/tools/clang/test/DXC/Passes/DxilGen/hitobject_attributes_dxilgen.ll
new file mode 100644
index 0000000000..4887be4d58
--- /dev/null
+++ b/tools/clang/test/DXC/Passes/DxilGen/hitobject_attributes_dxilgen.ll
@@ -0,0 +1,130 @@
+; RUN: %dxopt %s -hlsl-passes-resume -dxilgen -S | FileCheck %s
+; REQUIRES: dxil-1-9
+
+;
+; Buffer Definitions:
+;
+;
+; Resource Bindings:
+;
+; Name                                 Type  Format         Dim      ID      HLSL Bind  Count
+; ------------------------------ ---------- ------- ----------- ------- -------------- ------
+; outbuf                                UAV    byte         r/w      U0u4294967295,space4294967295     1
+;
+target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+%struct.RWByteAddressBuffer = type { i32 }
+%dx.types.HitObject = type { i8* }
+%struct.CustomAttrs = type { <4 x float>, i32 }
+%dx.types.Handle = type { i8* }
+%dx.types.ResourceProperties = type { i32, i32 }
+%"class.dx::HitObject" = type { i32 }
+
+@"\01?outbuf@@3URWByteAddressBuffer@@A" = external global %struct.RWByteAddressBuffer, align 4
+
+; CHECK: %[[ATTRA:[^ ]+]] = alloca %struct.CustomAttrs, align 4
+; CHECK: call void @dx.op.hitObject_Attributes.struct.CustomAttrs(i32 289, %dx.types.HitObject %{{[^ ]+}}, %struct.CustomAttrs* %[[ATTRA]])
+
+; Function Attrs: nounwind
+define void @"\01?main@@YAXXZ"() #0 {
+entry:
+  %hit = alloca %dx.types.HitObject, align 4
+  %0 = bitcast %dx.types.HitObject* %hit to i8*, !dbg !21 ; line:22 col:3
+  call void @llvm.lifetime.start(i64 4, i8* %0) #0, !dbg !21 ; line:22 col:3
+  %1 = call %dx.types.HitObject* @"dx.hl.op..%dx.types.HitObject* (i32, %dx.types.HitObject*)"(i32 358, %dx.types.HitObject* %hit), !dbg !25 ; line:22 col:17
+  %2 = call %struct.CustomAttrs* @"dx.hl.op..%struct.CustomAttrs* (i32, %dx.types.HitObject*)"(i32 364, %dx.types.HitObject* %hit), !dbg !26 ; line:23 col:23
+  %3 = getelementptr inbounds %struct.CustomAttrs, %struct.CustomAttrs* %2, i32 0, i32 0, !dbg !26 ; line:23 col:23
+  %4 = load <4 x float>, <4 x float>* %3, !dbg !26 ; line:23 col:23
+  %5 = getelementptr inbounds %struct.CustomAttrs, %struct.CustomAttrs* %2, i32 0, i32 1, !dbg !26 ; line:23 col:23
+  %6 = load i32, i32* %5, !dbg !26 ; line:23 col:23
+  %7 = extractelement <4 x float> %4, i32 0, !dbg !27 ; line:24 col:15
+  %8 = extractelement <4 x float> %4, i32 1, !dbg !28 ; line:24 col:27
+  %add = fadd float %7, %8, !dbg !29 ; line:24 col:25
+  %9 = extractelement <4 x float> %4, i32 2, !dbg !30 ; line:24 col:39
+  %add4 = fadd float %add, %9, !dbg !31 ; line:24 col:37
+  %10 = extractelement <4 x float> %4, i32 3, !dbg !32 ; line:24 col:51
+  %add6 = fadd float %add4, %10, !dbg !33 ; line:24 col:49
+  %conv = sitofp i32 %6 to float, !dbg !34 ; line:24 col:63
+  %add7 = fadd float %add6, %conv, !dbg !35 ; line:24 col:61
+  %11 = load %struct.RWByteAddressBuffer, %struct.RWByteAddressBuffer* @"\01?outbuf@@3URWByteAddressBuffer@@A", !dbg !36 ; line:25 col:3
+  %12 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RWByteAddressBuffer)"(i32 0, %struct.RWByteAddressBuffer %11), !dbg !36 ; line:25 col:3
+  %13 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RWByteAddressBuffer)"(i32 14, %dx.types.Handle %12, %dx.types.ResourceProperties { i32 4107, i32 0 }, %struct.RWByteAddressBuffer zeroinitializer), !dbg !36 ; line:25 col:3
+  call void @"dx.hl.op..void (i32, %dx.types.Handle, i32, float)"(i32 277, %dx.types.Handle %13, i32 0, float %add7), !dbg !36 ; line:25 col:3
+  %14 = bitcast %dx.types.HitObject* %hit to i8*, !dbg !37 ; line:26 col:1
+  call void @llvm.lifetime.end(i64 4, i8* %14) #0, !dbg !37 ; line:26 col:1
+  ret void, !dbg !37 ; line:26 col:1
+}
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture) #0
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture) #0
+
+; Function Attrs: nounwind
+declare %dx.types.HitObject* @"dx.hl.op..%dx.types.HitObject* (i32, %dx.types.HitObject*)"(i32, %dx.types.HitObject*) #0
+
+; Function Attrs: nounwind
+declare %struct.CustomAttrs* @"dx.hl.op..%struct.CustomAttrs* (i32, %dx.types.HitObject*)"(i32, %dx.types.HitObject*) #0
+
+; Function Attrs: nounwind
+declare void @"dx.hl.op..void (i32, %dx.types.Handle, i32, float)"(i32, %dx.types.Handle, i32, float) #0
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RWByteAddressBuffer)"(i32, %struct.RWByteAddressBuffer) #1
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RWByteAddressBuffer)"(i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RWByteAddressBuffer) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+
+!llvm.module.flags = !{!0}
+!pauseresume = !{!1}
+!dx.version = !{!2}
+!dx.valver = !{!2}
+!dx.shaderModel = !{!3}
+!dx.typeAnnotations = !{!4, !10}
+!dx.entryPoints = !{!14}
+!dx.fnprops = !{!18}
+!dx.options = !{!19, !20}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !{!"hlsl-hlemit", !"hlsl-hlensure"}
+!2 = !{i32 1, i32 9}
+!3 = !{!"lib", i32 6, i32 9}
+!4 = !{i32 0, %"class.dx::HitObject" undef, !5, %struct.CustomAttrs undef, !7}
+!5 = !{i32 4, !6}
+!6 = !{i32 6, !"h", i32 3, i32 0, i32 7, i32 4}
+!7 = !{i32 20, !8, !9}
+!8 = !{i32 6, !"v", i32 3, i32 0, i32 7, i32 9, i32 13, i32 4}
+!9 = !{i32 6, !"y", i32 3, i32 16, i32 7, i32 4}
+!10 = !{i32 1, void ()* @"\01?main@@YAXXZ", !11}
+!11 = !{!12}
+!12 = !{i32 1, !13, !13}
+!13 = !{}
+!14 = !{null, !"", null, !15, null}
+!15 = !{null, !16, null, null}
+!16 = !{!17}
+!17 = !{i32 0, %struct.RWByteAddressBuffer* @"\01?outbuf@@3URWByteAddressBuffer@@A", !"outbuf", i32 -1, i32 -1, i32 1, i32 11, i1 false, i1 false, i1 false, null}
+!18 = !{void ()* @"\01?main@@YAXXZ", i32 7}
+!19 = !{i32 -2147483584}
+!20 = !{i32 -1}
+!21 = !DILocation(line: 22, column: 3, scope: !22)
+!22 = !DISubprogram(name: "main", scope: !23, file: !23, line: 21, type: !24, isLocal: false, isDefinition: true, scopeLine: 21, flags: DIFlagPrototyped, isOptimized: false, function: void ()* @"\01?main@@YAXXZ")
+!23 = !DIFile(filename: "tools/clang/test/CodeGenDXIL/hlsl/objects/HitObject/hitobject_attributes.hlsl", directory: "")
+!24 = !DISubroutineType(types: !13)
+!25 = !DILocation(line: 22, column: 17, scope: !22)
+!26 = !DILocation(line: 23, column: 23, scope: !22)
+!27 = !DILocation(line: 24, column: 15, scope: !22)
+!28 = !DILocation(line: 24, column: 27, scope: !22)
+!29 = !DILocation(line: 24, column: 25, scope: !22)
+!30 = !DILocation(line: 24, column: 39, scope: !22)
+!31 = !DILocation(line: 24, column: 37, scope: !22)
+!32 = !DILocation(line: 24, column: 51, scope: !22)
+!33 = !DILocation(line: 24, column: 49, scope: !22)
+!34 = !DILocation(line: 24, column: 63, scope: !22)
+!35 = !DILocation(line: 24, column: 61, scope: !22)
+!36 = !DILocation(line: 25, column: 3, scope: !22)
+!37 = !DILocation(line: 26, column: 1, scope: !22)
diff --git a/tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_attributes.hlsl b/tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_attributes.hlsl
new file mode 100644
index 0000000000..79db78cdaf
--- /dev/null
+++ b/tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_attributes.hlsl
@@ -0,0 +1,28 @@
+// RUN: %dxc -T lib_6_9 -E main %s -ast-dump-implicit | FileCheck %s --check-prefix AST
+// RUN: %dxc -T lib_6_9 -E main %s -fcgl | FileCheck %s --check-prefix FCGL
+
+// AST: | | |-FunctionTemplateDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> GetAttributes
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TResult
+// AST-NEXT: | | | |-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> implicit GetAttributes 'TResult () const'
+// AST-NEXT: | | | `-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> used GetAttributes 'CustomAttrs &()' extern
+// AST-NEXT: | | |   |-TemplateArgument type 'CustomAttrs'
+// AST-NEXT: | | |   |-HLSLIntrinsicAttr {{[^ ]+}} <<invalid sloc>> Implicit "op" "" 364
+// AST-NEXT: | | |   `-AvailabilityAttr {{[^ ]+}} <<invalid sloc>> Implicit  6.9 0 0 ""
+
+// FCGL: %{{[^ ]+}} = call %struct.CustomAttrs* @"dx.hl.op..%struct.CustomAttrs* (i32, %dx.types.HitObject*)"(i32 364, %dx.types.HitObject* %{{[^ ]+}})
+
+RWByteAddressBuffer outbuf;
+
+struct
+CustomAttrs {
+  float4 v;
+  int y;
+};
+
+[shader("raygeneration")]
+void main() {
+  dx::HitObject hit;
+  CustomAttrs attrs = hit.GetAttributes<CustomAttrs>();
+  float sum = attrs.v.x + attrs.v.y + attrs.v.z + attrs.v.w + attrs.y;
+  outbuf.Store(0, sum);
+}
diff --git a/tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_attributes_invalid_longvec.hlsl b/tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_attributes_invalid_longvec.hlsl
new file mode 100644
index 0000000000..240ccfb9d4
--- /dev/null
+++ b/tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_attributes_invalid_longvec.hlsl
@@ -0,0 +1,14 @@
+// RUN: %dxc -T lib_6_9 -E main %s -verify
+
+struct
+CustomAttrs {
+  vector<float, 32> v;
+  int y;
+};
+
+[shader("raygeneration")]
+void main() {
+  dx::HitObject hit;
+  // expected-error@+1{{vectors of over 4 elements in attributes are not supported}}
+  CustomAttrs attrs = hit.GetAttributes<CustomAttrs>();
+}
diff --git a/tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_attributes_invalid_udt.hlsl b/tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_attributes_invalid_udt.hlsl
new file mode 100644
index 0000000000..0f27f089e4
--- /dev/null
+++ b/tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_attributes_invalid_udt.hlsl
@@ -0,0 +1,14 @@
+// RUN: %dxc -T lib_6_9 -E main %s -verify
+
+struct
+CustomAttrs {
+  vector<float, 32> v;
+  RWStructuredBuffer<float> buf;
+};
+
+[shader("raygeneration")]
+void main() {
+  dx::HitObject hit;
+  // expected-error@+1{{attributes type must be a user-defined type composed of only numeric types}}
+  CustomAttrs attrs = hit.GetAttributes<CustomAttrs>();
+}

From 624665f3987d379a299aa14dbc53e0cd3b96afea Mon Sep 17 00:00:00 2001
From: Simon Moll <smoll@nvidia.com>
Date: Thu, 24 Apr 2025 19:01:41 +0200
Subject: [PATCH 09/93] [SER] HitObject::FromRayQuery  HLSL -> DXIL lowering
 (#7370)

* HLSL -> DXIL lowering
* ast, hlsl->dxil, dxilgen, and ScalarReplAggregatesHLSL tests

SER implementation tracker (#7214)
---
 include/dxc/HLSL/HLOperations.h               |   4 +
 lib/HLSL/HLOperationLower.cpp                 |  27 +-
 .../Scalar/ScalarReplAggregatesHLSL.cpp       |  25 ++
 tools/clang/lib/Sema/SemaHLSL.cpp             |   1 +
 .../HitObject/hitobject_fromrayquery.hlsl     |  37 ++
 .../DxilGen/hitobject_fromrayquery_dxilgen.ll | 146 +++++++
 .../hitobject_fromrayquery_scalarrepl.ll      | 383 ++++++++++++++++++
 .../HitObject/hitobject_fromrayquery.hlsl     |  72 ++++
 8 files changed, 694 insertions(+), 1 deletion(-)
 create mode 100644 tools/clang/test/CodeGenDXIL/hlsl/objects/HitObject/hitobject_fromrayquery.hlsl
 create mode 100644 tools/clang/test/DXC/Passes/DxilGen/hitobject_fromrayquery_dxilgen.ll
 create mode 100644 tools/clang/test/DXC/Passes/ScalarReplHLSL/hitobject_fromrayquery_scalarrepl.ll
 create mode 100644 tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_fromrayquery.hlsl

diff --git a/include/dxc/HLSL/HLOperations.h b/include/dxc/HLSL/HLOperations.h
index 970ddd3e85..0e9b8c2710 100644
--- a/include/dxc/HLSL/HLOperations.h
+++ b/include/dxc/HLSL/HLOperations.h
@@ -441,6 +441,10 @@ const unsigned kHitObjectMakeMissRayDescOpIdx = 4;
 const unsigned kHitObjectTraceRay_RayDescOpIdx = 8;
 const unsigned kHitObjectTraceRay_NumOp = 10;
 
+// HitObject::FromRayQuery
+const unsigned kHitObjectFromRayQuery_WithAttrs_AttributeOpIdx = 4;
+const unsigned kHitObjectFromRayQuery_WithAttrs_NumOp = 5;
+
 } // namespace HLOperandIndex
 
 llvm::Function *GetOrCreateHLFunction(llvm::Module &M,
diff --git a/lib/HLSL/HLOperationLower.cpp b/lib/HLSL/HLOperationLower.cpp
index f8a9f528cc..4ef7591e89 100644
--- a/lib/HLSL/HLOperationLower.cpp
+++ b/lib/HLSL/HLOperationLower.cpp
@@ -6312,7 +6312,32 @@ Value *TranslateHitObjectFromRayQuery(CallInst *CI, IntrinsicOp IOP,
                                       HLOperationLowerHelper &Helper,
                                       HLObjectOperationLowerHelper *pObjHelper,
                                       bool &Translated) {
-  return UndefValue::get(CI->getType()); // TODO: Merge SER DXIL patches
+  hlsl::OP *OP = &Helper.hlslOP;
+  IRBuilder<> Builder(CI);
+
+  unsigned SrcIdx = 1;
+  Value *HitObjectPtr = CI->getArgOperand(SrcIdx++);
+  Value *RayQuery = CI->getArgOperand(SrcIdx++);
+
+  if (CI->getNumArgOperands() ==
+      HLOperandIndex::kHitObjectFromRayQuery_WithAttrs_NumOp) {
+    Value *HitKind = CI->getArgOperand(SrcIdx++);
+    Value *AttribSrc = CI->getArgOperand(SrcIdx++);
+    DXASSERT_NOMSG(SrcIdx == CI->getNumArgOperands());
+    OpCode = DXIL::OpCode::HitObject_FromRayQueryWithAttrs;
+    Type *AttrTy = AttribSrc->getType();
+    Value *OutHitObject = TrivialDxilOperation(
+        OpCode, {nullptr, RayQuery, HitKind, AttribSrc}, AttrTy, CI, OP);
+    Builder.CreateStore(OutHitObject, HitObjectPtr);
+    return nullptr;
+  }
+
+  DXASSERT_NOMSG(SrcIdx == CI->getNumArgOperands());
+  OpCode = DXIL::OpCode::HitObject_FromRayQuery;
+  Value *OutHitObject =
+      TrivialDxilOperation(OpCode, {nullptr, RayQuery}, Helper.voidTy, CI, OP);
+  Builder.CreateStore(OutHitObject, HitObjectPtr);
+  return nullptr;
 }
 
 Value *TranslateHitObjectTraceRay(CallInst *CI, IntrinsicOp IOP,
diff --git a/lib/Transforms/Scalar/ScalarReplAggregatesHLSL.cpp b/lib/Transforms/Scalar/ScalarReplAggregatesHLSL.cpp
index b13e9a0f5d..20265af40a 100644
--- a/lib/Transforms/Scalar/ScalarReplAggregatesHLSL.cpp
+++ b/lib/Transforms/Scalar/ScalarReplAggregatesHLSL.cpp
@@ -2795,6 +2795,31 @@ void SROA_Helper::RewriteCall(CallInst *CI) {
         }
       }
         LLVM_FALLTHROUGH;
+      case IntrinsicOp::MOP_DxHitObject_FromRayQuery: {
+        const bool IsWithAttrs =
+            CI->getNumArgOperands() ==
+            HLOperandIndex::kHitObjectFromRayQuery_WithAttrs_NumOp;
+        if (IsWithAttrs &&
+            (OldVal ==
+             CI->getArgOperand(
+                 HLOperandIndex::
+                     kHitObjectFromRayQuery_WithAttrs_AttributeOpIdx))) {
+          RewriteCallArg(
+              CI,
+              HLOperandIndex::kHitObjectFromRayQuery_WithAttrs_AttributeOpIdx,
+              /*bIn*/ true, /*bOut*/ false);
+          break;
+        }
+
+        // For RayQuery methods, we want to replace the RayQuery this pointer
+        // with a load and use of the underlying handle value.
+        // This will allow elimination of RayQuery types earlier.
+        RewriteWithFlattenedHLIntrinsicCall(CI, OldVal, NewElts,
+                                            /*loadElts*/ true);
+        DeadInsts.push_back(CI);
+        break;
+      }
+        LLVM_FALLTHROUGH;
       default:
         // RayQuery this pointer replacement.
         if (OldVal->getType()->isPointerTy() &&
diff --git a/tools/clang/lib/Sema/SemaHLSL.cpp b/tools/clang/lib/Sema/SemaHLSL.cpp
index 6eadfeaac9..5131d39f44 100644
--- a/tools/clang/lib/Sema/SemaHLSL.cpp
+++ b/tools/clang/lib/Sema/SemaHLSL.cpp
@@ -12093,6 +12093,7 @@ void Sema::DiagnoseReachableHLSLCall(CallExpr *CE, const hlsl::ShaderModel *SM,
   case hlsl::IntrinsicOp::MOP_TraceRayInline:
     DiagnoseTraceRayInline(*this, CE);
     break;
+  case hlsl::IntrinsicOp::MOP_DxHitObject_FromRayQuery:
   case hlsl::IntrinsicOp::MOP_DxHitObject_Invoke:
   case hlsl::IntrinsicOp::MOP_DxHitObject_MakeMiss:
   case hlsl::IntrinsicOp::MOP_DxHitObject_MakeNop:
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/objects/HitObject/hitobject_fromrayquery.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/objects/HitObject/hitobject_fromrayquery.hlsl
new file mode 100644
index 0000000000..33ea2719be
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/hlsl/objects/HitObject/hitobject_fromrayquery.hlsl
@@ -0,0 +1,37 @@
+// RUN: %dxc -T lib_6_9 -E main %s | FileCheck %s --check-prefix DXIL
+
+// DXIL: %{{[^ ]+}} = call %dx.types.HitObject @dx.op.hitObject_FromRayQuery(i32 263, i32 %[[RQ:[^ ]+]])  ; HitObject_FromRayQuery(rayQueryHandle)
+// DXIL: %{{[^ ]+}} = call %dx.types.HitObject @dx.op.hitObject_FromRayQueryWithAttrs.struct.CustomAttrs(i32 264, i32 %[[RQ]], i32 16, %struct.CustomAttrs* nonnull %{{[^ ]+}})  ; HitObject_FromRayQueryWithAttrs(rayQueryHandle,HitKind,CommittedAttribs)
+
+RaytracingAccelerationStructure RTAS;
+RWStructuredBuffer<float> UAV : register(u0);
+
+RayDesc MakeRayDesc() {
+  RayDesc desc;
+  desc.Origin = float3(0, 0, 0);
+  desc.Direction = float3(1, 0, 0);
+  desc.TMin = 0.0f;
+  desc.TMax = 9999.0;
+  return desc;
+}
+
+struct CustomAttrs {
+  float x;
+  float y;
+};
+
+void Use(in dx::HitObject hit) {
+  dx::MaybeReorderThread(hit);
+}
+
+[shader("raygeneration")]
+void main() {
+  RayQuery<RAY_FLAG_FORCE_OPAQUE | RAY_FLAG_ACCEPT_FIRST_HIT_AND_END_SEARCH> q;
+  RayDesc ray = MakeRayDesc();
+  q.TraceRayInline(RTAS, RAY_FLAG_NONE, 0xFF, ray);
+
+  Use(dx::HitObject::FromRayQuery(q));
+
+  CustomAttrs attrs = {1.f, 2.f};
+  Use(dx::HitObject::FromRayQuery(q, 16, attrs));
+}
diff --git a/tools/clang/test/DXC/Passes/DxilGen/hitobject_fromrayquery_dxilgen.ll b/tools/clang/test/DXC/Passes/DxilGen/hitobject_fromrayquery_dxilgen.ll
new file mode 100644
index 0000000000..0ae8e36fa7
--- /dev/null
+++ b/tools/clang/test/DXC/Passes/DxilGen/hitobject_fromrayquery_dxilgen.ll
@@ -0,0 +1,146 @@
+; RUN: %dxopt %s -hlsl-passes-resume -dxilgen -S | FileCheck %s
+; REQUIRES: dxil-1-9
+
+;
+; Buffer Definitions:
+;
+;
+; Resource Bindings:
+;
+; Name                                 Type  Format         Dim      ID      HLSL Bind  Count
+; ------------------------------ ---------- ------- ----------- ------- -------------- ------
+; RTAS                              texture     i32         ras      T0t4294967295,space4294967295     1
+;
+target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+%struct.RaytracingAccelerationStructure = type { i32 }
+%struct.CustomAttrs = type { float, float }
+%dx.types.HitObject = type { i8* }
+%dx.types.Handle = type { i8* }
+%dx.types.ResourceProperties = type { i32, i32 }
+%"class.RWStructuredBuffer<float>" = type { float }
+%struct.RayDesc = type { <3 x float>, float, <3 x float>, float }
+%"class.dx::HitObject" = type { i32 }
+%"class.RayQuery<5, 0>" = type { i32 }
+
+@"\01?RTAS@@3URaytracingAccelerationStructure@@A" = external global %struct.RaytracingAccelerationStructure, align 4
+
+; CHECK: %[[ATTRA:[^ ]+]] = alloca %struct.CustomAttrs
+; CHECK: call void @dx.op.rayQuery_TraceRayInline(i32 179, i32 %[[RQH:[^ ]+]], %dx.types.Handle %{{[^ ]+}}, i32 0, i32 255, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 9.999000e+03)
+; CHECK: %{{[^ ]+}} = call %dx.types.HitObject @dx.op.hitObject_FromRayQuery(i32 263, i32 %[[RQH]])
+; CHECK: %{{[^ ]+}} = call %dx.types.HitObject @dx.op.hitObject_FromRayQueryWithAttrs.struct.CustomAttrs(i32 264, i32 %[[RQH]], i32 16, %struct.CustomAttrs* %[[ATTRA]])
+
+; Function Attrs: nounwind
+define void @"\01?main@@YAXXZ"() #0 {
+entry:
+  %0 = alloca %struct.CustomAttrs
+  %agg.tmp = alloca %dx.types.HitObject, align 4
+  %agg.tmp1 = alloca %dx.types.HitObject, align 4
+  %q2 = call i32 @"dx.hl.op..i32 (i32, i32, i32)"(i32 4, i32 5, i32 0), !dbg !38 ; line:29 col:78
+  %1 = load %struct.RaytracingAccelerationStructure, %struct.RaytracingAccelerationStructure* @"\01?RTAS@@3URaytracingAccelerationStructure@@A", !dbg !42 ; line:31 col:3
+  %2 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RaytracingAccelerationStructure)"(i32 0, %struct.RaytracingAccelerationStructure %1), !dbg !42 ; line:31 col:3
+  %3 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RaytracingAccelerationStructure)"(i32 14, %dx.types.Handle %2, %dx.types.ResourceProperties { i32 16, i32 0 }, %struct.RaytracingAccelerationStructure zeroinitializer), !dbg !42 ; line:31 col:3
+  call void @"dx.hl.op..void (i32, i32, %dx.types.Handle, i32, i32, <3 x float>, float, <3 x float>, float)"(i32 325, i32 %q2, %dx.types.Handle %3, i32 0, i32 255, <3 x float> zeroinitializer, float 0.000000e+00, <3 x float> <float 1.000000e+00, float 0.000000e+00, float 0.000000e+00>, float 9.999000e+03), !dbg !42 ; line:31 col:3
+  call void @"dx.hl.op..void (i32, %dx.types.HitObject*, i32)"(i32 363, %dx.types.HitObject* %agg.tmp, i32 %q2), !dbg !43 ; line:33 col:7
+  call void @"dx.hl.op..void (i32, %dx.types.HitObject*)"(i32 359, %dx.types.HitObject* %agg.tmp) #0, !dbg !44 ; line:24 col:3
+  %.0 = getelementptr inbounds %struct.CustomAttrs, %struct.CustomAttrs* %0, i32 0, i32 0
+  store float 1.000000e+00, float* %.0
+  %.1 = getelementptr inbounds %struct.CustomAttrs, %struct.CustomAttrs* %0, i32 0, i32 1
+  store float 2.000000e+00, float* %.1, align 4
+  call void @"dx.hl.op..void (i32, %dx.types.HitObject*, i32, i32, %struct.CustomAttrs*)"(i32 363, %dx.types.HitObject* %agg.tmp1, i32 %q2, i32 16, %struct.CustomAttrs* %0), !dbg !47 ; line:36 col:7
+  call void @"dx.hl.op..void (i32, %dx.types.HitObject*)"(i32 359, %dx.types.HitObject* %agg.tmp1) #0, !dbg !48 ; line:24 col:3
+  ret void, !dbg !49 ; line:37 col:1
+}
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture) #0
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture) #0
+
+; Function Attrs: nounwind
+declare void @"dx.hl.op..void (i32, %dx.types.HitObject*)"(i32, %dx.types.HitObject*) #0
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RaytracingAccelerationStructure)"(i32, %struct.RaytracingAccelerationStructure) #1
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RaytracingAccelerationStructure)"(i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RaytracingAccelerationStructure) #1
+
+; Function Attrs: nounwind
+declare i32 @"dx.hl.op..i32 (i32, i32, i32)"(i32, i32, i32) #0
+
+; Function Attrs: nounwind
+declare void @"dx.hl.op..void (i32, i32, %dx.types.Handle, i32, i32, <3 x float>, float, <3 x float>, float)"(i32, i32, %dx.types.Handle, i32, i32, <3 x float>, float, <3 x float>, float) #0
+
+; Function Attrs: nounwind
+declare void @"dx.hl.op..void (i32, %dx.types.HitObject*, i32, i32, %struct.CustomAttrs*)"(i32, %dx.types.HitObject*, i32, i32, %struct.CustomAttrs*) #0
+
+; Function Attrs: nounwind
+declare void @"dx.hl.op..void (i32, %dx.types.HitObject*, i32)"(i32, %dx.types.HitObject*, i32) #0
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+
+!llvm.module.flags = !{!0}
+!pauseresume = !{!1}
+!dx.version = !{!2}
+!dx.valver = !{!2}
+!dx.shaderModel = !{!3}
+!dx.typeAnnotations = !{!4, !26}
+!dx.entryPoints = !{!30}
+!dx.fnprops = !{!35}
+!dx.options = !{!36, !37}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !{!"hlsl-hlemit", !"hlsl-hlensure"}
+!2 = !{i32 1, i32 9}
+!3 = !{!"lib", i32 6, i32 9}
+!4 = !{i32 0, %"class.RWStructuredBuffer<float>" undef, !5, %struct.RayDesc undef, !10, %"class.dx::HitObject" undef, !15, %"class.RayQuery<5, 0>" undef, !17, %struct.CustomAttrs undef, !23}
+!5 = !{i32 4, !6, !7}
+!6 = !{i32 6, !"h", i32 3, i32 0, i32 7, i32 9}
+!7 = !{i32 0, !8}
+!8 = !{!9}
+!9 = !{i32 0, float undef}
+!10 = !{i32 32, !11, !12, !13, !14}
+!11 = !{i32 6, !"Origin", i32 3, i32 0, i32 7, i32 9, i32 13, i32 3}
+!12 = !{i32 6, !"TMin", i32 3, i32 12, i32 7, i32 9}
+!13 = !{i32 6, !"Direction", i32 3, i32 16, i32 7, i32 9, i32 13, i32 3}
+!14 = !{i32 6, !"TMax", i32 3, i32 28, i32 7, i32 9}
+!15 = !{i32 4, !16}
+!16 = !{i32 6, !"h", i32 3, i32 0, i32 7, i32 4}
+!17 = !{i32 4, !18, !19}
+!18 = !{i32 6, !"h", i32 3, i32 0, i32 7, i32 5}
+!19 = !{i32 0, !20}
+!20 = !{!21, !22}
+!21 = !{i32 1, i64 5}
+!22 = !{i32 1, i64 0}
+!23 = !{i32 8, !24, !25}
+!24 = !{i32 6, !"x", i32 3, i32 0, i32 7, i32 9}
+!25 = !{i32 6, !"y", i32 3, i32 4, i32 7, i32 9}
+!26 = !{i32 1, void ()* @"\01?main@@YAXXZ", !27}
+!27 = !{!28}
+!28 = !{i32 1, !29, !29}
+!29 = !{}
+!30 = !{null, !"", null, !31, null}
+!31 = !{!32, null, null, null}
+!32 = !{!33}
+!33 = !{i32 0, %struct.RaytracingAccelerationStructure* @"\01?RTAS@@3URaytracingAccelerationStructure@@A", !"RTAS", i32 -1, i32 -1, i32 1, i32 16, i32 0, !34}
+!34 = !{i32 0, i32 4}
+!35 = !{void ()* @"\01?main@@YAXXZ", i32 7}
+!36 = !{i32 -2147483584}
+!37 = !{i32 -1}
+!38 = !DILocation(line: 29, column: 78, scope: !39)
+!39 = !DISubprogram(name: "main", scope: !40, file: !40, line: 28, type: !41, isLocal: false, isDefinition: true, scopeLine: 28, flags: DIFlagPrototyped, isOptimized: false, function: void ()* @"\01?main@@YAXXZ")
+!40 = !DIFile(filename: "tools/clang/test/CodeGenDXIL/hlsl/objects/HitObject/hitobject_fromrayquery.hlsl", directory: "")
+!41 = !DISubroutineType(types: !29)
+!42 = !DILocation(line: 31, column: 3, scope: !39)
+!43 = !DILocation(line: 33, column: 7, scope: !39)
+!44 = !DILocation(line: 24, column: 3, scope: !45, inlinedAt: !46)
+!45 = !DISubprogram(name: "Use", scope: !40, file: !40, line: 23, type: !41, isLocal: false, isDefinition: true, scopeLine: 23, flags: DIFlagPrototyped, isOptimized: false)
+!46 = distinct !DILocation(line: 33, column: 3, scope: !39)
+!47 = !DILocation(line: 36, column: 7, scope: !39)
+!48 = !DILocation(line: 24, column: 3, scope: !45, inlinedAt: !49)
+!49 = distinct !DILocation(line: 36, column: 3, scope: !39)
+!50 = !DILocation(line: 37, column: 1, scope: !39)
diff --git a/tools/clang/test/DXC/Passes/ScalarReplHLSL/hitobject_fromrayquery_scalarrepl.ll b/tools/clang/test/DXC/Passes/ScalarReplHLSL/hitobject_fromrayquery_scalarrepl.ll
new file mode 100644
index 0000000000..5afd30b524
--- /dev/null
+++ b/tools/clang/test/DXC/Passes/ScalarReplHLSL/hitobject_fromrayquery_scalarrepl.ll
@@ -0,0 +1,383 @@
+; RUN: %dxopt %s -hlsl-passes-resume -scalarrepl-param-hlsl -S | FileCheck %s
+
+; COM: Original HLSL code
+; COM: RaytracingAccelerationStructure RTAS;
+; COM: RWStructuredBuffer<float> UAV : register(u0);
+; COM: RWByteAddressBuffer inbuf;
+; COM: RWByteAddressBuffer outbuf;
+; COM: 
+; COM: RayDesc MakeRayDesc() {
+; COM:   RayDesc desc;
+; COM:   desc.Origin = float3(0, 0, 0);
+; COM:   desc.Direction = float3(1, 0, 0);
+; COM:   desc.TMin = 0.0f;
+; COM:   desc.TMax = 9999.0;
+; COM:   return desc;
+; COM: }
+; COM: 
+; COM: struct CustomAttrs {
+; COM:   float x;
+; COM:   float y;
+; COM: };
+; COM: 
+; COM: void Use(in dx::HitObject hit) {
+; COM:   dx::MaybeReorderThread(hit);
+; COM: }
+; COM: 
+; COM: [shader("raygeneration")]
+; COM: void main() {
+; COM:   RayQuery<RAY_FLAG_FORCE_OPAQUE | RAY_FLAG_ACCEPT_FIRST_HIT_AND_END_SEARCH> q;
+; COM:   RayDesc ray = MakeRayDesc();
+; COM:   q.TraceRayInline(RTAS, RAY_FLAG_NONE, 0xFF, ray);
+; COM: 
+; COM:   Use(dx::HitObject::FromRayQuery(q));
+; COM: 
+; COM:   CustomAttrs attrs;
+; COM:   attrs.x = inbuf.Load(0);
+; COM:   attrs.y = inbuf.Load(4);
+; COM:   Use(dx::HitObject::FromRayQuery(q, 16, attrs));
+; COM: 
+; COM:   attrs.x = inbuf.Load(8);
+; COM:   attrs.y = inbuf.Load(12);
+; COM:   Use(dx::HitObject::FromRayQuery(q, 17, attrs));
+; COM: 
+; COM:   outbuf.Store(0, attrs.x);
+; COM:   outbuf.Store(4, attrs.y);
+; COM: }
+
+;
+; Buffer Definitions:
+;
+; cbuffer $Globals
+; {
+;
+;   [0 x i8] (type annotation not present)
+;
+; }
+;
+; Resource bind info for UAV
+; {
+;
+;   float $Element;                                   ; Offset:    0 Size:     4
+;
+; }
+;
+;
+; Resource Bindings:
+;
+; Name                                 Type  Format         Dim      ID      HLSL Bind  Count
+; ------------------------------ ---------- ------- ----------- ------- -------------- ------
+; $Globals                          cbuffer      NA          NA     CB0   cb4294967295     1
+; RTAS                              texture     i32         ras      T0t4294967295,space4294967295     1
+; UAV                                   UAV  struct         r/w      U0             u0     1
+; inbuf                                 UAV    byte         r/w      U1u4294967295,space4294967295     1
+; outbuf                                UAV    byte         r/w      U2u4294967295,space4294967295     1
+;
+target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+%struct.RaytracingAccelerationStructure = type { i32 }
+%"class.RWStructuredBuffer<float>" = type { float }
+%struct.RWByteAddressBuffer = type { i32 }
+%ConstantBuffer = type opaque
+%"class.RayQuery<5, 0>" = type { i32 }
+%struct.RayDesc = type { <3 x float>, float, <3 x float>, float }
+%dx.types.HitObject = type { i8* }
+%struct.CustomAttrs = type { float, float }
+%dx.types.Handle = type { i8* }
+%dx.types.ResourceProperties = type { i32, i32 }
+%"class.dx::HitObject" = type { i32 }
+
+@"\01?RTAS@@3URaytracingAccelerationStructure@@A" = external global %struct.RaytracingAccelerationStructure, align 4
+@"\01?UAV@@3V?$RWStructuredBuffer@M@@A" = external global %"class.RWStructuredBuffer<float>", align 4
+@"\01?inbuf@@3URWByteAddressBuffer@@A" = external global %struct.RWByteAddressBuffer, align 4
+@"\01?outbuf@@3URWByteAddressBuffer@@A" = external global %struct.RWByteAddressBuffer, align 4
+@"$Globals" = external constant %ConstantBuffer
+
+; CHECK: %[[RQA:[^ ]+]] = alloca i32
+; CHECK: %[[ATTRA0:[^ ]+]] = alloca %struct.CustomAttrs
+; CHECK: %[[ATTRA1:[^ ]+]] = alloca %struct.CustomAttrs
+; CHECK: %[[XATTRA:[^ ]+]] = alloca float
+; CHECK: %[[YATTRA:[^ ]+]] = alloca float
+
+; COM: Check same query handle used for TraceRayInline and the FromRayQuery calls
+; CHECK: %[[RQH:[^ ]+]] = load i32, i32* %[[RQA]]
+; CHECK: call void @"dx.hl.op..void (i32, i32, %dx.types.Handle, i32, i32, <3 x float>, float, <3 x float>, float)"(i32 325, i32 %[[RQH]],
+    
+; COM: Check RQ handle loaded for first FromRayQuery call
+; CHECK: %[[RQH0:[^ ]+]] = load i32, i32* %[[RQA]]
+; CHECK: call void @"dx.hl.op..void (i32, %dx.types.HitObject*, i32)"(i32 363, %dx.types.HitObject* %{{[^ ]+}}, i32 %[[RQH0]])
+
+; COM: Check buffer loads for first FromRayQuery-with-attrs call
+; CHECK: %[[XI0:[^ ]+]] = call i32 @"dx.hl.op.ro.i32 (i32, %dx.types.Handle, i32)"(i32 231, %dx.types.Handle %{{[^ ]+}}, i32 0)
+; CHECK: %[[XF0:[^ ]+]] = uitofp i32 %[[XI0]] to float
+; CHECK: store float %[[XF0]], float* %[[XATTRA]], align 4
+; CHECK: %[[YI0:[^ ]+]] = call i32 @"dx.hl.op.ro.i32 (i32, %dx.types.Handle, i32)"(i32 231, %dx.types.Handle %{{[^ ]+}}, i32 4)
+; CHECK: %[[YF0:[^ ]+]] = uitofp i32 %[[YI0]] to float
+; CHECK: store float %[[YF0]], float* %[[YATTRA]], align 4
+
+; COM: Check that values from buffer flow into first FromRayQuery-with-attrs call
+; CHECK: %[[XPTR0:[^ ]+]] = getelementptr inbounds %struct.CustomAttrs, %struct.CustomAttrs* %[[ATTRA0]], i32 0, i32 0
+; CHECK: %[[XF1:[^ ]+]] = load float, float* %[[XATTRA]]
+; CHECK: store float %[[XF1]], float* %[[XPTR0]]
+; CHECK: %[[YPTR0:[^ ]+]] = getelementptr inbounds %struct.CustomAttrs, %struct.CustomAttrs* %[[ATTRA0]], i32 0, i32 1
+; CHECK: %[[YF1:[^ ]+]] = load float, float* %[[YATTRA]]
+; CHECK: store float %[[YF1]], float* %[[YPTR0]], align 4
+; CHECK: %[[RQH1:[^ ]+]] = load i32, i32* %[[RQA]]
+; CHECK: call void @"dx.hl.op..void (i32, %dx.types.HitObject*, i32, i32, %struct.CustomAttrs*)"(i32 363, %dx.types.HitObject* %{{[^ ]+}}, i32 %[[RQH1]], i32 16, %struct.CustomAttrs* %[[ATTRA0]])
+
+; COM: Check buffer loads for second FromRayQuery-with-attrs call
+; CHECK: %[[XI1:[^ ]+]] = call i32 @"dx.hl.op.ro.i32 (i32, %dx.types.Handle, i32)"(i32 231, %dx.types.Handle %{{[^ ]+}}, i32 8)
+; CHECK: %[[XF1:[^ ]+]] = uitofp i32 %[[XI1]] to float
+; CHECK: store float %[[XF1]], float* %[[XATTRA]], align 4
+; CHECK: %[[YI1:[^ ]+]] = call i32 @"dx.hl.op.ro.i32 (i32, %dx.types.Handle, i32)"(i32 231, %dx.types.Handle %{{[^ ]+}}, i32 12)
+; CHECK: %[[YF1:[^ ]+]] = uitofp i32 %[[YI1]] to float
+; CHECK: store float %[[YF1]], float* %[[YATTRA]], align 4
+
+; COM: Check that values from buffer flow into second FromRayQuery-with-attrs call
+; CHECK: %[[XPTR1:[^ ]+]] = getelementptr inbounds %struct.CustomAttrs, %struct.CustomAttrs* %[[ATTRA1]], i32 0, i32 0
+; CHECK: %[[XF2:[^ ]+]] = load float, float* %[[XATTRA]]
+; CHECK: store float %[[XF2]], float* %[[XPTR1]]
+; CHECK: %[[YPTR1:[^ ]+]] = getelementptr inbounds %struct.CustomAttrs, %struct.CustomAttrs* %[[ATTRA1]], i32 0, i32 1
+; CHECK: %[[YF2:[^ ]+]] = load float, float* %[[YATTRA]]
+; CHECK: store float %[[YF2]], float* %[[YPTR1]], align 4
+; CHECK: %[[RQH2:[^ ]+]] = load i32, i32* %[[RQA]]
+; CHECK: call void @"dx.hl.op..void (i32, %dx.types.HitObject*, i32, i32, %struct.CustomAttrs*)"(i32 363, %dx.types.HitObject* %{{[^ ]+}}, i32 %[[RQH2]], i32 17, %struct.CustomAttrs* %[[ATTRA1]])
+
+
+; Function Attrs: nounwind
+define void @"\01?main@@YAXXZ"() #0 {
+entry:
+  %q = alloca %"class.RayQuery<5, 0>", align 4
+  %ray = alloca %struct.RayDesc, align 4
+  %agg.tmp = alloca %dx.types.HitObject, align 4
+  %attrs = alloca %struct.CustomAttrs, align 4
+  %agg.tmp4 = alloca %dx.types.HitObject, align 4
+  %agg.tmp11 = alloca %dx.types.HitObject, align 4
+  %0 = bitcast %"class.RayQuery<5, 0>"* %q to i8*, !dbg !45 ; line:26 col:3
+  call void @llvm.lifetime.start(i64 4, i8* %0) #0, !dbg !45 ; line:26 col:3
+  %q14 = call i32 @"dx.hl.op..i32 (i32, i32, i32)"(i32 4, i32 5, i32 0), !dbg !49 ; line:26 col:78
+  %1 = getelementptr inbounds %"class.RayQuery<5, 0>", %"class.RayQuery<5, 0>"* %q, i32 0, i32 0, !dbg !49 ; line:26 col:78
+  store i32 %q14, i32* %1, !dbg !49 ; line:26 col:78
+  %2 = bitcast %struct.RayDesc* %ray to i8*, !dbg !50 ; line:27 col:3
+  call void @llvm.lifetime.start(i64 32, i8* %2) #0, !dbg !50 ; line:27 col:3
+  %Origin.i = getelementptr inbounds %struct.RayDesc, %struct.RayDesc* %ray, i32 0, i32 0, !dbg !51 ; line:8 col:8
+  store <3 x float> zeroinitializer, <3 x float>* %Origin.i, align 4, !dbg !54, !tbaa !55, !alias.scope !58 ; line:8 col:15
+  %Direction.i = getelementptr inbounds %struct.RayDesc, %struct.RayDesc* %ray, i32 0, i32 2, !dbg !61 ; line:9 col:8
+  store <3 x float> <float 1.000000e+00, float 0.000000e+00, float 0.000000e+00>, <3 x float>* %Direction.i, align 4, !dbg !62, !tbaa !55, !alias.scope !58 ; line:9 col:18
+  %TMin.i = getelementptr inbounds %struct.RayDesc, %struct.RayDesc* %ray, i32 0, i32 1, !dbg !63 ; line:10 col:8
+  store float 0.000000e+00, float* %TMin.i, align 4, !dbg !64, !tbaa !65, !alias.scope !58 ; line:10 col:13
+  %TMax.i = getelementptr inbounds %struct.RayDesc, %struct.RayDesc* %ray, i32 0, i32 3, !dbg !67 ; line:11 col:8
+  store float 9.999000e+03, float* %TMax.i, align 4, !dbg !68, !tbaa !65, !alias.scope !58 ; line:11 col:13
+  %3 = load %struct.RaytracingAccelerationStructure, %struct.RaytracingAccelerationStructure* @"\01?RTAS@@3URaytracingAccelerationStructure@@A", !dbg !69 ; line:28 col:3
+  %4 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RaytracingAccelerationStructure)"(i32 0, %struct.RaytracingAccelerationStructure %3), !dbg !69 ; line:28 col:3
+  %5 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RaytracingAccelerationStructure)"(i32 14, %dx.types.Handle %4, %dx.types.ResourceProperties { i32 16, i32 0 }, %struct.RaytracingAccelerationStructure undef), !dbg !69 ; line:28 col:3
+  call void @"dx.hl.op..void (i32, %\22class.RayQuery<5, 0>\22*, %dx.types.Handle, i32, i32, %struct.RayDesc*)"(i32 325, %"class.RayQuery<5, 0>"* %q, %dx.types.Handle %5, i32 0, i32 255, %struct.RayDesc* %ray), !dbg !69 ; line:28 col:3
+  call void @"dx.hl.op..void (i32, %dx.types.HitObject*, %\22class.RayQuery<5, 0>\22*)"(i32 363, %dx.types.HitObject* %agg.tmp, %"class.RayQuery<5, 0>"* %q), !dbg !70 ; line:30 col:7
+  call void @"dx.hl.op..void (i32, %dx.types.HitObject*)"(i32 359, %dx.types.HitObject* %agg.tmp) #0, !dbg !71 ; line:21 col:3
+  %6 = bitcast %struct.CustomAttrs* %attrs to i8*, !dbg !74 ; line:32 col:3
+  call void @llvm.lifetime.start(i64 8, i8* %6) #0, !dbg !74 ; line:32 col:3
+  %7 = load %struct.RWByteAddressBuffer, %struct.RWByteAddressBuffer* @"\01?inbuf@@3URWByteAddressBuffer@@A", !dbg !75 ; line:33 col:13
+  %8 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RWByteAddressBuffer)"(i32 0, %struct.RWByteAddressBuffer %7), !dbg !75 ; line:33 col:13
+  %9 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RWByteAddressBuffer)"(i32 14, %dx.types.Handle %8, %dx.types.ResourceProperties { i32 4107, i32 0 }, %struct.RWByteAddressBuffer undef), !dbg !75 ; line:33 col:13
+  %10 = call i32 @"dx.hl.op.ro.i32 (i32, %dx.types.Handle, i32)"(i32 231, %dx.types.Handle %9, i32 0), !dbg !75 ; line:33 col:13
+  %conv = uitofp i32 %10 to float, !dbg !75 ; line:33 col:13
+  %x = getelementptr inbounds %struct.CustomAttrs, %struct.CustomAttrs* %attrs, i32 0, i32 0, !dbg !76 ; line:33 col:9
+  store float %conv, float* %x, align 4, !dbg !77, !tbaa !65 ; line:33 col:11
+  %11 = load %struct.RWByteAddressBuffer, %struct.RWByteAddressBuffer* @"\01?inbuf@@3URWByteAddressBuffer@@A", !dbg !78 ; line:34 col:13
+  %12 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RWByteAddressBuffer)"(i32 0, %struct.RWByteAddressBuffer %11), !dbg !78 ; line:34 col:13
+  %13 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RWByteAddressBuffer)"(i32 14, %dx.types.Handle %12, %dx.types.ResourceProperties { i32 4107, i32 0 }, %struct.RWByteAddressBuffer undef), !dbg !78 ; line:34 col:13
+  %14 = call i32 @"dx.hl.op.ro.i32 (i32, %dx.types.Handle, i32)"(i32 231, %dx.types.Handle %13, i32 4), !dbg !78 ; line:34 col:13
+  %conv3 = uitofp i32 %14 to float, !dbg !78 ; line:34 col:13
+  %y = getelementptr inbounds %struct.CustomAttrs, %struct.CustomAttrs* %attrs, i32 0, i32 1, !dbg !79 ; line:34 col:9
+  store float %conv3, float* %y, align 4, !dbg !80, !tbaa !65 ; line:34 col:11
+  call void @"dx.hl.op..void (i32, %dx.types.HitObject*, %\22class.RayQuery<5, 0>\22*, i32, %struct.CustomAttrs*)"(i32 363, %dx.types.HitObject* %agg.tmp4, %"class.RayQuery<5, 0>"* %q, i32 16, %struct.CustomAttrs* %attrs), !dbg !81 ; line:35 col:7
+  call void @"dx.hl.op..void (i32, %dx.types.HitObject*)"(i32 359, %dx.types.HitObject* %agg.tmp4) #0, !dbg !82 ; line:21 col:3
+  %15 = load %struct.RWByteAddressBuffer, %struct.RWByteAddressBuffer* @"\01?inbuf@@3URWByteAddressBuffer@@A", !dbg !84 ; line:37 col:13
+  %16 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RWByteAddressBuffer)"(i32 0, %struct.RWByteAddressBuffer %15), !dbg !84 ; line:37 col:13
+  %17 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RWByteAddressBuffer)"(i32 14, %dx.types.Handle %16, %dx.types.ResourceProperties { i32 4107, i32 0 }, %struct.RWByteAddressBuffer undef), !dbg !84 ; line:37 col:13
+  %18 = call i32 @"dx.hl.op.ro.i32 (i32, %dx.types.Handle, i32)"(i32 231, %dx.types.Handle %17, i32 8), !dbg !84 ; line:37 col:13
+  %conv6 = uitofp i32 %18 to float, !dbg !84 ; line:37 col:13
+  %x7 = getelementptr inbounds %struct.CustomAttrs, %struct.CustomAttrs* %attrs, i32 0, i32 0, !dbg !85 ; line:37 col:9
+  store float %conv6, float* %x7, align 4, !dbg !86, !tbaa !65 ; line:37 col:11
+  %19 = load %struct.RWByteAddressBuffer, %struct.RWByteAddressBuffer* @"\01?inbuf@@3URWByteAddressBuffer@@A", !dbg !87 ; line:38 col:13
+  %20 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RWByteAddressBuffer)"(i32 0, %struct.RWByteAddressBuffer %19), !dbg !87 ; line:38 col:13
+  %21 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RWByteAddressBuffer)"(i32 14, %dx.types.Handle %20, %dx.types.ResourceProperties { i32 4107, i32 0 }, %struct.RWByteAddressBuffer undef), !dbg !87 ; line:38 col:13
+  %22 = call i32 @"dx.hl.op.ro.i32 (i32, %dx.types.Handle, i32)"(i32 231, %dx.types.Handle %21, i32 12), !dbg !87 ; line:38 col:13
+  %conv9 = uitofp i32 %22 to float, !dbg !87 ; line:38 col:13
+  %y10 = getelementptr inbounds %struct.CustomAttrs, %struct.CustomAttrs* %attrs, i32 0, i32 1, !dbg !88 ; line:38 col:9
+  store float %conv9, float* %y10, align 4, !dbg !89, !tbaa !65 ; line:38 col:11
+  call void @"dx.hl.op..void (i32, %dx.types.HitObject*, %\22class.RayQuery<5, 0>\22*, i32, %struct.CustomAttrs*)"(i32 363, %dx.types.HitObject* %agg.tmp11, %"class.RayQuery<5, 0>"* %q, i32 17, %struct.CustomAttrs* %attrs), !dbg !90 ; line:39 col:7
+  call void @"dx.hl.op..void (i32, %dx.types.HitObject*)"(i32 359, %dx.types.HitObject* %agg.tmp11) #0, !dbg !91 ; line:21 col:3
+  %x12 = getelementptr inbounds %struct.CustomAttrs, %struct.CustomAttrs* %attrs, i32 0, i32 0, !dbg !93 ; line:41 col:25
+  %23 = load float, float* %x12, align 4, !dbg !93, !tbaa !65 ; line:41 col:25
+  %24 = load %struct.RWByteAddressBuffer, %struct.RWByteAddressBuffer* @"\01?outbuf@@3URWByteAddressBuffer@@A", !dbg !94 ; line:41 col:3
+  %25 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RWByteAddressBuffer)"(i32 0, %struct.RWByteAddressBuffer %24), !dbg !94 ; line:41 col:3
+  %26 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RWByteAddressBuffer)"(i32 14, %dx.types.Handle %25, %dx.types.ResourceProperties { i32 4107, i32 0 }, %struct.RWByteAddressBuffer undef), !dbg !94 ; line:41 col:3
+  call void @"dx.hl.op..void (i32, %dx.types.Handle, i32, float)"(i32 277, %dx.types.Handle %26, i32 0, float %23), !dbg !94 ; line:41 col:3
+  %y13 = getelementptr inbounds %struct.CustomAttrs, %struct.CustomAttrs* %attrs, i32 0, i32 1, !dbg !95 ; line:42 col:25
+  %27 = load float, float* %y13, align 4, !dbg !95, !tbaa !65 ; line:42 col:25
+  %28 = load %struct.RWByteAddressBuffer, %struct.RWByteAddressBuffer* @"\01?outbuf@@3URWByteAddressBuffer@@A", !dbg !96 ; line:42 col:3
+  %29 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RWByteAddressBuffer)"(i32 0, %struct.RWByteAddressBuffer %28), !dbg !96 ; line:42 col:3
+  %30 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RWByteAddressBuffer)"(i32 14, %dx.types.Handle %29, %dx.types.ResourceProperties { i32 4107, i32 0 }, %struct.RWByteAddressBuffer undef), !dbg !96 ; line:42 col:3
+  call void @"dx.hl.op..void (i32, %dx.types.Handle, i32, float)"(i32 277, %dx.types.Handle %30, i32 4, float %27), !dbg !96 ; line:42 col:3
+  %31 = bitcast %struct.CustomAttrs* %attrs to i8*, !dbg !97 ; line:43 col:1
+  call void @llvm.lifetime.end(i64 8, i8* %31) #0, !dbg !97 ; line:43 col:1
+  %32 = bitcast %struct.RayDesc* %ray to i8*, !dbg !97 ; line:43 col:1
+  call void @llvm.lifetime.end(i64 32, i8* %32) #0, !dbg !97 ; line:43 col:1
+  %33 = bitcast %"class.RayQuery<5, 0>"* %q to i8*, !dbg !97 ; line:43 col:1
+  call void @llvm.lifetime.end(i64 4, i8* %33) #0, !dbg !97 ; line:43 col:1
+  ret void, !dbg !97 ; line:43 col:1
+}
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture) #0
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture) #0
+
+; Function Attrs: nounwind
+declare void @"dx.hl.op..void (i32, %dx.types.HitObject*)"(i32, %dx.types.HitObject*) #0
+
+; Function Attrs: nounwind
+declare void @"dx.hl.op..void (i32, %\22class.RayQuery<5, 0>\22*, %dx.types.Handle, i32, i32, %struct.RayDesc*)"(i32, %"class.RayQuery<5, 0>"*, %dx.types.Handle, i32, i32, %struct.RayDesc*) #0
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RaytracingAccelerationStructure)"(i32, %struct.RaytracingAccelerationStructure) #1
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RaytracingAccelerationStructure)"(i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RaytracingAccelerationStructure) #1
+
+; Function Attrs: nounwind
+declare void @"dx.hl.op..void (i32, %dx.types.HitObject*, %\22class.RayQuery<5, 0>\22*)"(i32, %dx.types.HitObject*, %"class.RayQuery<5, 0>"*) #0
+
+; Function Attrs: nounwind readonly
+declare i32 @"dx.hl.op.ro.i32 (i32, %dx.types.Handle, i32)"(i32, %dx.types.Handle, i32) #2
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RWByteAddressBuffer)"(i32, %struct.RWByteAddressBuffer) #1
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RWByteAddressBuffer)"(i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RWByteAddressBuffer) #1
+
+; Function Attrs: nounwind
+declare void @"dx.hl.op..void (i32, %dx.types.HitObject*, %\22class.RayQuery<5, 0>\22*, i32, %struct.CustomAttrs*)"(i32, %dx.types.HitObject*, %"class.RayQuery<5, 0>"*, i32, %struct.CustomAttrs*) #0
+
+; Function Attrs: nounwind
+declare void @"dx.hl.op..void (i32, %dx.types.Handle, i32, float)"(i32, %dx.types.Handle, i32, float) #0
+
+; Function Attrs: nounwind
+declare i32 @"dx.hl.op..i32 (i32, i32, i32)"(i32, i32, i32) #0
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!pauseresume = !{!1}
+!dx.version = !{!2}
+!dx.valver = !{!2}
+!dx.shaderModel = !{!3}
+!dx.typeAnnotations = !{!4, !26}
+!dx.entryPoints = !{!30}
+!dx.fnprops = !{!42}
+!dx.options = !{!43, !44}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !{!"hlsl-hlemit", !"hlsl-hlensure"}
+!2 = !{i32 1, i32 9}
+!3 = !{!"lib", i32 6, i32 9}
+!4 = !{i32 0, %"class.RWStructuredBuffer<float>" undef, !5, %struct.RayDesc undef, !10, %"class.dx::HitObject" undef, !15, %"class.RayQuery<5, 0>" undef, !17, %struct.CustomAttrs undef, !23}
+!5 = !{i32 4, !6, !7}
+!6 = !{i32 6, !"h", i32 3, i32 0, i32 7, i32 9}
+!7 = !{i32 0, !8}
+!8 = !{!9}
+!9 = !{i32 0, float undef}
+!10 = !{i32 32, !11, !12, !13, !14}
+!11 = !{i32 6, !"Origin", i32 3, i32 0, i32 7, i32 9, i32 13, i32 3}
+!12 = !{i32 6, !"TMin", i32 3, i32 12, i32 7, i32 9}
+!13 = !{i32 6, !"Direction", i32 3, i32 16, i32 7, i32 9, i32 13, i32 3}
+!14 = !{i32 6, !"TMax", i32 3, i32 28, i32 7, i32 9}
+!15 = !{i32 4, !16}
+!16 = !{i32 6, !"h", i32 3, i32 0, i32 7, i32 4}
+!17 = !{i32 4, !18, !19}
+!18 = !{i32 6, !"h", i32 3, i32 0, i32 7, i32 5}
+!19 = !{i32 0, !20}
+!20 = !{!21, !22}
+!21 = !{i32 1, i64 5}
+!22 = !{i32 1, i64 0}
+!23 = !{i32 8, !24, !25}
+!24 = !{i32 6, !"x", i32 3, i32 0, i32 7, i32 9}
+!25 = !{i32 6, !"y", i32 3, i32 4, i32 7, i32 9}
+!26 = !{i32 1, void ()* @"\01?main@@YAXXZ", !27}
+!27 = !{!28}
+!28 = !{i32 1, !29, !29}
+!29 = !{}
+!30 = !{null, !"", null, !31, null}
+!31 = !{!32, !35, !40, null}
+!32 = !{!33}
+!33 = !{i32 0, %struct.RaytracingAccelerationStructure* @"\01?RTAS@@3URaytracingAccelerationStructure@@A", !"RTAS", i32 -1, i32 -1, i32 1, i32 16, i32 0, !34}
+!34 = !{i32 0, i32 4}
+!35 = !{!36, !38, !39}
+!36 = !{i32 0, %"class.RWStructuredBuffer<float>"* @"\01?UAV@@3V?$RWStructuredBuffer@M@@A", !"UAV", i32 0, i32 0, i32 1, i32 12, i1 false, i1 false, i1 false, !37}
+!37 = !{i32 1, i32 4}
+!38 = !{i32 1, %struct.RWByteAddressBuffer* @"\01?inbuf@@3URWByteAddressBuffer@@A", !"inbuf", i32 -1, i32 -1, i32 1, i32 11, i1 false, i1 false, i1 false, null}
+!39 = !{i32 2, %struct.RWByteAddressBuffer* @"\01?outbuf@@3URWByteAddressBuffer@@A", !"outbuf", i32 -1, i32 -1, i32 1, i32 11, i1 false, i1 false, i1 false, null}
+!40 = !{!41}
+!41 = !{i32 0, %ConstantBuffer* @"$Globals", !"$Globals", i32 0, i32 -1, i32 1, i32 0, null}
+!42 = !{void ()* @"\01?main@@YAXXZ", i32 7}
+!43 = !{i32 -2147483584}
+!44 = !{i32 -1}
+!45 = !DILocation(line: 26, column: 3, scope: !46)
+!46 = !DISubprogram(name: "main", scope: !47, file: !47, line: 25, type: !48, isLocal: false, isDefinition: true, scopeLine: 25, flags: DIFlagPrototyped, isOptimized: false, function: void ()* @"\01?main@@YAXXZ")
+!47 = !DIFile(filename: "hitobject_fromrayquery_scalarrepl.hlsl", directory: "")
+!48 = !DISubroutineType(types: !29)
+!49 = !DILocation(line: 26, column: 78, scope: !46)
+!50 = !DILocation(line: 27, column: 3, scope: !46)
+!51 = !DILocation(line: 8, column: 8, scope: !52, inlinedAt: !53)
+!52 = !DISubprogram(name: "MakeRayDesc", scope: !47, file: !47, line: 6, type: !48, isLocal: false, isDefinition: true, scopeLine: 6, flags: DIFlagPrototyped, isOptimized: false)
+!53 = distinct !DILocation(line: 27, column: 17, scope: !46)
+!54 = !DILocation(line: 8, column: 15, scope: !52, inlinedAt: !53)
+!55 = !{!56, !56, i64 0}
+!56 = !{!"omnipotent char", !57, i64 0}
+!57 = !{!"Simple C/C++ TBAA"}
+!58 = !{!59}
+!59 = distinct !{!59, !60, !"\01?MakeRayDesc@@YA?AURayDesc@@XZ: %agg.result"}
+!60 = distinct !{!60, !"\01?MakeRayDesc@@YA?AURayDesc@@XZ"}
+!61 = !DILocation(line: 9, column: 8, scope: !52, inlinedAt: !53)
+!62 = !DILocation(line: 9, column: 18, scope: !52, inlinedAt: !53)
+!63 = !DILocation(line: 10, column: 8, scope: !52, inlinedAt: !53)
+!64 = !DILocation(line: 10, column: 13, scope: !52, inlinedAt: !53)
+!65 = !{!66, !66, i64 0}
+!66 = !{!"float", !56, i64 0}
+!67 = !DILocation(line: 11, column: 8, scope: !52, inlinedAt: !53)
+!68 = !DILocation(line: 11, column: 13, scope: !52, inlinedAt: !53)
+!69 = !DILocation(line: 28, column: 3, scope: !46)
+!70 = !DILocation(line: 30, column: 7, scope: !46)
+!71 = !DILocation(line: 21, column: 3, scope: !72, inlinedAt: !73)
+!72 = !DISubprogram(name: "Use", scope: !47, file: !47, line: 20, type: !48, isLocal: false, isDefinition: true, scopeLine: 20, flags: DIFlagPrototyped, isOptimized: false)
+!73 = distinct !DILocation(line: 30, column: 3, scope: !46)
+!74 = !DILocation(line: 32, column: 3, scope: !46)
+!75 = !DILocation(line: 33, column: 13, scope: !46)
+!76 = !DILocation(line: 33, column: 9, scope: !46)
+!77 = !DILocation(line: 33, column: 11, scope: !46)
+!78 = !DILocation(line: 34, column: 13, scope: !46)
+!79 = !DILocation(line: 34, column: 9, scope: !46)
+!80 = !DILocation(line: 34, column: 11, scope: !46)
+!81 = !DILocation(line: 35, column: 7, scope: !46)
+!82 = !DILocation(line: 21, column: 3, scope: !72, inlinedAt: !83)
+!83 = distinct !DILocation(line: 35, column: 3, scope: !46)
+!84 = !DILocation(line: 37, column: 13, scope: !46)
+!85 = !DILocation(line: 37, column: 9, scope: !46)
+!86 = !DILocation(line: 37, column: 11, scope: !46)
+!87 = !DILocation(line: 38, column: 13, scope: !46)
+!88 = !DILocation(line: 38, column: 9, scope: !46)
+!89 = !DILocation(line: 38, column: 11, scope: !46)
+!90 = !DILocation(line: 39, column: 7, scope: !46)
+!91 = !DILocation(line: 21, column: 3, scope: !72, inlinedAt: !92)
+!92 = distinct !DILocation(line: 39, column: 3, scope: !46)
+!93 = !DILocation(line: 41, column: 25, scope: !46)
+!94 = !DILocation(line: 41, column: 3, scope: !46)
+!95 = !DILocation(line: 42, column: 25, scope: !46)
+!96 = !DILocation(line: 42, column: 3, scope: !46)
+!97 = !DILocation(line: 43, column: 1, scope: !46)
diff --git a/tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_fromrayquery.hlsl b/tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_fromrayquery.hlsl
new file mode 100644
index 0000000000..004d25156a
--- /dev/null
+++ b/tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_fromrayquery.hlsl
@@ -0,0 +1,72 @@
+// RUN: %dxc -T lib_6_9 -E main %s -fcgl | FileCheck %s --check-prefix FCGL
+// RUN: %dxc -T lib_6_9 -E main %s -ast-dump-implicit | FileCheck %s --check-prefix AST
+
+// AST: | | |-FunctionTemplateDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> FromRayQuery
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TResult
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class Trq
+// AST-NEXT: | | | |-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> implicit FromRayQuery 'TResult (Trq) const' static
+// AST-NEXT: | | | | `-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> rq 'Trq'
+// AST-NEXT: | | | `-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> used FromRayQuery 'dx::HitObject (RayQuery<RAY_FLAG_FORCE_OPAQUE | RAY_FLAG_ACCEPT_FIRST_HIT_AND_END_SEARCH>)' static
+// AST-NEXT: | | |   |-TemplateArgument type 'dx::HitObject'
+// AST-NEXT: | | |   |-TemplateArgument type 'RayQuery<RAY_FLAG_FORCE_OPAQUE | RAY_FLAG_ACCEPT_FIRST_HIT_AND_END_SEARCH>':'RayQuery<5, 0>'
+// AST-NEXT: | | |   |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> FromRayQuery 'RayQuery<RAY_FLAG_FORCE_OPAQUE | RAY_FLAG_ACCEPT_FIRST_HIT_AND_END_SEARCH>':'RayQuery<5, 0>'
+// AST-NEXT: | | |   |-HLSLIntrinsicAttr {{[^ ]+}} <<invalid sloc>> Implicit "op" "" 363
+// AST-NEXT: | | |   `-AvailabilityAttr {{[^ ]+}} <<invalid sloc>> Implicit  6.9 0 0 ""
+
+// AST: | | |-FunctionTemplateDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> FromRayQuery
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TResult
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class Trq
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class THitKind
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TAttributes
+// AST-NEXT: | | | |-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> implicit FromRayQuery 'TResult (Trq, THitKind, TAttributes) const' static
+// AST-NEXT: | | | | |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> rq 'Trq'
+// AST-NEXT: | | | | |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> HitKind 'THitKind'
+// AST-NEXT: | | | | `-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> Attributes 'TAttributes'
+// AST-NEXT: | | | `-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> used FromRayQuery 'dx::HitObject (RayQuery<RAY_FLAG_FORCE_OPAQUE | RAY_FLAG_ACCEPT_FIRST_HIT_AND_END_SEARCH>, unsigned int, CustomAttrs)' static
+// AST-NEXT: | | |   |-TemplateArgument type 'dx::HitObject'
+// AST-NEXT: | | |   |-TemplateArgument type 'RayQuery<RAY_FLAG_FORCE_OPAQUE | RAY_FLAG_ACCEPT_FIRST_HIT_AND_END_SEARCH>':'RayQuery<5, 0>'
+// AST-NEXT: | | |   |-TemplateArgument type 'unsigned int'
+// AST-NEXT: | | |   |-TemplateArgument type 'CustomAttrs'
+// AST-NEXT: | | |   |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> FromRayQuery 'RayQuery<RAY_FLAG_FORCE_OPAQUE | RAY_FLAG_ACCEPT_FIRST_HIT_AND_END_SEARCH>':'RayQuery<5, 0>'
+// AST-NEXT: | | |   |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> rq 'unsigned int'
+// AST-NEXT: | | |   |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> HitKind 'CustomAttrs'
+// AST-NEXT: | | |   |-HLSLIntrinsicAttr {{[^ ]+}} <<invalid sloc>> Implicit "op" "" 363
+// AST-NEXT: | | |   `-AvailabilityAttr {{[^ ]+}} <<invalid sloc>> Implicit  6.9 0 0 ""
+
+// FCGL: call void @"dx.hl.op..void (i32, %dx.types.HitObject*, %\22class.RayQuery<5, 0>\22*)"(i32 363, %dx.types.HitObject* %[[HITPTR0:[^ ]+]], %"class.RayQuery<5, 0>"* %[[RQ:[^ ]+]])
+// FCGL-NEXT: call void @"\01?Use@@YAXVHitObject@dx@@@Z"(%dx.types.HitObject* %[[HITPTR0]])
+// FCGL: call void @"dx.hl.op..void (i32, %dx.types.HitObject*, %\22class.RayQuery<5, 0>\22*, i32, %struct.CustomAttrs*)"(i32 363, %dx.types.HitObject* %[[HITPTR1:[^ ]+]], %"class.RayQuery<5, 0>"* %[[RQ]], i32 16, %struct.CustomAttrs* %{{[^ ]+}})
+// FCGL-NEXT: call void @"\01?Use@@YAXVHitObject@dx@@@Z"(%dx.types.HitObject* %[[HITPTR1]])
+
+RaytracingAccelerationStructure RTAS;
+RWStructuredBuffer<float> UAV : register(u0);
+
+RayDesc MakeRayDesc() {
+  RayDesc desc;
+  desc.Origin = float3(0, 0, 0);
+  desc.Direction = float3(1, 0, 0);
+  desc.TMin = 0.0f;
+  desc.TMax = 9999.0;
+  return desc;
+}
+
+struct CustomAttrs {
+  float x;
+  float y;
+};
+
+void Use(in dx::HitObject hit) {
+  dx::MaybeReorderThread(hit);
+}
+
+[shader("raygeneration")]
+void main() {
+  RayQuery<RAY_FLAG_FORCE_OPAQUE | RAY_FLAG_ACCEPT_FIRST_HIT_AND_END_SEARCH> q;
+  RayDesc ray = MakeRayDesc();
+  q.TraceRayInline(RTAS, RAY_FLAG_NONE, 0xFF, ray);
+
+  Use(dx::HitObject::FromRayQuery(q));
+
+  CustomAttrs attrs = {1.f, 2.f};
+  Use(dx::HitObject::FromRayQuery(q, 16, attrs));
+}

From 0f7af1be3fcd5ca99877b084afe7801c4e8597fd Mon Sep 17 00:00:00 2001
From: Simon Moll <smoll@nvidia.com>
Date: Fri, 25 Apr 2025 20:59:25 +0200
Subject: [PATCH 10/93] [SER] Validate HitObject accessors (#7371)

Validate:
 HitObject_GeometryIndex
 HitObject_HitKind
 HitObject_InstanceID
 HitObject_InstanceIndex
 HitObject_IsHit
 HitObject_IsMiss
 HitObject_IsNop
 HitObject_LoadLocalRootTableConstant
 HitObject_ObjectRayDirection
 HitObject_ObjectRayOrigin
 HitObject_ObjectToWorld3x4
 HitObject_PrimitiveIndex
 HitObject_RayFlags
 HitObject_RayTCurrent
 HitObject_RayTMin
 HitObject_SetShaderTableIndex
 HitObject_ShaderTableIndex
 HitObject_WorldRayDirection
 HitObject_WorldRayOrigin

Rules:
* No undef parameters (HitObject, RecordIndex, RootTableOffset)
* row/col/component indices are constant and in-bounds
* If constant, RootTableOffset % 4 == 0

SER implementation tracker:
https://github.com/microsoft/DirectXShaderCompiler/issues/7214
---
 docs/DXIL.rst                                 |   1 +
 lib/DxilValidation/DxilValidation.cpp         | 110 ++++++++++
 .../HitObject/hitobject_accessors.hlsl        |   4 +-
 .../ser_hitobject_accessors_failing.ll        | 202 ++++++++++++++++++
 .../ser_hitobject_accessors_passing.ll        |   2 +-
 .../HitObject/hitobject_accessors.hlsl        |   4 +-
 utils/hct/hctdb.py                            |   5 +
 7 files changed, 323 insertions(+), 5 deletions(-)
 create mode 100644 tools/clang/test/LitDXILValidation/ser_hitobject_accessors_failing.ll

diff --git a/docs/DXIL.rst b/docs/DXIL.rst
index a1c5055085..a55f476450 100644
--- a/docs/DXIL.rst
+++ b/docs/DXIL.rst
@@ -3161,6 +3161,7 @@ INSTR.OPCODERESERVED                                  Instructions must not refe
 INSTR.OPCONST                                         DXIL intrinsic requires an immediate constant operand
 INSTR.OPCONSTRANGE                                    Constant values must be in-range for operation.
 INSTR.OPERANDRANGE                                    DXIL intrinsic operand must be within defined range
+INSTR.PARAMMULTIPLE                                   Parameter must be a valid multiple
 INSTR.PTRBITCAST                                      Pointer type bitcast must be have same size.
 INSTR.RESOURCECLASSFORLOAD                            load can only run on UAV/SRV resource.
 INSTR.RESOURCECLASSFORSAMPLERGATHER                   sample, lod and gather should be on srv resource.
diff --git a/lib/DxilValidation/DxilValidation.cpp b/lib/DxilValidation/DxilValidation.cpp
index 00a6b9ae14..69eb2a88f2 100644
--- a/lib/DxilValidation/DxilValidation.cpp
+++ b/lib/DxilValidation/DxilValidation.cpp
@@ -1644,6 +1644,46 @@ static unsigned getSemanticFlagValidMask(const ShaderModel *pSM) {
   return static_cast<unsigned>(hlsl::DXIL::BarrierSemanticFlag::ValidMask);
 }
 
+StringRef GetOpCodeName(DXIL::OpCode OpCode) {
+  switch (OpCode) {
+  default:
+    DXASSERT(false, "Unexpected op code");
+    return "";
+  case DXIL::OpCode::HitObject_ObjectRayOrigin:
+    return "HitObject_ObjectRayOrigin";
+  case DXIL::OpCode::HitObject_WorldRayDirection:
+    return "HitObject_WorldRayDirection";
+  case DXIL::OpCode::HitObject_WorldRayOrigin:
+    return "HitObject_WorldRayOrigin";
+  case DXIL::OpCode::HitObject_ObjectRayDirection:
+    return "HitObject_ObjectRayDirection";
+  case DXIL::OpCode::HitObject_WorldToObject3x4:
+    return "HitObject_WorldToObject3x4";
+  case DXIL::OpCode::HitObject_ObjectToWorld3x4:
+    return "HitObject_ObjectToWorld3x4";
+  }
+}
+
+static void ValidateConstantRangeUnsigned(Value *Val, StringRef Name,
+                                          uint64_t LowerBound,
+                                          uint64_t UpperBound, CallInst *CI,
+                                          DXIL::OpCode OpCode,
+                                          ValidationContext &ValCtx) {
+  ConstantInt *C = dyn_cast<ConstantInt>(Val);
+  if (!C) {
+    ValCtx.EmitInstrFormatError(CI, ValidationRule::InstrOpConst,
+                                {Name, GetOpCodeName(OpCode)});
+    return;
+  }
+  if (C->uge(UpperBound + 1U) || !C->uge(LowerBound)) {
+    std::string Range =
+        std::to_string(LowerBound) + "~" + std::to_string(UpperBound);
+    ValCtx.EmitInstrFormatError(
+        CI, ValidationRule::InstrOperandRange,
+        {Name, Range, C->getValue().toString(10, false)});
+  }
+}
+
 static void ValidateDxilOperationCallInProfile(CallInst *CI,
                                                DXIL::OpCode Opcode,
                                                const ShaderModel *pSM,
@@ -1910,6 +1950,76 @@ static void ValidateDxilOperationCallInProfile(CallInst *CI,
           CI, ValidationRule::InstrMayReorderThreadUndefCoherenceHintParam);
   } break;
 
+  case DXIL::OpCode::HitObject_LoadLocalRootTableConstant: {
+    Value *HitObject = CI->getArgOperand(1);
+    if (isa<UndefValue>(HitObject))
+      ValCtx.EmitInstrError(CI, ValidationRule::InstrUndefHitObject);
+    Value *Offset = CI->getArgOperand(2);
+    if (isa<UndefValue>(Offset))
+      ValCtx.EmitInstrError(CI, ValidationRule::InstrNoReadingUninitialized);
+    if (ConstantInt *COffset = dyn_cast<ConstantInt>(Offset)) {
+      if (COffset->getLimitedValue() % 4 != 0)
+        ValCtx.EmitInstrFormatError(
+            CI, ValidationRule::InstrParamMultiple,
+            {"offset", "4", COffset->getValue().toString(10, false)});
+    }
+    break;
+  }
+  case DXIL::OpCode::HitObject_SetShaderTableIndex: {
+    Value *HitObject = CI->getArgOperand(1);
+    if (isa<UndefValue>(HitObject))
+      ValCtx.EmitInstrError(CI, ValidationRule::InstrUndefHitObject);
+    Value *RecordIndex = CI->getArgOperand(2);
+    if (isa<UndefValue>(RecordIndex))
+      ValCtx.EmitInstrError(CI, ValidationRule::InstrNoReadingUninitialized);
+    break;
+  }
+
+  // Shader Execution Reordering - scalar getters
+  case DXIL::OpCode::HitObject_GeometryIndex:
+  case DXIL::OpCode::HitObject_HitKind:
+  case DXIL::OpCode::HitObject_InstanceID:
+  case DXIL::OpCode::HitObject_InstanceIndex:
+  case DXIL::OpCode::HitObject_IsHit:
+  case DXIL::OpCode::HitObject_IsMiss:
+  case DXIL::OpCode::HitObject_IsNop:
+  case DXIL::OpCode::HitObject_PrimitiveIndex:
+  case DXIL::OpCode::HitObject_RayFlags:
+  case DXIL::OpCode::HitObject_RayTCurrent:
+  case DXIL::OpCode::HitObject_RayTMin:
+  case DXIL::OpCode::HitObject_ShaderTableIndex: {
+    Value *HitObject = CI->getArgOperand(1);
+    if (isa<UndefValue>(HitObject))
+      ValCtx.EmitInstrError(CI, ValidationRule::InstrUndefHitObject);
+    break;
+  }
+
+  // Shader Execution Reordering - vector getters
+  case DXIL::OpCode::HitObject_ObjectRayDirection:
+  case DXIL::OpCode::HitObject_ObjectRayOrigin:
+  case DXIL::OpCode::HitObject_WorldRayDirection:
+  case DXIL::OpCode::HitObject_WorldRayOrigin: {
+    Value *HitObject = CI->getArgOperand(1);
+    if (isa<UndefValue>(HitObject))
+      ValCtx.EmitInstrError(CI, ValidationRule::InstrUndefHitObject);
+    Value *Col = CI->getArgOperand(2);
+    ValidateConstantRangeUnsigned(Col, "component", 0, 2, CI, Opcode, ValCtx);
+    break;
+  }
+
+  // Shader Execution Reordering - matrix getters
+  case DXIL::OpCode::HitObject_WorldToObject3x4:
+  case DXIL::OpCode::HitObject_ObjectToWorld3x4: {
+    Value *HitObject = CI->getArgOperand(1);
+    if (isa<UndefValue>(HitObject))
+      ValCtx.EmitInstrError(CI, ValidationRule::InstrUndefHitObject);
+    Value *Row = CI->getArgOperand(2);
+    ValidateConstantRangeUnsigned(Row, "row", 0, 2, CI, Opcode, ValCtx);
+    Value *Col = CI->getArgOperand(3);
+    ValidateConstantRangeUnsigned(Col, "column", 0, 3, CI, Opcode, ValCtx);
+    break;
+  }
+
   case DXIL::OpCode::AtomicBinOp:
   case DXIL::OpCode::AtomicCompareExchange: {
     Type *pOverloadType = OP::GetOverloadType(Opcode, CI->getCalledFunction());
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/objects/HitObject/hitobject_accessors.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/objects/HitObject/hitobject_accessors.hlsl
index bae2b0590c..daeabf9710 100644
--- a/tools/clang/test/CodeGenDXIL/hlsl/objects/HitObject/hitobject_accessors.hlsl
+++ b/tools/clang/test/CodeGenDXIL/hlsl/objects/HitObject/hitobject_accessors.hlsl
@@ -14,7 +14,7 @@
 // DXIL-DAG:   %{{[^ ]+}} = call i32 @dx.op.hitObject_StateScalar.i32(i32 283, %dx.types.HitObject %[[HIT]])  ; HitObject_InstanceID(hitObject)
 // DXIL-DAG:   %{{[^ ]+}} = call i32 @dx.op.hitObject_StateScalar.i32(i32 284, %dx.types.HitObject %[[HIT]])  ; HitObject_PrimitiveIndex(hitObject)
 // DXIL-DAG:   %{{[^ ]+}} = call i32 @dx.op.hitObject_StateScalar.i32(i32 286, %dx.types.HitObject %[[HIT]])  ; HitObject_ShaderTableIndex(hitObject)
-// DXIL-DAG:   %{{[^ ]+}} = call i32 @dx.op.hitObject_LoadLocalRootTableConstant(i32 288, %dx.types.HitObject %[[HIT]], i32 42)  ; HitObject_LoadLocalRootTableConstant(hitObject,offset)
+// DXIL-DAG:   %{{[^ ]+}} = call i32 @dx.op.hitObject_LoadLocalRootTableConstant(i32 288, %dx.types.HitObject %[[HIT]], i32 40)  ; HitObject_LoadLocalRootTableConstant(hitObject,offset)
 // DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateVector.f32(i32 277, %dx.types.HitObject %[[HIT]], i32 0)  ; HitObject_ObjectRayOrigin(hitObject,component)
 // DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateVector.f32(i32 277, %dx.types.HitObject %[[HIT]], i32 1)  ; HitObject_ObjectRayOrigin(hitObject,component)
 // DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateVector.f32(i32 277, %dx.types.HitObject %[[HIT]], i32 2)  ; HitObject_ObjectRayOrigin(hitObject,component)
@@ -88,7 +88,7 @@ void main() {
   isum += hit.GetInstanceID();
   isum += hit.GetPrimitiveIndex();
   isum += hit.GetShaderTableIndex();
-  isum += hit.LoadLocalRootTableConstant(42);
+  isum += hit.LoadLocalRootTableConstant(40);
 
   // float3 accessors
   vsum += hit.GetWorldRayOrigin();
diff --git a/tools/clang/test/LitDXILValidation/ser_hitobject_accessors_failing.ll b/tools/clang/test/LitDXILValidation/ser_hitobject_accessors_failing.ll
new file mode 100644
index 0000000000..7270996b91
--- /dev/null
+++ b/tools/clang/test/LitDXILValidation/ser_hitobject_accessors_failing.ll
@@ -0,0 +1,202 @@
+; REQUIRES: dxil-1-9
+; RUN: not %dxv %s 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+%dx.types.HitObject = type { i8* }
+
+; CHECK: Function: ?main@@YAXXZ: error: HitObject is undef.
+; CHECK: note: at '%r287_ud = call %dx.types.HitObject @dx.op.hitObject_SetShaderTableIndex(i32 287, %dx.types.HitObject undef, i32 undef)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: Instructions should not read uninitialized value.
+; CHECK: note: at '%r287_ud = call %dx.types.HitObject @dx.op.hitObject_SetShaderTableIndex(i32 287, %dx.types.HitObject undef, i32 undef)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: HitObject is undef.
+; CHECK: note: at '%r287 = call %dx.types.HitObject @dx.op.hitObject_SetShaderTableIndex(i32 287, %dx.types.HitObject undef, i32 1)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: HitObject is undef.
+; CHECK: note: at '%r271 = call i1 @dx.op.hitObject_StateScalar.i1(i32 271, %dx.types.HitObject undef)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: HitObject is undef.
+; CHECK: note: at '%r270 = call i1 @dx.op.hitObject_StateScalar.i1(i32 270, %dx.types.HitObject undef)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: HitObject is undef.
+; CHECK: note: at '%r269 = call i1 @dx.op.hitObject_StateScalar.i1(i32 269, %dx.types.HitObject undef)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: HitObject is undef.
+; CHECK: note: at '%r286 = call i32 @dx.op.hitObject_StateScalar.i32(i32 286, %dx.types.HitObject undef)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: HitObject is undef.
+; CHECK: note: at '%r285 = call i32 @dx.op.hitObject_StateScalar.i32(i32 285, %dx.types.HitObject undef)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: HitObject is undef.
+; CHECK: note: at '%r284 = call i32 @dx.op.hitObject_StateScalar.i32(i32 284, %dx.types.HitObject undef)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: HitObject is undef.
+; CHECK: note: at '%r283 = call i32 @dx.op.hitObject_StateScalar.i32(i32 283, %dx.types.HitObject undef)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: HitObject is undef.
+; CHECK: note: at '%r282 = call i32 @dx.op.hitObject_StateScalar.i32(i32 282, %dx.types.HitObject undef)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: HitObject is undef.
+; CHECK: note: at '%r281 = call i32 @dx.op.hitObject_StateScalar.i32(i32 281, %dx.types.HitObject undef)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: HitObject is undef.
+; CHECK: note: at '%r272 = call i32 @dx.op.hitObject_StateScalar.i32(i32 272, %dx.types.HitObject undef)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: HitObject is undef.
+; CHECK: note: at '%r288_wrongmul = call i32 @dx.op.hitObject_LoadLocalRootTableConstant(i32 288, %dx.types.HitObject undef, i32 7)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: parameter 'offset' must be a multiple of 4, got 7
+; CHECK: note: at '%r288_wrongmul = call i32 @dx.op.hitObject_LoadLocalRootTableConstant(i32 288, %dx.types.HitObject undef, i32 7)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: HitObject is undef.
+; CHECK: note: at '%r288 = call i32 @dx.op.hitObject_LoadLocalRootTableConstant(i32 288, %dx.types.HitObject undef, i32 42)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: parameter 'offset' must be a multiple of 4, got 42
+; CHECK: note: at '%r288 = call i32 @dx.op.hitObject_LoadLocalRootTableConstant(i32 288, %dx.types.HitObject undef, i32 42)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: expect component between 0~2, got 3.
+; CHECK: note: at '%r278_oobc = call float @dx.op.hitObject_StateVector.f32(i32 278, %dx.types.HitObject %nop, i32 3)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: component of HitObject_ObjectRayDirection must be an immediate constant.
+; CHECK: note: at '%r278_dync = call float @dx.op.hitObject_StateVector.f32(i32 278, %dx.types.HitObject %nop, i32 %r272)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: HitObject is undef.
+; CHECK: note: at '%r278 = call float @dx.op.hitObject_StateVector.f32(i32 278, %dx.types.HitObject undef, i32 0)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: expect component between 0~2, got 3.
+; CHECK: note: at '%r277_oobc = call float @dx.op.hitObject_StateVector.f32(i32 277, %dx.types.HitObject %nop, i32 3)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: component of HitObject_ObjectRayOrigin must be an immediate constant.
+; CHECK: note: at '%r277_dync = call float @dx.op.hitObject_StateVector.f32(i32 277, %dx.types.HitObject %nop, i32 %r272)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: HitObject is undef.
+; CHECK: note: at '%r277 = call float @dx.op.hitObject_StateVector.f32(i32 277, %dx.types.HitObject undef, i32 0)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: expect component between 0~2, got 3.
+; CHECK: note: at '%r276_oobc = call float @dx.op.hitObject_StateVector.f32(i32 276, %dx.types.HitObject %nop, i32 3)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: component of HitObject_WorldRayDirection must be an immediate constant.
+; CHECK: note: at '%r276_dync = call float @dx.op.hitObject_StateVector.f32(i32 276, %dx.types.HitObject %nop, i32 %r272)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: HitObject is undef.
+; CHECK: note: at '%r276 = call float @dx.op.hitObject_StateVector.f32(i32 276, %dx.types.HitObject undef, i32 0)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: expect component between 0~2, got 3.
+; CHECK: note: at '%r275_oobc = call float @dx.op.hitObject_StateVector.f32(i32 275, %dx.types.HitObject %nop, i32 3)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: component of HitObject_WorldRayOrigin must be an immediate constant.
+; CHECK: note: at '%r275_dync = call float @dx.op.hitObject_StateVector.f32(i32 275, %dx.types.HitObject %nop, i32 %r272)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: HitObject is undef.
+; CHECK: note: at '%r275 = call float @dx.op.hitObject_StateVector.f32(i32 275, %dx.types.HitObject undef, i32 0)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: HitObject is undef.
+; CHECK: note: at '%r274 = call float @dx.op.hitObject_StateScalar.f32(i32 274, %dx.types.HitObject undef)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: HitObject is undef.
+; CHECK: note: at '%r273 = call float @dx.op.hitObject_StateScalar.f32(i32 273, %dx.types.HitObject undef)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: expect column between 0~3, got 4.
+; CHECK: note: at '%r280_oobc = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %nop, i32 0, i32 4)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: column of HitObject_WorldToObject3x4 must be an immediate constant.
+; CHECK: note: at '%r280_dync = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %nop, i32 0, i32 %r272)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: expect row between 0~2, got 3.
+; CHECK: note: at '%r280_oobr = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %nop, i32 3, i32 0)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: row of HitObject_WorldToObject3x4 must be an immediate constant.
+; CHECK: note: at '%r280_dynr = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %nop, i32 %r272, i32 0)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: HitObject is undef.
+; CHECK: note: at '%r280 = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject undef, i32 0, i32 0)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: expect column between 0~3, got 4.
+; CHECK: note: at '%r279_oobc = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %nop, i32 0, i32 4)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: column of HitObject_ObjectToWorld3x4 must be an immediate constant.
+; CHECK: note: at '%r279_dync = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %nop, i32 0, i32 %r272)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: expect row between 0~2, got 3.
+; CHECK: note: at '%r279_oobr = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %nop, i32 3, i32 0)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: row of HitObject_ObjectToWorld3x4 must be an immediate constant.
+; CHECK: note: at '%r279_dynr = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %nop, i32 %r272, i32 0)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: HitObject is undef.
+; CHECK: note: at '%r279 = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject undef, i32 0, i32 0)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Validation failed.
+
+; Function Attrs: nounwind
+define void @"\01?main@@YAXXZ"() #0 {
+%nop = call %dx.types.HitObject @dx.op.hitObject_MakeNop(i32 266)  ; HitObject_MakeNop()
+  %r269 = call i1 @dx.op.hitObject_StateScalar.i1(i32 269, %dx.types.HitObject undef)  ; HitObject_IsMiss(hitObject)
+
+  %r270 = call i1 @dx.op.hitObject_StateScalar.i1(i32 270, %dx.types.HitObject undef)  ; HitObject_IsHit(hitObject)
+
+  %r271 = call i1 @dx.op.hitObject_StateScalar.i1(i32 271, %dx.types.HitObject undef)  ; HitObject_IsNop(hitObject)
+
+  %r272 = call i32 @dx.op.hitObject_StateScalar.i32(i32 272, %dx.types.HitObject undef)  ; HitObject_RayFlags(hitObject)
+
+  %r273 = call float @dx.op.hitObject_StateScalar.f32(i32 273, %dx.types.HitObject undef)  ; HitObject_RayTMin(hitObject)
+
+  %r274 = call float @dx.op.hitObject_StateScalar.f32(i32 274, %dx.types.HitObject undef)  ; HitObject_RayTCurrent(hitObject)
+
+  %r275 = call float @dx.op.hitObject_StateVector.f32(i32 275, %dx.types.HitObject undef, i32 0)  ; HitObject_WorldRayOrigin(hitObject,component)
+  %r275_dync = call float @dx.op.hitObject_StateVector.f32(i32 275, %dx.types.HitObject %nop, i32 %r272)  ; HitObject_WorldRayOrigin(hitObject,component)
+  %r275_oobc = call float @dx.op.hitObject_StateVector.f32(i32 275, %dx.types.HitObject %nop, i32 3)  ; HitObject_WorldRayOrigin(hitObject,component)
+
+  %r276 = call float @dx.op.hitObject_StateVector.f32(i32 276, %dx.types.HitObject undef, i32 0)  ; HitObject_WorldRayDirection(hitObject,component)
+  %r276_dync = call float @dx.op.hitObject_StateVector.f32(i32 276, %dx.types.HitObject %nop, i32 %r272)  ; HitObject_WorldRayDirection(hitObject,component)
+  %r276_oobc = call float @dx.op.hitObject_StateVector.f32(i32 276, %dx.types.HitObject %nop, i32 3)  ; HitObject_WorldRayDirection(hitObject,component)
+
+  %r277 = call float @dx.op.hitObject_StateVector.f32(i32 277, %dx.types.HitObject undef, i32 0)  ; HitObject_ObjectRayOrigin(hitObject,component)
+  %r277_dync = call float @dx.op.hitObject_StateVector.f32(i32 277, %dx.types.HitObject %nop, i32 %r272)  ; HitObject_ObjectRayOrigin(hitObject,component)
+  %r277_oobc = call float @dx.op.hitObject_StateVector.f32(i32 277, %dx.types.HitObject %nop, i32 3)  ; HitObject_ObjectRayOrigin(hitObject,component)
+
+  %r278 = call float @dx.op.hitObject_StateVector.f32(i32 278, %dx.types.HitObject undef, i32 0)  ; HitObject_ObjectRayDirection(hitObject,component)
+  %r278_dync = call float @dx.op.hitObject_StateVector.f32(i32 278, %dx.types.HitObject %nop, i32 %r272)  ; HitObject_ObjectRayDirection(hitObject,component)
+  %r278_oobc = call float @dx.op.hitObject_StateVector.f32(i32 278, %dx.types.HitObject %nop, i32 3)  ; HitObject_ObjectRayDirection(hitObject,component)
+
+  %r279 = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject undef, i32 0, i32 0)  ; HitObject_ObjectToWorld3x4(hitObject,row,col)
+  %r279_dynr = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %nop, i32 %r272, i32 0)  ; HitObject_ObjectToWorld3x4(hitObject,row,col)
+  %r279_oobr = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %nop, i32 3, i32 0)  ; HitObject_ObjectToWorld3x4(hitObject,row,col)
+  %r279_dync = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %nop, i32 0, i32 %r272)  ; HitObject_ObjectToWorld3x4(hitObject,row,col)
+  %r279_oobc = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %nop, i32 0, i32 4)  ; HitObject_ObjectToWorld3x4(hitObject,row,col)
+
+  %r280 = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject undef, i32 0, i32 0)  ; HitObject_WorldToObject3x4(hitObject,row,col)
+  %r280_dynr = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %nop, i32 %r272, i32 0)  ; HitObject_WorldToObject3x4(hitObject,row,col)
+  %r280_oobr = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %nop, i32 3, i32 0)  ; HitObject_WorldToObject3x4(hitObject,row,col)
+  %r280_dync = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %nop, i32 0, i32 %r272)  ; HitObject_WorldToObject3x4(hitObject,row,col)
+  %r280_oobc = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %nop, i32 0, i32 4)  ; HitObject_WorldToObject3x4(hitObject,row,col)
+
+  %r281 = call i32 @dx.op.hitObject_StateScalar.i32(i32 281, %dx.types.HitObject undef)  ; HitObject_GeometryIndex(hitObject)
+
+  %r282 = call i32 @dx.op.hitObject_StateScalar.i32(i32 282, %dx.types.HitObject undef)  ; HitObject_InstanceIndex(hitObject)
+
+  %r283 = call i32 @dx.op.hitObject_StateScalar.i32(i32 283, %dx.types.HitObject undef)  ; HitObject_InstanceID(hitObject)
+
+  %r284 = call i32 @dx.op.hitObject_StateScalar.i32(i32 284, %dx.types.HitObject undef)  ; HitObject_PrimitiveIndex(hitObject)
+
+  %r285 = call i32 @dx.op.hitObject_StateScalar.i32(i32 285, %dx.types.HitObject undef)  ; HitObject_HitKind(hitObject)
+
+  %r286 = call i32 @dx.op.hitObject_StateScalar.i32(i32 286, %dx.types.HitObject undef)  ; HitObject_ShaderTableIndex(hitObject)
+
+  %r287 = call %dx.types.HitObject @dx.op.hitObject_SetShaderTableIndex(i32 287, %dx.types.HitObject undef, i32 1)  ; HitObject_SetShaderTableIndex(hitObject,shaderTableIndex)
+  %r287_ud = call %dx.types.HitObject @dx.op.hitObject_SetShaderTableIndex(i32 287, %dx.types.HitObject undef, i32 undef)  ; HitObject_SetShaderTableIndex(hitObject,shaderTableIndex)
+
+  %r288 = call i32 @dx.op.hitObject_LoadLocalRootTableConstant(i32 288, %dx.types.HitObject undef, i32 42)  ; HitObject_LoadLocalRootTableConstant(hitObject,offset)
+  %r288_wrongmul = call i32 @dx.op.hitObject_LoadLocalRootTableConstant(i32 288, %dx.types.HitObject undef, i32 7)  ; HitObject_LoadLocalRootTableConstant(hitObject,offset)
+
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare %dx.types.HitObject @dx.op.hitObject_MakeNop(i32) #1
+
+; Function Attrs: nounwind readnone
+declare %dx.types.HitObject @dx.op.hitObject_SetShaderTableIndex(i32, %dx.types.HitObject, i32) #1
+
+; Function Attrs: nounwind readnone
+declare i1 @dx.op.hitObject_StateScalar.i1(i32, %dx.types.HitObject) #1
+
+; Function Attrs: nounwind readnone
+declare i32 @dx.op.hitObject_StateScalar.i32(i32, %dx.types.HitObject) #1
+
+; Function Attrs: nounwind readonly
+declare i32 @dx.op.hitObject_LoadLocalRootTableConstant(i32, %dx.types.HitObject, i32) #2
+
+; Function Attrs: nounwind readnone
+declare float @dx.op.hitObject_StateVector.f32(i32, %dx.types.HitObject, i32) #1
+
+; Function Attrs: nounwind readnone
+declare float @dx.op.hitObject_StateScalar.f32(i32, %dx.types.HitObject) #1
+
+; Function Attrs: nounwind readnone
+declare float @dx.op.hitObject_StateMatrix.f32(i32, %dx.types.HitObject, i32, i32) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind readonly }
+attributes #3 = { nounwind argmemonly }
+
+!dx.version = !{!0}
+!dx.valver = !{!0}
+!dx.shaderModel = !{!1}
+!dx.typeAnnotations = !{!2}
+!dx.entryPoints = !{!3, !4}
+
+!0 = !{i32 1, i32 9}
+!1 = !{!"lib", i32 6, i32 9}
+!2 = !{i32 1, void ()* @"\01?main@@YAXXZ", !5}
+!3 = !{null, !"", null, null, !6}
+!4 = !{void ()* @"\01?main@@YAXXZ", !"\01?main@@YAXXZ", null, null, !7}
+!5 = !{!8}
+!6 = !{i32 0, i64 0}
+!7 = !{i32 8, i32 7, i32 5, !9}
+!8 = !{i32 1, !10, !10}
+!9 = !{i32 0}
+!10 = !{}
diff --git a/tools/clang/test/LitDXILValidation/ser_hitobject_accessors_passing.ll b/tools/clang/test/LitDXILValidation/ser_hitobject_accessors_passing.ll
index e527125009..74cc94fb78 100644
--- a/tools/clang/test/LitDXILValidation/ser_hitobject_accessors_passing.ll
+++ b/tools/clang/test/LitDXILValidation/ser_hitobject_accessors_passing.ll
@@ -52,7 +52,7 @@ define void @"\01?main@@YAXXZ"() #0 {
 
   %r287 = call %dx.types.HitObject @dx.op.hitObject_SetShaderTableIndex(i32 287, %dx.types.HitObject %nop, i32 1)  ; HitObject_SetShaderTableIndex(hitObject,shaderTableIndex)
 
-  %r288 = call i32 @dx.op.hitObject_LoadLocalRootTableConstant(i32 288, %dx.types.HitObject %nop, i32 42)  ; HitObject_LoadLocalRootTableConstant(hitObject,offset)
+  %r288 = call i32 @dx.op.hitObject_LoadLocalRootTableConstant(i32 288, %dx.types.HitObject %nop, i32 16)  ; HitObject_LoadLocalRootTableConstant(hitObject,offset)
 
   call void @dx.op.hitObject_Attributes.struct.AttribType(i32 289, %dx.types.HitObject %nop, %struct.AttribType* nonnull %attrs)  ; HitObject_Attributes(hitObject,attributes)
   ret void
diff --git a/tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_accessors.hlsl b/tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_accessors.hlsl
index 7b4182b739..05aa790ad4 100644
--- a/tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_accessors.hlsl
+++ b/tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_accessors.hlsl
@@ -189,7 +189,7 @@
 // FCGL:   %{{[^ ]+}} = call i32 @"dx.hl.op.rn.i32 (i32, %dx.types.HitObject*)"(i32 367, %dx.types.HitObject* %[[HIT]])
 // FCGL:   %{{[^ ]+}} = call i32 @"dx.hl.op.rn.i32 (i32, %dx.types.HitObject*)"(i32 373, %dx.types.HitObject* %[[HIT]])
 // FCGL:   %{{[^ ]+}} = call i32 @"dx.hl.op.rn.i32 (i32, %dx.types.HitObject*)"(i32 377, %dx.types.HitObject* %[[HIT]])
-// FCGL:   %{{[^ ]+}} = call i32 @"dx.hl.op.ro.i32 (i32, %dx.types.HitObject*, i32)"(i32 386, %dx.types.HitObject* %[[HIT]], i32 42)
+// FCGL:   %{{[^ ]+}} = call i32 @"dx.hl.op.ro.i32 (i32, %dx.types.HitObject*, i32)"(i32 386, %dx.types.HitObject* %[[HIT]], i32 40)
 // FCGL:   %{{[^ ]+}} = call <3 x float> @"dx.hl.op.rn.<3 x float> (i32, %dx.types.HitObject*)"(i32 379, %dx.types.HitObject* %[[HIT]])
 // FCGL:   %{{[^ ]+}} = call <3 x float> @"dx.hl.op.rn.<3 x float> (i32, %dx.types.HitObject*)"(i32 378, %dx.types.HitObject* %[[HIT]])
 // FCGL:   %{{[^ ]+}} = call <3 x float> @"dx.hl.op.rn.<3 x float> (i32, %dx.types.HitObject*)"(i32 370, %dx.types.HitObject* %[[HIT]])
@@ -238,7 +238,7 @@ void main() {
   isum += hit.GetInstanceID();
   isum += hit.GetPrimitiveIndex();
   isum += hit.GetShaderTableIndex();
-  isum += hit.LoadLocalRootTableConstant(42);
+  isum += hit.LoadLocalRootTableConstant(40);
 
   // float3 accessors
   vsum += hit.GetWorldRayOrigin();
diff --git a/utils/hct/hctdb.py b/utils/hct/hctdb.py
index 6344fb5849..b1460de9b8 100644
--- a/utils/hct/hctdb.py
+++ b/utils/hct/hctdb.py
@@ -8298,6 +8298,11 @@ def build_valrules(self):
             "Instr.UndefHitObject",
             "HitObject is undef.",
         )
+        self.add_valrule_msg(
+            "Instr.ParamMultiple",
+            "Parameter must be a valid multiple",
+            "parameter '%0' must be a multiple of %1, got %2",
+        )
         self.add_valrule(
             "Instr.MayReorderThreadUndefCoherenceHintParam",
             "Use of undef coherence hint or num coherence hint bits in MaybeReorderThread.",

From b4d773fdf89a43cf983e28e11f0d8102e8723e18 Mon Sep 17 00:00:00 2001
From: Simon Moll <smoll@nvidia.com>
Date: Fri, 25 Apr 2025 20:59:38 +0200
Subject: [PATCH 11/93] [SER] Validate MakeMiss (#7372)

Validate:
 HitObject_MakeMiss

Rules:
 No undef MissShaderIndex or RayFlags

SER implementation tracker: #7214
---
 lib/DxilValidation/DxilValidation.cpp         |  6 +++
 .../ser_hitobject_make_failing.ll             | 44 +++++++++++++++++++
 2 files changed, 50 insertions(+)
 create mode 100644 tools/clang/test/LitDXILValidation/ser_hitobject_make_failing.ll

diff --git a/lib/DxilValidation/DxilValidation.cpp b/lib/DxilValidation/DxilValidation.cpp
index 69eb2a88f2..1ef64ee1bb 100644
--- a/lib/DxilValidation/DxilValidation.cpp
+++ b/lib/DxilValidation/DxilValidation.cpp
@@ -1949,6 +1949,12 @@ static void ValidateDxilOperationCallInProfile(CallInst *CI,
       ValCtx.EmitInstrError(
           CI, ValidationRule::InstrMayReorderThreadUndefCoherenceHintParam);
   } break;
+  case DXIL::OpCode::HitObject_MakeMiss: {
+    DxilInst_HitObject_MakeMiss MakeMiss(CI);
+    if (isa<UndefValue>(MakeMiss.get_RayFlags()) ||
+        isa<UndefValue>(MakeMiss.get_MissShaderIndex()))
+      ValCtx.EmitInstrError(CI, ValidationRule::InstrNoReadingUninitialized);
+  } break;
 
   case DXIL::OpCode::HitObject_LoadLocalRootTableConstant: {
     Value *HitObject = CI->getArgOperand(1);
diff --git a/tools/clang/test/LitDXILValidation/ser_hitobject_make_failing.ll b/tools/clang/test/LitDXILValidation/ser_hitobject_make_failing.ll
new file mode 100644
index 0000000000..b47f178ca2
--- /dev/null
+++ b/tools/clang/test/LitDXILValidation/ser_hitobject_make_failing.ll
@@ -0,0 +1,44 @@
+; REQUIRES: dxil-1-9
+; RUN: not %dxv %s 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+%dx.types.HitObject = type { i8* }
+
+; CHECK: Function: ?main@@YAXXZ: error: Instructions should not read uninitialized value.
+; CHECK: note: at '%r265_udmiss = call %dx.types.HitObject @dx.op.hitObject_MakeMiss(i32 265, i32 4, i32 undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 9.999000e+03)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: Instructions should not read uninitialized value.
+; CHECK: note: at '%r265_udflags = call %dx.types.HitObject @dx.op.hitObject_MakeMiss(i32 265, i32 undef, i32 0, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 9.999000e+03)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Validation failed.
+
+; Function Attrs: nounwind
+define void @"\01?main@@YAXXZ"() #0 {
+  %r265_udflags = call %dx.types.HitObject @dx.op.hitObject_MakeMiss(i32 265, i32 undef, i32 0, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 9.999000e+03)  ; HitObject_MakeMiss(RayFlags,MissShaderIndex,Origin_X,Origin_Y,Origin_Z,TMin,Direction_X,Direction_Y,Direction_Z,TMax)
+  %r265_udmiss = call %dx.types.HitObject @dx.op.hitObject_MakeMiss(i32 265, i32 4, i32 undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 9.999000e+03)  ; HitObject_MakeMiss(RayFlags,MissShaderIndex,Origin_X,Origin_Y,Origin_Z,TMin,Direction_X,Direction_Y,Direction_Z,TMax)
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare %dx.types.HitObject @dx.op.hitObject_MakeMiss(i32, i32, i32, float, float, float, float, float, float, float, float) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+
+!dx.version = !{!0}
+!dx.valver = !{!0}
+!dx.shaderModel = !{!1}
+!dx.typeAnnotations = !{!2}
+!dx.entryPoints = !{!9, !11}
+
+!0 = !{i32 1, i32 9}
+!1 = !{!"lib", i32 6, i32 9}
+!2 = !{i32 1, void ()* @"\01?main@@YAXXZ", !3}
+!3 = !{!4}
+!4 = !{i32 1, !5, !5}
+!5 = !{}
+!9 = !{null, !"", null, null, !10}
+!10 = !{i32 0, i64 0}
+!11 = !{void ()* @"\01?main@@YAXXZ", !"\01?main@@YAXXZ", null, null, !12}
+!12 = !{i32 8, i32 7, i32 5, !13}
+!13 = !{i32 0}

From 8209d53f0ef0257e5b8c78d22057086403946cca Mon Sep 17 00:00:00 2001
From: Simon Moll <smoll@nvidia.com>
Date: Fri, 25 Apr 2025 20:59:48 +0200
Subject: [PATCH 12/93] Fix static member call instantiation (#7377)

The first parameter of the HLSL_INTRINSIC record is actually a dummy
entry for the function name.
When builtin member function templates are instantiated, the 'this' ptr
is added as the first ParamVarDecl, which has the same index as the
first argument in the HLSL_INTRINSIC record.
This shifts the parameter names by one for static member functions, as
those do not have a 'this' pointer, as in:

```
| | | `-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> used MakeMiss 'dx::HitObject (unsigned int, unsigned int, RayDesc)' static
| | |   |-TemplateArgument type 'dx::HitObject'
| | |   |-TemplateArgument type 'unsigned int'
| | |   |-TemplateArgument type 'unsigned int'
| | |   |-TemplateArgument type 'RayDesc'
| | |   |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> MakeMiss 'unsigned int'
| | |   |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> RayFlags 'unsigned int'
| | |   |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> MissShaderIndex 'RayDesc'
```

The fix is the take the first actual parameter name of the
HLSL_INTRINSIC if a static member function template is declared.

Closes #7374
---
 tools/clang/lib/Sema/SemaHLSL.cpp                    | 10 ++++++++--
 .../hlsl/objects/HitObject/hitobject_make.hlsl       |  4 ++--
 .../objects/HitObject/hitobject_traceinvoke.hlsl     | 12 ++++++------
 .../objects/HitObject/hitobject_fromrayquery.hlsl    |  8 ++++----
 4 files changed, 20 insertions(+), 14 deletions(-)

diff --git a/tools/clang/lib/Sema/SemaHLSL.cpp b/tools/clang/lib/Sema/SemaHLSL.cpp
index 5131d39f44..43946bc78a 100644
--- a/tools/clang/lib/Sema/SemaHLSL.cpp
+++ b/tools/clang/lib/Sema/SemaHLSL.cpp
@@ -5945,6 +5945,8 @@ class HLSLExternalSource : public ExternalSemaSource {
              "otherwise caller didn't initialize - there should be at least a "
              "void return type");
 
+    const bool IsStatic = IsStaticMember(intrinsic);
+
     // Create the template arguments.
     SmallVector<TemplateArgument, g_MaxIntrinsicParamCount + 1> templateArgs;
     for (size_t i = 0; i < parameterTypeCount; i++) {
@@ -6010,15 +6012,19 @@ class HLSLExternalSource : public ExternalSemaSource {
 
     SmallVector<ParmVarDecl *, g_MaxIntrinsicParamCount> Params;
     for (unsigned int i = 1; i < parameterTypeCount; i++) {
+      // The first parameter in the HLSL intrinsic record is just the intrinsic
+      // name and aliases with the 'this' pointer for non-static members. Skip
+      // this first parameter for static functions.
+      unsigned ParamIdx = IsStatic ? i : i - 1;
       IdentifierInfo *id =
-          &m_context->Idents.get(StringRef(intrinsic->pArgs[i - 1].pName));
+          &m_context->Idents.get(StringRef(intrinsic->pArgs[ParamIdx].pName));
       ParmVarDecl *paramDecl = ParmVarDecl::Create(
           *m_context, nullptr, NoLoc, NoLoc, id, parameterTypes[i], nullptr,
           StorageClass::SC_None, nullptr, paramMods[i - 1]);
       Params.push_back(paramDecl);
     }
 
-    StorageClass SC = IsStaticMember(intrinsic) ? SC_Static : SC_Extern;
+    StorageClass SC = IsStatic ? SC_Static : SC_Extern;
     QualType T = TInfo->getType();
     DeclarationNameInfo NameInfo(FunctionTemplate->getDeclName(), NoLoc);
     CXXMethodDecl *method = CXXMethodDecl::Create(
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/objects/HitObject/hitobject_make.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/objects/HitObject/hitobject_make.hlsl
index 1e947b2296..cc9515d7c1 100644
--- a/tools/clang/test/CodeGenDXIL/hlsl/objects/HitObject/hitobject_make.hlsl
+++ b/tools/clang/test/CodeGenDXIL/hlsl/objects/HitObject/hitobject_make.hlsl
@@ -25,9 +25,9 @@
 // AST-NEXT: | | |   |-TemplateArgument type 'unsigned int'
 // AST-NEXT: | | |   |-TemplateArgument type 'unsigned int'
 // AST-NEXT: | | |   |-TemplateArgument type 'RayDesc'
-// AST-NEXT: | | |   |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> MakeMiss 'unsigned int'
 // AST-NEXT: | | |   |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> RayFlags 'unsigned int'
-// AST-NEXT: | | |   |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> MissShaderIndex 'RayDesc'
+// AST-NEXT: | | |   |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> MissShaderIndex 'unsigned int'
+// AST-NEXT: | | |   |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> Ray 'RayDesc'
 // AST-NEXT: | | |   |-HLSLIntrinsicAttr {{[^ ]+}} <<invalid sloc>> Implicit "op" "" 387
 // AST-NEXT: | | |   `-AvailabilityAttr {{[^ ]+}} <<invalid sloc>> Implicit  6.9 0 0 ""
 
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/objects/HitObject/hitobject_traceinvoke.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/objects/HitObject/hitobject_traceinvoke.hlsl
index 13bff4a3f4..4ea00475f1 100644
--- a/tools/clang/test/CodeGenDXIL/hlsl/objects/HitObject/hitobject_traceinvoke.hlsl
+++ b/tools/clang/test/CodeGenDXIL/hlsl/objects/HitObject/hitobject_traceinvoke.hlsl
@@ -13,8 +13,8 @@
 // AST-NEXT: | | |   |-TemplateArgument type 'void'
 // AST-NEXT: | | |   |-TemplateArgument type 'dx::HitObject':'dx::HitObject'
 // AST-NEXT: | | |   |-TemplateArgument type 'Payload'
-// AST-NEXT: | | |   |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> Invoke 'dx::HitObject':'dx::HitObject'
-// AST-NEXT: | | |   |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> ho 'Payload &&__restrict'
+// AST-NEXT: | | |   |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> ho 'dx::HitObject':'dx::HitObject'
+// AST-NEXT: | | |   |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> Payload 'Payload &&__restrict'
 // AST-NEXT: | | |   |-HLSLIntrinsicAttr {{[^ ]+}} <<invalid sloc>> Implicit "op" "" 382
 // AST-NEXT: | | |   `-AvailabilityAttr {{[^ ]+}} <<invalid sloc>> Implicit  6.9 0 0 ""
 
@@ -47,14 +47,14 @@
 // AST-NEXT: | | |   |-TemplateArgument type 'unsigned int'
 // AST-NEXT: | | |   |-TemplateArgument type 'RayDesc'
 // AST-NEXT: | | |   |-TemplateArgument type 'Payload'
-// AST-NEXT: | | |   |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> TraceRay 'RaytracingAccelerationStructure'
-// AST-NEXT: | | |   |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> AccelerationStructure 'unsigned int'
+// AST-NEXT: | | |   |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> AccelerationStructure 'RaytracingAccelerationStructure'
 // AST-NEXT: | | |   |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> RayFlags 'unsigned int'
 // AST-NEXT: | | |   |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> InstanceInclusionMask 'unsigned int'
 // AST-NEXT: | | |   |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> RayContributionToHitGroupIndex 'unsigned int'
 // AST-NEXT: | | |   |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> MultiplierForGeometryContributionToHitGroupIndex 'unsigned int'
-// AST-NEXT: | | |   |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> MissShaderIndex 'RayDesc'
-// AST-NEXT: | | |   |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> Ray 'Payload &&__restrict'
+// AST-NEXT: | | |   |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> MissShaderIndex 'unsigned int'
+// AST-NEXT: | | |   |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> Ray 'RayDesc'
+// AST-NEXT: | | |   |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> Payload 'Payload &&__restrict'
 // AST-NEXT: | | |   |-HLSLIntrinsicAttr {{[^ ]+}} <<invalid sloc>> Implicit "op" "" 389
 // AST-NEXT: | | |   `-AvailabilityAttr {{[^ ]+}} <<invalid sloc>> Implicit  6.9 0 0 ""
 
diff --git a/tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_fromrayquery.hlsl b/tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_fromrayquery.hlsl
index 004d25156a..e4a13d8a62 100644
--- a/tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_fromrayquery.hlsl
+++ b/tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_fromrayquery.hlsl
@@ -9,7 +9,7 @@
 // AST-NEXT: | | | `-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> used FromRayQuery 'dx::HitObject (RayQuery<RAY_FLAG_FORCE_OPAQUE | RAY_FLAG_ACCEPT_FIRST_HIT_AND_END_SEARCH>)' static
 // AST-NEXT: | | |   |-TemplateArgument type 'dx::HitObject'
 // AST-NEXT: | | |   |-TemplateArgument type 'RayQuery<RAY_FLAG_FORCE_OPAQUE | RAY_FLAG_ACCEPT_FIRST_HIT_AND_END_SEARCH>':'RayQuery<5, 0>'
-// AST-NEXT: | | |   |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> FromRayQuery 'RayQuery<RAY_FLAG_FORCE_OPAQUE | RAY_FLAG_ACCEPT_FIRST_HIT_AND_END_SEARCH>':'RayQuery<5, 0>'
+// AST-NEXT: | | |   |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> rq 'RayQuery<RAY_FLAG_FORCE_OPAQUE | RAY_FLAG_ACCEPT_FIRST_HIT_AND_END_SEARCH>':'RayQuery<5, 0>'
 // AST-NEXT: | | |   |-HLSLIntrinsicAttr {{[^ ]+}} <<invalid sloc>> Implicit "op" "" 363
 // AST-NEXT: | | |   `-AvailabilityAttr {{[^ ]+}} <<invalid sloc>> Implicit  6.9 0 0 ""
 
@@ -27,9 +27,9 @@
 // AST-NEXT: | | |   |-TemplateArgument type 'RayQuery<RAY_FLAG_FORCE_OPAQUE | RAY_FLAG_ACCEPT_FIRST_HIT_AND_END_SEARCH>':'RayQuery<5, 0>'
 // AST-NEXT: | | |   |-TemplateArgument type 'unsigned int'
 // AST-NEXT: | | |   |-TemplateArgument type 'CustomAttrs'
-// AST-NEXT: | | |   |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> FromRayQuery 'RayQuery<RAY_FLAG_FORCE_OPAQUE | RAY_FLAG_ACCEPT_FIRST_HIT_AND_END_SEARCH>':'RayQuery<5, 0>'
-// AST-NEXT: | | |   |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> rq 'unsigned int'
-// AST-NEXT: | | |   |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> HitKind 'CustomAttrs'
+// AST-NEXT: | | |   |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> rq 'RayQuery<RAY_FLAG_FORCE_OPAQUE | RAY_FLAG_ACCEPT_FIRST_HIT_AND_END_SEARCH>':'RayQuery<5, 0>'
+// AST-NEXT: | | |   |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> HitKind 'unsigned int'
+// AST-NEXT: | | |   |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> Attributes 'CustomAttrs'
 // AST-NEXT: | | |   |-HLSLIntrinsicAttr {{[^ ]+}} <<invalid sloc>> Implicit "op" "" 363
 // AST-NEXT: | | |   `-AvailabilityAttr {{[^ ]+}} <<invalid sloc>> Implicit  6.9 0 0 ""
 

From 34b6d0f91e6afd523bdc574836093f021713cce7 Mon Sep 17 00:00:00 2001
From: Tex Riddell <texr@microsoft.com>
Date: Mon, 28 Apr 2025 10:23:47 -0700
Subject: [PATCH 13/93] Implementation of the CoopVec Inference and Training
 builtin intrinisics (#7290) (#7381)

Authored-by: Anupama Chandrasekhar <anupamac@nvidia.com>

Implements
HLSL:
__builtin_MatVecMul
__builtin_MatVecMulAdd
__builtin_OuterProductAccumulate
__builtin_VectorAccumulate

Lowered to
DXIL:
@dx.op.matVecMul
@dx.op.matVecMulAdd
 @dx.op.outerProductAccumulate
 @dx.op.vectorAccumulate

---------

Co-authored-by: Anupama Chandrasekhar <anupamac@nvidia.com>
Co-authored-by: Simon Moll <smoll@nvidia.com>
(cherry picked from commit 1db8c5b30b41f600c4c014fad7669d0e8f154a45)
---
 docs/DXIL.rst                                 |  12 +
 include/dxc/DXIL/DxilConstants.h              |  96 +++++--
 include/dxc/DXIL/DxilInstructions.h           | 230 +++++++++++++++
 .../dxc/DxilContainer/RDAT_LibraryTypes.inl   |   6 +-
 include/dxc/HLSL/HLOperations.h               |  48 ++++
 include/dxc/HlslIntrinsicOp.h                 |   6 +-
 lib/DXIL/DxilOperations.cpp                   | 108 ++++++-
 lib/DxilValidation/DxilValidation.cpp         | 271 ++++++++++++++++++
 lib/HLSL/HLOperationLower.cpp                 | 203 +++++++++++++
 tools/clang/lib/Sema/SemaHLSL.cpp             |  12 +
 .../linalg_builtins/check-shader-stages.hlsl  | 135 +++++++++
 .../linalg_builtins/linalg-builtins.hlsl      |  79 +++++
 .../intrinsics/linalg_builtins/lit.local.cfg  |   1 +
 .../mat-vec-mul-add_multioverload.hlsl        | 108 +++++++
 .../mat-vec-mul_multioverload.hlsl            | 104 +++++++
 ...uter-product-accumulate-multioverload.hlsl |  70 +++++
 .../linalg_builtins/vector-accumulate.hlsl    |  16 ++
 .../DXC/Passes/DxilGen/linalg-builtins.ll     | 189 ++++++++++++
 .../hlsl/linalg/unavailable-pre-sm69.hlsl     |  59 ++++
 utils/hct/gen_intrin_main.txt                 |   8 +
 utils/hct/hctdb.py                            | 151 ++++++++++
 utils/hct/hlsl_intrinsic_opcodes.json         |   8 +-
 22 files changed, 1894 insertions(+), 26 deletions(-)
 create mode 100644 tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/check-shader-stages.hlsl
 create mode 100644 tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/linalg-builtins.hlsl
 create mode 100644 tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/lit.local.cfg
 create mode 100644 tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/mat-vec-mul-add_multioverload.hlsl
 create mode 100644 tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/mat-vec-mul_multioverload.hlsl
 create mode 100644 tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/outer-product-accumulate-multioverload.hlsl
 create mode 100644 tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/vector-accumulate.hlsl
 create mode 100644 tools/clang/test/DXC/Passes/DxilGen/linalg-builtins.ll
 create mode 100644 tools/clang/test/SemaHLSL/hlsl/linalg/unavailable-pre-sm69.hlsl

diff --git a/docs/DXIL.rst b/docs/DXIL.rst
index a55f476450..69bcae8c53 100644
--- a/docs/DXIL.rst
+++ b/docs/DXIL.rst
@@ -2419,6 +2419,10 @@ ID  Name                                                  Description
 302 ReservedC9                                            reserved
 303 RawBufferVectorLoad                                   reads from a raw buffer and structured buffer
 304 RawBufferVectorStore                                  writes to a RWByteAddressBuffer or RWStructuredBuffer
+305 MatVecMul                                             Multiplies a MxK dimension matrix and a K sized input vector
+306 MatVecMulAdd                                          multiplies a MxK dimension matrix and a K sized input vector and adds an M-sized bias vector
+307 OuterProductAccumulate                                Computes the outer product between column vectors and an MxN matrix is accumulated component-wise atomically (with device scope) in memory
+308 VectorAccumulate                                      Accumulates the components of a vector component-wise atomically (with device scope) to the corresponding elements of an array in memory
 === ===================================================== =======================================================================================================================================================================================================================
 
 
@@ -3134,6 +3138,14 @@ INSTR.ILLEGALDXILOPCODE                               DXILOpCode must be [0..%0]
 INSTR.ILLEGALDXILOPFUNCTION                           '%0' is not a DXILOpFuncition for DXILOpcode '%1'.
 INSTR.IMMBIASFORSAMPLEB                               bias amount for sample_b must be in the range [%0,%1], but %2 was specified as an immediate.
 INSTR.INBOUNDSACCESS                                  Access to out-of-bounds memory is disallowed.
+INSTR.LINALGINTERPRETATIONPARAMARECONST               In Linalg operations, Interpretation value is a constant.
+INSTR.LINALGINVALIDMATRIXLAYOUTVALUEFORMATVECOPS      Matrix Layout for Linalg Mul/MulAdd operation must be valid.
+INSTR.LINALGINVALIDMEMORYINTERPVALUE                  In Memory Interpolation value must be valid.
+INSTR.LINALGINVALIDREGISTERINTERPVALUE                From Register Interpretation value must be valid.
+INSTR.LINALGMATRIXLAYOUTNOTTRANSPOSABLE               Row Major and Column Major matrix layouts are not transposable.
+INSTR.LINALGMATRIXSHAPEPARAMSARECONST                 Matrix Layout, Dimensions and isTranspose are constants
+INSTR.LINALGNOTANUNSIGNEDTYPE                         Unsigned flag set for a float signed type
+INSTR.MATVECOPISUNSIGNEDFLAGSARECONST                 In Linalg Mul/MulAdd functions, IsUnsigned flag is a constant.
 INSTR.MAYREORDERTHREADUNDEFCOHERENCEHINTPARAM         Use of undef coherence hint or num coherence hint bits in MaybeReorderThread.
 INSTR.MINPRECISIONNOTPRECISE                          Instructions marked precise may not refer to minprecision values.
 INSTR.MINPRECISONBITCAST                              Bitcast on minprecison types is not allowed.
diff --git a/include/dxc/DXIL/DxilConstants.h b/include/dxc/DXIL/DxilConstants.h
index 3752274f18..fe32c06f63 100644
--- a/include/dxc/DXIL/DxilConstants.h
+++ b/include/dxc/DXIL/DxilConstants.h
@@ -162,24 +162,32 @@ const unsigned kDxilMaxOloadDims = 2;
 
 enum class ComponentType : uint32_t {
   Invalid = 0,
-  I1,
-  I16,
-  U16,
-  I32,
-  U32,
-  I64,
-  U64,
-  F16,
-  F32,
-  F64,
-  SNormF16,
-  UNormF16,
-  SNormF32,
-  UNormF32,
-  SNormF64,
-  UNormF64,
-  PackedS8x32,
-  PackedU8x32,
+  I1 = 1,
+  I16 = 2,
+  U16 = 3,
+  I32 = 4,
+  U32 = 5,
+  I64 = 6,
+  U64 = 7,
+  F16 = 8,
+  F32 = 9,
+  F64 = 10,
+  SNormF16 = 11,
+  UNormF16 = 12,
+  SNormF32 = 13,
+  UNormF32 = 14,
+  SNormF64 = 15,
+  UNormF64 = 16,
+  PackedS8x32 = 17,
+  PackedU8x32 = 18,
+
+  // BEGIN NEW FOR SM 6.9
+  U8 = 19,
+  I8 = 20,
+  F8_E4M3 = 21,
+  F8_E5M2 = 22,
+  // END
+
   LastEntry
 };
 
@@ -743,6 +751,19 @@ enum class OpCode : unsigned {
   CreateHandleForLib =
       160, // create resource handle from resource struct for library
 
+  // Linear Algebra Operations
+  MatVecMul =
+      305, // Multiplies a MxK dimension matrix and a K sized input vector
+  MatVecMulAdd = 306, // multiplies a MxK dimension matrix and a K sized input
+                      // vector and adds an M-sized bias vector
+  OuterProductAccumulate =
+      307, // Computes the outer product between column vectors and an MxN
+           // matrix is accumulated component-wise atomically (with device
+           // scope) in memory
+  VectorAccumulate = 308, // Accumulates the components of a vector
+                          // component-wise atomically (with device scope) to
+                          // the corresponding elements of an array in memory
+
   // Mesh shader instructions
   EmitIndices = 169, // emit a primitive's vertex indices in a mesh shader
   GetMeshPayload =
@@ -1060,7 +1081,7 @@ enum class OpCode : unsigned {
   NumOpCodes_Dxil_1_7 = 226,
   NumOpCodes_Dxil_1_8 = 258,
 
-  NumOpCodes = 305 // exclusive last value of enumeration
+  NumOpCodes = 309 // exclusive last value of enumeration
 };
 // OPCODE-ENUM:END
 
@@ -1201,6 +1222,12 @@ enum class OpCodeClass : unsigned {
   // Library create handle from resource struct (like HL intrinsic)
   CreateHandleForLib,
 
+  // Linear Algebra Operations
+  MatVecMul,
+  MatVecMulAdd,
+  OuterProductAccumulate,
+  VectorAccumulate,
+
   // Mesh shader instructions
   EmitIndices,
   GetMeshPayload,
@@ -1385,7 +1412,7 @@ enum class OpCodeClass : unsigned {
   NumOpClasses_Dxil_1_7 = 153,
   NumOpClasses_Dxil_1_8 = 174,
 
-  NumOpClasses = 190 // exclusive last value of enumeration
+  NumOpClasses = 194 // exclusive last value of enumeration
 };
 // OPCODECLASS-ENUM:END
 
@@ -1561,6 +1588,28 @@ const unsigned kHitObjectTraceRay_RayDescOpIdx = 7;
 const unsigned kHitObjectTraceRay_PayloadOpIdx = 15;
 const unsigned kHitObjectTraceRay_NumOp = 16;
 
+// MatVec Ops
+const unsigned kMatVecMulInputVectorIdx = 1;
+const unsigned kMatVecMulIsInputUnsignedIdx = 2;
+const unsigned kMatVecMulInputInterpretationIdx = 3;
+const unsigned kMatVecMulMatrixBufferIdx = 4;
+const unsigned kMatVecMulMatrixOffsetIdx = 5;
+const unsigned kMatVecMulMatrixInterpretationIdx = 6;
+const unsigned kMatVecMulMatrixMIdx = 7;
+const unsigned kMatVecMulMatrixKIdx = 8;
+const unsigned kMatVecMulMatrixLayoutIdx = 9;
+const unsigned kMatVecMulMatrixTransposeIdx = 10;
+const unsigned kMatVecMulMatrixStrideIdx = 11;
+const unsigned kMatVecMulIsOutputUnsignedIdx = 12;
+
+// MatVecAdd
+const unsigned kMatVecMulAddBiasInterpretation = 14;
+const unsigned kMatVecMulAddIsOutputUnsignedIdx = 15;
+
+// Outer Product Accumulate
+const unsigned kOuterProdAccMatrixInterpretation = 5;
+const unsigned kOuterProdAccMatrixLayout = 6;
+
 // TODO: add operand index for all the OpCodeClass.
 } // namespace OperandIndex
 
@@ -2132,6 +2181,13 @@ extern const char *kHostLayoutTypePrefix;
 
 extern const char *kWaveOpsIncludeHelperLanesString;
 
+enum class LinalgMatrixLayout : uint32_t {
+  RowMajor = 0,
+  ColumnMajor = 1,
+  MulOptimal = 2,
+  OuterProductOptimal = 3,
+};
+
 } // namespace DXIL
 
 } // namespace hlsl
diff --git a/include/dxc/DXIL/DxilInstructions.h b/include/dxc/DXIL/DxilInstructions.h
index a99c5360d4..9a4030fd8e 100644
--- a/include/dxc/DXIL/DxilInstructions.h
+++ b/include/dxc/DXIL/DxilInstructions.h
@@ -9918,5 +9918,235 @@ struct DxilInst_RawBufferVectorStore {
                              llvm::APInt(32, (uint64_t)val)));
   }
 };
+
+/// This instruction Multiplies a MxK dimension matrix and a K sized input
+/// vector
+struct DxilInst_MatVecMul {
+  llvm::Instruction *Instr;
+  // Construction and identification
+  DxilInst_MatVecMul(llvm::Instruction *pInstr) : Instr(pInstr) {}
+  operator bool() const {
+    return hlsl::OP::IsDxilOpFuncCallInst(Instr, hlsl::OP::OpCode::MatVecMul);
+  }
+  // Validation support
+  bool isAllowed() const { return true; }
+  bool isArgumentListValid() const {
+    if (13 != llvm::dyn_cast<llvm::CallInst>(Instr)->getNumArgOperands())
+      return false;
+    return true;
+  }
+  // Metadata
+  bool requiresUniformInputs() const { return false; }
+  // Operand indexes
+  enum OperandIdx {
+    arg_inputVector = 1,
+    arg_isInputUnsigned = 2,
+    arg_inputInterpretation = 3,
+    arg_matrixBuffer = 4,
+    arg_matrixOffset = 5,
+    arg_matrixIntepretation = 6,
+    arg_matrixM = 7,
+    arg_matrixK = 8,
+    arg_matrixLayout = 9,
+    arg_matrixTranspose = 10,
+    arg_matrixStride = 11,
+    arg_isOutputUnsigned = 12,
+  };
+  // Accessors
+  llvm::Value *get_inputVector() const { return Instr->getOperand(1); }
+  void set_inputVector(llvm::Value *val) { Instr->setOperand(1, val); }
+  llvm::Value *get_isInputUnsigned() const { return Instr->getOperand(2); }
+  void set_isInputUnsigned(llvm::Value *val) { Instr->setOperand(2, val); }
+  llvm::Value *get_inputInterpretation() const { return Instr->getOperand(3); }
+  void set_inputInterpretation(llvm::Value *val) { Instr->setOperand(3, val); }
+  llvm::Value *get_matrixBuffer() const { return Instr->getOperand(4); }
+  void set_matrixBuffer(llvm::Value *val) { Instr->setOperand(4, val); }
+  llvm::Value *get_matrixOffset() const { return Instr->getOperand(5); }
+  void set_matrixOffset(llvm::Value *val) { Instr->setOperand(5, val); }
+  llvm::Value *get_matrixIntepretation() const { return Instr->getOperand(6); }
+  void set_matrixIntepretation(llvm::Value *val) { Instr->setOperand(6, val); }
+  llvm::Value *get_matrixM() const { return Instr->getOperand(7); }
+  void set_matrixM(llvm::Value *val) { Instr->setOperand(7, val); }
+  llvm::Value *get_matrixK() const { return Instr->getOperand(8); }
+  void set_matrixK(llvm::Value *val) { Instr->setOperand(8, val); }
+  llvm::Value *get_matrixLayout() const { return Instr->getOperand(9); }
+  void set_matrixLayout(llvm::Value *val) { Instr->setOperand(9, val); }
+  llvm::Value *get_matrixTranspose() const { return Instr->getOperand(10); }
+  void set_matrixTranspose(llvm::Value *val) { Instr->setOperand(10, val); }
+  llvm::Value *get_matrixStride() const { return Instr->getOperand(11); }
+  void set_matrixStride(llvm::Value *val) { Instr->setOperand(11, val); }
+  llvm::Value *get_isOutputUnsigned() const { return Instr->getOperand(12); }
+  void set_isOutputUnsigned(llvm::Value *val) { Instr->setOperand(12, val); }
+};
+
+/// This instruction multiplies a MxK dimension matrix and a K sized input
+/// vector and adds an M-sized bias vector
+struct DxilInst_MatVecMulAdd {
+  llvm::Instruction *Instr;
+  // Construction and identification
+  DxilInst_MatVecMulAdd(llvm::Instruction *pInstr) : Instr(pInstr) {}
+  operator bool() const {
+    return hlsl::OP::IsDxilOpFuncCallInst(Instr,
+                                          hlsl::OP::OpCode::MatVecMulAdd);
+  }
+  // Validation support
+  bool isAllowed() const { return true; }
+  bool isArgumentListValid() const {
+    if (16 != llvm::dyn_cast<llvm::CallInst>(Instr)->getNumArgOperands())
+      return false;
+    return true;
+  }
+  // Metadata
+  bool requiresUniformInputs() const { return false; }
+  // Operand indexes
+  enum OperandIdx {
+    arg_inputVector = 1,
+    arg_isInputUnsigned = 2,
+    arg_inputInterpretation = 3,
+    arg_matrixBuffer = 4,
+    arg_matrixOffset = 5,
+    arg_matrixIntepretation = 6,
+    arg_matrixM = 7,
+    arg_matrixK = 8,
+    arg_matrixLayout = 9,
+    arg_matrixTranspose = 10,
+    arg_matrixStride = 11,
+    arg_biasBuffer = 12,
+    arg_biasOffset = 13,
+    arg_biasIntepretation = 14,
+    arg_isOutputUnsigned = 15,
+  };
+  // Accessors
+  llvm::Value *get_inputVector() const { return Instr->getOperand(1); }
+  void set_inputVector(llvm::Value *val) { Instr->setOperand(1, val); }
+  llvm::Value *get_isInputUnsigned() const { return Instr->getOperand(2); }
+  void set_isInputUnsigned(llvm::Value *val) { Instr->setOperand(2, val); }
+  llvm::Value *get_inputInterpretation() const { return Instr->getOperand(3); }
+  void set_inputInterpretation(llvm::Value *val) { Instr->setOperand(3, val); }
+  llvm::Value *get_matrixBuffer() const { return Instr->getOperand(4); }
+  void set_matrixBuffer(llvm::Value *val) { Instr->setOperand(4, val); }
+  llvm::Value *get_matrixOffset() const { return Instr->getOperand(5); }
+  void set_matrixOffset(llvm::Value *val) { Instr->setOperand(5, val); }
+  llvm::Value *get_matrixIntepretation() const { return Instr->getOperand(6); }
+  void set_matrixIntepretation(llvm::Value *val) { Instr->setOperand(6, val); }
+  llvm::Value *get_matrixM() const { return Instr->getOperand(7); }
+  void set_matrixM(llvm::Value *val) { Instr->setOperand(7, val); }
+  llvm::Value *get_matrixK() const { return Instr->getOperand(8); }
+  void set_matrixK(llvm::Value *val) { Instr->setOperand(8, val); }
+  llvm::Value *get_matrixLayout() const { return Instr->getOperand(9); }
+  void set_matrixLayout(llvm::Value *val) { Instr->setOperand(9, val); }
+  llvm::Value *get_matrixTranspose() const { return Instr->getOperand(10); }
+  void set_matrixTranspose(llvm::Value *val) { Instr->setOperand(10, val); }
+  llvm::Value *get_matrixStride() const { return Instr->getOperand(11); }
+  void set_matrixStride(llvm::Value *val) { Instr->setOperand(11, val); }
+  llvm::Value *get_biasBuffer() const { return Instr->getOperand(12); }
+  void set_biasBuffer(llvm::Value *val) { Instr->setOperand(12, val); }
+  llvm::Value *get_biasOffset() const { return Instr->getOperand(13); }
+  void set_biasOffset(llvm::Value *val) { Instr->setOperand(13, val); }
+  llvm::Value *get_biasIntepretation() const { return Instr->getOperand(14); }
+  void set_biasIntepretation(llvm::Value *val) { Instr->setOperand(14, val); }
+  llvm::Value *get_isOutputUnsigned() const { return Instr->getOperand(15); }
+  void set_isOutputUnsigned(llvm::Value *val) { Instr->setOperand(15, val); }
+};
+
+/// This instruction Computes the outer product between column vectors and an
+/// MxN matrix is accumulated component-wise atomically (with device scope) in
+/// memory
+struct DxilInst_OuterProductAccumulate {
+  llvm::Instruction *Instr;
+  // Construction and identification
+  DxilInst_OuterProductAccumulate(llvm::Instruction *pInstr) : Instr(pInstr) {}
+  operator bool() const {
+    return hlsl::OP::IsDxilOpFuncCallInst(
+        Instr, hlsl::OP::OpCode::OuterProductAccumulate);
+  }
+  // Validation support
+  bool isAllowed() const { return true; }
+  bool isArgumentListValid() const {
+    if (8 != llvm::dyn_cast<llvm::CallInst>(Instr)->getNumArgOperands())
+      return false;
+    return true;
+  }
+  // Metadata
+  bool requiresUniformInputs() const { return false; }
+  // Operand indexes
+  enum OperandIdx {
+    arg_inputVector1 = 1,
+    arg_inputVector2 = 2,
+    arg_matrixBuffer = 3,
+    arg_matrixOffset = 4,
+    arg_matrixIntepretation = 5,
+    arg_matrixLayout = 6,
+    arg_matrixStride = 7,
+  };
+  // Accessors
+  llvm::Value *get_inputVector1() const { return Instr->getOperand(1); }
+  void set_inputVector1(llvm::Value *val) { Instr->setOperand(1, val); }
+  llvm::Value *get_inputVector2() const { return Instr->getOperand(2); }
+  void set_inputVector2(llvm::Value *val) { Instr->setOperand(2, val); }
+  llvm::Value *get_matrixBuffer() const { return Instr->getOperand(3); }
+  void set_matrixBuffer(llvm::Value *val) { Instr->setOperand(3, val); }
+  llvm::Value *get_matrixOffset() const { return Instr->getOperand(4); }
+  void set_matrixOffset(llvm::Value *val) { Instr->setOperand(4, val); }
+  llvm::Value *get_matrixIntepretation() const { return Instr->getOperand(5); }
+  void set_matrixIntepretation(llvm::Value *val) { Instr->setOperand(5, val); }
+  int32_t get_matrixIntepretation_val() const {
+    return (int32_t)(llvm::dyn_cast<llvm::ConstantInt>(Instr->getOperand(5))
+                         ->getZExtValue());
+  }
+  void set_matrixIntepretation_val(int32_t val) {
+    Instr->setOperand(5, llvm::Constant::getIntegerValue(
+                             llvm::IntegerType::get(Instr->getContext(), 32),
+                             llvm::APInt(32, (uint64_t)val)));
+  }
+  llvm::Value *get_matrixLayout() const { return Instr->getOperand(6); }
+  void set_matrixLayout(llvm::Value *val) { Instr->setOperand(6, val); }
+  int32_t get_matrixLayout_val() const {
+    return (int32_t)(llvm::dyn_cast<llvm::ConstantInt>(Instr->getOperand(6))
+                         ->getZExtValue());
+  }
+  void set_matrixLayout_val(int32_t val) {
+    Instr->setOperand(6, llvm::Constant::getIntegerValue(
+                             llvm::IntegerType::get(Instr->getContext(), 32),
+                             llvm::APInt(32, (uint64_t)val)));
+  }
+  llvm::Value *get_matrixStride() const { return Instr->getOperand(7); }
+  void set_matrixStride(llvm::Value *val) { Instr->setOperand(7, val); }
+};
+
+/// This instruction Accumulates the components of a vector component-wise
+/// atomically (with device scope) to the corresponding elements of an array in
+/// memory
+struct DxilInst_VectorAccumulate {
+  llvm::Instruction *Instr;
+  // Construction and identification
+  DxilInst_VectorAccumulate(llvm::Instruction *pInstr) : Instr(pInstr) {}
+  operator bool() const {
+    return hlsl::OP::IsDxilOpFuncCallInst(Instr,
+                                          hlsl::OP::OpCode::VectorAccumulate);
+  }
+  // Validation support
+  bool isAllowed() const { return true; }
+  bool isArgumentListValid() const {
+    if (4 != llvm::dyn_cast<llvm::CallInst>(Instr)->getNumArgOperands())
+      return false;
+    return true;
+  }
+  // Metadata
+  bool requiresUniformInputs() const { return false; }
+  // Operand indexes
+  enum OperandIdx {
+    arg_inputVector = 1,
+    arg_arrayBuffer = 2,
+    arg_arrayOffset = 3,
+  };
+  // Accessors
+  llvm::Value *get_inputVector() const { return Instr->getOperand(1); }
+  void set_inputVector(llvm::Value *val) { Instr->setOperand(1, val); }
+  llvm::Value *get_arrayBuffer() const { return Instr->getOperand(2); }
+  void set_arrayBuffer(llvm::Value *val) { Instr->setOperand(2, val); }
+  llvm::Value *get_arrayOffset() const { return Instr->getOperand(3); }
+  void set_arrayOffset(llvm::Value *val) { Instr->setOperand(3, val); }
+};
 // INSTR-HELPER:END
 } // namespace hlsl
diff --git a/include/dxc/DxilContainer/RDAT_LibraryTypes.inl b/include/dxc/DxilContainer/RDAT_LibraryTypes.inl
index 4b58b406c2..902f2e9652 100644
--- a/include/dxc/DxilContainer/RDAT_LibraryTypes.inl
+++ b/include/dxc/DxilContainer/RDAT_LibraryTypes.inl
@@ -565,9 +565,13 @@ RDAT_DXIL_ENUM_START(hlsl::DXIL::ComponentType, uint32_t)
   RDAT_ENUM_VALUE_NODEF(UNormF64)
   RDAT_ENUM_VALUE_NODEF(PackedS8x32)
   RDAT_ENUM_VALUE_NODEF(PackedU8x32)
+  RDAT_ENUM_VALUE_NODEF(U8)
+  RDAT_ENUM_VALUE_NODEF(I8)
+  RDAT_ENUM_VALUE_NODEF(F8_E4M3)
+  RDAT_ENUM_VALUE_NODEF(F8_E5M2)
   RDAT_ENUM_VALUE_NODEF(LastEntry)
 #if DEF_RDAT_ENUMS == DEF_RDAT_DUMP_IMPL
-  static_assert((unsigned)hlsl::DXIL::ComponentType::LastEntry == 19,
+  static_assert((unsigned)hlsl::DXIL::ComponentType::LastEntry == 23,
                 "otherwise, RDAT_DXIL_ENUM definition needs updating");
 #endif
 RDAT_ENUM_END()
diff --git a/include/dxc/HLSL/HLOperations.h b/include/dxc/HLSL/HLOperations.h
index 0e9b8c2710..c75318da99 100644
--- a/include/dxc/HLSL/HLOperations.h
+++ b/include/dxc/HLSL/HLOperations.h
@@ -445,6 +445,54 @@ const unsigned kHitObjectTraceRay_NumOp = 10;
 const unsigned kHitObjectFromRayQuery_WithAttrs_AttributeOpIdx = 4;
 const unsigned kHitObjectFromRayQuery_WithAttrs_NumOp = 5;
 
+// Linear Algebra Operations
+
+// MatVecMul
+const unsigned kMatVecMulOutputVectorIdx = 1;
+const unsigned kMatVecMulIsOutputUnsignedIdx = 2;
+const unsigned kMatVecMulInputVectorIdx = 3;
+const unsigned kMatVecMulIsInputUnsignedIdx = 4;
+const unsigned kMatVecMulInputInterpretationIdx = 5;
+const unsigned kMatVecMulMatrixBufferIdx = 6;
+const unsigned kMatVecMulMatrixOffsetIdx = 7;
+const unsigned kMatVecMulMatrixInterpretationIdx = 8;
+const unsigned kMatVecMulMatrixMIdx = 9;
+const unsigned kMatVecMulMatrixKIdx = 10;
+const unsigned kMatVecMulMatrixLayoutIdx = 11;
+const unsigned kMatVecMulMatrixTransposeIdx = 12;
+const unsigned kMatVecMulMatrixStrideIdx = 13;
+
+// MatVecMulAdd
+const unsigned kMatVecMulAddOutputVectorIdx = 1;
+const unsigned kMatVecMulAddIsOutputUnsignedIdx = 2;
+const unsigned kMatVecMulAddInputVectorIdx = 3;
+const unsigned kMatVecMulAddIsInputUnsignedIdx = 4;
+const unsigned kMatVecMulAddInputInterpretationIdx = 5;
+const unsigned kMatVecMulAddMatrixBufferIdx = 6;
+const unsigned kMatVecMulAddMatrixOffsetIdx = 7;
+const unsigned kMatVecMulAddMatrixInterpretationIdx = 8;
+const unsigned kMatVecMulAddMatrixMIdx = 9;
+const unsigned kMatVecMulAddMatrixKIdx = 10;
+const unsigned kMatVecMulAddMatrixLayoutIdx = 11;
+const unsigned kMatVecMulAddMatrixTransposeIdx = 12;
+const unsigned kMatVecMulAddMatrixStrideIdx = 13;
+const unsigned kMatVecMulAddBiasBufferIdx = 14;
+const unsigned kMatVecMulAddBiasOffsetIdx = 15;
+const unsigned kMatVecMulAddBiasInterpretationIdx = 16;
+
+// OuterProductAccumulate
+const unsigned kOuterProdAccInputVec1Idx = 1;
+const unsigned kOuterProdAccInputVec2Idx = 2;
+const unsigned kOuterProdAccMatrixIdx = 3;
+const unsigned kOuterProdAccMatrixOffsetIdx = 4;
+const unsigned kOuterProdAccMatrixInterpretationIdx = 5;
+const unsigned kOuterProdAccMatrixLayoutIdx = 6;
+const unsigned kOuterProdAccMatrixStrideIdx = 7;
+
+// Vector Accumulate
+const unsigned kVectorAccInputVecIdx = 1;
+const unsigned kVectorAccMatrixIdx = 2;
+const unsigned kVectorAccMatrixOffsetIdx = 3;
 } // namespace HLOperandIndex
 
 llvm::Function *GetOrCreateHLFunction(llvm::Module &M,
diff --git a/include/dxc/HlslIntrinsicOp.h b/include/dxc/HlslIntrinsicOp.h
index d37c27a38e..197bd3e1f5 100644
--- a/include/dxc/HlslIntrinsicOp.h
+++ b/include/dxc/HlslIntrinsicOp.h
@@ -107,6 +107,10 @@ enum class IntrinsicOp {
   IOP_WorldToObject = 99,
   IOP_WorldToObject3x4 = 100,
   IOP_WorldToObject4x3 = 101,
+  IOP___builtin_MatVecMul = 390,
+  IOP___builtin_MatVecMulAdd = 391,
+  IOP___builtin_OuterProductAccumulate = 392,
+  IOP___builtin_VectorAccumulate = 393,
   IOP_abort = 102,
   IOP_abs = 103,
   IOP_acos = 104,
@@ -396,7 +400,7 @@ enum class IntrinsicOp {
   IOP_usign = 355,
   MOP_InterlockedUMax = 356,
   MOP_InterlockedUMin = 357,
-  Num_Intrinsics = 390,
+  Num_Intrinsics = 394,
 };
 inline bool HasUnsignedIntrinsicOpcode(IntrinsicOp opcode) {
   switch (opcode) {
diff --git a/lib/DXIL/DxilOperations.cpp b/lib/DXIL/DxilOperations.cpp
index f614ba9d14..95e8dfaeba 100644
--- a/lib/DXIL/DxilOperations.cpp
+++ b/lib/DXIL/DxilOperations.cpp
@@ -2652,6 +2652,40 @@ const OP::OpCodeProperty OP::m_OpCodeProps[(unsigned)OP::OpCode::NumOpCodes] = {
      1,
      {{0x4e7}},
      {{0xe7}}}, // Overloads: hfwidl<hfwidl
+
+    // Linear Algebra Operations
+    {OC::MatVecMul,
+     "MatVecMul",
+     OCC::MatVecMul,
+     "matVecMul",
+     Attribute::ReadOnly,
+     2,
+     {{0x400}, {0x400}},
+     {{0x63}, {0x63}}}, // Overloads: <hfwi,<hfwi
+    {OC::MatVecMulAdd,
+     "MatVecMulAdd",
+     OCC::MatVecMulAdd,
+     "matVecMulAdd",
+     Attribute::ReadOnly,
+     2,
+     {{0x400}, {0x400}},
+     {{0x63}, {0x63}}}, // Overloads: <hfwi,<hfwi
+    {OC::OuterProductAccumulate,
+     "OuterProductAccumulate",
+     OCC::OuterProductAccumulate,
+     "outerProductAccumulate",
+     Attribute::None,
+     2,
+     {{0x400}, {0x400}},
+     {{0x63}, {0x63}}}, // Overloads: <hfwi,<hfwi
+    {OC::VectorAccumulate,
+     "VectorAccumulate",
+     OCC::VectorAccumulate,
+     "vectorAccumulate",
+     Attribute::None,
+     1,
+     {{0x400}},
+     {{0x63}}}, // Overloads: <hfwi
 };
 // OPCODE-OLOADS:END
 
@@ -3440,8 +3474,9 @@ void OP::GetMinShaderModelAndMask(OpCode C, bool bWithTranslation,
     return;
   }
   // Instructions: AllocateRayQuery2=258, RawBufferVectorLoad=303,
-  // RawBufferVectorStore=304
-  if (op == 258 || (303 <= op && op <= 304)) {
+  // RawBufferVectorStore=304, MatVecMul=305, MatVecMulAdd=306,
+  // OuterProductAccumulate=307, VectorAccumulate=308
+  if (op == 258 || (303 <= op && op <= 308)) {
     major = 6;
     minor = 9;
     return;
@@ -5890,6 +5925,61 @@ Function *OP::GetOpFunc(OpCode opCode, Type *pOverloadType) {
     A(pETy);
     A(pI32);
     break;
+
+    // Linear Algebra Operations
+  case OpCode::MatVecMul:
+    EXT(0);
+    A(pI32);
+    EXT(1);
+    A(pI1);
+    A(pI32);
+    A(pRes);
+    A(pI32);
+    A(pI32);
+    A(pI32);
+    A(pI32);
+    A(pI32);
+    A(pI1);
+    A(pI32);
+    A(pI1);
+    break;
+  case OpCode::MatVecMulAdd:
+    EXT(0);
+    A(pI32);
+    EXT(1);
+    A(pI1);
+    A(pI32);
+    A(pRes);
+    A(pI32);
+    A(pI32);
+    A(pI32);
+    A(pI32);
+    A(pI32);
+    A(pI1);
+    A(pI32);
+    A(pRes);
+    A(pI32);
+    A(pI32);
+    A(pI1);
+    break;
+  case OpCode::OuterProductAccumulate:
+    A(pV);
+    A(pI32);
+    EXT(0);
+    EXT(1);
+    A(pRes);
+    A(pI32);
+    A(pI32);
+    A(pI32);
+    A(pI32);
+    break;
+  case OpCode::VectorAccumulate:
+    A(pV);
+    A(pI32);
+    A(pETy);
+    A(pRes);
+    A(pI32);
+    break;
   // OPCODE-OLOAD-FUNCS:END
   default:
     DXASSERT(false, "otherwise unhandled case");
@@ -6061,6 +6151,7 @@ llvm::Type *OP::GetOverloadType(OpCode opCode, llvm::Function *F) {
   case OpCode::WaveActiveAllEqual:
   case OpCode::CreateHandleForLib:
   case OpCode::WaveMatch:
+  case OpCode::VectorAccumulate:
     if (FT->getNumParams() <= 1)
       return nullptr;
     return FT->getParamType(1);
@@ -6291,6 +6382,19 @@ llvm::Type *OP::GetOverloadType(OpCode opCode, llvm::Function *F) {
     StructType *ST = cast<StructType>(Ty);
     return ST->getElementType(0);
   }
+  case OpCode::MatVecMul:
+  case OpCode::MatVecMulAdd:
+    if (FT->getNumParams() < 2)
+      return nullptr;
+    return llvm::StructType::get(Ctx,
+                                 {FT->getReturnType(), FT->getParamType(1)});
+
+  case OpCode::OuterProductAccumulate:
+    if (FT->getNumParams() < 3)
+      return nullptr;
+    return llvm::StructType::get(Ctx,
+                                 {FT->getParamType(1), FT->getParamType(2)});
+
   // OPCODE-OLOAD-TYPES:END
   default:
     return Ty;
diff --git a/lib/DxilValidation/DxilValidation.cpp b/lib/DxilValidation/DxilValidation.cpp
index 1ef64ee1bb..c4448d1ec4 100644
--- a/lib/DxilValidation/DxilValidation.cpp
+++ b/lib/DxilValidation/DxilValidation.cpp
@@ -970,6 +970,267 @@ static void ValidateImmOperandForMathDxilOp(CallInst *CI, DXIL::OpCode Opcode,
   }
 }
 
+static bool CheckLinalgInterpretation(uint32_t Input, bool InRegister) {
+  using CT = DXIL::ComponentType;
+  switch (static_cast<CT>(Input)) {
+  case CT::I16:
+  case CT::U16:
+  case CT::I32:
+  case CT::U32:
+  case CT::F16:
+  case CT::F32:
+  case CT::U8:
+  case CT::I8:
+  case CT::F8_E4M3:
+  case CT::F8_E5M2:
+    return true;
+  case CT::PackedS8x32:
+  case CT::PackedU8x32:
+    return InRegister;
+  default:
+    return false;
+  }
+}
+
+static bool CheckMatrixLayoutForMatVecMulOps(unsigned Layout) {
+  return Layout <=
+         static_cast<unsigned>(DXIL::LinalgMatrixLayout::OuterProductOptimal);
+}
+
+std::string GetMatrixLayoutStr(unsigned Layout) {
+  switch (static_cast<DXIL::LinalgMatrixLayout>(Layout)) {
+  case DXIL::LinalgMatrixLayout::RowMajor:
+    return "RowMajor";
+  case DXIL::LinalgMatrixLayout::ColumnMajor:
+    return "ColumnMajor";
+  case DXIL::LinalgMatrixLayout::MulOptimal:
+    return "MulOptimal";
+  case DXIL::LinalgMatrixLayout::OuterProductOptimal:
+    return "OuterProductOptimal";
+  default:
+    DXASSERT_NOMSG(false);
+    return "Invalid";
+  }
+}
+
+static bool CheckTransposeForMatrixLayout(unsigned Layout, bool Transposed) {
+  switch (static_cast<DXIL::LinalgMatrixLayout>(Layout)) {
+  case DXIL::LinalgMatrixLayout::RowMajor:
+  case DXIL::LinalgMatrixLayout::ColumnMajor:
+    return !Transposed;
+
+  default:
+    return true;
+  }
+}
+
+static bool CheckUnsignedFlag(Type *VecTy, bool IsUnsigned) {
+  Type *ElemTy = VecTy->getScalarType();
+  if (ElemTy->isFloatingPointTy())
+    return !IsUnsigned;
+
+  return true;
+}
+
+static Value *GetMatVecOpIsOutputUnsigned(CallInst *CI, DXIL::OpCode OpCode) {
+  switch (OpCode) {
+  case DXIL::OpCode::MatVecMul:
+    return CI->getOperand(DXIL::OperandIndex::kMatVecMulIsOutputUnsignedIdx);
+  case DXIL::OpCode::MatVecMulAdd:
+    return CI->getOperand(DXIL::OperandIndex::kMatVecMulAddIsOutputUnsignedIdx);
+
+  default:
+    DXASSERT_NOMSG(false);
+    return nullptr;
+  }
+}
+
+static void ValidateImmOperandsForMatVecOps(CallInst *CI, DXIL::OpCode OpCode,
+                                            ValidationContext &ValCtx) {
+
+  llvm::Value *IsInputUnsigned =
+      CI->getOperand(DXIL::OperandIndex::kMatVecMulIsInputUnsignedIdx);
+  ConstantInt *IsInputUnsignedConst =
+      dyn_cast<llvm::ConstantInt>(IsInputUnsigned);
+  if (!IsInputUnsignedConst) {
+    ValCtx.EmitInstrFormatError(
+        CI, ValidationRule::InstrMatVecOpIsUnsignedFlagsAreConst,
+        {"IsInputUnsigned"});
+    return;
+  }
+
+  llvm::Value *IsOutputUnsigned = GetMatVecOpIsOutputUnsigned(CI, OpCode);
+  ConstantInt *IsOutputUnsignedConst =
+      dyn_cast<llvm::ConstantInt>(IsOutputUnsigned);
+  if (!IsOutputUnsignedConst) {
+    ValCtx.EmitInstrFormatError(
+        CI, ValidationRule::InstrMatVecOpIsUnsignedFlagsAreConst,
+        {"IsOutputUnsigned"});
+    return;
+  }
+
+  llvm::Value *InputInterpretation =
+      CI->getOperand(DXIL::OperandIndex::kMatVecMulInputInterpretationIdx);
+  ConstantInt *II = dyn_cast<ConstantInt>(InputInterpretation);
+  if (!II) {
+    ValCtx.EmitInstrFormatError(
+        CI, ValidationRule::InstrLinalgInterpretationParamAreConst,
+        {"InputInterpretation"});
+    return;
+  }
+  uint64_t IIValue = II->getLimitedValue();
+  if (!CheckLinalgInterpretation(IIValue, true)) {
+    ValCtx.EmitInstrFormatError(
+        CI, ValidationRule::InstrLinalgInvalidRegisterInterpValue,
+        {std::to_string(IIValue), "Input"});
+    return;
+  }
+
+  llvm::Value *MatrixInterpretation =
+      CI->getOperand(DXIL::OperandIndex::kMatVecMulMatrixInterpretationIdx);
+  ConstantInt *MI = dyn_cast<ConstantInt>(MatrixInterpretation);
+  if (!MI) {
+    ValCtx.EmitInstrFormatError(
+        CI, ValidationRule::InstrLinalgInterpretationParamAreConst,
+        {"MatrixInterpretation"});
+    return;
+  }
+  uint64_t MIValue = MI->getLimitedValue();
+  if (!CheckLinalgInterpretation(MIValue, false)) {
+    ValCtx.EmitInstrFormatError(
+        CI, ValidationRule::InstrLinalgInvalidMemoryInterpValue,
+        {std::to_string(MIValue), "Matrix"});
+    return;
+  }
+
+  llvm::Value *MatrixM =
+      CI->getOperand(DXIL::OperandIndex::kMatVecMulMatrixMIdx);
+  if (!llvm::isa<llvm::Constant>(MatrixM)) {
+    ValCtx.EmitInstrFormatError(
+        CI, ValidationRule::InstrLinalgMatrixShapeParamsAreConst,
+        {"Matrix M dimension"});
+    return;
+  }
+
+  llvm::Value *MatrixK =
+      CI->getOperand(DXIL::OperandIndex::kMatVecMulMatrixKIdx);
+  if (!llvm::isa<llvm::Constant>(MatrixK)) {
+    ValCtx.EmitInstrFormatError(
+        CI, ValidationRule::InstrLinalgMatrixShapeParamsAreConst,
+        {"Matrix K dimension"});
+    return;
+  }
+
+  llvm::Value *MatrixLayout =
+      CI->getOperand(DXIL::OperandIndex::kMatVecMulMatrixLayoutIdx);
+
+  ConstantInt *MatrixLayoutConst = dyn_cast<ConstantInt>(MatrixLayout);
+  if (!MatrixLayoutConst) {
+    ValCtx.EmitInstrFormatError(
+        CI, ValidationRule::InstrLinalgMatrixShapeParamsAreConst,
+        {"Matrix Layout"});
+    return;
+  }
+  uint64_t MLValue = MatrixLayoutConst->getLimitedValue();
+  if (!CheckMatrixLayoutForMatVecMulOps(MLValue)) {
+    ValCtx.EmitInstrFormatError(
+        CI, ValidationRule::InstrLinalgInvalidMatrixLayoutValueForMatVecOps,
+        {std::to_string(MLValue),
+         std::to_string(
+             static_cast<unsigned>(DXIL::LinalgMatrixLayout::RowMajor)),
+         std::to_string(static_cast<unsigned>(
+             DXIL::LinalgMatrixLayout::OuterProductOptimal))});
+    return;
+  }
+
+  llvm::Value *MatrixTranspose =
+      CI->getOperand(DXIL::OperandIndex::kMatVecMulMatrixTransposeIdx);
+  ConstantInt *MatrixTransposeConst = dyn_cast<ConstantInt>(MatrixTranspose);
+  if (!MatrixTransposeConst) {
+    ValCtx.EmitInstrFormatError(
+        CI, ValidationRule::InstrLinalgMatrixShapeParamsAreConst,
+        {"MatrixTranspose"});
+    return;
+  }
+
+  if (!CheckTransposeForMatrixLayout(MLValue,
+                                     MatrixTransposeConst->getLimitedValue())) {
+    ValCtx.EmitInstrFormatError(
+        CI, ValidationRule::InstrLinalgMatrixLayoutNotTransposable,
+        {GetMatrixLayoutStr(MLValue)});
+    return;
+  }
+
+  llvm::Value *InputVector =
+      CI->getOperand(DXIL::OperandIndex::kMatVecMulInputVectorIdx);
+  if (!CheckUnsignedFlag(InputVector->getType(),
+                         IsInputUnsignedConst->getLimitedValue())) {
+    ValCtx.EmitInstrFormatError(
+        CI, ValidationRule::InstrLinalgNotAnUnsignedType, {"Input"});
+    return;
+  }
+
+  if (!CheckUnsignedFlag(CI->getType(),
+                         IsOutputUnsignedConst->getLimitedValue())) {
+    ValCtx.EmitInstrFormatError(
+        CI, ValidationRule::InstrLinalgNotAnUnsignedType, {"Output"});
+    return;
+  }
+
+  switch (OpCode) {
+  case DXIL::OpCode::MatVecMulAdd: {
+    llvm::Value *BiasInterpretation =
+        CI->getOperand(DXIL::OperandIndex::kMatVecMulAddBiasInterpretation);
+    ConstantInt *BI = cast<ConstantInt>(BiasInterpretation);
+    if (!BI) {
+      ValCtx.EmitInstrFormatError(
+          CI, ValidationRule::InstrLinalgInterpretationParamAreConst,
+          {"BiasInterpretation"});
+      return;
+    }
+    uint64_t BIValue = BI->getLimitedValue();
+    if (!CheckLinalgInterpretation(BIValue, false)) {
+      ValCtx.EmitInstrFormatError(
+          CI, ValidationRule::InstrLinalgInvalidMemoryInterpValue,
+          {std::to_string(BIValue), "Bias vector"});
+      return;
+    }
+  } break;
+  default:
+    break;
+  }
+}
+
+static void ValidateImmOperandsForOuterProdAcc(CallInst *CI,
+                                               ValidationContext &ValCtx) {
+
+  llvm::Value *MatrixInterpretation =
+      CI->getOperand(DXIL::OperandIndex::kOuterProdAccMatrixInterpretation);
+  ConstantInt *MI = cast<ConstantInt>(MatrixInterpretation);
+  if (!MI) {
+    ValCtx.EmitInstrFormatError(
+        CI, ValidationRule::InstrLinalgInterpretationParamAreConst,
+        {"MatrixInterpretation"});
+    return;
+  }
+  uint64_t MIValue = MI->getLimitedValue();
+  if (!CheckLinalgInterpretation(MIValue, false)) {
+    ValCtx.EmitInstrFormatError(
+        CI, ValidationRule::InstrLinalgInvalidMemoryInterpValue,
+        {std::to_string(MIValue), "Matrix"});
+    return;
+  }
+
+  llvm::Value *MatrixLayout =
+      CI->getOperand(DXIL::OperandIndex::kOuterProdAccMatrixLayout);
+  if (!llvm::isa<llvm::Constant>(MatrixLayout)) {
+    ValCtx.EmitInstrFormatError(
+        CI, ValidationRule::InstrLinalgMatrixShapeParamsAreConst,
+        {"MatrixLayout"});
+    return;
+  }
+}
+
 // Validate the type-defined mask compared to the store value mask which
 // indicates which parts were defined returns true if caller should continue
 // validation
@@ -2110,6 +2371,16 @@ static void ValidateDxilOperationCallInProfile(CallInst *CI,
                                  GetLaunchTypeStr(NodeLaunchType)});
 
     break;
+  case DXIL::OpCode::MatVecMul:
+  case DXIL::OpCode::MatVecMulAdd:
+    ValidateImmOperandsForMatVecOps(CI, Opcode, ValCtx);
+    break;
+  case DXIL::OpCode::OuterProductAccumulate:
+    ValidateImmOperandsForOuterProdAcc(CI, ValCtx);
+    break;
+  case DXIL::OpCode::VectorAccumulate:
+
+    break;
 
   default:
     // TODO: make sure every Opcode is checked.
diff --git a/lib/HLSL/HLOperationLower.cpp b/lib/HLSL/HLOperationLower.cpp
index 4ef7591e89..18d003a764 100644
--- a/lib/HLSL/HLOperationLower.cpp
+++ b/lib/HLSL/HLOperationLower.cpp
@@ -6562,6 +6562,200 @@ Value *TranslateSelect(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
 
   return Builder.CreateSelect(cond, t, f);
 }
+
+Value *TranslateMatVecMul(CallInst *CI, IntrinsicOp IOP, OP::OpCode OpCode,
+                          HLOperationLowerHelper &Helper,
+                          HLObjectOperationLowerHelper *ObjHelper,
+                          bool &Translated) {
+
+  hlsl::OP *HlslOp = &Helper.hlslOP;
+  IRBuilder<> Builder(CI);
+
+  Constant *OpArg = HlslOp->GetU32Const(static_cast<unsigned>(OpCode));
+
+  // Input parameters
+  Value *InputVector =
+      CI->getArgOperand(HLOperandIndex::kMatVecMulInputVectorIdx);
+  Value *InputIsUnsigned =
+      CI->getArgOperand(HLOperandIndex::kMatVecMulIsInputUnsignedIdx);
+  Value *InputInterpretation =
+      CI->getArgOperand(HLOperandIndex::kMatVecMulInputInterpretationIdx);
+
+  // Matrix parameters
+  Value *MatrixBuffer =
+      CI->getArgOperand(HLOperandIndex::kMatVecMulMatrixBufferIdx);
+  Value *MatrixOffset =
+      CI->getArgOperand(HLOperandIndex::kMatVecMulMatrixOffsetIdx);
+  Value *MatrixInterpretation =
+      CI->getArgOperand(HLOperandIndex::kMatVecMulMatrixInterpretationIdx);
+  Value *MatrixM = CI->getArgOperand(HLOperandIndex::kMatVecMulMatrixMIdx);
+  Value *MatrixK = CI->getArgOperand(HLOperandIndex::kMatVecMulMatrixKIdx);
+  Value *MatrixLayout =
+      CI->getArgOperand(HLOperandIndex::kMatVecMulMatrixLayoutIdx);
+  Value *MatrixTranspose =
+      CI->getArgOperand(HLOperandIndex::kMatVecMulMatrixTransposeIdx);
+  Value *MatrixStride =
+      CI->getArgOperand(HLOperandIndex::kMatVecMulMatrixStrideIdx);
+
+  // Output parameters
+  Value *OutputIsUnsigned =
+      CI->getArgOperand(HLOperandIndex::kMatVecMulIsOutputUnsignedIdx);
+
+  // Get the DXIL function for the operation
+  Function *DxilFunc = HlslOp->GetOpFunc(
+      OpCode, {CI->getArgOperand(HLOperandIndex::kMatVecMulOutputVectorIdx)
+                   ->getType()
+                   ->getPointerElementType(),
+               InputVector->getType()});
+
+  // Create a call to the DXIL function
+  Value *NewCI = Builder.CreateCall(
+      DxilFunc,
+      {OpArg, InputVector, InputIsUnsigned, InputInterpretation, MatrixBuffer,
+       MatrixOffset, MatrixInterpretation, MatrixM, MatrixK, MatrixLayout,
+       MatrixTranspose, MatrixStride, OutputIsUnsigned});
+
+  // Get the output parameter and store the result
+  Value *OutParam =
+      CI->getArgOperand(HLOperandIndex::kMatVecMulOutputVectorIdx);
+
+  Builder.CreateStore(NewCI, OutParam);
+
+  return nullptr;
+}
+
+Value *TranslateMatVecMulAdd(CallInst *CI, IntrinsicOp IOP, OP::OpCode OpCode,
+                             HLOperationLowerHelper &Helper,
+                             HLObjectOperationLowerHelper *ObjHelper,
+                             bool &Translated) {
+
+  hlsl::OP *HlslOp = &Helper.hlslOP;
+  IRBuilder<> Builder(CI);
+
+  Constant *OpArg = HlslOp->GetU32Const(static_cast<unsigned>(OpCode));
+
+  // Input vector parameters
+  Value *InputVector =
+      CI->getArgOperand(HLOperandIndex::kMatVecMulAddInputVectorIdx);
+  Value *InputIsUnsigned =
+      CI->getArgOperand(HLOperandIndex::kMatVecMulAddIsInputUnsignedIdx);
+  Value *InputInterpretation =
+      CI->getArgOperand(HLOperandIndex::kMatVecMulAddInputInterpretationIdx);
+
+  // Matrix parameters
+  Value *MatrixBuffer =
+      CI->getArgOperand(HLOperandIndex::kMatVecMulAddMatrixBufferIdx);
+  Value *MatrixOffset =
+      CI->getArgOperand(HLOperandIndex::kMatVecMulAddMatrixOffsetIdx);
+  Value *MatrixInterpretation =
+      CI->getArgOperand(HLOperandIndex::kMatVecMulAddMatrixInterpretationIdx);
+  Value *MatrixM = CI->getArgOperand(HLOperandIndex::kMatVecMulAddMatrixMIdx);
+  Value *MatrixK = CI->getArgOperand(HLOperandIndex::kMatVecMulAddMatrixKIdx);
+  Value *MatrixLayout =
+      CI->getArgOperand(HLOperandIndex::kMatVecMulAddMatrixLayoutIdx);
+  Value *MatrixTranspose =
+      CI->getArgOperand(HLOperandIndex::kMatVecMulAddMatrixTransposeIdx);
+  Value *MatrixStride =
+      CI->getArgOperand(HLOperandIndex::kMatVecMulAddMatrixStrideIdx);
+
+  // Bias parameters
+  Value *BiasBuffer =
+      CI->getArgOperand(HLOperandIndex::kMatVecMulAddBiasBufferIdx);
+  Value *BiasOffset =
+      CI->getArgOperand(HLOperandIndex::kMatVecMulAddBiasOffsetIdx);
+  Value *BiasInterpretation =
+      CI->getArgOperand(HLOperandIndex::kMatVecMulAddBiasInterpretationIdx);
+
+  // Output parameters
+  Value *OutputIsUnsigned =
+      CI->getArgOperand(HLOperandIndex::kMatVecMulAddIsOutputUnsignedIdx);
+
+  // Get the DXIL function for the operation
+  Function *DxilFunc = HlslOp->GetOpFunc(
+      OpCode, {CI->getArgOperand(HLOperandIndex::kMatVecMulAddOutputVectorIdx)
+                   ->getType()
+                   ->getPointerElementType(),
+               InputVector->getType()});
+
+  // Create a call to the DXIL function
+  Value *NewCI = Builder.CreateCall(
+      DxilFunc, {OpArg, InputVector, InputIsUnsigned, InputInterpretation,
+                 MatrixBuffer, MatrixOffset, MatrixInterpretation, MatrixM,
+                 MatrixK, MatrixLayout, MatrixTranspose, MatrixStride,
+                 BiasBuffer, BiasOffset, BiasInterpretation, OutputIsUnsigned});
+
+  // Store the result in the output parameter
+  Value *OutParam =
+      CI->getArgOperand(HLOperandIndex::kMatVecMulAddOutputVectorIdx);
+  Builder.CreateStore(NewCI, OutParam);
+
+  return nullptr;
+}
+
+Value *TranslateOuterProductAccumulate(CallInst *CI, IntrinsicOp IOP,
+                                       OP::OpCode OpCode,
+                                       HLOperationLowerHelper &Helper,
+                                       HLObjectOperationLowerHelper *ObjHelper,
+                                       bool &Translated) {
+
+  hlsl::OP *HlslOp = &Helper.hlslOP;
+  IRBuilder<> Builder(CI);
+
+  Constant *OpArg = HlslOp->GetU32Const(static_cast<unsigned>(OpCode));
+
+  // Input vector parameters
+  Value *InputVector1 =
+      CI->getArgOperand(HLOperandIndex::kOuterProdAccInputVec1Idx);
+  Value *InputVector2 =
+      CI->getArgOperand(HLOperandIndex::kOuterProdAccInputVec2Idx);
+
+  // Matrix parameters
+  Value *MatrixBuffer =
+      CI->getArgOperand(HLOperandIndex::kOuterProdAccMatrixIdx);
+  Value *MatrixOffset =
+      CI->getArgOperand(HLOperandIndex::kOuterProdAccMatrixOffsetIdx);
+  Value *MatrixInterpretation =
+      CI->getArgOperand(HLOperandIndex::kOuterProdAccMatrixInterpretationIdx);
+  Value *MatrixLayout =
+      CI->getArgOperand(HLOperandIndex::kOuterProdAccMatrixLayoutIdx);
+  Value *MatrixStride =
+      CI->getArgOperand(HLOperandIndex::kOuterProdAccMatrixStrideIdx);
+
+  // Get the DXIL function for the operation
+  Function *DxilFunc = HlslOp->GetOpFunc(
+      OpCode, {InputVector1->getType(), InputVector2->getType()});
+
+  return Builder.CreateCall(
+      DxilFunc, {OpArg, InputVector1, InputVector2, MatrixBuffer, MatrixOffset,
+                 MatrixInterpretation, MatrixLayout, MatrixStride});
+}
+
+Value *TranslateVectorAccumulate(CallInst *CI, IntrinsicOp IOP,
+                                 OP::OpCode OpCode,
+                                 HLOperationLowerHelper &Helper,
+                                 HLObjectOperationLowerHelper *ObjHelper,
+                                 bool &Translated) {
+
+  hlsl::OP *HlslOp = &Helper.hlslOP;
+  IRBuilder<> Builder(CI);
+
+  Constant *OpArg = HlslOp->GetU32Const(static_cast<unsigned>(OpCode));
+
+  // Input vector parameter
+  Value *InputVector = CI->getArgOperand(HLOperandIndex::kVectorAccInputVecIdx);
+
+  // Matrix parameters
+  Value *MatrixBuffer = CI->getArgOperand(HLOperandIndex::kVectorAccMatrixIdx);
+  Value *MatrixOffset =
+      CI->getArgOperand(HLOperandIndex::kVectorAccMatrixOffsetIdx);
+
+  // Get the DXIL function for the operation
+  Function *DxilFunc = HlslOp->GetOpFunc(OpCode, InputVector->getType());
+
+  return Builder.CreateCall(DxilFunc,
+                            {OpArg, InputVector, MatrixBuffer, MatrixOffset});
+}
+
 } // namespace
 
 // Lower table.
@@ -7275,6 +7469,15 @@ IntrinsicLower gLowerTable[] = {
      DXIL::OpCode::HitObject_SetShaderTableIndex},
     {IntrinsicOp::MOP_DxHitObject_TraceRay, TranslateHitObjectTraceRay,
      DXIL::OpCode::HitObject_TraceRay},
+
+    {IntrinsicOp::IOP___builtin_MatVecMul, TranslateMatVecMul,
+     DXIL::OpCode::MatVecMul},
+    {IntrinsicOp::IOP___builtin_MatVecMulAdd, TranslateMatVecMulAdd,
+     DXIL::OpCode::MatVecMulAdd},
+    {IntrinsicOp::IOP___builtin_OuterProductAccumulate,
+     TranslateOuterProductAccumulate, DXIL::OpCode::OuterProductAccumulate},
+    {IntrinsicOp::IOP___builtin_VectorAccumulate, TranslateVectorAccumulate,
+     DXIL::OpCode::VectorAccumulate},
 };
 } // namespace
 static_assert(
diff --git a/tools/clang/lib/Sema/SemaHLSL.cpp b/tools/clang/lib/Sema/SemaHLSL.cpp
index 43946bc78a..6e58c0e872 100644
--- a/tools/clang/lib/Sema/SemaHLSL.cpp
+++ b/tools/clang/lib/Sema/SemaHLSL.cpp
@@ -12108,6 +12108,18 @@ void Sema::DiagnoseReachableHLSLCall(CallExpr *CE, const hlsl::ShaderModel *SM,
     break;
   case hlsl::IntrinsicOp::IOP_DxMaybeReorderThread:
     DiagnoseReachableSERCall(*this, CE, EntrySK, EntryDecl, true);
+    break;
+  case hlsl::IntrinsicOp::IOP___builtin_MatVecMul:
+  case hlsl::IntrinsicOp::IOP___builtin_MatVecMulAdd:
+  case hlsl::IntrinsicOp::IOP___builtin_OuterProductAccumulate:
+  case hlsl::IntrinsicOp::IOP___builtin_VectorAccumulate:
+    if (!SM->IsSM69Plus()) {
+      Diags.Report(CE->getExprLoc(),
+                   diag::warn_hlsl_intrinsic_in_wrong_shader_model)
+          << FD->getNameAsString() << EntryDecl->getNameAsString() << "6.9";
+      return;
+    }
+
     break;
   default:
     break;
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/check-shader-stages.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/check-shader-stages.hlsl
new file mode 100644
index 0000000000..74cb51260c
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/check-shader-stages.hlsl
@@ -0,0 +1,135 @@
+// RUN: %dxc -T lib_6_9 %s | FileCheck %s
+ 
+ByteAddressBuffer matrix_buffer;
+ByteAddressBuffer bias_buffer;
+RWByteAddressBuffer rw_matrix_buffer;
+ByteAddressBuffer input_vector_buffer;
+RWByteAddressBuffer output_vector_buffer;
+
+void UseCoopVec() {
+    vector<float, 4> output_vector;
+    static const uint is_output_unsigned = 0;
+
+    vector<float, 4> input_vector = input_vector_buffer.Load<vector<float, 4> >(0);
+    const uint is_input_unsigned = 0;
+    const uint input_interpretation = 9; /*F32*/
+
+    const uint matrix_offset = 0;
+    const uint matrix_interpretation = 9; /*F32*/
+    const uint matrix_dimM = 4;
+    const uint matrix_dimK = 4;
+    const uint matrix_layout = 0; /*RowMajor*/
+    const bool matrix_is_transposed = false;
+    const uint matrix_stride = 64;
+
+    __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+      is_input_unsigned, input_interpretation, matrix_buffer, matrix_offset,
+      matrix_interpretation, matrix_dimM, matrix_dimK, matrix_layout,
+      matrix_is_transposed, matrix_stride);
+    output_vector_buffer.Store(0, output_vector);
+
+    const uint bias_offset = 0;
+    const uint bias_interpretation = 9; /*F32*/
+
+    __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+      is_input_unsigned, input_interpretation, matrix_buffer, matrix_offset,
+      matrix_interpretation, matrix_dimM, matrix_dimK, matrix_layout,
+      matrix_is_transposed, matrix_stride, bias_buffer, bias_offset,
+      bias_interpretation);
+    output_vector_buffer.Store(1024, output_vector);
+
+    vector<uint, 8> input_vector1;
+    vector<uint, 8> input_vector2;
+    const uint opa_matrix_offset = 0;
+    const uint opa_matrix_interpretation = 5; /*U32*/
+    const uint opa_matrix_layout = 3; /*OuterProductOptimal*/
+    const uint opa_matrix_stride = 64;
+
+    __builtin_OuterProductAccumulate(input_vector1, input_vector2,
+      rw_matrix_buffer, opa_matrix_offset, opa_matrix_interpretation,
+      opa_matrix_layout, opa_matrix_stride);
+
+    const uint va_matrix_offset = 0;
+
+     __builtin_VectorAccumulate(input_vector1, rw_matrix_buffer,
+       va_matrix_offset);
+}
+
+// CHECK: define void @ps_main()
+// CHECK: call <4 x float> @dx.op.matVecMul
+// CHECK: call <4 x float> @dx.op.matVecMulAdd
+// CHECK: call void @dx.op.outerProductAccumulate
+// CHECK: call void @dx.op.vectorAccumulate
+
+[Shader("pixel")]
+void ps_main()
+{
+    UseCoopVec();
+}
+
+// CHECK: define void @cs_main()
+// CHECK: call <4 x float> @dx.op.matVecMul
+// CHECK: call <4 x float> @dx.op.matVecMulAdd
+// CHECK: call void @dx.op.outerProductAccumulate
+// CHECK: call void @dx.op.vectorAccumulate
+
+[Shader("compute")]
+[NumThreads(1,1,1)]
+void cs_main()
+{
+    UseCoopVec();
+}
+
+// CHECK: define void @vs_main()
+// CHECK: call <4 x float> @dx.op.matVecMul
+// CHECK: call <4 x float> @dx.op.matVecMulAdd
+// CHECK: call void @dx.op.outerProductAccumulate
+// CHECK: call void @dx.op.vectorAccumulate
+
+[Shader("vertex")]
+void vs_main()
+{
+    UseCoopVec();
+}
+
+struct MyRecord{
+    uint a;
+};
+
+// CHECK: define void @ns_main()
+// CHECK: call <4 x float> @dx.op.matVecMul
+// CHECK: call <4 x float> @dx.op.matVecMulAdd
+// CHECK: call void @dx.op.outerProductAccumulate
+// CHECK: call void @dx.op.vectorAccumulate
+
+[Shader("node")]
+[NodeLaunch("thread")]
+void ns_main(ThreadNodeInputRecord<MyRecord> input)
+{
+    UseCoopVec();
+}
+
+// Vertex shader output structure
+struct VS_OUT {
+    float3 Color : COLOR0;
+};
+
+// Geometry shader output structure
+struct GS_OUT {
+    float3 Color : COLOR0;
+    float2 TexCoord : TEXCOORD0;
+};
+
+// CHECK: define void @gs_main()
+// CHECK:  call <4 x float> @dx.op.matVecMul
+// CHECK: call <4 x float> @dx.op.matVecMulAdd
+// CHECK: call void @dx.op.outerProductAccumulate
+// CHECK: call void @dx.op.vectorAccumulate
+
+[shader("geometry")]
+[maxvertexcount(3)]
+void gs_main(point VS_OUT input[1], 
+    inout TriangleStream<GS_OUT> OutputStream)
+{
+    UseCoopVec();
+}
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/linalg-builtins.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/linalg-builtins.hlsl
new file mode 100644
index 0000000000..c3b4a3a8d7
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/linalg-builtins.hlsl
@@ -0,0 +1,79 @@
+// RUN: %dxc -fcgl -T cs_6_9 -E cs_main %s | FileCheck %s
+
+ByteAddressBuffer input_vector_buffer;
+ByteAddressBuffer opa_input_buffer;
+ByteAddressBuffer matrix_buffer;
+ByteAddressBuffer bias_buffer;
+RWByteAddressBuffer rw_matrix_buffer;
+RWByteAddressBuffer output_vector_buffer;
+
+[Shader("compute")]
+[NumThreads(1,1,1)]
+void cs_main()
+{    
+    vector<float, 4> output_vector;
+    static const uint is_output_unsigned = 0;
+    
+    vector<float, 4> input_vector = input_vector_buffer.Load<vector<float, 4> >(0);
+    const uint is_input_unsigned = 0;
+    const uint input_interpretation = 9; /*F32*/
+    
+    const uint matrix_offset = 0;
+    const uint matrix_interpretation = 9; /*F32*/
+    const uint matrix_dimM = 4;
+    const uint matrix_dimK = 4;
+    const uint matrix_layout = 0; /*RowMajor*/
+    const bool matrix_is_transposed = false; 
+    const uint matrix_stride = 64;
+
+    // CHECK: %[[MLD0:[^ ]+]] = load %struct.ByteAddressBuffer, %struct.ByteAddressBuffer* @"\01?matrix_buffer@@3UByteAddressBuffer@@A"
+    // CHECK: %[[MCH0:[^ ]+]] = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.ByteAddressBuffer)"(i32 0, %struct.ByteAddressBuffer %[[MLD0]])
+    // CHECK: %[[MAH0:[^ ]+]] = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.ByteAddressBuffer)"(i32 14, %dx.types.Handle %[[MCH0]], %dx.types.ResourceProperties { i32 11, i32 0 }, %struct.ByteAddressBuffer undef)
+    // CHECK: call void @"dx.hl.op..void (i32, <4 x float>*, i1, <4 x float>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32)"(i32 390, <4 x float>* %{{[^ ]+}}, i1 false, <4 x float> %{{[^ ]+}}, i1 false, i32 9, %dx.types.Handle %[[MAH0]], i32 0, i32 9, i32 4, i32 4, i32 0, i1 false, i32 64)
+    __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+      is_input_unsigned, input_interpretation, matrix_buffer, matrix_offset,
+      matrix_interpretation, matrix_dimM, matrix_dimK, matrix_layout,
+      matrix_is_transposed, matrix_stride);
+    output_vector_buffer.Store(0, output_vector);
+
+    const uint bias_offset = 0;
+    const uint bias_interpretation = 9; /*F32*/
+
+    // CHECK: %[[MLD1:[^ ]+]] = load %struct.ByteAddressBuffer, %struct.ByteAddressBuffer* @"\01?matrix_buffer@@3UByteAddressBuffer@@A"
+    // CHECK: %[[MCH1:[^ ]+]] = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.ByteAddressBuffer)"(i32 0, %struct.ByteAddressBuffer %[[MLD1]])
+    // CHECK: %[[MAH1:[^ ]+]] = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.ByteAddressBuffer)"(i32 14, %dx.types.Handle %[[MCH1]], %dx.types.ResourceProperties { i32 11, i32 0 }, %struct.ByteAddressBuffer undef)
+    // CHECK-NEXT: %[[BLD1:[^ ]+]] = load %struct.ByteAddressBuffer, %struct.ByteAddressBuffer* @"\01?bias_buffer@@3UByteAddressBuffer@@A"
+    // CHECK-NEXT: %[[BCH1:[^ ]+]] = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.ByteAddressBuffer)"(i32 0, %struct.ByteAddressBuffer %[[BLD1]])
+    // CHECK-NEXT: %[[BAH1:[^ ]+]] = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.ByteAddressBuffer)"(i32 14, %dx.types.Handle %[[BCH1]], %dx.types.ResourceProperties { i32 11, i32 0 }, %struct.ByteAddressBuffer undef)
+    // CHECK-NEXT: call void @"dx.hl.op..void (i32, <4 x float>*, i1, <4 x float>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32, %dx.types.Handle, i32, i32)"(i32 391, <4 x float>* %{{[^ ]+}}, i1 false, <4 x float> %{{[^ ]+}}, i1 false, i32 9, %dx.types.Handle %[[MAH1]], i32 0, i32 9, i32 4, i32 4, i32 0, i1 false, i32 64, %dx.types.Handle %[[BAH1]], i32 0, i32 9)
+    __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+      is_input_unsigned, input_interpretation, matrix_buffer, matrix_offset,
+      matrix_interpretation, matrix_dimM, matrix_dimK, matrix_layout,
+      matrix_is_transposed, matrix_stride, bias_buffer, bias_offset,
+      bias_interpretation);
+    output_vector_buffer.Store(1024, output_vector);
+
+    vector<uint, 8> input_vector1 = opa_input_buffer.Load<vector<uint, 8> >(0);
+    vector<uint, 8> input_vector2 = opa_input_buffer.Load<vector<uint, 8> >(128);
+    const uint opa_matrix_offset = 0;
+    const uint opa_matrix_interpretation = 5; /*U32*/
+    const uint opa_matrix_layout = 3; /*OuterProductOptimal*/
+    const uint opa_matrix_stride = 64;
+
+    // CHECK: %[[MLD2:[^ ]+]] = load %struct.RWByteAddressBuffer, %struct.RWByteAddressBuffer* @"\01?rw_matrix_buffer@@3URWByteAddressBuffer@@A"
+    // CHECK: %[[MCH2:[^ ]+]] = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RWByteAddressBuffer)"(i32 0, %struct.RWByteAddressBuffer %[[MLD2]])
+    // CHECK: %[[MAH2:[^ ]+]] = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RWByteAddressBuffer)"(i32 14, %dx.types.Handle %[[MCH2]], %dx.types.ResourceProperties { i32 4107, i32 0 }, %struct.RWByteAddressBuffer undef)
+    // CHECK: call void @"dx.hl.op..void (i32, <8 x i32>, <8 x i32>, %dx.types.Handle, i32, i32, i32, i32)"(i32 392, <8 x i32> %{{[^ ]+}}, <8 x i32> %{{[^ ]+}}, %dx.types.Handle %[[MAH2]], i32 0, i32 5, i32 3, i32 64)
+    __builtin_OuterProductAccumulate(input_vector1, input_vector2,
+      rw_matrix_buffer, opa_matrix_offset, opa_matrix_interpretation,
+      opa_matrix_layout, opa_matrix_stride);
+
+    const uint va_matrix_offset = 0;
+
+    // CHECK: %[[MLD3:[^ ]+]] = load %struct.RWByteAddressBuffer, %struct.RWByteAddressBuffer* @"\01?rw_matrix_buffer@@3URWByteAddressBuffer@@A"
+    // CHECK: %[[MCH3:[^ ]+]] = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RWByteAddressBuffer)"(i32 0, %struct.RWByteAddressBuffer %[[MLD3]])
+    // CHECK: %[[MAH3:[^ ]+]] = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RWByteAddressBuffer)"(i32 14, %dx.types.Handle %[[MCH3]], %dx.types.ResourceProperties { i32 4107, i32 0 }, %struct.RWByteAddressBuffer undef)
+    // CHECK: call void @"dx.hl.op..void (i32, <8 x i32>, %dx.types.Handle, i32)"(i32 393, <8 x i32> %{{[^ ]+}}, %dx.types.Handle %[[MAH3]], i32 0)
+    __builtin_VectorAccumulate(input_vector1, rw_matrix_buffer,
+      va_matrix_offset); 
+}
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/lit.local.cfg b/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/lit.local.cfg
new file mode 100644
index 0000000000..c2417a9e43
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/lit.local.cfg
@@ -0,0 +1 @@
+config.unsupported = 'dxil-1-9' not in config.available_features
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/mat-vec-mul-add_multioverload.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/mat-vec-mul-add_multioverload.hlsl
new file mode 100644
index 0000000000..98a568fa22
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/mat-vec-mul-add_multioverload.hlsl
@@ -0,0 +1,108 @@
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=float16_t -DIU=0 -DITY=float16_t -DII=F16 -DMI=F16 -DML=RowMajor -DMT=0 -DBI=F16 | FileCheck %s --check-prefixes COMMON,DXIL-0
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=float16_t -DIU=0 -DITY=float16_t -DII=F8_E4M3 -DMI=F8_E4M3 -DML=MulOptimal -DMT=0 -DBI=F16 | FileCheck %s --check-prefixes COMMON,DXIL-1
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=float16_t -DIU=0 -DITY=float16_t -DII=F8_E5M2 -DMI=F8_E5M2 -DML=MulOptimal -DMT=1 -DBI=F16 | FileCheck %s --check-prefixes COMMON,DXIL-2
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=0 -DITY=uint -DII=PackedS8x32 -DMI=I8 -DML=OuterProductOptimal -DMT=1 -DBI=I32 | FileCheck %s --check-prefixes COMMON,DXIL-3
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=0 -DITY=float -DII=I8 -DMI=I8 -DML=RowMajor -DMT=0 -DBI=I32 | FileCheck %s --check-prefixes COMMON,DXIL-4
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=1 -DOTY=uint -DIU=0 -DITY=float -DII=I8 -DMI=F16 -DML=RowMajor -DMT=0 -DBI=I8 | FileCheck %s --check-prefixes COMMON,DXIL-5
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=1 -DITY=uint -DII=U8 -DMI=I8 -DML=ColumnMajor -DMT=0 -DBI=I8 | FileCheck %s --check-prefixes COMMON,DXIL-6
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=0 -DITY=int -DII=U8 -DMI=U8 -DML=MulOptimal -DMT=1 -DBI=I8 | FileCheck %s --check-prefixes COMMON,DXIL-7
+
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=float16_t -DIU=0 -DITY=float16_t -DII=F16 -DMI=F16 -DML=RowMajor -DMT=0 -DBI=F16 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-0
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=float16_t -DIU=0 -DITY=float16_t -DII=F8_E4M3 -DMI=F8_E4M3 -DML=MulOptimal -DMT=0 -DBI=F16 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-1
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=float16_t -DIU=0 -DITY=float16_t -DII=F8_E5M2 -DMI=F8_E5M2 -DML=MulOptimal -DMT=1 -DBI=F16 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-2
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=0 -DITY=uint -DII=PackedS8x32 -DMI=I8 -DML=OuterProductOptimal -DMT=1 -DBI=I32 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-3
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=0 -DITY=float -DII=I8 -DMI=I8 -DML=RowMajor -DMT=0 -DBI=I32 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-4
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=1 -DOTY=uint -DIU=0 -DITY=float -DII=I8 -DMI=F16 -DML=RowMajor -DMT=0 -DBI=I8 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-5
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=1 -DITY=uint -DII=U8 -DMI=I8 -DML=ColumnMajor -DMT=0 -DBI=I8 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-6
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=0 -DITY=int -DII=U8 -DMI=U8 -DML=MulOptimal -DMT=1 -DBI=I8 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-7
+
+
+// COMMON: define void @main()
+
+// Test minimum support set of combinations for matVecMul
+// HLOP-0: call void @"dx.hl.op..void (i32, <4 x half>*, i1, <8 x half>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32, %dx.types.Handle, i32, i32)"(i32 391, <4 x half>* %output_vector, i1 false, <8 x half> %{{[^ ]+}}, i1 false, i32 8, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 8, i32 8, i32 8, i32 0, i1 false, i32 64, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 8)
+// DXIL-0: call <4 x half> @dx.op.matVecMulAdd.v4f16.v8f16(i32 306, <8 x half> {{[^ ]+}}, i1 false, i32 8, %dx.types.Handle {{[^ ]+}}, i32 0, i32 8, i32 8, i32 8, i32 0, i1 false, i32 64, %dx.types.Handle {{[^ ]+}}, i32 0, i32 8, i1 false)  ; MatVecMulAdd(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,biasBuffer,biasOffset,biasIntepretation,isOutputUnsigned)
+// HLOP-1: call void @"dx.hl.op..void (i32, <4 x half>*, i1, <8 x half>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32, %dx.types.Handle, i32, i32)"(i32 391, <4 x half>* %output_vector, i1 false, <8 x half> %{{[^ ]+}}, i1 false, i32 21, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 21, i32 8, i32 8, i32 2, i1 false, i32 64, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 8)
+// DXIL-1: call <4 x half> @dx.op.matVecMulAdd.v4f16.v8f16(i32 306, <8 x half> {{[^ ]+}}, i1 false, i32 21, %dx.types.Handle {{[^ ]+}}, i32 0, i32 21, i32 8, i32 8, i32 2, i1 false, i32 64, %dx.types.Handle {{[^ ]+}}, i32 0, i32 8, i1 false)  ; MatVecMulAdd(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,biasBuffer,biasOffset,biasIntepretation,isOutputUnsigned)
+// HLOP-2: call void @"dx.hl.op..void (i32, <4 x half>*, i1, <8 x half>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32, %dx.types.Handle, i32, i32)"(i32 391, <4 x half>* %output_vector, i1 false, <8 x half> %{{[^ ]+}}, i1 false, i32 22, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 22, i32 8, i32 8, i32 2, i1 true, i32 64, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 8)
+// DXIL-2: call <4 x half> @dx.op.matVecMulAdd.v4f16.v8f16(i32 306, <8 x half> {{[^ ]+}}, i1 false, i32 22, %dx.types.Handle {{[^ ]+}}, i32 0, i32 22, i32 8, i32 8, i32 2, i1 true, i32 64, %dx.types.Handle {{[^ ]+}}, i32 0, i32 8, i1 false)  ; MatVecMulAdd(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,biasBuffer,biasOffset,biasIntepretation,isOutputUnsigned)
+// HLOP-3: call void @"dx.hl.op..void (i32, <4 x i32>*, i1, <8 x i32>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32, %dx.types.Handle, i32, i32)"(i32 391, <4 x i32>* %output_vector, i1 false, <8 x i32> %{{[^ ]+}}, i1 false, i32 17, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 20, i32 8, i32 8, i32 3, i1 true, i32 64, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 4)
+// DXIL-3: call <4 x i32> @dx.op.matVecMulAdd.v4i32.v8i32(i32 306, <8 x i32> {{[^ ]+}}, i1 false, i32 17, %dx.types.Handle {{[^ ]+}}, i32 0, i32 20, i32 8, i32 8, i32 3, i1 true, i32 64, %dx.types.Handle {{[^ ]+}}, i32 0, i32 4, i1 false)  ; MatVecMulAdd(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,biasBuffer,biasOffset,biasIntepretation,isOutputUnsigned)
+// HLOP-4: call void @"dx.hl.op..void (i32, <4 x i32>*, i1, <8 x float>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32, %dx.types.Handle, i32, i32)"(i32 391, <4 x i32>* %output_vector, i1 false, <8 x float> %{{[^ ]+}}, i1 false, i32 20, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 20, i32 8, i32 8, i32 0, i1 false, i32 64, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 4)
+// DXIL-4: call <4 x i32> @dx.op.matVecMulAdd.v4i32.v8f32(i32 306, <8 x float> {{[^ ]+}}, i1 false, i32 20, %dx.types.Handle {{[^ ]+}}, i32 0, i32 20, i32 8, i32 8, i32 0, i1 false, i32 64, %dx.types.Handle {{[^ ]+}}, i32 0, i32 4, i1 false)  ; MatVecMulAdd(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,biasBuffer,biasOffset,biasIntepretation,isOutputUnsigned)
+
+// Test unsigned variations
+// HLOP-5: call void @"dx.hl.op..void (i32, <4 x i32>*, i1, <8 x float>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32, %dx.types.Handle, i32, i32)"(i32 391, <4 x i32>* %output_vector, i1 true, <8 x float> %{{[^ ]+}}, i1 false, i32 20, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 8, i32 8, i32 8, i32 0, i1 false, i32 64, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 20)
+// DXIL-5: call <4 x i32> @dx.op.matVecMulAdd.v4i32.v8f32(i32 306, <8 x float> {{[^ ]+}}, i1 false, i32 20, %dx.types.Handle {{[^ ]+}}, i32 0, i32 8, i32 8, i32 8, i32 0, i1 false, i32 64, %dx.types.Handle {{[^ ]+}}, i32 0, i32 20, i1 true)  ; MatVecMulAdd(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,biasBuffer,biasOffset,biasIntepretation,isOutputUnsigned)
+// HLOP-6: call void @"dx.hl.op..void (i32, <4 x i32>*, i1, <8 x i32>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32, %dx.types.Handle, i32, i32)"(i32 391, <4 x i32>* %output_vector, i1 false, <8 x i32> %{{[^ ]+}}, i1 true, i32 19, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 20, i32 8, i32 8, i32 1, i1 false, i32 64, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 20)
+// DXIL-6: call <4 x i32> @dx.op.matVecMulAdd.v4i32.v8i32(i32 306, <8 x i32> {{[^ ]+}}, i1 true, i32 19, %dx.types.Handle {{[^ ]+}}, i32 0, i32 20, i32 8, i32 8, i32 1, i1 false, i32 64, %dx.types.Handle {{[^ ]+}}, i32 0, i32 20, i1 false)  ; MatVecMulAdd(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,biasBuffer,biasOffset,biasIntepretation,isOutputUnsigned)
+// HLOP-7: call void @"dx.hl.op..void (i32, <4 x i32>*, i1, <8 x i32>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32, %dx.types.Handle, i32, i32)"(i32 391, <4 x i32>* %output_vector, i1 false, <8 x i32> %{{[^ ]+}}, i1 false, i32 19, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 19, i32 8, i32 8, i32 2, i1 true, i32 64, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 20)
+// DXIL-7: call <4 x i32> @dx.op.matVecMulAdd.v4i32.v8i32(i32 306, <8 x i32> {{[^ ]+}}, i1 false, i32 19, %dx.types.Handle {{[^ ]+}}, i32 0, i32 19, i32 8, i32 8, i32 2, i1 true, i32 64, %dx.types.Handle {{[^ ]+}}, i32 0, i32 20, i1 false)  ; MatVecMulAdd(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,biasBuffer,biasOffset,biasIntepretation,isOutputUnsigned)
+
+
+ByteAddressBuffer input_vector_buffer; 
+ByteAddressBuffer matrix_buffer;
+ByteAddressBuffer bias_buffer;
+RWByteAddressBuffer rw_matrix_buffer;
+RWByteAddressBuffer output_vector_buffer;
+
+enum CompType {
+  Invalid = 0,
+  I1 = 1,
+  I16 = 2,
+  U16 = 3,
+  I32 = 4,
+  U32 = 5,
+  I64 = 6,
+  U64 = 7,
+  F16 = 8,
+  F32 = 9,
+  F64 = 10,
+  SNormF16 = 11,
+  UNormF16 = 12,
+  SNormF32 = 13,
+  UNormF32 = 14,
+  SNormF64 = 15,
+  UNormF64 = 16,
+  PackedS8x32 = 17,
+  PackedU8x32 = 18,
+
+  // BEGIN NEW FOR SM 6.9
+  U8 = 19,
+  I8 = 20,
+  F8_E4M3 = 21,
+  F8_E5M2 = 22,
+};
+
+enum MatLayout {
+  RowMajor = 0,
+  ColumnMajor = 1,
+  MulOptimal = 2,
+  OuterProductOptimal = 3,
+};
+
+[NumThreads(1,1,1)]
+void main()
+{    
+    vector<OTY, 4> output_vector;
+    static const uint is_output_unsigned = OU;
+    
+    vector<ITY, 8> input_vector = input_vector_buffer.Load<vector<ITY, 8> >(0);
+    const uint is_input_unsigned = IU;
+    const uint input_interpretation = II;
+    
+    const uint matrix_offset = 0;
+    const uint matrix_interpretation = MI;
+    const uint matrix_dimM = 8;
+    const uint matrix_dimK = 8;
+    const uint matrix_layout = ML;
+    const bool matrix_is_transposed = (bool) MT; 
+    const uint matrix_stride = 64;
+
+    const uint bias_offset = 0;
+    const uint bias_interpretation = BI;
+
+    __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector, is_input_unsigned, input_interpretation, matrix_buffer, matrix_offset, matrix_interpretation, 
+        matrix_dimM, matrix_dimK, matrix_layout, matrix_is_transposed, matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+    output_vector_buffer.Store(0, output_vector);
+}
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/mat-vec-mul_multioverload.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/mat-vec-mul_multioverload.hlsl
new file mode 100644
index 0000000000..2ca2648503
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/mat-vec-mul_multioverload.hlsl
@@ -0,0 +1,104 @@
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=float16_t -DIU=0 -DITY=float16_t -DII=F16 -DMI=F16 -DML=RowMajor -DMT=0 | FileCheck %s --check-prefixes COMMON,DXIL-0
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=float16_t -DIU=0 -DITY=float16_t -DII=F8_E4M3 -DMI=F8_E4M3 -DML=MulOptimal -DMT=0 | FileCheck %s --check-prefixes COMMON,DXIL-1
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=float16_t -DIU=0 -DITY=float16_t -DII=F8_E5M2 -DMI=F8_E5M2 -DML=MulOptimal -DMT=1 | FileCheck %s --check-prefixes COMMON,DXIL-2
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=0 -DITY=uint -DII=PackedS8x32 -DMI=I8 -DML=OuterProductOptimal -DMT=1 | FileCheck %s --check-prefixes COMMON,DXIL-3
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=0 -DITY=float -DII=I8 -DMI=I8 -DML=RowMajor -DMT=0 | FileCheck %s --check-prefixes COMMON,DXIL-4
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=1 -DOTY=uint -DIU=0 -DITY=float -DII=I8 -DMI=F16 -DML=RowMajor -DMT=0 | FileCheck %s --check-prefixes COMMON,DXIL-5
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=1 -DITY=uint -DII=U8 -DMI=I8 -DML=ColumnMajor -DMT=0 | FileCheck %s --check-prefixes COMMON,DXIL-6
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=0 -DITY=int -DII=U8 -DMI=U8 -DML=MulOptimal -DMT=1 | FileCheck %s --check-prefixes COMMON,DXIL-7
+
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=float16_t -DIU=0 -DITY=float16_t -DII=F16 -DMI=F16 -DML=RowMajor -DMT=0 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-0
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=float16_t -DIU=0 -DITY=float16_t -DII=F8_E4M3 -DMI=F8_E4M3 -DML=MulOptimal -DMT=0 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-1
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=float16_t -DIU=0 -DITY=float16_t -DII=F8_E5M2 -DMI=F8_E5M2 -DML=MulOptimal -DMT=1 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-2
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=0 -DITY=uint -DII=PackedS8x32 -DMI=I8 -DML=OuterProductOptimal -DMT=1 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-3
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=0 -DITY=float -DII=I8 -DMI=I8 -DML=RowMajor -DMT=0 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-4
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=1 -DOTY=uint -DIU=0 -DITY=float -DII=I8 -DMI=F16 -DML=RowMajor -DMT=0 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-5
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=1 -DITY=uint -DII=U8 -DMI=I8 -DML=ColumnMajor -DMT=0 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-6
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=0 -DITY=int -DII=U8 -DMI=U8 -DML=MulOptimal -DMT=1 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-7
+
+// COMMON: define void @main()
+
+// Test minimum support set of combinations for matVecMul
+// HLOP-0: call void @"dx.hl.op..void (i32, <4 x half>*, i1, <8 x half>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32)"(i32 390, <4 x half>* %output_vector, i1 false, <8 x half> %{{[^ ]+}}, i1 false, i32 8, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 8, i32 8, i32 8, i32 0, i1 false, i32 64)
+// DXIL-0: call <4 x half> @dx.op.matVecMul.v4f16.v8f16(i32 305, <8 x half> {{[^ ]+}}, i1 false, i32 8, %dx.types.Handle {{[^ ]+}}, i32 0, i32 8, i32 8, i32 8, i32 0, i1 false, i32 64, i1 false)  ; MatVecMul(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,isOutputUnsigned)
+// HLOP-1: call void @"dx.hl.op..void (i32, <4 x half>*, i1, <8 x half>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32)"(i32 390, <4 x half>* %output_vector, i1 false, <8 x half> %{{[^ ]+}}, i1 false, i32 21, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 21, i32 8, i32 8, i32 2, i1 false, i32 64)
+// DXIL-1: call <4 x half> @dx.op.matVecMul.v4f16.v8f16(i32 305, <8 x half> {{[^ ]+}}, i1 false, i32 21, %dx.types.Handle {{[^ ]+}}, i32 0, i32 21, i32 8, i32 8, i32 2, i1 false, i32 64, i1 false)  ; MatVecMul(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,isOutputUnsigned)
+// HLOP-2: call void @"dx.hl.op..void (i32, <4 x half>*, i1, <8 x half>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32)"(i32 390, <4 x half>* %output_vector, i1 false, <8 x half> %{{[^ ]+}}, i1 false, i32 22, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 22, i32 8, i32 8, i32 2, i1 true, i32 64)
+// DXIL-2: call <4 x half> @dx.op.matVecMul.v4f16.v8f16(i32 305, <8 x half> {{[^ ]+}}, i1 false, i32 22, %dx.types.Handle {{[^ ]+}}, i32 0, i32 22, i32 8, i32 8, i32 2, i1 true, i32 64, i1 false)  ; MatVecMul(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,isOutputUnsigned)
+// HLOP-3: call void @"dx.hl.op..void (i32, <4 x i32>*, i1, <8 x i32>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32)"(i32 390, <4 x i32>* %output_vector, i1 false, <8 x i32> %{{[^ ]+}}, i1 false, i32 17, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 20, i32 8, i32 8, i32 3, i1 true, i32 64)
+// DXIL-3: call <4 x i32> @dx.op.matVecMul.v4i32.v8i32(i32 305, <8 x i32> {{[^ ]+}}, i1 false, i32 17, %dx.types.Handle {{[^ ]+}}, i32 0, i32 20, i32 8, i32 8, i32 3, i1 true, i32 64, i1 false)  ; MatVecMul(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,isOutputUnsigned)
+// HLOP-4: call void @"dx.hl.op..void (i32, <4 x i32>*, i1, <8 x float>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32)"(i32 390, <4 x i32>* %output_vector, i1 false, <8 x float> %{{[^ ]+}}, i1 false, i32 20, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 20, i32 8, i32 8, i32 0, i1 false, i32 64)
+// DXIL-4: call <4 x i32> @dx.op.matVecMul.v4i32.v8f32(i32 305, <8 x float> {{[^ ]+}}, i1 false, i32 20, %dx.types.Handle {{[^ ]+}}, i32 0, i32 20, i32 8, i32 8, i32 0, i1 false, i32 64, i1 false)  ; MatVecMul(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,isOutputUnsigned)
+
+// Test unsigned variations
+// HLOP-5: call void @"dx.hl.op..void (i32, <4 x i32>*, i1, <8 x float>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32)"(i32 390, <4 x i32>* %output_vector, i1 true, <8 x float> %{{[^ ]+}}, i1 false, i32 20, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 8, i32 8, i32 8, i32 0, i1 false, i32 64)
+// DXIL-5: call <4 x i32> @dx.op.matVecMul.v4i32.v8f32(i32 305, <8 x float> {{[^ ]+}}, i1 false, i32 20, %dx.types.Handle {{[^ ]+}}, i32 0, i32 8, i32 8, i32 8, i32 0, i1 false, i32 64, i1 true)  ; MatVecMul(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,isOutputUnsigned)
+// HLOP-6: call void @"dx.hl.op..void (i32, <4 x i32>*, i1, <8 x i32>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32)"(i32 390, <4 x i32>* %output_vector, i1 false, <8 x i32> %{{[^ ]+}}, i1 true, i32 19, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 20, i32 8, i32 8, i32 1, i1 false, i32 64)
+// DXIL-6: call <4 x i32> @dx.op.matVecMul.v4i32.v8i32(i32 305, <8 x i32> {{[^ ]+}}, i1 true, i32 19, %dx.types.Handle {{[^ ]+}}, i32 0, i32 20, i32 8, i32 8, i32 1, i1 false, i32 64, i1 false)  ; MatVecMul(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,isOutputUnsigned)
+// HLOP-7: call void @"dx.hl.op..void (i32, <4 x i32>*, i1, <8 x i32>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32)"(i32 390, <4 x i32>* %output_vector, i1 false, <8 x i32> %{{[^ ]+}}, i1 false, i32 19, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 19, i32 8, i32 8, i32 2, i1 true, i32 64)
+// DXIL-7: call <4 x i32> @dx.op.matVecMul.v4i32.v8i32(i32 305, <8 x i32> {{[^ ]+}}, i1 false, i32 19, %dx.types.Handle {{[^ ]+}}, i32 0, i32 19, i32 8, i32 8, i32 2, i1 true, i32 64, i1 false)  ; MatVecMul(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,isOutputUnsigned)
+
+
+ByteAddressBuffer input_vector_buffer; 
+ByteAddressBuffer matrix_buffer;
+ByteAddressBuffer bias_buffer;
+RWByteAddressBuffer rw_matrix_buffer;
+RWByteAddressBuffer output_vector_buffer;
+
+enum CompType {
+  Invalid = 0,
+  I1 = 1,
+  I16 = 2,
+  U16 = 3,
+  I32 = 4,
+  U32 = 5,
+  I64 = 6,
+  U64 = 7,
+  F16 = 8,
+  F32 = 9,
+  F64 = 10,
+  SNormF16 = 11,
+  UNormF16 = 12,
+  SNormF32 = 13,
+  UNormF32 = 14,
+  SNormF64 = 15,
+  UNormF64 = 16,
+  PackedS8x32 = 17,
+  PackedU8x32 = 18,
+
+  // BEGIN NEW FOR SM 6.9
+  U8 = 19,
+  I8 = 20,
+  F8_E4M3 = 21,
+  F8_E5M2 = 22,
+};
+
+enum MatLayout {
+  RowMajor = 0,
+  ColumnMajor = 1,
+  MulOptimal = 2,
+  OuterProductOptimal = 3,
+};
+
+[NumThreads(1,1,1)]
+void main()
+{    
+    vector<OTY, 4> output_vector;
+    static const uint is_output_unsigned = OU;
+    
+    vector<ITY, 8> input_vector = input_vector_buffer.Load<vector<ITY, 8> >(0);
+    const uint is_input_unsigned = IU;
+    const uint input_interpretation = II;
+    
+    const uint matrix_offset = 0;
+    const uint matrix_interpretation = MI;
+    const uint matrix_dimM = 8;
+    const uint matrix_dimK = 8;
+    const uint matrix_layout = ML;
+    const bool matrix_is_transposed = (bool) MT; 
+    const uint matrix_stride = 64;
+
+    __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector, is_input_unsigned, input_interpretation, matrix_buffer, matrix_offset, matrix_interpretation, 
+        matrix_dimM, matrix_dimK, matrix_layout, matrix_is_transposed, matrix_stride);
+    output_vector_buffer.Store(0, output_vector);
+}
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/outer-product-accumulate-multioverload.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/outer-product-accumulate-multioverload.hlsl
new file mode 100644
index 0000000000..40bbe62284
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/outer-product-accumulate-multioverload.hlsl
@@ -0,0 +1,70 @@
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DITY=float16_t -DMI=F16 -DML=RowMajor | FileCheck %s --check-prefixes COMMON,DXIL-0
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DITY=float16_t -DMI=F8_E4M3 -DML=OuterProductOptimal | FileCheck %s --check-prefixes COMMON,DXIL-1
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DITY=uint -DMI=U8 -DML=OuterProductOptimal | FileCheck %s --check-prefixes COMMON,DXIL-2
+
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DITY=float16_t -DMI=F16 -DML=RowMajor -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-0
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DITY=float16_t -DMI=F8_E4M3 -DML=OuterProductOptimal -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-1
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DITY=uint -DMI=U8 -DML=OuterProductOptimal -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-2
+
+ByteAddressBuffer input_vector_buffer;
+ByteAddressBuffer input_vector_buffer2;
+RWByteAddressBuffer matrix_buffer;
+
+// COMMON: define void @main()
+// DXIL-0: call void @dx.op.outerProductAccumulate.v8f16.v8f16(i32 307, <8 x half> %{{[^ ]+}}, <8 x half> %{{[^ ]+}}, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 8, i32 0, i32 64)  ; OuterProductAccumulate(inputVector1,inputVector2,matrixBuffer,matrixOffset,matrixIntepretation,matrixLayout,matrixStride)
+// HLOP-0: call void @"dx.hl.op..void (i32, <8 x half>, <8 x half>, %dx.types.Handle, i32, i32, i32, i32)"(i32 392, <8 x half> %{{[^ ]+}}, <8 x half> %{{[^ ]+}}, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 8, i32 0, i32 64)
+// DXIL-1: call void @dx.op.outerProductAccumulate.v8f16.v8f16(i32 307, <8 x half> %{{[^ ]+}}, <8 x half> %{{[^ ]+}}, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 21, i32 3, i32 64)  ; OuterProductAccumulate(inputVector1,inputVector2,matrixBuffer,matrixOffset,matrixIntepretation,matrixLayout,matrixStride)
+// HLOP-1: call void @"dx.hl.op..void (i32, <8 x half>, <8 x half>, %dx.types.Handle, i32, i32, i32, i32)"(i32 392, <8 x half> %{{[^ ]+}}, <8 x half> %{{[^ ]+}}, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 21, i32 3, i32 64)
+// DXIL-2: call void @dx.op.outerProductAccumulate.v8i32.v8i32(i32 307, <8 x i32> %{{[^ ]+}}, <8 x i32> %{{[^ ]+}}, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 19, i32 3, i32 64)  ; OuterProductAccumulate(inputVector1,inputVector2,matrixBuffer,matrixOffset,matrixIntepretation,matrixLayout,matrixStride)
+// HLOP-2: call void @"dx.hl.op..void (i32, <8 x i32>, <8 x i32>, %dx.types.Handle, i32, i32, i32, i32)"(i32 392, <8 x i32> %{{[^ ]+}}, <8 x i32> %{{[^ ]+}}, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 19, i32 3, i32 64)
+
+enum CompType {
+  Invalid = 0,
+  I1 = 1,
+  I16 = 2,
+  U16 = 3,
+  I32 = 4,
+  U32 = 5,
+  I64 = 6,
+  U64 = 7,
+  F16 = 8,
+  F32 = 9,
+  F64 = 10,
+  SNormF16 = 11,
+  UNormF16 = 12,
+  SNormF32 = 13,
+  UNormF32 = 14,
+  SNormF64 = 15,
+  UNormF64 = 16,
+  PackedS8x32 = 17,
+  PackedU8x32 = 18,
+
+  // BEGIN NEW FOR SM 6.9
+  U8 = 19,
+  I8 = 20,
+  F8_E4M3 = 21,
+  F8_E5M2 = 22,
+};
+
+enum MatLayout {
+  RowMajor = 0,
+  ColumnMajor = 1,
+  MulOptimal = 2,
+  OuterProductOptimal = 3,
+};
+
+
+[Numthreads(1,1,1)]
+void main()
+{
+    vector<ITY, 8> input_vector1 = input_vector_buffer.Load<vector<ITY, 8> >(0);
+    vector<ITY, 8> input_vector2 = input_vector_buffer2.Load<vector<ITY, 8> >(0);
+
+    const uint matrix_interpretation = MI;
+    const uint matrix_layout = ML;
+    const uint matrix_offset = 0;
+    const uint matrix_stride = 64;
+
+    __builtin_OuterProductAccumulate(input_vector1, input_vector2, matrix_buffer, matrix_offset, matrix_interpretation, matrix_layout, matrix_stride);
+
+}
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/vector-accumulate.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/vector-accumulate.hlsl
new file mode 100644
index 0000000000..dc1bb6c563
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/vector-accumulate.hlsl
@@ -0,0 +1,16 @@
+// RUN: %dxc -T cs_6_9 %s | FileCheck %s
+
+RWByteAddressBuffer matrix_buffer;
+
+// Test use of __builtin_VectorAccumulate in compute shader
+// CHECK: define void @main()
+// CHECK: call void @dx.op.vectorAccumulate.v2i32(i32 {{[0-9]+}}, <2 x i32> <i32 5, i32 5>, %dx.types.Handle {{%[0-9]+}}, i32 0)
+
+[NumThreads(1,1,1)]
+void main()
+{
+    vector<uint, 2> input_vector1 = 5;
+    const uint matrix_offset = 0;
+
+     __builtin_VectorAccumulate(input_vector1, matrix_buffer, matrix_offset);
+}
diff --git a/tools/clang/test/DXC/Passes/DxilGen/linalg-builtins.ll b/tools/clang/test/DXC/Passes/DxilGen/linalg-builtins.ll
new file mode 100644
index 0000000000..6623f63031
--- /dev/null
+++ b/tools/clang/test/DXC/Passes/DxilGen/linalg-builtins.ll
@@ -0,0 +1,189 @@
+; RUN: %dxopt %s -hlsl-passes-resume -dxilgen -S | FileCheck %s
+; REQUIRES: dxil-1-9
+
+target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+%struct.ByteAddressBuffer = type { i32 }
+%struct.RWByteAddressBuffer = type { i32 }
+%dx.types.Handle = type { i8* }
+%dx.types.ResourceProperties = type { i32, i32 }
+
+@"\01?input_vector_buffer@@3UByteAddressBuffer@@A" = external global %struct.ByteAddressBuffer, align 4
+@"\01?opa_input_buffer@@3UByteAddressBuffer@@A" = external global %struct.ByteAddressBuffer, align 4
+@"\01?matrix_buffer@@3UByteAddressBuffer@@A" = external global %struct.ByteAddressBuffer, align 4
+@"\01?bias_buffer@@3UByteAddressBuffer@@A" = external global %struct.ByteAddressBuffer, align 4
+@"\01?rw_matrix_buffer@@3URWByteAddressBuffer@@A" = external global %struct.RWByteAddressBuffer, align 4
+@"\01?output_vector_buffer@@3URWByteAddressBuffer@@A" = external global %struct.RWByteAddressBuffer, align 4
+
+; Function Attrs: nounwind
+define void @cs_main() #0 {
+entry:
+  ;CHECK-DAG: %[[MLD:[^ ]+]] = load %struct.ByteAddressBuffer, %struct.ByteAddressBuffer* @"\01?matrix_buffer@@3UByteAddressBuffer@@A"
+  ;CHECK-DAG: %[[BLD:[^ ]+]] = load %struct.ByteAddressBuffer, %struct.ByteAddressBuffer* @"\01?bias_buffer@@3UByteAddressBuffer@@A"
+  ;CHECK-DAG: %[[RWMLD0:[^ ]+]] = load %struct.RWByteAddressBuffer, %struct.RWByteAddressBuffer* @"\01?rw_matrix_buffer@@3URWByteAddressBuffer@@A"
+  %output_vector = alloca <4 x float>, align 4
+  %tmp = bitcast <4 x float>* %output_vector to i8*, !dbg !21 ; line:14 col:5
+  call void @llvm.lifetime.start(i64 16, i8* %tmp) #0, !dbg !21 ; line:14 col:5
+  %tmp1 = load %struct.ByteAddressBuffer, %struct.ByteAddressBuffer* @"\01?input_vector_buffer@@3UByteAddressBuffer@@A", !dbg !25 ; line:17 col:37
+  %tmp2 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.ByteAddressBuffer)"(i32 0, %struct.ByteAddressBuffer %tmp1), !dbg !25 ; line:17 col:37
+  %tmp3 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.ByteAddressBuffer)"(i32 14, %dx.types.Handle %tmp2, %dx.types.ResourceProperties { i32 11, i32 0 }, %struct.ByteAddressBuffer zeroinitializer), !dbg !25 ; line:17 col:37
+  %tmp4 = call <4 x float> @"dx.hl.op.ro.<4 x float> (i32, %dx.types.Handle, i32)"(i32 231, %dx.types.Handle %tmp3, i32 0), !dbg !25 ; line:17 col:37
+  %tmp5 = load %struct.ByteAddressBuffer, %struct.ByteAddressBuffer* @"\01?matrix_buffer@@3UByteAddressBuffer@@A", !dbg !26 ; line:33 col:5
+  %tmp6 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.ByteAddressBuffer)"(i32 0, %struct.ByteAddressBuffer %tmp5), !dbg !26 ; line:33 col:5
+  %tmp7 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.ByteAddressBuffer)"(i32 14, %dx.types.Handle %tmp6, %dx.types.ResourceProperties { i32 11, i32 0 }, %struct.ByteAddressBuffer zeroinitializer), !dbg !26 ; line:33 col:5
+
+  ;CHECK: %[[MCH0:[^ ]+]] = call %dx.types.Handle @dx.op.createHandleForLib.struct.ByteAddressBuffer(i32 160, %struct.ByteAddressBuffer %[[MLD]]
+  ;CHECK: %[[MAH0:[^ ]+]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %[[MCH0]]
+  ;CHECK: call <4 x float> @dx.op.matVecMul.v4f32.v4f32(i32 305, <4 x float> %{{[^ ]+}}, i1 false, i32 9, %dx.types.Handle %[[MAH0]], i32 0, i32 9, i32 4, i32 4, i32 0, i1 false, i32 64, i1 false) 
+  call void @"dx.hl.op..void (i32, <4 x float>*, i1, <4 x float>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32)"(i32 390, <4 x float>* %output_vector, i1 false, <4 x float> %tmp4, i1 false, i32 9, %dx.types.Handle %tmp7, i32 0, i32 9, i32 4, i32 4, i32 0, i1 false, i32 64), !dbg !26 ; line:33 col:5
+
+  %tmp8 = load <4 x float>, <4 x float>* %output_vector, align 4, !dbg !27, !tbaa !28 ; line:37 col:35
+  %tmp9 = load %struct.RWByteAddressBuffer, %struct.RWByteAddressBuffer* @"\01?output_vector_buffer@@3URWByteAddressBuffer@@A", !dbg !31 ; line:37 col:5
+  %tmp10 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RWByteAddressBuffer)"(i32 0, %struct.RWByteAddressBuffer %tmp9), !dbg !31 ; line:37 col:5
+  %tmp11 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RWByteAddressBuffer)"(i32 14, %dx.types.Handle %tmp10, %dx.types.ResourceProperties { i32 4107, i32 0 }, %struct.RWByteAddressBuffer zeroinitializer), !dbg !31 ; line:37 col:5
+  call void @"dx.hl.op..void (i32, %dx.types.Handle, i32, <4 x float>)"(i32 277, %dx.types.Handle %tmp11, i32 0, <4 x float> %tmp8), !dbg !31 ; line:37 col:5
+  %tmp12 = load %struct.ByteAddressBuffer, %struct.ByteAddressBuffer* @"\01?matrix_buffer@@3UByteAddressBuffer@@A", !dbg !32 ; line:49 col:5
+  %tmp13 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.ByteAddressBuffer)"(i32 0, %struct.ByteAddressBuffer %tmp12), !dbg !32 ; line:49 col:5
+  %tmp14 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.ByteAddressBuffer)"(i32 14, %dx.types.Handle %tmp13, %dx.types.ResourceProperties { i32 11, i32 0 }, %struct.ByteAddressBuffer zeroinitializer), !dbg !32 ; line:49 col:5
+  %tmp15 = load %struct.ByteAddressBuffer, %struct.ByteAddressBuffer* @"\01?bias_buffer@@3UByteAddressBuffer@@A", !dbg !32 ; line:49 col:5
+  %tmp16 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.ByteAddressBuffer)"(i32 0, %struct.ByteAddressBuffer %tmp15), !dbg !32 ; line:49 col:5
+  %tmp17 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.ByteAddressBuffer)"(i32 14, %dx.types.Handle %tmp16, %dx.types.ResourceProperties { i32 11, i32 0 }, %struct.ByteAddressBuffer zeroinitializer), !dbg !32 ; line:49 col:5
+
+  ;CHECK: %[[MCH1:[^ ]+]] = call %dx.types.Handle @dx.op.createHandleForLib.struct.ByteAddressBuffer(i32 160, %struct.ByteAddressBuffer %[[MLD]]
+  ;CHECK: %[[MAH1:[^ ]+]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %[[MCH1]]
+  ;CHECK: %[[BCH1:[^ ]+]] = call %dx.types.Handle @dx.op.createHandleForLib.struct.ByteAddressBuffer(i32 160, %struct.ByteAddressBuffer %[[BLD]]
+  ;CHECK: %[[BAH1:[^ ]+]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %[[BCH1]]
+  ;CHECK: call <4 x float> @dx.op.matVecMulAdd.v4f32.v4f32(i32 306, <4 x float> %{{[^ ]+}}, i1 false, i32 9, %dx.types.Handle %[[MAH1]], i32 0, i32 9, i32 4, i32 4, i32 0, i1 false, i32 64, %dx.types.Handle %[[BAH1]], i32 0, i32 9, i1 false)
+  call void @"dx.hl.op..void (i32, <4 x float>*, i1, <4 x float>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32, %dx.types.Handle, i32, i32)"(i32 391, <4 x float>* %output_vector, i1 false, <4 x float> %tmp4, i1 false, i32 9, %dx.types.Handle %tmp14, i32 0, i32 9, i32 4, i32 4, i32 0, i1 false, i32 64, %dx.types.Handle %tmp17, i32 0, i32 9), !dbg !32 ; line:49 col:5
+  
+  %tmp18 = load <4 x float>, <4 x float>* %output_vector, align 4, !dbg !33, !tbaa !28 ; line:54 col:38
+  %tmp19 = load %struct.RWByteAddressBuffer, %struct.RWByteAddressBuffer* @"\01?output_vector_buffer@@3URWByteAddressBuffer@@A", !dbg !34 ; line:54 col:5
+  %tmp20 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RWByteAddressBuffer)"(i32 0, %struct.RWByteAddressBuffer %tmp19), !dbg !34 ; line:54 col:5
+  %tmp21 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RWByteAddressBuffer)"(i32 14, %dx.types.Handle %tmp20, %dx.types.ResourceProperties { i32 4107, i32 0 }, %struct.RWByteAddressBuffer zeroinitializer), !dbg !34 ; line:54 col:5
+  call void @"dx.hl.op..void (i32, %dx.types.Handle, i32, <4 x float>)"(i32 277, %dx.types.Handle %tmp21, i32 1024, <4 x float> %tmp18), !dbg !34 ; line:54 col:5
+  %tmp22 = load %struct.ByteAddressBuffer, %struct.ByteAddressBuffer* @"\01?opa_input_buffer@@3UByteAddressBuffer@@A", !dbg !35 ; line:56 col:37
+  %tmp23 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.ByteAddressBuffer)"(i32 0, %struct.ByteAddressBuffer %tmp22), !dbg !35 ; line:56 col:37
+  %tmp24 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.ByteAddressBuffer)"(i32 14, %dx.types.Handle %tmp23, %dx.types.ResourceProperties { i32 11, i32 0 }, %struct.ByteAddressBuffer zeroinitializer), !dbg !35 ; line:56 col:37
+  %tmp25 = call <8 x i32> @"dx.hl.op.ro.<8 x i32> (i32, %dx.types.Handle, i32)"(i32 231, %dx.types.Handle %tmp24, i32 0), !dbg !35 ; line:56 col:37
+  %tmp26 = load %struct.ByteAddressBuffer, %struct.ByteAddressBuffer* @"\01?opa_input_buffer@@3UByteAddressBuffer@@A", !dbg !36 ; line:57 col:37
+  %tmp27 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.ByteAddressBuffer)"(i32 0, %struct.ByteAddressBuffer %tmp26), !dbg !36 ; line:57 col:37
+  %tmp28 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.ByteAddressBuffer)"(i32 14, %dx.types.Handle %tmp27, %dx.types.ResourceProperties { i32 11, i32 0 }, %struct.ByteAddressBuffer zeroinitializer), !dbg !36 ; line:57 col:37
+  %tmp29 = call <8 x i32> @"dx.hl.op.ro.<8 x i32> (i32, %dx.types.Handle, i32)"(i32 231, %dx.types.Handle %tmp28, i32 128), !dbg !36 ; line:57 col:37
+  %tmp30 = load %struct.RWByteAddressBuffer, %struct.RWByteAddressBuffer* @"\01?rw_matrix_buffer@@3URWByteAddressBuffer@@A", !dbg !37 ; line:67 col:5
+  %tmp31 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RWByteAddressBuffer)"(i32 0, %struct.RWByteAddressBuffer %tmp30), !dbg !37 ; line:67 col:5
+  %tmp32 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RWByteAddressBuffer)"(i32 14, %dx.types.Handle %tmp31, %dx.types.ResourceProperties { i32 4107, i32 0 }, %struct.RWByteAddressBuffer zeroinitializer), !dbg !37 ; line:67 col:5
+
+  ;CHECK: %[[RWMCH0:[^ ]+]] = call %dx.types.Handle @dx.op.createHandleForLib.struct.RWByteAddressBuffer(i32 160, %struct.RWByteAddressBuffer %[[RWMLD0]]
+  ;CHECK: %[[RWMAH0:[^ ]+]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %[[RWMCH0]]
+  ;CHECK: call void @dx.op.outerProductAccumulate.v8i32.v8i32(i32 307, <8 x i32> %{{[^ ]+}}, <8 x i32> %{{[^ ]+}}, %dx.types.Handle %[[RWMAH0]], i32 0, i32 5, i32 3, i32 64)
+  call void @"dx.hl.op..void (i32, <8 x i32>, <8 x i32>, %dx.types.Handle, i32, i32, i32, i32)"(i32 392, <8 x i32> %tmp25, <8 x i32> %tmp29, %dx.types.Handle %tmp32, i32 0, i32 5, i32 3, i32 64), !dbg !37 ; line:67 col:5
+
+  
+  %tmp33 = load %struct.RWByteAddressBuffer, %struct.RWByteAddressBuffer* @"\01?rw_matrix_buffer@@3URWByteAddressBuffer@@A", !dbg !38 ; line:77 col:5
+  %tmp34 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RWByteAddressBuffer)"(i32 0, %struct.RWByteAddressBuffer %tmp33), !dbg !38 ; line:77 col:5
+  %tmp35 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RWByteAddressBuffer)"(i32 14, %dx.types.Handle %tmp34, %dx.types.ResourceProperties { i32 4107, i32 0 }, %struct.RWByteAddressBuffer zeroinitializer), !dbg !38 ; line:77 col:5
+
+  ;CHECK: %[[RWMCH1:[^ ]+]] = call %dx.types.Handle @dx.op.createHandleForLib.struct.RWByteAddressBuffer(i32 160, %struct.RWByteAddressBuffer %[[RWMLD0]]
+  ;CHECK: %[[RWMAH1:[^ ]+]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %[[RWMCH1]]
+  ;CHECK: call void @dx.op.vectorAccumulate.v8i32(i32 308, <8 x i32> %{{[^ ]+}}, %dx.types.Handle %[[RWMAH1]], i32 0)
+  call void @"dx.hl.op..void (i32, <8 x i32>, %dx.types.Handle, i32)"(i32 393, <8 x i32> %tmp25, %dx.types.Handle %tmp35, i32 0), !dbg !38 ; line:77 col:5
+
+  %tmp36 = bitcast <4 x float>* %output_vector to i8*, !dbg !39 ; line:79 col:1
+  call void @llvm.lifetime.end(i64 16, i8* %tmp36) #0, !dbg !39 ; line:79 col:1
+  ret void, !dbg !39 ; line:79 col:1
+}
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture) #0
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture) #0
+
+; Function Attrs: nounwind readonly
+declare <4 x float> @"dx.hl.op.ro.<4 x float> (i32, %dx.types.Handle, i32)"(i32, %dx.types.Handle, i32) #1
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.ByteAddressBuffer)"(i32, %struct.ByteAddressBuffer) #2
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.ByteAddressBuffer)"(i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.ByteAddressBuffer) #2
+
+; Function Attrs: nounwind
+declare void @"dx.hl.op..void (i32, <4 x float>*, i1, <4 x float>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32)"(i32, <4 x float>*, i1, <4 x float>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32) #0
+
+; Function Attrs: nounwind
+declare void @"dx.hl.op..void (i32, %dx.types.Handle, i32, <4 x float>)"(i32, %dx.types.Handle, i32, <4 x float>) #0
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RWByteAddressBuffer)"(i32, %struct.RWByteAddressBuffer) #2
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RWByteAddressBuffer)"(i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RWByteAddressBuffer) #2
+
+; Function Attrs: nounwind
+declare void @"dx.hl.op..void (i32, <4 x float>*, i1, <4 x float>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32, %dx.types.Handle, i32, i32)"(i32, <4 x float>*, i1, <4 x float>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32, %dx.types.Handle, i32, i32) #0
+
+; Function Attrs: nounwind readonly
+declare <8 x i32> @"dx.hl.op.ro.<8 x i32> (i32, %dx.types.Handle, i32)"(i32, %dx.types.Handle, i32) #1
+
+; Function Attrs: nounwind
+declare void @"dx.hl.op..void (i32, <8 x i32>, <8 x i32>, %dx.types.Handle, i32, i32, i32, i32)"(i32, <8 x i32>, <8 x i32>, %dx.types.Handle, i32, i32, i32, i32) #0
+
+; Function Attrs: nounwind
+declare void @"dx.hl.op..void (i32, <8 x i32>, %dx.types.Handle, i32)"(i32, <8 x i32>, %dx.types.Handle, i32) #0
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readonly }
+attributes #2 = { nounwind readnone }
+
+!llvm.module.flags = !{!0}
+!pauseresume = !{!1}
+!dx.version = !{!2}
+!dx.valver = !{!2}
+!dx.shaderModel = !{!3}
+!dx.typeAnnotations = !{!4}
+!dx.entryPoints = !{!8}
+!dx.fnprops = !{!18}
+!dx.options = !{!19, !20}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !{!"hlsl-hlemit", !"hlsl-hlensure"}
+!2 = !{i32 1, i32 9}
+!3 = !{!"cs", i32 6, i32 9}
+!4 = !{i32 1, void ()* @cs_main, !5}
+!5 = !{!6}
+!6 = !{i32 1, !7, !7}
+!7 = !{}
+!8 = !{void ()* @cs_main, !"cs_main", null, !9, null}
+!9 = !{!10, !15, null, null}
+!10 = !{!11, !12, !13, !14}
+!11 = !{i32 0, %struct.ByteAddressBuffer* @"\01?input_vector_buffer@@3UByteAddressBuffer@@A", !"input_vector_buffer", i32 -1, i32 -1, i32 1, i32 11, i32 0, null}
+!12 = !{i32 1, %struct.ByteAddressBuffer* @"\01?opa_input_buffer@@3UByteAddressBuffer@@A", !"opa_input_buffer", i32 -1, i32 -1, i32 1, i32 11, i32 0, null}
+!13 = !{i32 2, %struct.ByteAddressBuffer* @"\01?matrix_buffer@@3UByteAddressBuffer@@A", !"matrix_buffer", i32 -1, i32 -1, i32 1, i32 11, i32 0, null}
+!14 = !{i32 3, %struct.ByteAddressBuffer* @"\01?bias_buffer@@3UByteAddressBuffer@@A", !"bias_buffer", i32 -1, i32 -1, i32 1, i32 11, i32 0, null}
+!15 = !{!16, !17}
+!16 = !{i32 0, %struct.RWByteAddressBuffer* @"\01?rw_matrix_buffer@@3URWByteAddressBuffer@@A", !"rw_matrix_buffer", i32 -1, i32 -1, i32 1, i32 11, i1 false, i1 false, i1 false, null}
+!17 = !{i32 1, %struct.RWByteAddressBuffer* @"\01?output_vector_buffer@@3URWByteAddressBuffer@@A", !"output_vector_buffer", i32 -1, i32 -1, i32 1, i32 11, i1 false, i1 false, i1 false, null}
+!18 = !{void ()* @cs_main, i32 5, i32 1, i32 1, i32 1}
+!19 = !{i32 -2147483584}
+!20 = !{i32 -1}
+!21 = !DILocation(line: 14, column: 5, scope: !22)
+!22 = !DISubprogram(name: "cs_main", scope: !23, file: !23, line: 12, type: !24, isLocal: false, isDefinition: true, scopeLine: 13, flags: DIFlagPrototyped, isOptimized: false, function: void ()* @cs_main)
+!23 = !DIFile(filename: "DirectXShaderCompiler\5Ctools\5Cclang\5Ctest\5CCodeGenDXIL\5Chlsl\5Cintrinsics\5Clinalg_builtins\5Clinalg-builtins.hlsl", directory: "")
+!24 = !DISubroutineType(types: !7)
+!25 = !DILocation(line: 17, column: 37, scope: !22)
+!26 = !DILocation(line: 33, column: 5, scope: !22)
+!27 = !DILocation(line: 37, column: 35, scope: !22)
+!28 = !{!29, !29, i64 0}
+!29 = !{!"omnipotent char", !30, i64 0}
+!30 = !{!"Simple C/C++ TBAA"}
+!31 = !DILocation(line: 37, column: 5, scope: !22)
+!32 = !DILocation(line: 49, column: 5, scope: !22)
+!33 = !DILocation(line: 54, column: 38, scope: !22)
+!34 = !DILocation(line: 54, column: 5, scope: !22)
+!35 = !DILocation(line: 56, column: 37, scope: !22)
+!36 = !DILocation(line: 57, column: 37, scope: !22)
+!37 = !DILocation(line: 67, column: 5, scope: !22)
+!38 = !DILocation(line: 77, column: 5, scope: !22)
+!39 = !DILocation(line: 79, column: 1, scope: !22)
diff --git a/tools/clang/test/SemaHLSL/hlsl/linalg/unavailable-pre-sm69.hlsl b/tools/clang/test/SemaHLSL/hlsl/linalg/unavailable-pre-sm69.hlsl
new file mode 100644
index 0000000000..d5e251ae8b
--- /dev/null
+++ b/tools/clang/test/SemaHLSL/hlsl/linalg/unavailable-pre-sm69.hlsl
@@ -0,0 +1,59 @@
+// RUN: %dxc -T lib_6_8 %s -verify
+ 
+ByteAddressBuffer matrix_buffer;
+ByteAddressBuffer bias_buffer;
+RWByteAddressBuffer rw_matrix_buffer;
+
+[Shader("compute")]
+[Numthreads(1,1,1)]
+void cs_main()
+{    
+    vector<float, 4> output_vector;
+    static const uint is_output_unsigned = 0;
+    
+    vector<float, 4> input_vector;
+    const uint is_input_unsigned = 0;
+    const uint input_interpretation = 9; /*F32*/
+    
+    const uint matrix_offset = 0;
+    const uint matrix_interpretation = 9; /*F32*/
+    const uint matrix_dimM = 4;
+    const uint matrix_dimK = 4;
+    const uint matrix_layout = 0; /*RowMajor*/
+    const bool matrix_is_transposed = false; 
+    const uint matrix_stride = 64;
+
+    //expected-error@+1{{intrinsic __builtin_MatVecMul potentially used by 'cs_main' requires shader model 6.9 or greater}}
+    __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector, 
+      is_input_unsigned, input_interpretation, matrix_buffer, matrix_offset,
+      matrix_interpretation, matrix_dimM, matrix_dimK, matrix_layout,
+      matrix_is_transposed, matrix_stride); 
+
+    const uint bias_offset = 0;
+    const uint bias_interpretation = 9; /*F32*/
+
+    //expected-error@+1{{intrinsic __builtin_MatVecMulAdd potentially used by 'cs_main' requires shader model 6.9 or greater}}
+    __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+      is_input_unsigned, input_interpretation, matrix_buffer, matrix_offset,
+      matrix_interpretation, matrix_dimM, matrix_dimK, matrix_layout,
+      matrix_is_transposed, matrix_stride, bias_buffer, bias_offset,
+      bias_interpretation); 
+
+    vector<uint, 4> input_vector1;
+    vector<uint, 4> input_vector2;
+    const uint opa_matrix_offset = 0;
+    const uint opa_matrix_interpretation = 5; /*U32*/
+    const uint opa_matrix_layout = 3; /*OuterProductOptimal*/
+    const uint opa_matrix_stride = 64;
+
+    //expected-error@+1{{intrinsic __builtin_OuterProductAccumulate potentially used by 'cs_main' requires shader model 6.9 or greater}}
+    __builtin_OuterProductAccumulate(input_vector1, input_vector2,
+      rw_matrix_buffer, opa_matrix_offset, opa_matrix_interpretation,
+      opa_matrix_layout, opa_matrix_stride);
+
+    const uint va_matrix_offset = 0;
+
+     //expected-error@+1{{intrinsic __builtin_VectorAccumulate potentially used by 'cs_main' requires shader model 6.9 or greater}}
+     __builtin_VectorAccumulate(input_vector1, rw_matrix_buffer,
+       va_matrix_offset);
+}
\ No newline at end of file
diff --git a/utils/hct/gen_intrin_main.txt b/utils/hct/gen_intrin_main.txt
index f1274fd308..c394611302 100644
--- a/utils/hct/gen_intrin_main.txt
+++ b/utils/hct/gen_intrin_main.txt
@@ -383,6 +383,14 @@ void [[]] Barrier(in NodeRecordOrUAV o, in uint SemanticFlags);
 
 uint [[]] GetRemainingRecursionLevels();
 
+void [[]] __builtin_MatVecMul(out numeric<c> OutputVector, in bool OutputIsUnsigned, in numeric<c2> InputVector, in bool InputIsUnsigned, in uint InputInterpretation, in ByteAddressBuffer MatrixBuffer, in uint MatrixOffset, in uint MatrixInterpretation, in uint M, in uint K, in uint MatrixLayout, in bool MatrixIsTransposed, in uint MatrixStride);
+
+void [[]] __builtin_MatVecMulAdd(out numeric<c> OutputVector, in bool OutputIsUnsigned, in numeric<c2> InputVector, in bool InputIsUnsigned, in uint InputInterpretation, in ByteAddressBuffer MatrixBuffer, in uint MatrixOffset, in uint MatrixInterpretation, in uint M, in uint K, in uint MatrixLayout, in bool MatrixIsTransposed, in uint MatrixStride, in ByteAddressBuffer BiasVector, in uint BiasOffset, in uint BiasInterpretation);
+
+void [[]] __builtin_OuterProductAccumulate(in numeric<c> InputVector1, in numeric<c2> InputVector2, in RWByteAddressBuffer MatrixBuffer, in uint MatrixOffset, in uint MatrixInterpretation, in uint MatrixLayout, in uint MatrixStride);
+
+void [[]] __builtin_VectorAccumulate(in numeric<c> InputVector, in RWByteAddressBuffer MatrixBuffer, in uint MatrixOffset);
+
 } namespace
 
 
diff --git a/utils/hct/hctdb.py b/utils/hct/hctdb.py
index b1460de9b8..57f2574005 100644
--- a/utils/hct/hctdb.py
+++ b/utils/hct/hctdb.py
@@ -873,6 +873,11 @@ def populate_categories_and_models(self):
                 "library",
                 "raygeneration",
             )
+        for i in (
+            "MatVecMul,MatVecMulAdd,OuterProductAccumulate,VectorAccumulate"
+        ).split(","):
+            self.name_idx[i].category = "Linear Algebra Operations"
+            self.name_idx[i].shader_model = 6, 9
 
     def populate_llvm_instructions(self):
         # Add instructions that map to LLVM instructions.
@@ -6340,6 +6345,103 @@ def UFI(name, **mappings):
         )
         next_op_idx += 1
 
+        self.add_dxil_op(
+            "MatVecMul",
+            next_op_idx,
+            "MatVecMul",
+            "Multiplies a MxK dimension matrix and a K sized input vector",
+            "<hfwi,<hfwi",
+            "ro",
+            [
+                db_dxil_param(0, "$x0", "outputVector", "output vector"),
+                db_dxil_param(2, "$x1", "inputVector", "input vector"),
+                db_dxil_param(3, "i1", "isInputUnsigned", "is input unsigned"),
+                db_dxil_param(4, "i32", "inputInterpretation", "input interpretation"),
+                db_dxil_param(5, "res", "matrixBuffer", "matrix resource"),
+                db_dxil_param(6, "i32", "matrixOffset", "matrix offset"),
+                db_dxil_param(7, "i32", "matrixIntepretation", "matrix intepretation"),
+                db_dxil_param(8, "i32", "matrixM", "matrix M dimension"),
+                db_dxil_param(9, "i32", "matrixK", "matrix K dimension"),
+                db_dxil_param(10, "i32", "matrixLayout", "matrix layout"),
+                db_dxil_param(11, "i1", "matrixTranspose", "matrix transpose"),
+                db_dxil_param(12, "i32", "matrixStride", "matrix stride"),
+                db_dxil_param(13, "i1", "isOutputUnsigned", "is output unsigned"),
+            ],
+        )
+        next_op_idx += 1
+
+        self.add_dxil_op(
+            "MatVecMulAdd",
+            next_op_idx,
+            "MatVecMulAdd",
+            "multiplies a MxK dimension matrix and a K sized input vector and adds an M-sized bias vector",
+            "<hfwi,<hfwi",
+            "ro",
+            [
+                db_dxil_param(0, "$x0", "outputVector", "output vector"),
+                db_dxil_param(2, "$x1", "inputVector", "input vector"),
+                db_dxil_param(3, "i1", "isInputUnsigned", "is input unsigned"),
+                db_dxil_param(4, "i32", "inputInterpretation", "input interpretation"),
+                db_dxil_param(5, "res", "matrixBuffer", "matrix resource"),
+                db_dxil_param(6, "i32", "matrixOffset", "matrix offset"),
+                db_dxil_param(7, "i32", "matrixIntepretation", "matrix intepretation"),
+                db_dxil_param(8, "i32", "matrixM", "matrix M dimension"),
+                db_dxil_param(9, "i32", "matrixK", "matrix K dimension"),
+                db_dxil_param(10, "i32", "matrixLayout", "matrix layout"),
+                db_dxil_param(11, "i1", "matrixTranspose", "matrix transpose"),
+                db_dxil_param(12, "i32", "matrixStride", "matrix stride"),
+                db_dxil_param(13, "res", "biasBuffer", "bias vector resource"),
+                db_dxil_param(14, "i32", "biasOffset", "bias vector offset"),
+                db_dxil_param(
+                    15, "i32", "biasIntepretation", "bias vector intepretation"
+                ),
+                db_dxil_param(16, "i1", "isOutputUnsigned", "is output unsigned"),
+            ],
+        )
+        next_op_idx += 1
+
+        self.add_dxil_op(
+            "OuterProductAccumulate",
+            next_op_idx,
+            "OuterProductAccumulate",
+            "Computes the outer product between column vectors and an MxN matrix is accumulated component-wise atomically (with device scope) in memory",
+            "<hfwi,<hfwi",
+            "",
+            [
+                db_dxil_param(0, "v", "", ""),
+                db_dxil_param(2, "$x0", "inputVector1", "input vector 1"),
+                db_dxil_param(3, "$x1", "inputVector2", "input vector 2"),
+                db_dxil_param(4, "res", "matrixBuffer", "matrix resource"),
+                db_dxil_param(5, "i32", "matrixOffset", "matrix offset"),
+                db_dxil_param(
+                    6,
+                    "i32",
+                    "matrixIntepretation",
+                    "matrix intepretation",
+                    is_const=True,
+                ),
+                db_dxil_param(7, "i32", "matrixLayout", "matrix layout", is_const=True),
+                db_dxil_param(8, "i32", "matrixStride", "matrix stride"),
+            ],
+        )
+        next_op_idx += 1
+
+        self.add_dxil_op(
+            "VectorAccumulate",
+            next_op_idx,
+            "VectorAccumulate",
+            "Accumulates the components of a vector component-wise atomically (with device scope) to the corresponding elements of an array in memory",
+            "<hfwi",
+            "",
+            [
+                db_dxil_param(0, "v", "", ""),
+                db_dxil_param(2, "$o", "inputVector", "input vector 1"),
+                db_dxil_param(3, "res", "arrayBuffer", "output array resource"),
+                db_dxil_param(4, "i32", "arrayOffset", "output array offset"),
+            ],
+        )
+        next_op_idx += 1
+
         # End of DXIL 1.9 opcodes.
         # NOTE!! Update and uncomment when DXIL 1.9 opcodes are finalized:
         # self.set_op_count_for_version(1, 9, next_op_idx)
@@ -8308,6 +8410,55 @@ def build_valrules(self):
             "Use of undef coherence hint or num coherence hint bits in MaybeReorderThread.",
         )
 
+        # Linalg ops
+        self.add_valrule_msg(
+            "Instr.MatVecOpIsUnsignedFlagsAreConst",
+            "In Linalg Mul/MulAdd functions, IsUnsigned flag is a constant.",
+            "'%1' is not a constant value",
+        )
+
+        self.add_valrule_msg(
+            "Instr.LinalgInterpretationParamAreConst",
+            "In Linalg operations, Interpretation value is a constant.",
+            "'%1' is not a constant value",
+        )
+
+        self.add_valrule_msg(
+            "Instr.LinalgInvalidRegisterInterpValue",
+            "From Register Interpretation value must be valid.",
+            "'%0' is not a valid %1 interpretation value",
+        )
+
+        self.add_valrule_msg(
+            "Instr.LinalgInvalidMemoryInterpValue",
+            "In Memory Interpolation value must be valid.",
+            "'%0' is not a valid %1 interpretation value",
+        )
+
+        self.add_valrule_msg(
+            "Instr.LinalgMatrixShapeParamsAreConst",
+            "Matrix Layout, Dimensions and isTranspose are constants",
+            "'%0' is not a constant value",
+        )
+
+        self.add_valrule_msg(
+            "Instr.LinalgInvalidMatrixLayoutValueForMatVecOps",
+            "Matrix Layout for Linalg Mul/MulAdd operation must be valid.",
+            "matrix layout value '%0' is not valid. Must be between [%1 - %2]",
+        )
+
+        self.add_valrule_msg(
+            "Instr.LinalgMatrixLayoutNotTransposable",
+            "Row Major and Column Major matrix layouts are not transposable.",
+            "%0 matrix layout is not transposable",
+        )
+
+        self.add_valrule_msg(
+            "Instr.LinalgNotAnUnsignedType",
+            "Unsigned flag set for a float signed type",
+            "IsUnsigned flag set to true for a float type '%0' vector",
+        )
+
         # Some legacy rules:
         # - space is only supported for shader targets 5.1 and higher
         # - multiple rules regarding derivatives, which isn't a supported feature for DXIL
diff --git a/utils/hct/hlsl_intrinsic_opcodes.json b/utils/hct/hlsl_intrinsic_opcodes.json
index d99b84b745..a6f17bf7cf 100644
--- a/utils/hct/hlsl_intrinsic_opcodes.json
+++ b/utils/hct/hlsl_intrinsic_opcodes.json
@@ -1,6 +1,6 @@
 {
   "IntrinsicOpCodes": {
-    "Num_Intrinsics": 390,
+    "Num_Intrinsics": 394,
     "IOP_AcceptHitAndEndSearch": 0,
     "IOP_AddUint64": 1,
     "IOP_AllMemoryBarrier": 2,
@@ -390,6 +390,10 @@
     "MOP_DxHitObject_LoadLocalRootTableConstant": 386,
     "MOP_DxHitObject_MakeMiss": 387,
     "MOP_DxHitObject_SetShaderTableIndex": 388,
-    "MOP_DxHitObject_TraceRay": 389
+    "MOP_DxHitObject_TraceRay": 389,
+    "IOP___builtin_MatVecMul": 390,
+    "IOP___builtin_MatVecMulAdd": 391,
+    "IOP___builtin_OuterProductAccumulate": 392,
+    "IOP___builtin_VectorAccumulate": 393
   }
 }

From e866b4bac624bdd244ce7ccdcb5570b2510781db Mon Sep 17 00:00:00 2001
From: Steven Perron <stevenperron@google.com>
Date: Tue, 29 Apr 2025 09:35:06 -0400
Subject: [PATCH 14/93] [SPIRV] Refactor OpExecutionModeId (#7378)

The current implementation of OpExecutionModeId assumes that the
operands must be OpConstants. However, they could be the id of a
spec constant. The first step in allowing OpExecutionModeId is to modify
the internal representation of the instruction to hold general spirv
instructions.

Part of https://github.com/microsoft/DirectXShaderCompiler/issues/5960
and https://github.com/microsoft/DirectXShaderCompiler/issues/3092.
---
 .../clang/include/clang/SPIRV/SpirvBuilder.h  | 48 +++++++++++++---
 .../include/clang/SPIRV/SpirvInstruction.h    | 55 ++++++++++++++++---
 tools/clang/include/clang/SPIRV/SpirvModule.h |  8 +--
 .../clang/include/clang/SPIRV/SpirvVisitor.h  |  2 +-
 tools/clang/lib/SPIRV/CapabilityVisitor.cpp   |  2 +-
 tools/clang/lib/SPIRV/CapabilityVisitor.h     |  2 +-
 tools/clang/lib/SPIRV/EmitVisitor.cpp         | 19 ++++---
 tools/clang/lib/SPIRV/EmitVisitor.h           |  2 +-
 tools/clang/lib/SPIRV/SpirvEmitter.cpp        | 52 ++++++++++++++----
 tools/clang/lib/SPIRV/SpirvEmitter.h          |  5 +-
 tools/clang/lib/SPIRV/SpirvInstruction.cpp    | 19 ++++---
 tools/clang/lib/SPIRV/SpirvModule.cpp         |  9 +--
 .../spv.intrinsicExecutionModeId.hlsl         |  6 +-
 13 files changed, 171 insertions(+), 58 deletions(-)

diff --git a/tools/clang/include/clang/SPIRV/SpirvBuilder.h b/tools/clang/include/clang/SPIRV/SpirvBuilder.h
index 5e03d1ef96..e4e6ef308f 100644
--- a/tools/clang/include/clang/SPIRV/SpirvBuilder.h
+++ b/tools/clang/include/clang/SPIRV/SpirvBuilder.h
@@ -615,8 +615,15 @@ class SpirvBuilder {
   inline SpirvInstruction *addExecutionMode(SpirvFunction *entryPoint,
                                             spv::ExecutionMode em,
                                             llvm::ArrayRef<uint32_t> params,
-                                            SourceLocation,
-                                            bool useIdParams = false);
+                                            SourceLocation);
+
+  /// \brief Adds an execution mode to the module under construction if it does
+  /// not already exist. Return the newly added instruction or the existing
+  /// instruction, if one already exists.
+  inline SpirvInstruction *
+  addExecutionModeId(SpirvFunction *entryPoint, spv::ExecutionMode em,
+                     llvm::ArrayRef<SpirvInstruction *> params,
+                     SourceLocation loc);
 
   /// \brief Adds an OpModuleProcessed instruction to the module under
   /// construction.
@@ -963,17 +970,44 @@ SpirvBuilder::setDebugSource(uint32_t major, uint32_t minor,
 SpirvInstruction *
 SpirvBuilder::addExecutionMode(SpirvFunction *entryPoint, spv::ExecutionMode em,
                                llvm::ArrayRef<uint32_t> params,
-                               SourceLocation loc, bool useIdParams) {
+                               SourceLocation loc) {
   SpirvExecutionMode *mode = nullptr;
-  SpirvExecutionMode *existingInstruction =
+  SpirvExecutionModeBase *existingInstruction =
       mod->findExecutionMode(entryPoint, em);
 
   if (!existingInstruction) {
-    mode = new (context)
-        SpirvExecutionMode(loc, entryPoint, em, params, useIdParams);
+    mode = new (context) SpirvExecutionMode(loc, entryPoint, em, params);
+    mod->addExecutionMode(mode);
+  } else {
+    // No execution mode can be used with both OpExecutionMode and
+    // OpExecutionModeId. If this assert is triggered, then either this
+    // `addExecutionModeId` should have been called with `em` or the existing
+    // instruction is wrong.
+    assert(existingInstruction->getKind() ==
+           SpirvInstruction::IK_ExecutionMode);
+    mode = cast<SpirvExecutionMode>(existingInstruction);
+  }
+
+  return mode;
+}
+
+SpirvInstruction *SpirvBuilder::addExecutionModeId(
+    SpirvFunction *entryPoint, spv::ExecutionMode em,
+    llvm::ArrayRef<SpirvInstruction *> params, SourceLocation loc) {
+  SpirvExecutionModeId *mode = nullptr;
+  SpirvExecutionModeBase *existingInstruction =
+      mod->findExecutionMode(entryPoint, em);
+  if (!existingInstruction) {
+    mode = new (context) SpirvExecutionModeId(loc, entryPoint, em, params);
     mod->addExecutionMode(mode);
   } else {
-    mode = existingInstruction;
+    // No execution mode can be used with both OpExecutionMode and
+    // OpExecutionModeId. If this assert is triggered, then either this
+    // `addExecutionMode` should have been called with `em` or the existing
+    // instruction is wrong.
+    assert(existingInstruction->getKind() ==
+           SpirvInstruction::IK_ExecutionModeId);
+    mode = cast<SpirvExecutionModeId>(existingInstruction);
   }
 
   return mode;
diff --git a/tools/clang/include/clang/SPIRV/SpirvInstruction.h b/tools/clang/include/clang/SPIRV/SpirvInstruction.h
index f49a295610..6d95459373 100644
--- a/tools/clang/include/clang/SPIRV/SpirvInstruction.h
+++ b/tools/clang/include/clang/SPIRV/SpirvInstruction.h
@@ -57,6 +57,7 @@ class SpirvInstruction {
     IK_MemoryModel,     // OpMemoryModel
     IK_EntryPoint,      // OpEntryPoint
     IK_ExecutionMode,   // OpExecutionMode
+    IK_ExecutionModeId, // OpExecutionModeId
     IK_String,          // OpString (debug)
     IK_Source,          // OpSource (debug)
     IK_ModuleProcessed, // OpModuleProcessed (debug)
@@ -404,12 +405,34 @@ class SpirvEntryPoint : public SpirvInstruction {
   llvm::SmallVector<SpirvVariable *, 8> interfaceVec;
 };
 
+class SpirvExecutionModeBase : public SpirvInstruction {
+public:
+  SpirvExecutionModeBase(Kind kind, spv::Op opcode, SourceLocation loc,
+                         SpirvFunction *entryPointFunction,
+                         spv::ExecutionMode executionMode)
+      : SpirvInstruction(kind, opcode, QualType(), loc),
+        entryPoint(entryPointFunction), execMode(executionMode) {}
+
+  DEFINE_RELEASE_MEMORY_FOR_CLASS(SpirvExecutionModeBase)
+
+  // For LLVM-style RTTI
+  static bool classof(const SpirvInstruction *inst) { return false; }
+
+  bool invokeVisitor(Visitor *v) override;
+
+  SpirvFunction *getEntryPoint() const { return entryPoint; }
+  spv::ExecutionMode getExecutionMode() const { return execMode; }
+
+private:
+  SpirvFunction *entryPoint;
+  spv::ExecutionMode execMode;
+};
+
 /// \brief OpExecutionMode and OpExecutionModeId instructions
-class SpirvExecutionMode : public SpirvInstruction {
+class SpirvExecutionMode : public SpirvExecutionModeBase {
 public:
   SpirvExecutionMode(SourceLocation loc, SpirvFunction *entryPointFunction,
-                     spv::ExecutionMode, llvm::ArrayRef<uint32_t> params,
-                     bool usesIdParams);
+                     spv::ExecutionMode, llvm::ArrayRef<uint32_t> params);
 
   DEFINE_RELEASE_MEMORY_FOR_CLASS(SpirvExecutionMode)
 
@@ -420,16 +443,34 @@ class SpirvExecutionMode : public SpirvInstruction {
 
   bool invokeVisitor(Visitor *v) override;
 
-  SpirvFunction *getEntryPoint() const { return entryPoint; }
-  spv::ExecutionMode getExecutionMode() const { return execMode; }
   llvm::ArrayRef<uint32_t> getParams() const { return params; }
 
 private:
-  SpirvFunction *entryPoint;
-  spv::ExecutionMode execMode;
   llvm::SmallVector<uint32_t, 4> params;
 };
 
+/// \brief OpExecutionModeId
+class SpirvExecutionModeId : public SpirvExecutionModeBase {
+public:
+  SpirvExecutionModeId(SourceLocation loc, SpirvFunction *entryPointFunction,
+                       spv::ExecutionMode em,
+                       llvm::ArrayRef<SpirvInstruction *> params);
+
+  DEFINE_RELEASE_MEMORY_FOR_CLASS(SpirvExecutionModeId)
+
+  // For LLVM-style RTTI
+  static bool classof(const SpirvInstruction *inst) {
+    return inst->getKind() == IK_ExecutionModeId;
+  }
+
+  bool invokeVisitor(Visitor *v) override;
+
+  llvm::ArrayRef<SpirvInstruction *> getParams() const { return params; }
+
+private:
+  llvm::SmallVector<SpirvInstruction *, 4> params;
+};
+
 /// \brief OpString instruction
 class SpirvString : public SpirvInstruction {
 public:
diff --git a/tools/clang/include/clang/SPIRV/SpirvModule.h b/tools/clang/include/clang/SPIRV/SpirvModule.h
index 298c06d65e..9ab0c296b8 100644
--- a/tools/clang/include/clang/SPIRV/SpirvModule.h
+++ b/tools/clang/include/clang/SPIRV/SpirvModule.h
@@ -119,11 +119,11 @@ class SpirvModule {
 
   // Returns an existing execution mode instruction that is the same as em if it
   // exists. Return nullptr otherwise.
-  SpirvExecutionMode *findExecutionMode(SpirvFunction *entryPoint,
-                                        spv::ExecutionMode em);
+  SpirvExecutionModeBase *findExecutionMode(SpirvFunction *entryPoint,
+                                            spv::ExecutionMode em);
 
   // Adds an execution mode to the module.
-  void addExecutionMode(SpirvExecutionMode *);
+  void addExecutionMode(SpirvExecutionModeBase *em);
 
   // Adds an extension to the module. Returns true if the extension was added.
   // Returns false otherwise (e.g. if the extension already existed).
@@ -194,7 +194,7 @@ class SpirvModule {
   llvm::SmallVector<SpirvExtInstImport *, 1> extInstSets;
   SpirvMemoryModel *memoryModel;
   llvm::SmallVector<SpirvEntryPoint *, 1> entryPoints;
-  llvm::SmallVector<SpirvExecutionMode *, 4> executionModes;
+  llvm::SmallVector<SpirvExecutionModeBase *, 4> executionModes;
   llvm::SmallVector<SpirvString *, 4> constStrings;
   std::vector<SpirvSource *> sources;
   std::vector<SpirvModuleProcessed *> moduleProcesses;
diff --git a/tools/clang/include/clang/SPIRV/SpirvVisitor.h b/tools/clang/include/clang/SPIRV/SpirvVisitor.h
index 93682518a1..95bc46aa5f 100644
--- a/tools/clang/include/clang/SPIRV/SpirvVisitor.h
+++ b/tools/clang/include/clang/SPIRV/SpirvVisitor.h
@@ -64,7 +64,7 @@ class Visitor {
   DEFINE_VISIT_METHOD(SpirvExtInstImport)
   DEFINE_VISIT_METHOD(SpirvMemoryModel)
   DEFINE_VISIT_METHOD(SpirvEntryPoint)
-  DEFINE_VISIT_METHOD(SpirvExecutionMode)
+  DEFINE_VISIT_METHOD(SpirvExecutionModeBase)
   DEFINE_VISIT_METHOD(SpirvString)
   DEFINE_VISIT_METHOD(SpirvSource)
   DEFINE_VISIT_METHOD(SpirvModuleProcessed)
diff --git a/tools/clang/lib/SPIRV/CapabilityVisitor.cpp b/tools/clang/lib/SPIRV/CapabilityVisitor.cpp
index 24dfdc2e9a..9ca9cbc6cd 100644
--- a/tools/clang/lib/SPIRV/CapabilityVisitor.cpp
+++ b/tools/clang/lib/SPIRV/CapabilityVisitor.cpp
@@ -639,7 +639,7 @@ bool CapabilityVisitor::visit(SpirvEntryPoint *entryPoint) {
   return true;
 }
 
-bool CapabilityVisitor::visit(SpirvExecutionMode *execMode) {
+bool CapabilityVisitor::visit(SpirvExecutionModeBase *execMode) {
   spv::ExecutionMode executionMode = execMode->getExecutionMode();
   SourceLocation execModeSourceLocation = execMode->getSourceLocation();
   SourceLocation entryPointSourceLocation =
diff --git a/tools/clang/lib/SPIRV/CapabilityVisitor.h b/tools/clang/lib/SPIRV/CapabilityVisitor.h
index 95db110cce..35d4b5a18b 100644
--- a/tools/clang/lib/SPIRV/CapabilityVisitor.h
+++ b/tools/clang/lib/SPIRV/CapabilityVisitor.h
@@ -31,7 +31,7 @@ class CapabilityVisitor : public Visitor {
 
   bool visit(SpirvDecoration *decor) override;
   bool visit(SpirvEntryPoint *) override;
-  bool visit(SpirvExecutionMode *) override;
+  bool visit(SpirvExecutionModeBase *execMode) override;
   bool visit(SpirvImageQuery *) override;
   bool visit(SpirvImageOp *) override;
   bool visit(SpirvImageSparseTexelsResident *) override;
diff --git a/tools/clang/lib/SPIRV/EmitVisitor.cpp b/tools/clang/lib/SPIRV/EmitVisitor.cpp
index eb00f59632..2a3ffd82f4 100644
--- a/tools/clang/lib/SPIRV/EmitVisitor.cpp
+++ b/tools/clang/lib/SPIRV/EmitVisitor.cpp
@@ -617,19 +617,20 @@ bool EmitVisitor::visit(SpirvEntryPoint *inst) {
   return true;
 }
 
-bool EmitVisitor::visit(SpirvExecutionMode *inst) {
+bool EmitVisitor::visit(SpirvExecutionModeBase *inst) {
   initInstruction(inst);
   curInst.push_back(getOrAssignResultId<SpirvFunction>(inst->getEntryPoint()));
   curInst.push_back(static_cast<uint32_t>(inst->getExecutionMode()));
-  if (inst->getopcode() == spv::Op::OpExecutionMode) {
-    curInst.insert(curInst.end(), inst->getParams().begin(),
-                   inst->getParams().end());
-  } else {
-    for (uint32_t param : inst->getParams()) {
-      curInst.push_back(typeHandler.getOrCreateConstantInt(
-          llvm::APInt(32, param), context.getUIntType(32),
-          /*isSpecConst */ false));
+  if (auto *exeModeId = dyn_cast<SpirvExecutionModeId>(inst)) {
+    for (SpirvInstruction *param : exeModeId->getParams()) {
+      if (auto *ConstantInst = dyn_cast<SpirvConstant>(param))
+        typeHandler.getOrCreateConstant(ConstantInst);
+      curInst.push_back(getOrAssignResultId<SpirvInstruction>(param));
     }
+  } else {
+    auto *exeMode = llvm::cast<SpirvExecutionMode>(inst);
+    ArrayRef<uint32_t> params = exeMode->getParams();
+    curInst.insert(curInst.end(), params.begin(), params.end());
   }
   finalizeInstruction(&preambleBinary);
   return true;
diff --git a/tools/clang/lib/SPIRV/EmitVisitor.h b/tools/clang/lib/SPIRV/EmitVisitor.h
index 1f9b0939e6..bfa0710998 100644
--- a/tools/clang/lib/SPIRV/EmitVisitor.h
+++ b/tools/clang/lib/SPIRV/EmitVisitor.h
@@ -233,7 +233,7 @@ class EmitVisitor : public Visitor {
   bool visit(SpirvEmitVertex *) override;
   bool visit(SpirvEndPrimitive *) override;
   bool visit(SpirvEntryPoint *) override;
-  bool visit(SpirvExecutionMode *) override;
+  bool visit(SpirvExecutionModeBase *) override;
   bool visit(SpirvString *) override;
   bool visit(SpirvSource *) override;
   bool visit(SpirvModuleProcessed *) override;
diff --git a/tools/clang/lib/SPIRV/SpirvEmitter.cpp b/tools/clang/lib/SPIRV/SpirvEmitter.cpp
index cd5f860555..7337a33b01 100644
--- a/tools/clang/lib/SPIRV/SpirvEmitter.cpp
+++ b/tools/clang/lib/SPIRV/SpirvEmitter.cpp
@@ -1146,8 +1146,9 @@ void SpirvEmitter::doStmt(const Stmt *stmt,
     // All cases for expressions used as statements
     SpirvInstruction *result = doExpr(expr);
 
-    if (result && result->getKind() == SpirvInstruction::IK_ExecutionMode &&
-        !attrs.empty()) {
+    if (result && !attrs.empty() &&
+        (result->getKind() == SpirvInstruction::IK_ExecutionMode ||
+         result->getKind() == SpirvInstruction::IK_ExecutionModeId)) {
       // Handle [[vk::ext_capability(..)]] and [[vk::ext_extension(..)]]
       // attributes for vk::ext_execution_mode[_id](..).
       createSpirvIntrInstExt(
@@ -9161,10 +9162,10 @@ SpirvEmitter::processIntrinsicCallExpr(const CallExpr *callExpr) {
     retVal = processRawBufferStore(callExpr);
     break;
   case hlsl::IntrinsicOp::IOP_Vkext_execution_mode:
-    retVal = processIntrinsicExecutionMode(callExpr, false);
+    retVal = processIntrinsicExecutionMode(callExpr);
     break;
   case hlsl::IntrinsicOp::IOP_Vkext_execution_mode_id:
-    retVal = processIntrinsicExecutionMode(callExpr, true);
+    retVal = processIntrinsicExecutionModeId(callExpr);
     break;
   case hlsl::IntrinsicOp::IOP_saturate:
     retVal = processIntrinsicSaturate(callExpr);
@@ -15120,8 +15121,7 @@ SpirvEmitter::processCooperativeMatrixGetLength(const CallExpr *call) {
 }
 
 SpirvInstruction *
-SpirvEmitter::processIntrinsicExecutionMode(const CallExpr *expr,
-                                            bool useIdParams) {
+SpirvEmitter::processIntrinsicExecutionMode(const CallExpr *expr) {
   llvm::SmallVector<uint32_t, 2> execModesParams;
   uint32_t exeMode = 0;
   const auto args = expr->getArgs();
@@ -15145,9 +15145,38 @@ SpirvEmitter::processIntrinsicExecutionMode(const CallExpr *expr,
   assert(entryFunction != nullptr);
   assert(exeMode != 0);
 
-  return spvBuilder.addExecutionMode(
-      entryFunction, static_cast<spv::ExecutionMode>(exeMode), execModesParams,
-      expr->getExprLoc(), useIdParams);
+  return spvBuilder.addExecutionMode(entryFunction,
+                                     static_cast<spv::ExecutionMode>(exeMode),
+                                     execModesParams, expr->getExprLoc());
+}
+
+SpirvInstruction *
+SpirvEmitter::processIntrinsicExecutionModeId(const CallExpr *expr) {
+  assert(expr->getNumArgs() > 0);
+  uint32_t exeMode = 0;
+  const Expr *modeExpr = expr->getArg(0);
+  Expr::EvalResult evalResult;
+  if (modeExpr->EvaluateAsRValue(evalResult, astContext) &&
+      !evalResult.HasSideEffects && evalResult.Val.isInt()) {
+    exeMode = evalResult.Val.getInt().getZExtValue();
+  } else {
+    emitError("The execution mode must be constant integer",
+              expr->getExprLoc());
+    return nullptr;
+  }
+
+  llvm::SmallVector<SpirvInstruction *, 2> execModesParams;
+  const auto args = expr->getArgs();
+  for (uint32_t i = 1; i < expr->getNumArgs(); ++i) {
+    const Expr *argExpr = args[i];
+    SpirvInstruction *argInst = doExpr(argExpr);
+    execModesParams.push_back(argInst);
+  }
+
+  assert(entryFunction != nullptr);
+  return spvBuilder.addExecutionModeId(entryFunction,
+                                       static_cast<spv::ExecutionMode>(exeMode),
+                                       execModesParams, expr->getExprLoc());
 }
 
 SpirvInstruction *
@@ -15218,8 +15247,9 @@ bool SpirvEmitter::spirvToolsValidate(std::vector<uint32_t> *mod,
 void SpirvEmitter::addDerivativeGroupExecutionMode() {
   assert(spvContext.isCS());
 
-  SpirvExecutionMode *numThreadsEm = spvBuilder.getModule()->findExecutionMode(
-      entryFunction, spv::ExecutionMode::LocalSize);
+  SpirvExecutionMode *numThreadsEm =
+      cast<SpirvExecutionMode>(spvBuilder.getModule()->findExecutionMode(
+          entryFunction, spv::ExecutionMode::LocalSize));
   auto numThreads = numThreadsEm->getParams();
 
   // The layout of the quad is determined by the numer of threads in each
diff --git a/tools/clang/lib/SPIRV/SpirvEmitter.h b/tools/clang/lib/SPIRV/SpirvEmitter.h
index 79d2c43c35..6c1e12989c 100644
--- a/tools/clang/lib/SPIRV/SpirvEmitter.h
+++ b/tools/clang/lib/SPIRV/SpirvEmitter.h
@@ -781,8 +781,9 @@ class SpirvEmitter : public ASTConsumer {
   SpirvInstruction *processCooperativeMatrixGetLength(const CallExpr *call);
 
   /// Process vk::ext_execution_mode intrinsic
-  SpirvInstruction *processIntrinsicExecutionMode(const CallExpr *expr,
-                                                  bool useIdParams);
+  SpirvInstruction *processIntrinsicExecutionMode(const CallExpr *expr);
+  /// Process vk::ext_execution_mode_id intrinsic
+  SpirvInstruction *processIntrinsicExecutionModeId(const CallExpr *expr);
 
   /// Processes the 'firstbit{high|low}' intrinsic functions.
   SpirvInstruction *processIntrinsicFirstbit(const CallExpr *,
diff --git a/tools/clang/lib/SPIRV/SpirvInstruction.cpp b/tools/clang/lib/SPIRV/SpirvInstruction.cpp
index f41de03adc..3b5861710d 100644
--- a/tools/clang/lib/SPIRV/SpirvInstruction.cpp
+++ b/tools/clang/lib/SPIRV/SpirvInstruction.cpp
@@ -33,7 +33,9 @@ DEFINE_INVOKE_VISITOR_FOR_CLASS(SpirvExtension)
 DEFINE_INVOKE_VISITOR_FOR_CLASS(SpirvExtInstImport)
 DEFINE_INVOKE_VISITOR_FOR_CLASS(SpirvMemoryModel)
 DEFINE_INVOKE_VISITOR_FOR_CLASS(SpirvEntryPoint)
+DEFINE_INVOKE_VISITOR_FOR_CLASS(SpirvExecutionModeBase)
 DEFINE_INVOKE_VISITOR_FOR_CLASS(SpirvExecutionMode)
+DEFINE_INVOKE_VISITOR_FOR_CLASS(SpirvExecutionModeId)
 DEFINE_INVOKE_VISITOR_FOR_CLASS(SpirvString)
 DEFINE_INVOKE_VISITOR_FOR_CLASS(SpirvSource)
 DEFINE_INVOKE_VISITOR_FOR_CLASS(SpirvModuleProcessed)
@@ -207,13 +209,16 @@ SpirvEntryPoint::SpirvEntryPoint(SourceLocation loc,
 // OpExecutionMode and OpExecutionModeId instructions
 SpirvExecutionMode::SpirvExecutionMode(SourceLocation loc, SpirvFunction *entry,
                                        spv::ExecutionMode em,
-                                       llvm::ArrayRef<uint32_t> paramsVec,
-                                       bool usesIdParams)
-    : SpirvInstruction(IK_ExecutionMode,
-                       usesIdParams ? spv::Op::OpExecutionModeId
-                                    : spv::Op::OpExecutionMode,
-                       QualType(), loc),
-      entryPoint(entry), execMode(em),
+                                       llvm::ArrayRef<uint32_t> paramsVec)
+    : SpirvExecutionModeBase(IK_ExecutionMode, spv::Op::OpExecutionMode, loc,
+                             entry, em),
+      params(paramsVec.begin(), paramsVec.end()) {}
+
+SpirvExecutionModeId::SpirvExecutionModeId(
+    SourceLocation loc, SpirvFunction *entry, spv::ExecutionMode em,
+    llvm::ArrayRef<SpirvInstruction *> paramsVec)
+    : SpirvExecutionModeBase(IK_ExecutionModeId, spv::Op::OpExecutionModeId,
+                             loc, entry, em),
       params(paramsVec.begin(), paramsVec.end()) {}
 
 SpirvString::SpirvString(SourceLocation loc, llvm::StringRef stringLiteral)
diff --git a/tools/clang/lib/SPIRV/SpirvModule.cpp b/tools/clang/lib/SPIRV/SpirvModule.cpp
index 9c6a826a5b..ed6aca7488 100644
--- a/tools/clang/lib/SPIRV/SpirvModule.cpp
+++ b/tools/clang/lib/SPIRV/SpirvModule.cpp
@@ -294,9 +294,10 @@ void SpirvModule::addEntryPoint(SpirvEntryPoint *ep) {
   entryPoints.push_back(ep);
 }
 
-SpirvExecutionMode *SpirvModule::findExecutionMode(SpirvFunction *entryPoint,
-                                                   spv::ExecutionMode em) {
-  for (SpirvExecutionMode *cem : executionModes) {
+SpirvExecutionModeBase *
+SpirvModule::findExecutionMode(SpirvFunction *entryPoint,
+                               spv::ExecutionMode em) {
+  for (SpirvExecutionModeBase *cem : executionModes) {
     if (cem->getEntryPoint() != entryPoint)
       continue;
     if (cem->getExecutionMode() != em)
@@ -306,7 +307,7 @@ SpirvExecutionMode *SpirvModule::findExecutionMode(SpirvFunction *entryPoint,
   return nullptr;
 }
 
-void SpirvModule::addExecutionMode(SpirvExecutionMode *em) {
+void SpirvModule::addExecutionMode(SpirvExecutionModeBase *em) {
   assert(em && "cannot add null execution mode");
   executionModes.push_back(em);
 }
diff --git a/tools/clang/test/CodeGenSPIRV/inline-spirv/spv.intrinsicExecutionModeId.hlsl b/tools/clang/test/CodeGenSPIRV/inline-spirv/spv.intrinsicExecutionModeId.hlsl
index 0d63662ef8..beb0e23a95 100644
--- a/tools/clang/test/CodeGenSPIRV/inline-spirv/spv.intrinsicExecutionModeId.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/inline-spirv/spv.intrinsicExecutionModeId.hlsl
@@ -2,11 +2,11 @@
 
 // CHECK: OpCapability ShaderClockKHR
 // CHECK: OpExtension "SPV_KHR_shader_clock"
-// CHECK: OpExecutionModeId {{%[a-zA-Z0-9_]+}} LocalSizeId %uint_8 %uint_8 %uint_8
-// CHECK: OpExecutionModeId {{%[a-zA-Z0-9_]+}} LocalSizeHintId %uint_4 %uint_4 %uint_4
+// CHECK: OpExecutionModeId {{%[a-zA-Z0-9_]+}} LocalSizeId %uint_8 %uint_6 %uint_8
+// CHECK: OpExecutionModeId {{%[a-zA-Z0-9_]+}} LocalSizeHintId %int_4 %int_4 %int_4
 
 int main() : SV_Target0 {
-  vk::ext_execution_mode_id(/*LocalSizeId*/38, 8, 8, 8);
+  vk::ext_execution_mode_id(/*LocalSizeId*/38, 8u, 6u, 8u);
 
   [[vk::ext_capability(5055)]]
   [[vk::ext_extension("SPV_KHR_shader_clock")]]

From 75220978aab87a3d483ce6aadaeb87d7024e20e5 Mon Sep 17 00:00:00 2001
From: Joshua Batista <jbatista@microsoft.com>
Date: Tue, 29 Apr 2025 10:58:12 -0700
Subject: [PATCH 15/93] [CoopVec] Add Linear Algebra common header with tests
 (#7350) (#7388)

This PR introduces the linear algebra header file, and places it in a
location that is by default included in all HLSL compilation. The
builtins in the API aren't yet defined, and depend on the #7290 PR
merging first.
The tests that have been added have temporary diagnostic messages while
7290 is in progress. They will need to be updated. Open to feedback on
better / suggested error messages, or whether there shouldn't be any
sema-level validation for these errors.

Fixes
[#7304](https://github.com/microsoft/DirectXShaderCompiler/issues/7304)

Cherrypick of
https://github.com/microsoft/DirectXShaderCompiler/pull/7350

Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
---
 tools/clang/lib/Headers/hlsl/dx/linalg.h      | 182 ++++++++++++++++++
 .../CodeGenDXIL/hlsl/linalg/mat-vec-mul.hlsl  |  40 ++++
 .../hlsl/linalg/mat-vec-muladd.hlsl           |  90 +++++++++
 .../hlsl/linalg/outerproductaccumulate.hlsl   |  16 ++
 .../hlsl/linalg/vectoraccumulate.hlsl         |  14 ++
 .../hlsl/linalg/make-interp-vec-errors.hlsl   |  33 ++++
 .../hlsl/linalg/mat-vec-mul-errors.hlsl       |  16 ++
 .../linalg/mat-vec-mul-transpose-errors.hlsl  |  30 +++
 .../hlsl/linalg/mat-vec-muladd-errors.hlsl    |  16 ++
 .../linalg/outerproductaccumulate-errors.hlsl |  44 +++++
 .../outerproductaccumulate-spirv-errors.hlsl  |  19 ++
 .../hlsl/linalg/vectoraccumulate-errors.hlsl  |  16 ++
 12 files changed, 516 insertions(+)
 create mode 100644 tools/clang/lib/Headers/hlsl/dx/linalg.h
 create mode 100644 tools/clang/test/CodeGenDXIL/hlsl/linalg/mat-vec-mul.hlsl
 create mode 100644 tools/clang/test/CodeGenDXIL/hlsl/linalg/mat-vec-muladd.hlsl
 create mode 100644 tools/clang/test/CodeGenDXIL/hlsl/linalg/outerproductaccumulate.hlsl
 create mode 100644 tools/clang/test/CodeGenDXIL/hlsl/linalg/vectoraccumulate.hlsl
 create mode 100644 tools/clang/test/SemaHLSL/hlsl/linalg/make-interp-vec-errors.hlsl
 create mode 100644 tools/clang/test/SemaHLSL/hlsl/linalg/mat-vec-mul-errors.hlsl
 create mode 100644 tools/clang/test/SemaHLSL/hlsl/linalg/mat-vec-mul-transpose-errors.hlsl
 create mode 100644 tools/clang/test/SemaHLSL/hlsl/linalg/mat-vec-muladd-errors.hlsl
 create mode 100644 tools/clang/test/SemaHLSL/hlsl/linalg/outerproductaccumulate-errors.hlsl
 create mode 100644 tools/clang/test/SemaHLSL/hlsl/linalg/outerproductaccumulate-spirv-errors.hlsl
 create mode 100644 tools/clang/test/SemaHLSL/hlsl/linalg/vectoraccumulate-errors.hlsl

diff --git a/tools/clang/lib/Headers/hlsl/dx/linalg.h b/tools/clang/lib/Headers/hlsl/dx/linalg.h
new file mode 100644
index 0000000000..51e662bbc9
--- /dev/null
+++ b/tools/clang/lib/Headers/hlsl/dx/linalg.h
@@ -0,0 +1,182 @@
+// Header for linear algebra APIs.
+
+#if __spirv__
+#error "Cooperative vectors not (yet) supported for SPIRV"
+#endif
+
+#if ((__SHADER_TARGET_MAJOR > 6) ||                                            \
+     (__SHADER_TARGET_MAJOR == 6 && __SHADER_TARGET_MINOR >= 9)) &&            \
+    (__HLSL_VERSION >= 2021)
+
+namespace dx {
+namespace linalg {
+
+// NOTE: can't be an enum class because we get this error:
+//     error: non-type template argument of type 'dx::linalg::DataType' is not
+//     an integral constant expression
+//
+enum DataType {
+  DATA_TYPE_SINT16 = 2,           // ComponentType::I16
+  DATA_TYPE_UINT16 = 3,           // ComponentType::U16
+  DATA_TYPE_SINT32 = 4,           // ComponentType::I32
+  DATA_TYPE_UINT32 = 5,           // ComponentType::U32
+  DATA_TYPE_FLOAT16 = 8,          // ComponentType::F16
+  DATA_TYPE_FLOAT32 = 9,          // ComponentType::F32
+  DATA_TYPE_SINT8_T4_PACKED = 17, // ComponentType::PackedS8x32
+  DATA_TYPE_UINT8_T4_PACKED = 18, // ComponentType::PackedU8x32
+  DATA_TYPE_UINT8 = 19,           // ComponentType::U8
+  DATA_TYPE_SINT8 = 20,           // ComponentType::I8
+  DATA_TYPE_FLOAT8_E4M3 = 21,     // ComponentType::F8_E4M3
+                                  // (1 sign, 4 exp, 3 mantissa bits)
+  DATA_TYPE_FLOAT8_E5M2 = 22,     // ComponentType::F8_E5M2
+                                  // (1 sign, 5 exp, 2 mantissa bits)
+};
+
+enum MatrixLayout {
+  MATRIX_LAYOUT_ROW_MAJOR = 0,
+  MATRIX_LAYOUT_COLUMN_MAJOR = 1,
+  MATRIX_LAYOUT_MUL_OPTIMAL = 2,
+  MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL = 3
+};
+
+//
+// Helper for signedness
+//
+namespace details {
+template <typename T> bool IsUnsigned() { return false; }
+
+#ifdef __HLSL_ENABLE_16_BIT
+template <> bool IsUnsigned<uint16_t>() { return true; }
+#endif
+
+template <> bool IsUnsigned<uint32_t>() { return true; }
+template <> bool IsUnsigned<uint64_t>() { return true; }
+} // namespace details
+
+//
+// (RW)MatrixRef
+//
+
+template <typename BufferTy, DataType DT, uint M, uint K, MatrixLayout ML,
+          bool Transpose>
+struct MatrixRefImpl {
+  BufferTy Buffer;
+  uint StartOffset;
+  uint Stride;
+};
+
+template <DataType DT, uint M, uint K, MatrixLayout ML, bool Transpose = false>
+using MatrixRef = MatrixRefImpl<ByteAddressBuffer, DT, M, K, ML, Transpose>;
+
+template <DataType DT, uint M, uint K, MatrixLayout ML, bool Transpose = false>
+using RWMatrixRef = MatrixRefImpl<RWByteAddressBuffer, DT, M, K, ML, Transpose>;
+
+//
+// (RW)VectorRef
+//
+
+template <typename BufferTy, DataType DT> struct VectorRefImpl {
+  BufferTy Buffer;
+  uint StartOffset;
+};
+
+template <DataType DT> using VectorRef = VectorRefImpl<ByteAddressBuffer, DT>;
+
+template <DataType DT>
+using RWVectorRef = VectorRefImpl<RWByteAddressBuffer, DT>;
+
+//
+// Vector
+//
+
+template <typename T, int N, DataType DT> struct InterpretedVector {
+  vector<T, N> Data;
+};
+
+template <DataType DT, typename T, int N>
+InterpretedVector<T, N, DT> MakeInterpretedVector(vector<T, N> Vec) {
+  InterpretedVector<T, N, DT> IV = {Vec};
+  return IV;
+}
+
+//
+// Mul
+//
+
+template <typename OutputElTy, typename InputElTy, int InputElCount,
+          typename MatrixBufferTy, DataType InputDT, DataType MatrixDT,
+          uint MatrixM, uint MatrixK, MatrixLayout MatrixLayout,
+          bool MatrixTranspose>
+vector<OutputElTy, MatrixM>
+Mul(MatrixRefImpl<MatrixBufferTy, MatrixDT, MatrixM, MatrixK, MatrixLayout,
+                  MatrixTranspose>
+        Matrix,
+    InterpretedVector<InputElTy, InputElCount, InputDT> InputVector) {
+
+  vector<OutputElTy, MatrixM> OutputVector;
+
+  __builtin_MatVecMul(
+      /*out*/ OutputVector, details::IsUnsigned<OutputElTy>(), InputVector.Data,
+      details::IsUnsigned<InputElTy>(), InputDT, Matrix.Buffer,
+      Matrix.StartOffset, MatrixDT, MatrixM, MatrixK, MatrixLayout,
+      MatrixTranspose, Matrix.Stride);
+
+  return OutputVector;
+}
+
+//
+// MulAdd
+//
+
+template <typename OutputElTy, typename InputElTy, int InputElCount,
+          typename MatrixBufferTy, DataType InputDT, DataType MatrixDT,
+          uint MatrixM, uint MatrixK, MatrixLayout MatrixLayout,
+          bool MatrixTranspose, typename BiasVectorBufferTy,
+          DataType BiasVectorDT>
+vector<OutputElTy, MatrixM>
+MulAdd(MatrixRefImpl<MatrixBufferTy, MatrixDT, MatrixM, MatrixK, MatrixLayout,
+                     MatrixTranspose>
+           Matrix,
+       InterpretedVector<InputElTy, InputElCount, InputDT> InputVector,
+       VectorRefImpl<BiasVectorBufferTy, BiasVectorDT> BiasVector) {
+
+  vector<OutputElTy, MatrixM> OutputVector;
+
+  __builtin_MatVecMulAdd(
+      /*out*/ OutputVector, details::IsUnsigned<OutputElTy>(), InputVector.Data,
+      details::IsUnsigned<InputElTy>(), InputDT, Matrix.Buffer,
+      Matrix.StartOffset, MatrixDT, MatrixM, MatrixK, MatrixLayout,
+      MatrixTranspose, Matrix.Stride, BiasVector.Buffer, BiasVector.StartOffset,
+      BiasVectorDT);
+
+  return OutputVector;
+}
+
+//
+// OuterProductAccumulate
+//
+
+template <typename ElTy, int MatrixM, int MatrixN, DataType MatrixDT,
+          MatrixLayout MatrixLayout>
+void OuterProductAccumulate(
+    vector<ElTy, MatrixM> InputVector1, vector<ElTy, MatrixN> InputVector2,
+    RWMatrixRef<MatrixDT, MatrixM, MatrixN, MatrixLayout, false> Matrix) {
+  __builtin_OuterProductAccumulate(InputVector1, InputVector2, Matrix.Buffer,
+                                   Matrix.StartOffset, MatrixDT, MatrixLayout,
+                                   Matrix.Stride);
+}
+
+//
+// VectorAccumulate
+//
+
+template <typename ElTy, int ElCount>
+void VectorAccumulate(vector<ElTy, ElCount> InputVector,
+                      RWByteAddressBuffer Buffer, uint Offset) {
+  __builtin_VectorAccumulate(InputVector, Buffer, Offset);
+}
+
+} // namespace linalg
+} // namespace dx
+
+#endif // SM 6.9 check and HV version check
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/linalg/mat-vec-mul.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/linalg/mat-vec-mul.hlsl
new file mode 100644
index 0000000000..141801c71c
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/hlsl/linalg/mat-vec-mul.hlsl
@@ -0,0 +1,40 @@
+// RUN: %dxc -I %hlsl_headers -T lib_6_9 -enable-16bit-types %s | FileCheck %s
+
+#include <dx/linalg.h>
+
+ByteAddressBuffer Buf;
+
+export float4 Test1(vector<float, 4> Input) {
+  using namespace dx::linalg;
+
+  MatrixRef<DATA_TYPE_FLOAT16, 4, 4, MATRIX_LAYOUT_MUL_OPTIMAL, true> Matrix = {
+      Buf, 0, 0};
+
+  // CHECK: %{{.+}} = call <4 x float> @dx.op.matVecMul.v4f32.v4f32(i32 305, <4 x float> %{{.+}}, i1 false, i32 8, %dx.types.Handle %{{.+}}, i32 0, i32 8, i32 4, i32 4, i32 2, i1 true, i32 0, i1 false)
+  return Mul<float>(    
+      Matrix, MakeInterpretedVector<DATA_TYPE_FLOAT16>(Input));
+}
+
+export vector<float, 8> Test2(vector<uint8_t4_packed, 6> Input) {
+  using namespace dx::linalg;
+
+  MatrixRef<DATA_TYPE_UINT8, 8, 6 * 4, MATRIX_LAYOUT_MUL_OPTIMAL> Matrix = {
+      Buf, 0, 0};
+
+  // note the stride argument is dropped.
+  // CHECK: %{{.+}} = call <8 x float> @dx.op.matVecMul.v8f32.v6f32(i32 305, <6 x float> %{{.+}}, i1 false, i32 18, %dx.types.Handle %{{.+}}, i32 0, i32 19, i32 8, i32 24, i32 2, i1 false, i32 0, i1 false)
+  return Mul<float>(Matrix,
+                    MakeInterpretedVector<DATA_TYPE_UINT8_T4_PACKED>(Input));
+}
+
+// test that "stride" isn't ignored in non-optimal layouts
+export vector<float, 8> Test3(vector<uint8_t4_packed, 6> Input) {
+  using namespace dx::linalg;
+
+  MatrixRef<DATA_TYPE_UINT8, 8, 6 * 4, MATRIX_LAYOUT_ROW_MAJOR> Matrix = {
+      Buf, 0, 6 * 4 * 8};
+
+  // CHECK: %{{.+}} = call <8 x float> @dx.op.matVecMul.v8f32.v6f32(i32 305, <6 x float> %{{.+}}, i1 false, i32 18, %dx.types.Handle %{{.+}}, i32 0, i32 19, i32 8, i32 24, i32 0, i1 false, i32 192, i1 false)
+  return Mul<float>(Matrix,
+                    MakeInterpretedVector<DATA_TYPE_UINT8_T4_PACKED>(Input));
+}
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/linalg/mat-vec-muladd.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/linalg/mat-vec-muladd.hlsl
new file mode 100644
index 0000000000..c19e601904
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/hlsl/linalg/mat-vec-muladd.hlsl
@@ -0,0 +1,90 @@
+// RUN: %dxc -I %hlsl_headers -T lib_6_9 %s | FileCheck %s
+
+#include <dx/linalg.h>
+
+ByteAddressBuffer Buf;
+
+export float4 Test1(float4 input) {
+  using namespace dx::linalg;
+
+  MatrixRef<DATA_TYPE_FLOAT16, 4, 4, MATRIX_LAYOUT_MUL_OPTIMAL> matrix = {Buf,
+                                                                          0, 0};
+  VectorRef<DATA_TYPE_FLOAT16> biasVector = {Buf, 256};
+
+  InterpretedVector<float, 4, DATA_TYPE_FLOAT16> theVector = {input};
+
+  // CHECK: %{{.+}} = call <4 x float> @dx.op.matVecMulAdd.v4f32.v4f32(i32 306, <4 x float> %{{.+}}, i1 false, i32 8, %dx.types.Handle [[RES:%.+]], i32 0, i32 8, i32 4, i32 4, i32 2, i1 false, i32 0, %dx.types.Handle [[RES]], i32 256, i32 8, i1 false)
+  return MulAdd<float>(
+      matrix, theVector,
+      biasVector);
+}
+
+export float4 Test2(float4 input) {
+  using namespace dx::linalg;
+
+  MatrixRef<DATA_TYPE_FLOAT16, 4, 4, MATRIX_LAYOUT_MUL_OPTIMAL, true> matrix = {
+      Buf, 0, 0};
+  VectorRef<DATA_TYPE_FLOAT16> biasVector = {Buf, 256};
+
+  InterpretedVector<float, 4, DATA_TYPE_FLOAT16> theVector = {input};
+
+  // CHECK: %{{.+}} = call <4 x float> @dx.op.matVecMulAdd.v4f32.v4f32(i32 306, <4 x float> %{{.+}}, i1 false, i32 8, %dx.types.Handle [[RES:%.+]], i32 0, i32 8, i32 4, i32 4, i32 2, i1 true, i32 0, %dx.types.Handle [[RES]], i32 256, i32 8, i1 false)
+  return MulAdd<float>(
+      matrix, theVector,
+      biasVector);
+}
+
+export float4 Test3(float4 input) {
+  using namespace dx::linalg;
+
+  MatrixRef<DATA_TYPE_FLOAT16, 4, 4, MATRIX_LAYOUT_MUL_OPTIMAL, true> matrix = {
+      Buf, 0, 0};
+  VectorRef<DATA_TYPE_FLOAT16> biasVector = {Buf, 256};
+
+  // CHECK: %{{.+}} = call <4 x float> @dx.op.matVecMulAdd.v4f32.v4f32(i32 306, <4 x float> %{{.+}}, i1 false, i32 8, %dx.types.Handle [[RES:%.+]], i32 0, i32 8, i32 4, i32 4, i32 2, i1 true, i32 0, %dx.types.Handle [[RES]], i32 256, i32 8, i1 false)
+  return MulAdd<float>(
+      matrix, MakeInterpretedVector<DATA_TYPE_FLOAT16>(input),
+      biasVector);
+}
+
+namespace ProposalExample {
+
+ByteAddressBuffer model;
+
+vector<float, 3> ApplyNeuralMaterial(vector<half, 8> inputVector) {
+  using namespace dx::linalg;
+
+  MatrixRef<DATA_TYPE_FLOAT8_E4M3, 32, 8, MATRIX_LAYOUT_MUL_OPTIMAL> matrix0 = {
+      model, 0, 0};
+
+  VectorRef<DATA_TYPE_FLOAT16> biasVector0 = {model, 1024};
+
+  MatrixRef<DATA_TYPE_FLOAT8_E4M3, 32, 32, MATRIX_LAYOUT_MUL_OPTIMAL> matrix1 =
+      {model, 2048, 0};
+
+  VectorRef<DATA_TYPE_FLOAT16> biasVector1 = {model, 3072};
+
+  MatrixRef<DATA_TYPE_FLOAT8_E4M3, 3, 32, MATRIX_LAYOUT_MUL_OPTIMAL> matrix2 = {
+      model, 4096, 0};
+
+  VectorRef<DATA_TYPE_FLOAT16> biasVector2 = {model, 5120};
+
+  vector<half, 32> layer0 = MulAdd<half>(
+      matrix0, MakeInterpretedVector<DATA_TYPE_FLOAT8_E4M3>(inputVector),
+      biasVector0);
+  layer0 = max(layer0, 0);
+
+  vector<half, 32> layer1 = MulAdd<half>(
+      matrix1, MakeInterpretedVector<DATA_TYPE_FLOAT8_E4M3>(layer0),
+      biasVector1);
+  layer1 = max(layer1, 0);
+
+  vector<float, 3> output = MulAdd<float>(
+      matrix2, MakeInterpretedVector<DATA_TYPE_FLOAT8_E4M3>(layer1),
+      biasVector2);
+  output = exp(output);
+
+  return output;
+}
+
+} // namespace ProposalExample
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/linalg/outerproductaccumulate.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/linalg/outerproductaccumulate.hlsl
new file mode 100644
index 0000000000..eda15c66f6
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/hlsl/linalg/outerproductaccumulate.hlsl
@@ -0,0 +1,16 @@
+// RUN: %dxc -I %hlsl_headers -T lib_6_9 -enable-16bit-types %s | FileCheck %s
+
+#include <dx/linalg.h>
+
+RWByteAddressBuffer RWBuf;
+
+export void Test4(vector<half, 128> Input1, vector<half, 64> Input2) {
+  using namespace dx::linalg;
+
+  RWMatrixRef<DATA_TYPE_FLOAT16, 128, 64, MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL>
+      matrix = {RWBuf, 0, 0};
+
+  // CHECK: call void @dx.op.outerProductAccumulate.v128f16.v64f16(i32 307, <128 x half> %{{.+}}, <64 x half> %{{.+}}, %dx.types.Handle %{{.+}}, i32 0, i32 8, i32 3, i32 0)
+
+  OuterProductAccumulate(Input1, Input2, matrix);  
+}
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/linalg/vectoraccumulate.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/linalg/vectoraccumulate.hlsl
new file mode 100644
index 0000000000..9157156f10
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/hlsl/linalg/vectoraccumulate.hlsl
@@ -0,0 +1,14 @@
+// RUN: %dxc -I %hlsl_headers -T lib_6_9 %s | FileCheck %s
+
+#include <dx/linalg.h>
+
+RWByteAddressBuffer RWBuf;
+
+export void Test5(vector<half, 128> Input) {
+  using namespace dx::linalg;
+
+  RWBuf.Store<vector<half, 128> >(0, Input);
+
+  // CHECK: call void @dx.op.vectorAccumulate.v128f32(i32 308, <128 x float> %{{.*}}, %dx.types.Handle %{{.*}}, i32 0)
+  VectorAccumulate(Input, RWBuf, 0);
+}
diff --git a/tools/clang/test/SemaHLSL/hlsl/linalg/make-interp-vec-errors.hlsl b/tools/clang/test/SemaHLSL/hlsl/linalg/make-interp-vec-errors.hlsl
new file mode 100644
index 0000000000..9f2793d417
--- /dev/null
+++ b/tools/clang/test/SemaHLSL/hlsl/linalg/make-interp-vec-errors.hlsl
@@ -0,0 +1,33 @@
+// RUN: %dxc -I %hlsl_headers -T lib_6_9 %s -verify
+
+#include <dx/linalg.h>
+ByteAddressBuffer Buf;
+
+export float4 Test1(vector<float, 4> Input) {
+  using namespace dx::linalg;
+
+  MatrixRef<DATA_TYPE_UINT16, 4, 4, MATRIX_LAYOUT_MUL_OPTIMAL, true> Matrix = {
+      Buf, 0, 0};
+
+  // expected-error@+3{{no matching function for call to 'MakeInterpretedVector'}}
+  // expected-note@dx/linalg.h:97{{candidate template ignored: invalid explicitly-specified argument for template parameter 'DT'}}
+  return Mul<float>(    
+      Matrix, MakeInterpretedVector<2>(Input));
+}
+
+enum DataType {
+  DATA_TYPE_InvalidType = 40
+};
+
+export float4 Test2(vector<float, 4> Input) {
+  using namespace dx::linalg;
+
+  MatrixRef<DATA_TYPE_UINT16, 4, 4, MATRIX_LAYOUT_MUL_OPTIMAL, true> Matrix = {
+      Buf, 0, 0};
+
+  // expected-error@+3{{no matching function for call to 'MakeInterpretedVector'}}
+  // expected-note@dx/linalg.h:97{{candidate template ignored: invalid explicitly-specified argument for template parameter 'DT'}}
+  return Mul<float>(    
+      Matrix, MakeInterpretedVector<DATA_TYPE_InvalidType>(Input));
+}
+
diff --git a/tools/clang/test/SemaHLSL/hlsl/linalg/mat-vec-mul-errors.hlsl b/tools/clang/test/SemaHLSL/hlsl/linalg/mat-vec-mul-errors.hlsl
new file mode 100644
index 0000000000..2d5a11e83e
--- /dev/null
+++ b/tools/clang/test/SemaHLSL/hlsl/linalg/mat-vec-mul-errors.hlsl
@@ -0,0 +1,16 @@
+// RUN: %dxc -I %hlsl_headers -T lib_6_9 %s -verify
+
+#include <dx/linalg.h>
+
+ByteAddressBuffer Buf;
+
+vector<float, 128> MixUpVectorAndMatrixArguments(vector<float, 128> Input) {
+  using namespace dx::linalg;
+
+  MatrixRef<DATA_TYPE_FLOAT16, 128, 128, MATRIX_LAYOUT_MUL_OPTIMAL> Matrix = {
+      Buf, 0, 0};
+
+  // expected-error@+2{{no matching function for call to 'Mul'}}
+  // expected-note@dx/linalg.h:111{{candidate template ignored: could not match 'MatrixRefImpl' against 'InterpretedVector'}}
+  return Mul<float>(MakeInterpretedVector<DATA_TYPE_FLOAT16>(Input), Matrix);
+}
diff --git a/tools/clang/test/SemaHLSL/hlsl/linalg/mat-vec-mul-transpose-errors.hlsl b/tools/clang/test/SemaHLSL/hlsl/linalg/mat-vec-mul-transpose-errors.hlsl
new file mode 100644
index 0000000000..2018acafab
--- /dev/null
+++ b/tools/clang/test/SemaHLSL/hlsl/linalg/mat-vec-mul-transpose-errors.hlsl
@@ -0,0 +1,30 @@
+// XFAIL: *
+// RUN: %dxc -I %hlsl_headers -T lib_6_9 -enable-16bit-types %s -verify
+
+#include <dx/linalg.h>
+
+ByteAddressBuffer Buf;
+
+export float4 Test1(vector<float, 4> Input) {
+  using namespace dx::linalg;
+
+  MatrixRef<DATA_TYPE_FLOAT16, 4, 4, MATRIX_LAYOUT_ROW_MAJOR, true> Matrix = {
+      Buf, 0, 0};
+
+  // PREVIEW CHECK TODO:
+  // expected-error@+1{{something about transposing not supported for rowmajor / colmajor layouts}}
+  return Mul<float>(    
+      Matrix, MakeInterpretedVector<DATA_TYPE_FLOAT16>(Input));
+}
+
+export vector<float, 8> Test2(vector<uint8_t4_packed, 6> Input) {
+  using namespace dx::linalg;
+
+  MatrixRef<DATA_TYPE_UINT8, 8, 6 * 4, MATRIX_LAYOUT_COLUMN_MAJOR> Matrix = {
+      Buf, 0, 0};
+
+  // PREVIEW CHECK TODO:
+  // expected-error@+1{{something about transposing not supported for rowmajor / colmajor layouts}}
+  return Mul<float>(Matrix,
+                    MakeInterpretedVector<DATA_TYPE_UINT8_T4_PACKED>(Input));
+}
diff --git a/tools/clang/test/SemaHLSL/hlsl/linalg/mat-vec-muladd-errors.hlsl b/tools/clang/test/SemaHLSL/hlsl/linalg/mat-vec-muladd-errors.hlsl
new file mode 100644
index 0000000000..f444f81c3a
--- /dev/null
+++ b/tools/clang/test/SemaHLSL/hlsl/linalg/mat-vec-muladd-errors.hlsl
@@ -0,0 +1,16 @@
+// RUN: %dxc -I %hlsl_headers -T lib_6_9 %s -verify
+
+#include <dx/linalg.h>
+
+ByteAddressBuffer Buf;
+
+vector<float, 128> MixUpVectorAndMatrixArguments(vector<float, 128> Input) {
+  using namespace dx::linalg;
+
+  MatrixRef<DATA_TYPE_SINT16, 128, 128, MATRIX_LAYOUT_MUL_OPTIMAL> Matrix = {
+      Buf, 0, 0};
+
+  // expected-error@+2{{no matching function for call to 'MulAdd'}}
+  // expected-note@dx/linalg.h:137{{candidate template ignored: could not match 'MatrixRefImpl' against 'InterpretedVector'}}
+  return MulAdd<float>(MakeInterpretedVector<DATA_TYPE_SINT16>(Input), Matrix, MakeInterpretedVector<DATA_TYPE_SINT16>(Input));
+}
diff --git a/tools/clang/test/SemaHLSL/hlsl/linalg/outerproductaccumulate-errors.hlsl b/tools/clang/test/SemaHLSL/hlsl/linalg/outerproductaccumulate-errors.hlsl
new file mode 100644
index 0000000000..6f503b367b
--- /dev/null
+++ b/tools/clang/test/SemaHLSL/hlsl/linalg/outerproductaccumulate-errors.hlsl
@@ -0,0 +1,44 @@
+// RUN: %dxc -I %hlsl_headers -T lib_6_9 -enable-16bit-types %s -verify
+
+#include <dx/linalg.h>
+
+RWByteAddressBuffer RWBuf;
+
+// test for inputs of different size
+export void Test4(vector<half, 128> Input1, vector<half, 64> Input2) {
+  using namespace dx::linalg;
+
+  RWMatrixRef<DATA_TYPE_FLOAT16, 128, 64, MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL, true>
+      matrix = {RWBuf, 0, 0};
+
+  // expected-error@+3{{no matching function for call to 'OuterProductAccumulate'}}
+  // expected-note@dx/linalg.h:161{{candidate template ignored: could not match 0 against 1}}
+
+  OuterProductAccumulate(Input1, Input2, matrix);  
+}
+
+// now test for an error when element types differ
+export void Test5(vector<int, 128> Input1, vector<uint, 128> Input2) {
+  using namespace dx::linalg;
+
+  RWMatrixRef<DATA_TYPE_FLOAT16, 128, 128, MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL, true>
+      matrix = {RWBuf, 0, 0};
+
+  // expected-error@+3{{no matching function for call to 'OuterProductAccumulate'}}
+  // expected-note@dx/linalg.h:161{{candidate template ignored: could not match 0 against 1}}
+
+  OuterProductAccumulate(Input1, Input2, matrix);  
+}
+
+// now test for an error when matrix transpose parameter is true
+export void Test4(vector<half, 64> Input1, vector<half, 64> Input2) {
+  using namespace dx::linalg;
+
+  RWMatrixRef<DATA_TYPE_FLOAT16, 64, 64, MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL, true>
+      matrix = {RWBuf, 0, 0};
+
+  // expected-error@+3{{no matching function for call to 'OuterProductAccumulate'}}
+  // expected-note@dx/linalg.h:161{{candidate template ignored: deduced conflicting types for parameter 'ElTy' ('int' vs. 'unsigned int')}}
+
+  OuterProductAccumulate(Input1, Input2, matrix);  
+}
diff --git a/tools/clang/test/SemaHLSL/hlsl/linalg/outerproductaccumulate-spirv-errors.hlsl b/tools/clang/test/SemaHLSL/hlsl/linalg/outerproductaccumulate-spirv-errors.hlsl
new file mode 100644
index 0000000000..0213103926
--- /dev/null
+++ b/tools/clang/test/SemaHLSL/hlsl/linalg/outerproductaccumulate-spirv-errors.hlsl
@@ -0,0 +1,19 @@
+// RUN: %dxc -I %hlsl_headers -T lib_6_9 -enable-16bit-types -spirv %s -verify
+
+// Tests that the header file cannot be included for spirv compilations
+// This is a copy of \tools\clang\test\CodeGenDXIL\hlsl\linalg\outerproductaccumulate.hlsl
+// except that spirv is targeted
+
+// expected-error@dx/linalg.h:4{{Cooperative vectors not (yet) supported for SPIRV}}
+#include <dx/linalg.h>
+
+RWByteAddressBuffer RWBuf;
+
+export void Test4(vector<half, 128> Input1, vector<half, 64> Input2) {
+  using namespace dx::linalg;
+
+  RWMatrixRef<DATA_TYPE_FLOAT16, 128, 64, MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL>
+      matrix = {RWBuf, 0, 0};
+
+  OuterProductAccumulate(Input1, Input2, matrix);  
+}
diff --git a/tools/clang/test/SemaHLSL/hlsl/linalg/vectoraccumulate-errors.hlsl b/tools/clang/test/SemaHLSL/hlsl/linalg/vectoraccumulate-errors.hlsl
new file mode 100644
index 0000000000..4c8ae6f049
--- /dev/null
+++ b/tools/clang/test/SemaHLSL/hlsl/linalg/vectoraccumulate-errors.hlsl
@@ -0,0 +1,16 @@
+// XFAIL: *
+// RUN: %dxc -I %hlsl_headers -T lib_6_9 %s | FileCheck %s
+
+#include <dx/linalg.h>
+
+RWByteAddressBuffer RWBuf;
+
+export void Test5(vector<float, 128> Input) {
+  using namespace dx::linalg;
+
+  RWBuf.Store<vector<half, 128> >(0, Input);
+
+  // PREVIEW CHECK TODO:
+  // CHECK: Something about an error due to illegal conversions
+  VectorAccumulate(Input, RWBuf, 0);
+}

From 3bf630525da490d3c62d06e26922d00661822438 Mon Sep 17 00:00:00 2001
From: Simon Moll <smoll@nvidia.com>
Date: Tue, 29 Apr 2025 23:44:59 +0200
Subject: [PATCH 16/93] [SER] Validate %dx.types.HitObject as legal type same
 as handle (#7390)

Validator did not recognize %dx.types.HitObject as an allowed type. This
lead to validation failures in -Od compiles where allocas, loads and
stores remain in the generated DXIL:

```
  dxc.exe -T lib_6_9 -Od \tools\clang\test\CodeGenDXIL\hlsl\intrinsics\maybereorder.hlsl
  error: validation errors
  error: Declaration '%dx.types.HitObject = type { i8* }' uses a reserved prefix.
```
Closes #7387
---
 lib/DxilValidation/DxilValidation.cpp         |  3 +++
 .../hlsl/intrinsics/maybereorder_od.hlsl      | 19 +++++++++++++++++++
 2 files changed, 22 insertions(+)
 create mode 100644 tools/clang/test/CodeGenDXIL/hlsl/intrinsics/maybereorder_od.hlsl

diff --git a/lib/DxilValidation/DxilValidation.cpp b/lib/DxilValidation/DxilValidation.cpp
index c4448d1ec4..694ab43a7a 100644
--- a/lib/DxilValidation/DxilValidation.cpp
+++ b/lib/DxilValidation/DxilValidation.cpp
@@ -2599,6 +2599,9 @@ static bool ValidateType(Type *Ty, ValidationContext &ValCtx,
       if (ValCtx.HandleTy == Ty)
         return true;
       hlsl::OP *HlslOP = ValCtx.DxilMod.GetOP();
+      // Allow HitObject type.
+      if (ST == HlslOP->GetHitObjectType())
+        return true;
       if (IsDxilBuiltinStructType(ST, HlslOP)) {
         ValCtx.EmitTypeError(Ty, ValidationRule::InstrDxilStructUser);
         Result = false;
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/maybereorder_od.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/maybereorder_od.hlsl
new file mode 100644
index 0000000000..42dff9c52c
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/maybereorder_od.hlsl
@@ -0,0 +1,19 @@
+// RUN: %dxc -T lib_6_9 -E main %s -Od | FileCheck %s --check-prefix DXIL
+
+// DXIL: %[[HOA:[^ ]+]] = alloca %dx.types.HitObject, align 4
+// DXIL-NEXT: %[[NOP:[^ ]+]] = call %dx.types.HitObject @dx.op.hitObject_MakeNop(i32 266)  ; HitObject_MakeNop()
+// DXIL-NEXT: store %dx.types.HitObject %[[NOP]], %dx.types.HitObject* %[[HOA]]
+// DXIL-NEXT: %[[LD0:[^ ]+]] = load %dx.types.HitObject, %dx.types.HitObject* %[[HOA]]
+// DXIL-NEXT: call void @dx.op.maybeReorderThread(i32 268, %dx.types.HitObject %[[LD0]], i32 undef, i32 0)  ; MaybeReorderThread(hitObject,coherenceHint,numCoherenceHintBitsFromLSB)
+// DXIL-NEXT: %[[LD1:[^ ]+]] = load %dx.types.HitObject, %dx.types.HitObject* %[[HOA]]
+// DXIL-NEXT: call void @dx.op.maybeReorderThread(i32 268, %dx.types.HitObject %[[LD1]], i32 241, i32 3)  ; MaybeReorderThread(hitObject,coherenceHint,numCoherenceHintBitsFromLSB)
+// DXIL-NEXT: %[[NOP2:[^ ]+]] = call %dx.types.HitObject @dx.op.hitObject_MakeNop(i32 266)  ; HitObject_MakeNop()
+// DXIL-NEXT: call void @dx.op.maybeReorderThread(i32 268, %dx.types.HitObject %[[NOP2]], i32 242, i32 7)  ; MaybeReorderThread(hitObject,coherenceHint,numCoherenceHintBitsFromLSB)
+
+[shader("raygeneration")]
+void main() {
+  dx::HitObject hit;
+  dx::MaybeReorderThread(hit);
+  dx::MaybeReorderThread(hit, 0xf1, 3);
+  dx::MaybeReorderThread(0xf2, 7);
+}

From 0fd79eba6bb23f50ec21a7a7daeee3614bebe12b Mon Sep 17 00:00:00 2001
From: Joshua Batista <jbatista@microsoft.com>
Date: Tue, 29 Apr 2025 15:25:25 -0700
Subject: [PATCH 17/93] Change default validation behavior (#7392)

This PR changes the default validation behavior to use the internal
validator by default.
If no options are specified, the internal validator will be used, and if
it fails, then compilation fails.
The external validator can still be run but must be explicitly chosen.
Specifying internal works just as before.

There is plenty of testing and infrastructure that needs to be added to
verify this change, but that needs to be added in a separate change.
This change is step 1.

Addresses https://github.com/microsoft/DirectXShaderCompiler/issues/7389
---
 include/dxc/Support/HLSLOptions.h        |  2 +-
 tools/clang/tools/dxcompiler/dxcutil.cpp | 24 +++++++++++++++++-------
 2 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/include/dxc/Support/HLSLOptions.h b/include/dxc/Support/HLSLOptions.h
index 56e95a1659..bad330747b 100644
--- a/include/dxc/Support/HLSLOptions.h
+++ b/include/dxc/Support/HLSLOptions.h
@@ -115,7 +115,7 @@ struct RewriterOpts {
 };
 
 enum class ValidatorSelection : int {
-  Auto,        // Try DXIL.dll; fallback to internal validator
+  Auto,        // Force internal validator (even if DXIL.dll is present)
   Internal,    // Force internal validator (even if DXIL.dll is present)
   External,    // Use DXIL.dll, failing compilation if not available
   Invalid = -1 // Invalid
diff --git a/tools/clang/tools/dxcompiler/dxcutil.cpp b/tools/clang/tools/dxcompiler/dxcutil.cpp
index d3a531d4c6..ea3f72dcb4 100644
--- a/tools/clang/tools/dxcompiler/dxcutil.cpp
+++ b/tools/clang/tools/dxcompiler/dxcutil.cpp
@@ -49,6 +49,7 @@ HRESULT RunInternalValidator(IDxcValidator *pValidator,
 namespace {
 // AssembleToContainer helper functions.
 
+// return true if the internal validator was used, false otherwise
 bool CreateValidator(CComPtr<IDxcValidator> &pValidator,
                      hlsl::options::ValidatorSelection SelectValidator =
                          hlsl::options::ValidatorSelection::Auto) {
@@ -56,16 +57,25 @@ bool CreateValidator(CComPtr<IDxcValidator> &pValidator,
       SelectValidator == hlsl::options::ValidatorSelection::Internal;
   bool bExternal =
       SelectValidator == hlsl::options::ValidatorSelection::External;
-  if (!bInternal && DxilLibIsEnabled())
-    DxilLibCreateInstance(CLSID_DxcValidator, &pValidator);
+  bool bAuto = SelectValidator == hlsl::options::ValidatorSelection::Auto;
 
-  bool bInternalValidator = false;
-  if (pValidator == nullptr) {
-    IFTBOOL(!bExternal, DXC_E_VALIDATOR_MISSING);
+  // default behavior uses internal validator, as well as
+  // explicitly specifying internal
+  if (bInternal || bAuto) {
     IFT(CreateDxcValidator(IID_PPV_ARGS(&pValidator)));
-    bInternalValidator = true;
+    return true;
+  }
+
+  if (bExternal) {
+    // if external was explicitly specified, but no
+    // external validator could be found (no DXIL.dll), then error
+    IFTBOOL(DxilLibIsEnabled(), DXC_E_VALIDATOR_MISSING);
+    IFT(DxilLibCreateInstance(CLSID_DxcValidator, &pValidator));
+
+    return false;
   }
-  return bInternalValidator;
+
+  return false;
 }
 
 } // namespace

From 4d7c704e42921fa4a4b545963b3b353a6cdb7363 Mon Sep 17 00:00:00 2001
From: Joshua Batista <jbatista@microsoft.com>
Date: Tue, 29 Apr 2025 17:04:49 -0700
Subject: [PATCH 18/93] Update version to 1.8.2505 (#7398)

Update version to 1.8.2505
---
 docs/ReleaseNotes.md              | 5 ++---
 utils/version/latest-release.json | 4 ++--
 utils/version/version.inc         | 6 +++---
 3 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/docs/ReleaseNotes.md b/docs/ReleaseNotes.md
index 637bd8dae8..5d7dfcb9f4 100644
--- a/docs/ReleaseNotes.md
+++ b/docs/ReleaseNotes.md
@@ -17,11 +17,10 @@ The included licenses apply to the following files:
 
 ## Changelog
 
-### Upcoming Release
-
-Place release notes for the upcoming release below this line and remove this line upon naming this release.
+### Version 1.8.2505
 
 - Typed buffers (including ROV buffers) no longer accept types other than vectors and scalars. Any other types will produce descriptive errors. This removes support for appropriately sized matrices and structs. Though it worked in some contexts, code generated from such types was unreliable.
+- By default, the internal validator will be used instead of searching externally for an existing DXIL.dll.
 
 ### Version 1.8.2502
 
diff --git a/utils/version/latest-release.json b/utils/version/latest-release.json
index 3138ccd2b1..146acf0708 100644
--- a/utils/version/latest-release.json
+++ b/utils/version/latest-release.json
@@ -2,7 +2,7 @@
     "version": {
         "major": "1",
         "minor": "8",
-        "rev": "2502"
+        "rev": "2505"
     },
-    "sha": "070d0d5a2beacef9eeb51037a9b04665716fd6f3"
+    "sha": "0fd79eba6bb23f50ec21a7a7daeee3614bebe12b"
 }
diff --git a/utils/version/version.inc b/utils/version/version.inc
index 2577daa529..1d33b63ee2 100644
--- a/utils/version/version.inc
+++ b/utils/version/version.inc
@@ -18,7 +18,7 @@
 #ifdef RC_VERSION_FIELD_3
 #undef RC_VERSION_FIELD_3
 #endif
-#define RC_VERSION_FIELD_3 2502
+#define RC_VERSION_FIELD_3 2505
 
 #ifdef RC_VERSION_FIELD_4
 #undef RC_VERSION_FIELD_4
@@ -28,7 +28,7 @@
 #ifdef RC_FILE_VERSION
 #undef RC_FILE_VERSION
 #endif
-#define RC_FILE_VERSION "1.8.2502.0"
+#define RC_FILE_VERSION "1.8.2505.0"
 
 #ifdef RC_FILE_DESCRIPTION
 #undef RC_FILE_DESCRIPTION
@@ -49,7 +49,7 @@
 #ifdef RC_PRODUCT_VERSION
 #undef RC_PRODUCT_VERSION
 #endif
-#define RC_PRODUCT_VERSION "1.8.2502.0"
+#define RC_PRODUCT_VERSION "1.8.2505.0"
 
 #ifdef HLSL_TOOL_NAME
 #undef HLSL_TOOL_NAME

From b98c00f56818348ceb8da053794e44cb3245bafb Mon Sep 17 00:00:00 2001
From: Joshua Batista <jbatista@microsoft.com>
Date: Tue, 29 Apr 2025 19:44:51 -0700
Subject: [PATCH 19/93] Add upcoming release (#7399)

Add upcoming release section to the release notes.
---
 docs/ReleaseNotes.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docs/ReleaseNotes.md b/docs/ReleaseNotes.md
index 5d7dfcb9f4..7788c57726 100644
--- a/docs/ReleaseNotes.md
+++ b/docs/ReleaseNotes.md
@@ -17,6 +17,10 @@ The included licenses apply to the following files:
 
 ## Changelog
 
+### Upcoming Release
+
+Place release notes for the upcoming release below this line and remove this line upon naming this release.
+
 ### Version 1.8.2505
 
 - Typed buffers (including ROV buffers) no longer accept types other than vectors and scalars. Any other types will produce descriptive errors. This removes support for appropriately sized matrices and structs. Though it worked in some contexts, code generated from such types was unreliable.

From 6d67e4a0af4710d18515ccaf0153922c172d415d Mon Sep 17 00:00:00 2001
From: Joshua Batista <jbatista@microsoft.com>
Date: Wed, 30 Apr 2025 13:04:21 -0700
Subject: [PATCH 20/93] Move spirv file to appropriate dir with lit config
 settings (#7406)

This test that enables spirv codegen needs to be in a directory that has
a lit cfg file that specifies that spirv is required, that lack of spirv
codegen means the test is unsupported.
---
 .../linalg/outerproductaccumulate-spirv-errors.hlsl               | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename tools/clang/test/{SemaHLSL/hlsl => CodeGenSPIRV}/linalg/outerproductaccumulate-spirv-errors.hlsl (100%)

diff --git a/tools/clang/test/SemaHLSL/hlsl/linalg/outerproductaccumulate-spirv-errors.hlsl b/tools/clang/test/CodeGenSPIRV/linalg/outerproductaccumulate-spirv-errors.hlsl
similarity index 100%
rename from tools/clang/test/SemaHLSL/hlsl/linalg/outerproductaccumulate-spirv-errors.hlsl
rename to tools/clang/test/CodeGenSPIRV/linalg/outerproductaccumulate-spirv-errors.hlsl

From 1a75a30f815309ae5bb4cf0211450751e8f753c1 Mon Sep 17 00:00:00 2001
From: Tex Riddell <texr@microsoft.com>
Date: Thu, 1 May 2025 07:54:04 -0700
Subject: [PATCH 21/93] Don't pass through to DXIL.dll in DxcCreateInstance for
 CLSID_DxcValidator (#7409)

This change removes the code meant to pass through to DXIL.dll when
creating a CLSID_DxcValidator object with DxcCreateInstance.

Since the internal validator is now the default, and it will sign
shaders, there is no reason to pass through to DXIL.dll when using
DxcCreateInstance on DxCompiler.dll.

Testing will come with planned work to include the ability to globally
override the default validator to external.
---
 tools/clang/tools/dxcompiler/dxcapi.cpp | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/tools/clang/tools/dxcompiler/dxcapi.cpp b/tools/clang/tools/dxcompiler/dxcapi.cpp
index a6a877cba4..ab2cf1f40e 100644
--- a/tools/clang/tools/dxcompiler/dxcapi.cpp
+++ b/tools/clang/tools/dxcompiler/dxcapi.cpp
@@ -87,11 +87,7 @@ static HRESULT ThreadMallocDxcCreateInstance(REFCLSID rclsid, REFIID riid,
   } else if (IsEqualCLSID(rclsid, CLSID_DxcUtils)) {
     hr = CreateDxcUtils(riid, ppv);
   } else if (IsEqualCLSID(rclsid, CLSID_DxcValidator)) {
-    if (DxilLibIsEnabled()) {
-      hr = DxilLibCreateInstance(rclsid, riid, (IUnknown **)ppv);
-    } else {
-      hr = CreateDxcValidator(riid, ppv);
-    }
+    hr = CreateDxcValidator(riid, ppv);
   } else if (IsEqualCLSID(rclsid, CLSID_DxcAssembler)) {
     hr = CreateDxcAssembler(riid, ppv);
   } else if (IsEqualCLSID(rclsid, CLSID_DxcOptimizer)) {

From 11e28952e8b7ee7d20d7bcdb1577c781c08fbe51 Mon Sep 17 00:00:00 2001
From: Joshua Batista <jbatista@microsoft.com>
Date: Thu, 1 May 2025 10:39:22 -0700
Subject: [PATCH 22/93] Add DXIL REQUIRES to certain tests that require a
 minimum validation versoin (#7408)

Internal testing using older validators reveals some failures on tests
that were intended to be run on newer validators. This PR changes the
tests to require a minimum validation version to run.
---
 tools/clang/test/LitDXILValidation/load-store-validation.ll | 1 +
 tools/clang/test/LitDXILValidation/vector-validation.ll     | 1 +
 2 files changed, 2 insertions(+)

diff --git a/tools/clang/test/LitDXILValidation/load-store-validation.ll b/tools/clang/test/LitDXILValidation/load-store-validation.ll
index 34b2f6b602..16c64672bd 100644
--- a/tools/clang/test/LitDXILValidation/load-store-validation.ll
+++ b/tools/clang/test/LitDXILValidation/load-store-validation.ll
@@ -1,3 +1,4 @@
+; REQUIRES: dxil-1-9
 ; RUN: not %dxv %s 2>&1 | FileCheck %s
 
 ; Ensure proper validation errors are produced for invalid parameters to load and store operations.
diff --git a/tools/clang/test/LitDXILValidation/vector-validation.ll b/tools/clang/test/LitDXILValidation/vector-validation.ll
index 74e8116e88..b32ac0cd5c 100644
--- a/tools/clang/test/LitDXILValidation/vector-validation.ll
+++ b/tools/clang/test/LitDXILValidation/vector-validation.ll
@@ -1,3 +1,4 @@
+; REQUIRES: dxil-1-9
 ; RUN: not %dxv %s 2>&1 | FileCheck %s
 
 ; Confirm that 6.9 specific LLVM operations and DXIL intrinsics fail in 6.8

From a4f8cf9483ba4b07b498eb8add736dfbb1821404 Mon Sep 17 00:00:00 2001
From: Simon Moll <smoll@nvidia.com>
Date: Fri, 2 May 2025 18:23:05 +0200
Subject: [PATCH 23/93] [SER] Validate HitObject_FromRayQuery[WithAttrs]
 (#7402)

Validate:
 HitObject_FromRayQuery
 HitObject_FromRayQueryWithAttrs

Rules:
 No `undef` inputs

SER implementation tracker: #7214
---
 lib/DxilValidation/DxilValidation.cpp         | 11 +++
 .../ser_hitobject_fromrayquery_failing.ll     | 99 +++++++++++++++++++
 2 files changed, 110 insertions(+)
 create mode 100644 tools/clang/test/LitDXILValidation/ser_hitobject_fromrayquery_failing.ll

diff --git a/lib/DxilValidation/DxilValidation.cpp b/lib/DxilValidation/DxilValidation.cpp
index 694ab43a7a..11dfb42a6c 100644
--- a/lib/DxilValidation/DxilValidation.cpp
+++ b/lib/DxilValidation/DxilValidation.cpp
@@ -2287,6 +2287,17 @@ static void ValidateDxilOperationCallInProfile(CallInst *CI,
     break;
   }
 
+  // Shader Execution Reordering - from ray query
+  case DXIL::OpCode::HitObject_FromRayQuery:
+  case DXIL::OpCode::HitObject_FromRayQueryWithAttrs: {
+    for (unsigned i = 1; i < CI->getNumOperands(); ++i) {
+      Value *Arg = CI->getArgOperand(i);
+      if (isa<UndefValue>(Arg))
+        ValCtx.EmitInstrError(CI, ValidationRule::InstrNoReadingUninitialized);
+    }
+    break;
+  }
+
   case DXIL::OpCode::AtomicBinOp:
   case DXIL::OpCode::AtomicCompareExchange: {
     Type *pOverloadType = OP::GetOverloadType(Opcode, CI->getCalledFunction());
diff --git a/tools/clang/test/LitDXILValidation/ser_hitobject_fromrayquery_failing.ll b/tools/clang/test/LitDXILValidation/ser_hitobject_fromrayquery_failing.ll
new file mode 100644
index 0000000000..602ff99a55
--- /dev/null
+++ b/tools/clang/test/LitDXILValidation/ser_hitobject_fromrayquery_failing.ll
@@ -0,0 +1,99 @@
+; REQUIRES: dxil-1-9
+; RUN: not %dxv %s 2>&1 | FileCheck %s
+
+; CHECK: Function: ?main@@YAXXZ: error: Instructions should not read uninitialized value.
+; CHECK-NEXT: note: at '%attrsud3 = call %dx.types.HitObject @dx.op.hitObject_FromRayQueryWithAttrs.struct.CustomAttrs(i32 264, i32 %rq, i32 16, %struct.CustomAttrs* nonnull undef)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK-NEXT: Function: ?main@@YAXXZ: error: Instructions should not read uninitialized value.
+; CHECK-NEXT: note: at '%attrsud2 = call %dx.types.HitObject @dx.op.hitObject_FromRayQueryWithAttrs.struct.CustomAttrs(i32 264, i32 %rq, i32 undef, %struct.CustomAttrs* nonnull %attra)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK-NEXT: Function: ?main@@YAXXZ: error: Instructions should not read uninitialized value.
+; CHECK-NEXT: note: at '%attrsud1 = call %dx.types.HitObject @dx.op.hitObject_FromRayQueryWithAttrs.struct.CustomAttrs(i32 264, i32 undef, i32 16, %struct.CustomAttrs* nonnull %attra)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK-NEXT: Function: ?main@@YAXXZ: error: Instructions should not read uninitialized value.
+; CHECK-NEXT: note: at '%ud1 = call %dx.types.HitObject @dx.op.hitObject_FromRayQuery(i32 263, i32 undef)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK-NEXT: Validation failed.
+
+target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+%dx.types.Handle = type { i8* }
+%struct.Payload = type { <3 x float> }
+%struct.CustomAttrs = type { float, float }
+%dx.types.ResourceProperties = type { i32, i32 }
+%dx.types.HitObject = type { i8* }
+%struct.RaytracingAccelerationStructure = type { i32 }
+
+@"\01?RTAS@@3URaytracingAccelerationStructure@@A" = external constant %dx.types.Handle, align 4
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture) #0
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture) #0
+
+; Function Attrs: nounwind
+define void @"\01?main@@YAXXZ"() #0 {
+  %ldh = load %dx.types.Handle, %dx.types.Handle* @"\01?RTAS@@3URaytracingAccelerationStructure@@A", align 4
+  %attra = alloca %struct.CustomAttrs, align 4
+  %rq = call i32 @dx.op.allocateRayQuery(i32 178, i32 5)  ; AllocateRayQuery(constRayFlags)
+  %createh = call %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32 160, %dx.types.Handle %ldh)  ; CreateHandleForLib(Resource)
+  %annoth = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %createh, %dx.types.ResourceProperties { i32 16, i32 0 })  ; AnnotateHandle(res,props)  resource: RTAccelerationStructure
+  call void @dx.op.rayQuery_TraceRayInline(i32 179, i32 %rq, %dx.types.Handle %annoth, i32 0, i32 255, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 9.999000e+03)  ; RayQuery_TraceRayInline(rayQueryHandle,accelerationStructure,rayFlags,instanceInclusionMask,origin_X,origin_Y,origin_Z,tMin,direction_X,direction_Y,direction_Z,tMax)
+
+  %ok = call %dx.types.HitObject @dx.op.hitObject_FromRayQuery(i32 263, i32 %rq)  ; HitObject_FromRayQuery(rayQueryHandle)
+  %ud1 = call %dx.types.HitObject @dx.op.hitObject_FromRayQuery(i32 263, i32 undef)  ; HitObject_FromRayQuery(rayQueryHandle)
+
+  %attrsok = call %dx.types.HitObject @dx.op.hitObject_FromRayQueryWithAttrs.struct.CustomAttrs(i32 264, i32 %rq, i32 16, %struct.CustomAttrs* nonnull %attra)  ; HitObject_FromRayQueryWithAttrs(rayQueryHandle,HitKind,CommittedAttribs)
+  %attrsud1 = call %dx.types.HitObject @dx.op.hitObject_FromRayQueryWithAttrs.struct.CustomAttrs(i32 264, i32 undef, i32 16, %struct.CustomAttrs* nonnull %attra)  ; HitObject_FromRayQueryWithAttrs(rayQueryHandle,HitKind,CommittedAttribs)
+  %attrsud2 = call %dx.types.HitObject @dx.op.hitObject_FromRayQueryWithAttrs.struct.CustomAttrs(i32 264, i32 %rq, i32 undef, %struct.CustomAttrs* nonnull %attra)  ; HitObject_FromRayQueryWithAttrs(rayQueryHandle,HitKind,CommittedAttribs)
+  %attrsud3 = call %dx.types.HitObject @dx.op.hitObject_FromRayQueryWithAttrs.struct.CustomAttrs(i32 264, i32 %rq, i32 16, %struct.CustomAttrs* nonnull undef)  ; HitObject_FromRayQueryWithAttrs(rayQueryHandle,HitKind,CommittedAttribs)
+
+  ret void
+}
+
+; Function Attrs: nounwind
+declare i32 @dx.op.allocateRayQuery(i32, i32) #0
+
+; Function Attrs: nounwind
+declare void @dx.op.rayQuery_TraceRayInline(i32, i32, %dx.types.Handle, i32, i32, float, float, float, float, float, float, float, float) #0
+
+; Function Attrs: nounwind readonly
+declare %dx.types.HitObject @dx.op.hitObject_FromRayQueryWithAttrs.struct.CustomAttrs(i32, i32, i32, %struct.CustomAttrs*) #1
+
+; Function Attrs: nounwind readonly
+declare %dx.types.HitObject @dx.op.hitObject_FromRayQuery(i32, i32) #1
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @dx.op.annotateHandle(i32, %dx.types.Handle, %dx.types.ResourceProperties) #2
+
+; Function Attrs: nounwind readonly
+declare %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32, %dx.types.Handle) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readonly }
+attributes #2 = { nounwind readnone }
+
+!dx.version = !{!0}
+!dx.valver = !{!0}
+!dx.shaderModel = !{!1}
+!dx.resources = !{!2}
+!dx.typeAnnotations = !{!6}
+!dx.dxrPayloadAnnotations = !{!10}
+!dx.entryPoints = !{!13, !15}
+
+!0 = !{i32 1, i32 9}
+!1 = !{!"lib", i32 6, i32 9}
+!2 = !{!3, null, null, null}
+!3 = !{!4}
+!4 = !{i32 0, %struct.RaytracingAccelerationStructure* bitcast (%dx.types.Handle* @"\01?RTAS@@3URaytracingAccelerationStructure@@A" to %struct.RaytracingAccelerationStructure*), !"RTAS", i32 -1, i32 -1, i32 1, i32 16, i32 0, !5}
+!5 = !{i32 0, i32 4}
+!6 = !{i32 1, void ()* @"\01?main@@YAXXZ", !7}
+!7 = !{!8}
+!8 = !{i32 1, !9, !9}
+!9 = !{}
+!10 = !{i32 0, %struct.Payload undef, !11}
+!11 = !{!12}
+!12 = !{i32 0, i32 8210}
+!13 = !{null, !"", null, !2, !14}
+!14 = !{i32 0, i64 33554432}
+!15 = !{void ()* @"\01?main@@YAXXZ", !"\01?main@@YAXXZ", null, null, !16}
+!16 = !{i32 8, i32 7, i32 5, !17}
+!17 = !{i32 0}

From 9536291dccd13728f9d54fbc8adbb3d8ba73f0dd Mon Sep 17 00:00:00 2001
From: Simon Moll <smoll@nvidia.com>
Date: Fri, 2 May 2025 19:38:54 +0200
Subject: [PATCH 24/93] [SER] Validate HitObject::TraceRay|Invoke (#7384)

Validate:
 HitObject_Invoke
 HitObject_TraceRay

Rules:
 No undef params
 Resource handle must be AS

SER implementation tracker: #7214
---
 lib/DxilValidation/DxilValidation.cpp         |  15 +++
 .../ser_hitobject_invoke_failing.ll           |  58 +++++++++
 .../ser_hitobject_trace_failing.ll            | 114 ++++++++++++++++++
 .../ser_hitobject_trace_invaliduav.ll         | 108 +++++++++++++++++
 4 files changed, 295 insertions(+)
 create mode 100644 tools/clang/test/LitDXILValidation/ser_hitobject_invoke_failing.ll
 create mode 100644 tools/clang/test/LitDXILValidation/ser_hitobject_trace_failing.ll
 create mode 100644 tools/clang/test/LitDXILValidation/ser_hitobject_trace_invaliduav.ll

diff --git a/lib/DxilValidation/DxilValidation.cpp b/lib/DxilValidation/DxilValidation.cpp
index 11dfb42a6c..bd69cdaf5d 100644
--- a/lib/DxilValidation/DxilValidation.cpp
+++ b/lib/DxilValidation/DxilValidation.cpp
@@ -2298,6 +2298,21 @@ static void ValidateDxilOperationCallInProfile(CallInst *CI,
     break;
   }
 
+  case DXIL::OpCode::HitObject_Invoke: {
+    if (isa<UndefValue>(CI->getArgOperand(1)))
+      ValCtx.EmitInstrError(CI, ValidationRule::InstrUndefHitObject);
+    if (isa<UndefValue>(CI->getArgOperand(2)))
+      ValCtx.EmitInstrError(CI, ValidationRule::InstrNoReadingUninitialized);
+  } break;
+  case DXIL::OpCode::HitObject_TraceRay: {
+    Value *Hdl = CI->getArgOperand(
+        DxilInst_HitObject_TraceRay::arg_accelerationStructure);
+    ValidateASHandle(CI, Hdl, ValCtx);
+    for (unsigned ArgIdx = 2; ArgIdx < CI->getNumArgOperands(); ++ArgIdx)
+      if (isa<UndefValue>(CI->getArgOperand(ArgIdx)))
+        ValCtx.EmitInstrError(CI, ValidationRule::InstrNoReadingUninitialized);
+    DxilInst_HitObject_TraceRay HOTraceRay(CI);
+  } break;
   case DXIL::OpCode::AtomicBinOp:
   case DXIL::OpCode::AtomicCompareExchange: {
     Type *pOverloadType = OP::GetOverloadType(Opcode, CI->getCalledFunction());
diff --git a/tools/clang/test/LitDXILValidation/ser_hitobject_invoke_failing.ll b/tools/clang/test/LitDXILValidation/ser_hitobject_invoke_failing.ll
new file mode 100644
index 0000000000..a6bdd49f72
--- /dev/null
+++ b/tools/clang/test/LitDXILValidation/ser_hitobject_invoke_failing.ll
@@ -0,0 +1,58 @@
+; REQUIRES: dxil-1-9
+; RUN: not %dxv %s 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+%struct.Payload = type { <3 x float> }
+%dx.types.HitObject = type { i8* }
+
+; CHECK: Function: ?main@@YAXXZ: error: Instructions should not read uninitialized value.
+; CHECK-NEXT: note: at 'call void @dx.op.hitObject_Invoke.struct.Payload(i32 267, %dx.types.HitObject %nop, %struct.Payload* nonnull undef)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK-NEXT: Function: ?main@@YAXXZ: error: HitObject is undef.
+; CHECK-NEXT: note: at 'call void @dx.op.hitObject_Invoke.struct.Payload(i32 267, %dx.types.HitObject undef, %struct.Payload* nonnull %pld)' in block '#0' of function '?main@@YAXXZ'.
+
+; CHECK-NEXT: Validation failed.
+
+; Function Attrs: nounwind
+define void @"\01?main@@YAXXZ"() #0 {
+  %pld = alloca %struct.Payload, align 4
+  %nop = call %dx.types.HitObject @dx.op.hitObject_MakeNop(i32 266)  ; HitObject_MakeNop()
+  call void @dx.op.hitObject_Invoke.struct.Payload(i32 267, %dx.types.HitObject %nop, %struct.Payload* nonnull %pld)  ; HitObject_Invoke(hitObject,payload)
+  call void @dx.op.hitObject_Invoke.struct.Payload(i32 267, %dx.types.HitObject undef, %struct.Payload* nonnull %pld)  ; HitObject_Invoke(hitObject,payload)
+  call void @dx.op.hitObject_Invoke.struct.Payload(i32 267, %dx.types.HitObject %nop, %struct.Payload* nonnull undef)  ; HitObject_Invoke(hitObject,payload)
+
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare %dx.types.HitObject @dx.op.hitObject_MakeNop(i32) #1
+
+; Function Attrs: nounwind
+declare void @dx.op.hitObject_Invoke.struct.Payload(i32, %dx.types.HitObject, %struct.Payload*) #0
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind readonly }
+
+!dx.version = !{!0}
+!dx.valver = !{!0}
+!dx.shaderModel = !{!1}
+!dx.typeAnnotations = !{!2}
+!dx.dxrPayloadAnnotations = !{!3}
+!dx.entryPoints = !{!4, !6}
+
+!0 = !{i32 1, i32 9}
+!1 = !{!"lib", i32 6, i32 9}
+!2 = !{i32 1, void ()* @"\01?main@@YAXXZ", !7}
+!3 = !{i32 0, %struct.Payload undef, !8}
+!4 = !{null, !"", null, null, !5}
+!5 = !{i32 0, i64 0}
+!6 = !{void ()* @"\01?main@@YAXXZ", !"\01?main@@YAXXZ", null, null, !9}
+!7 = !{!10}
+!8 = !{!11}
+!9 = !{i32 8, i32 7, i32 5, !12}
+!10 = !{i32 1, !13, !13}
+!11 = !{i32 0, i32 8210}
+!12 = !{i32 0}
+!13 = !{}
diff --git a/tools/clang/test/LitDXILValidation/ser_hitobject_trace_failing.ll b/tools/clang/test/LitDXILValidation/ser_hitobject_trace_failing.ll
new file mode 100644
index 0000000000..eb0d2576b0
--- /dev/null
+++ b/tools/clang/test/LitDXILValidation/ser_hitobject_trace_failing.ll
@@ -0,0 +1,114 @@
+; REQUIRES: dxil-1-9
+; RUN: not %dxv %s 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+%dx.types.Handle = type { i8* }
+%struct.Payload = type { <3 x float> }
+%dx.types.ResourceProperties = type { i32, i32 }
+%dx.types.HitObject = type { i8* }
+%struct.RaytracingAccelerationStructure = type { i32 }
+
+@"\01?RTAS@@3URaytracingAccelerationStructure@@A" = external constant %dx.types.Handle, align 4
+
+; CHECK: Function: ?main@@YAXXZ: error: Instructions should not read uninitialized value.
+; CHECK-NEXT: note: at '%tud16 = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle %4, i32 513, i32 1, i32 2, i32 4, i32 0, float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, %struct.Payload* undef)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK-NEXT: Function: ?main@@YAXXZ: error: Instructions should not read uninitialized value.
+; CHECK-NEXT: note: at '%tud15 = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle %4, i32 513, i32 1, i32 2, i32 4, i32 0, float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float undef, %struct.Payload* nonnull %2)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK-NEXT: Function: ?main@@YAXXZ: error: Instructions should not read uninitialized value.
+; CHECK-NEXT: note: at '%tud14 = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle %4, i32 513, i32 1, i32 2, i32 4, i32 0, float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float undef, float 7.000000e+00, %struct.Payload* nonnull %2)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK-NEXT: Function: ?main@@YAXXZ: error: Instructions should not read uninitialized value.
+; CHECK-NEXT: note: at '%tud13 = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle %4, i32 513, i32 1, i32 2, i32 4, i32 0, float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float undef, float 6.000000e+00, float 7.000000e+00, %struct.Payload* nonnull %2)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK-NEXT: Function: ?main@@YAXXZ: error: Instructions should not read uninitialized value.
+; CHECK-NEXT: note: at '%tud12 = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle %4, i32 513, i32 1, i32 2, i32 4, i32 0, float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float undef, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, %struct.Payload* nonnull %2)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK-NEXT: Function: ?main@@YAXXZ: error: Instructions should not read uninitialized value.
+; CHECK-NEXT: note: at '%tud11 = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle %4, i32 513, i32 1, i32 2, i32 4, i32 0, float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float undef, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, %struct.Payload* nonnull %2)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK-NEXT: Function: ?main@@YAXXZ: error: Instructions should not read uninitialized value.
+; CHECK-NEXT: note: at '%tud10 = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle %4, i32 513, i32 1, i32 2, i32 4, i32 0, float 0.000000e+00, float 1.000000e+00, float undef, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, %struct.Payload* nonnull %2)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK-NEXT: Function: ?main@@YAXXZ: error: Instructions should not read uninitialized value.
+; CHECK-NEXT: note: at '%tud9 = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle %4, i32 513, i32 1, i32 2, i32 4, i32 0, float 0.000000e+00, float undef, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, %struct.Payload* nonnull %2)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK-NEXT: Function: ?main@@YAXXZ: error: Instructions should not read uninitialized value.
+; CHECK-NEXT: note: at '%tud8 = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle %4, i32 513, i32 1, i32 2, i32 4, i32 0, float undef, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, %struct.Payload* nonnull %2)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK-NEXT: Function: ?main@@YAXXZ: error: Instructions should not read uninitialized value.
+; CHECK-NEXT: note: at '%tud7 = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle %4, i32 513, i32 1, i32 2, i32 4, i32 undef, float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, %struct.Payload* nonnull %2)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK-NEXT: Function: ?main@@YAXXZ: error: Instructions should not read uninitialized value.
+; CHECK-NEXT: note: at '%tud6 = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle %4, i32 513, i32 1, i32 2, i32 undef, i32 0, float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, %struct.Payload* nonnull %2)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK-NEXT: Function: ?main@@YAXXZ: error: Instructions should not read uninitialized value.
+; CHECK-NEXT: note: at '%tud5 = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle %4, i32 513, i32 1, i32 undef, i32 4, i32 0, float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, %struct.Payload* nonnull %2)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK-NEXT: Function: ?main@@YAXXZ: error: Instructions should not read uninitialized value.
+; CHECK-NEXT: note: at '%tud4 = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle %4, i32 513, i32 undef, i32 2, i32 4, i32 0, float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, %struct.Payload* nonnull %2)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK-NEXT: Function: ?main@@YAXXZ: error: Instructions should not read uninitialized value.
+; CHECK-NEXT: note: at '%tud3 = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle %4, i32 undef, i32 1, i32 2, i32 4, i32 0, float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, %struct.Payload* nonnull %2)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK-NEXT: Function: ?main@@YAXXZ: error: Instructions should not read uninitialized value.
+; CHECK-NEXT: note: at '%tud2 = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle undef, i32 513, i32 1, i32 2, i32 4, i32 0, float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, %struct.Payload* nonnull %2)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK-NEXT: Function: ?main@@YAXXZ: error: TraceRay should only use RTAccelerationStructure.
+; CHECK-NEXT: note: at '%tud2 = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle undef, i32 513, i32 1, i32 2, i32 4, i32 0, float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, %struct.Payload* nonnull %2)' in block '#0' of function '?main@@YAXXZ'.
+
+; CHECK-NEXT: Validation failed.
+
+; Function Attrs: nounwind
+define void @"\01?main@@YAXXZ"() #0 {
+  %1 = load %dx.types.Handle, %dx.types.Handle* @"\01?RTAS@@3URaytracingAccelerationStructure@@A", align 4
+  %2 = alloca %struct.Payload, align 4
+  %3 = call %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32 160, %dx.types.Handle %1)  ; CreateHandleForLib(Resource)
+  %4 = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %3, %dx.types.ResourceProperties { i32 16, i32 0 })  ; AnnotateHandle(res,props)  resource: RTAccelerationStructure
+
+  %tok = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle %4, i32 513, i32 1, i32 2, i32 4, i32 0, float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, %struct.Payload* nonnull %2)  ; HitObject_TraceRay(accelerationStructure,rayFlags,instanceInclusionMask,rayContributionToHitGroupIndex,multiplierForGeometryContributionToHitGroupIndex,missShaderIndex,Origin_X,Origin_Y,Origin_Z,TMin,Direction_X,Direction_Y,Direction_Z,TMax,payload)
+  %tud2 = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle undef, i32 513, i32 1, i32 2, i32 4, i32 0, float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, %struct.Payload* nonnull %2)  ; HitObject_TraceRay(accelerationStructure,rayFlags,instanceInclusionMask,rayContributionToHitGroupIndex,multiplierForGeometryContributionToHitGroupIndex,missShaderIndex,Origin_X,Origin_Y,Origin_Z,TMin,Direction_X,Direction_Y,Direction_Z,TMax,payload)
+  %tud3 = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle %4, i32 undef, i32 1, i32 2, i32 4, i32 0, float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, %struct.Payload* nonnull %2)  ; HitObject_TraceRay(accelerationStructure,rayFlags,instanceInclusionMask,rayContributionToHitGroupIndex,multiplierForGeometryContributionToHitGroupIndex,missShaderIndex,Origin_X,Origin_Y,Origin_Z,TMin,Direction_X,Direction_Y,Direction_Z,TMax,payload)
+  %tud4 = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle %4, i32 513, i32 undef, i32 2, i32 4, i32 0, float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, %struct.Payload* nonnull %2)  ; HitObject_TraceRay(accelerationStructure,rayFlags,instanceInclusionMask,rayContributionToHitGroupIndex,multiplierForGeometryContributionToHitGroupIndex,missShaderIndex,Origin_X,Origin_Y,Origin_Z,TMin,Direction_X,Direction_Y,Direction_Z,TMax,payload)
+  %tud5 = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle %4, i32 513, i32 1, i32 undef, i32 4, i32 0, float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, %struct.Payload* nonnull %2)  ; HitObject_TraceRay(accelerationStructure,rayFlags,instanceInclusionMask,rayContributionToHitGroupIndex,multiplierForGeometryContributionToHitGroupIndex,missShaderIndex,Origin_X,Origin_Y,Origin_Z,TMin,Direction_X,Direction_Y,Direction_Z,TMax,payload)
+  %tud6 = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle %4, i32 513, i32 1, i32 2, i32 undef, i32 0, float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, %struct.Payload* nonnull %2)  ; HitObject_TraceRay(accelerationStructure,rayFlags,instanceInclusionMask,rayContributionToHitGroupIndex,multiplierForGeometryContributionToHitGroupIndex,missShaderIndex,Origin_X,Origin_Y,Origin_Z,TMin,Direction_X,Direction_Y,Direction_Z,TMax,payload)
+  %tud7 = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle %4, i32 513, i32 1, i32 2, i32 4, i32 undef, float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, %struct.Payload* nonnull %2)  ; HitObject_TraceRay(accelerationStructure,rayFlags,instanceInclusionMask,rayContributionToHitGroupIndex,multiplierForGeometryContributionToHitGroupIndex,missShaderIndex,Origin_X,Origin_Y,Origin_Z,TMin,Direction_X,Direction_Y,Direction_Z,TMax,payload)
+  %tud8 = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle %4, i32 513, i32 1, i32 2, i32 4, i32 0, float undef, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, %struct.Payload* nonnull %2)  ; HitObject_TraceRay(accelerationStructure,rayFlags,instanceInclusionMask,rayContributionToHitGroupIndex,multiplierForGeometryContributionToHitGroupIndex,missShaderIndex,Origin_X,Origin_Y,Origin_Z,TMin,Direction_X,Direction_Y,Direction_Z,TMax,payload)
+  %tud9 = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle %4, i32 513, i32 1, i32 2, i32 4, i32 0, float 0.000000e+00, float undef, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, %struct.Payload* nonnull %2)  ; HitObject_TraceRay(accelerationStructure,rayFlags,instanceInclusionMask,rayContributionToHitGroupIndex,multiplierForGeometryContributionToHitGroupIndex,missShaderIndex,Origin_X,Origin_Y,Origin_Z,TMin,Direction_X,Direction_Y,Direction_Z,TMax,payload)
+  %tud10 = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle %4, i32 513, i32 1, i32 2, i32 4, i32 0, float 0.000000e+00, float 1.000000e+00, float undef, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, %struct.Payload* nonnull %2)  ; HitObject_TraceRay(accelerationStructure,rayFlags,instanceInclusionMask,rayContributionToHitGroupIndex,multiplierForGeometryContributionToHitGroupIndex,missShaderIndex,Origin_X,Origin_Y,Origin_Z,TMin,Direction_X,Direction_Y,Direction_Z,TMax,payload)
+  %tud11 = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle %4, i32 513, i32 1, i32 2, i32 4, i32 0, float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float undef, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, %struct.Payload* nonnull %2)  ; HitObject_TraceRay(accelerationStructure,rayFlags,instanceInclusionMask,rayContributionToHitGroupIndex,multiplierForGeometryContributionToHitGroupIndex,missShaderIndex,Origin_X,Origin_Y,Origin_Z,TMin,Direction_X,Direction_Y,Direction_Z,TMax,payload)
+  %tud12 = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle %4, i32 513, i32 1, i32 2, i32 4, i32 0, float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float undef, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, %struct.Payload* nonnull %2)  ; HitObject_TraceRay(accelerationStructure,rayFlags,instanceInclusionMask,rayContributionToHitGroupIndex,multiplierForGeometryContributionToHitGroupIndex,missShaderIndex,Origin_X,Origin_Y,Origin_Z,TMin,Direction_X,Direction_Y,Direction_Z,TMax,payload)
+  %tud13 = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle %4, i32 513, i32 1, i32 2, i32 4, i32 0, float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float undef, float 6.000000e+00, float 7.000000e+00, %struct.Payload* nonnull %2)  ; HitObject_TraceRay(accelerationStructure,rayFlags,instanceInclusionMask,rayContributionToHitGroupIndex,multiplierForGeometryContributionToHitGroupIndex,missShaderIndex,Origin_X,Origin_Y,Origin_Z,TMin,Direction_X,Direction_Y,Direction_Z,TMax,payload)
+  %tud14 = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle %4, i32 513, i32 1, i32 2, i32 4, i32 0, float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float undef, float 7.000000e+00, %struct.Payload* nonnull %2)  ; HitObject_TraceRay(accelerationStructure,rayFlags,instanceInclusionMask,rayContributionToHitGroupIndex,multiplierForGeometryContributionToHitGroupIndex,missShaderIndex,Origin_X,Origin_Y,Origin_Z,TMin,Direction_X,Direction_Y,Direction_Z,TMax,payload)
+  %tud15 = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle %4, i32 513, i32 1, i32 2, i32 4, i32 0, float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float undef, %struct.Payload* nonnull %2)  ; HitObject_TraceRay(accelerationStructure,rayFlags,instanceInclusionMask,rayContributionToHitGroupIndex,multiplierForGeometryContributionToHitGroupIndex,missShaderIndex,Origin_X,Origin_Y,Origin_Z,TMin,Direction_X,Direction_Y,Direction_Z,TMax,payload)
+  %tud16 = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle %4, i32 513, i32 1, i32 2, i32 4, i32 0, float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, %struct.Payload* undef)  ; HitObject_TraceRay(accelerationStructure,rayFlags,instanceInclusionMask,rayContributionToHitGroupIndex,multiplierForGeometryContributionToHitGroupIndex,missShaderIndex,Origin_X,Origin_Y,Origin_Z,TMin,Direction_X,Direction_Y,Direction_Z,TMax,payload)
+
+  ret void
+}
+
+; Function Attrs: nounwind
+declare %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32, %dx.types.Handle, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, %struct.Payload*) #0
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @dx.op.annotateHandle(i32, %dx.types.Handle, %dx.types.ResourceProperties) #1
+
+; Function Attrs: nounwind readonly
+declare %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32, %dx.types.Handle) #2
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind readonly }
+
+!dx.version = !{!0}
+!dx.valver = !{!0}
+!dx.shaderModel = !{!1}
+!dx.resources = !{!2}
+!dx.typeAnnotations = !{!3}
+!dx.dxrPayloadAnnotations = !{!4}
+!dx.entryPoints = !{!5, !6}
+
+!0 = !{i32 1, i32 9}
+!1 = !{!"lib", i32 6, i32 9}
+!2 = !{!7, null, null, null}
+!3 = !{i32 1, void ()* @"\01?main@@YAXXZ", !8}
+!4 = !{i32 0, %struct.Payload undef, !9}
+!5 = !{null, !"", null, !2, null}
+!6 = !{void ()* @"\01?main@@YAXXZ", !"\01?main@@YAXXZ", null, null, !10}
+!7 = !{!11}
+!8 = !{!12}
+!9 = !{!13}
+!10 = !{i32 8, i32 7, i32 5, !14}
+!11 = !{i32 0, %struct.RaytracingAccelerationStructure* bitcast (%dx.types.Handle* @"\01?RTAS@@3URaytracingAccelerationStructure@@A" to %struct.RaytracingAccelerationStructure*), !"RTAS", i32 -1, i32 -1, i32 1, i32 16, i32 0, !15}
+!12 = !{i32 1, !16, !16}
+!13 = !{i32 0, i32 8210}
+!14 = !{i32 0}
+!15 = !{i32 0, i32 4}
+!16 = !{}
diff --git a/tools/clang/test/LitDXILValidation/ser_hitobject_trace_invaliduav.ll b/tools/clang/test/LitDXILValidation/ser_hitobject_trace_invaliduav.ll
new file mode 100644
index 0000000000..c4f3a918f8
--- /dev/null
+++ b/tools/clang/test/LitDXILValidation/ser_hitobject_trace_invaliduav.ll
@@ -0,0 +1,108 @@
+; REQUIRES: dxil-1-9
+; RUN: not %dxv %s 2>&1 | FileCheck %s
+
+; shader hash: b22988e7874179601860019e56fb877e
+;
+; Buffer Definitions:
+;
+;
+; Resource Bindings:
+;
+; Name                                 Type  Format         Dim      ID      HLSL Bind  Count
+; ------------------------------ ---------- ------- ----------- ------- -------------- ------
+; RTAS                              texture     i32         ras      T0t4294967295,space4294967295     1
+; nonas_buf                             UAV    byte         r/w      U0u4294967295,space4294967295     1
+;
+target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+%dx.types.Handle = type { i8* }
+%struct.Payload = type { <3 x float> }
+%dx.types.ResourceProperties = type { i32, i32 }
+%dx.types.HitObject = type { i8* }
+%struct.RaytracingAccelerationStructure = type { i32 }
+%struct.RWByteAddressBuffer = type { i32 }
+
+@"\01?RTAS@@3URaytracingAccelerationStructure@@A" = external constant %dx.types.Handle, align 4
+@"\01?nonas_buf@@3URWByteAddressBuffer@@A" = external constant %dx.types.Handle, align 4
+
+; CHECK: Function: ?main@@YAXXZ: error: TraceRay should only use RTAccelerationStructure.
+; CHECK-NEXT: note: at '%invalid = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle %7, i32 513, i32 1, i32 2, i32 4, i32 0, float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, %struct.Payload* nonnull %3)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK-NEXT: Validation failed.
+
+; Function Attrs: nounwind
+define void @"\01?main@@YAXXZ"() #0 {
+  %1 = load %dx.types.Handle, %dx.types.Handle* @"\01?RTAS@@3URaytracingAccelerationStructure@@A", align 4
+  %2 = load %dx.types.Handle, %dx.types.Handle* @"\01?nonas_buf@@3URWByteAddressBuffer@@A", align 4
+  %3 = alloca %struct.Payload, align 4
+  %4 = bitcast %struct.Payload* %3 to i8*
+  call void @llvm.lifetime.start(i64 12, i8* %4) #0
+  %5 = getelementptr inbounds %struct.Payload, %struct.Payload* %3, i32 0, i32 0
+  store <3 x float> <float 7.000000e+00, float 8.000000e+00, float 9.000000e+00>, <3 x float>* %5, align 4, !tbaa !20
+  %6 = call %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32 160, %dx.types.Handle %2)  ; CreateHandleForLib(Resource)
+  %7 = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %6, %dx.types.ResourceProperties { i32 4107, i32 0 })  ; AnnotateHandle(res,props)  resource: RWByteAddressBuffer
+  call void @dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle %7, i32 0, i32 undef, float 1.100000e+01, float undef, float undef, float undef, i8 1, i32 4)  ; RawBufferStore(uav,index,elementOffset,value0,value1,value2,value3,mask,alignment)
+  %8 = call %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32 160, %dx.types.Handle %1)  ; CreateHandleForLib(Resource)
+  %9 = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %8, %dx.types.ResourceProperties { i32 16, i32 0 })  ; AnnotateHandle(res,props)  resource: RTAccelerationStructure
+
+  %valid = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle %9, i32 513, i32 1, i32 2, i32 4, i32 0, float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, %struct.Payload* nonnull %3)  ; HitObject_TraceRay(accelerationStructure,rayFlags,instanceInclusionMask,rayContributionToHitGroupIndex,multiplierForGeometryContributionToHitGroupIndex,missShaderIndex,Origin_X,Origin_Y,Origin_Z,TMin,Direction_X,Direction_Y,Direction_Z,TMax,payload)
+
+  %invalid = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle %7, i32 513, i32 1, i32 2, i32 4, i32 0, float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, %struct.Payload* nonnull %3)  ; HitObject_TraceRay(accelerationStructure,rayFlags,instanceInclusionMask,rayContributionToHitGroupIndex,multiplierForGeometryContributionToHitGroupIndex,missShaderIndex,Origin_X,Origin_Y,Origin_Z,TMin,Direction_X,Direction_Y,Direction_Z,TMax,payload)
+
+  call void @llvm.lifetime.end(i64 12, i8* %4) #0
+  ret void
+}
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture) #0
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture) #0
+
+; Function Attrs: nounwind
+declare void @dx.op.rawBufferStore.f32(i32, %dx.types.Handle, i32, i32, float, float, float, float, i8, i32) #0
+
+; Function Attrs: nounwind
+declare %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32, %dx.types.Handle, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, %struct.Payload*) #0
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @dx.op.annotateHandle(i32, %dx.types.Handle, %dx.types.ResourceProperties) #1
+
+; Function Attrs: nounwind readonly
+declare %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32, %dx.types.Handle) #2
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind readonly }
+
+!dx.version = !{!0}
+!dx.valver = !{!0}
+!dx.shaderModel = !{!1}
+!dx.resources = !{!2}
+!dx.typeAnnotations = !{!8}
+!dx.dxrPayloadAnnotations = !{!12}
+!dx.entryPoints = !{!15, !17}
+
+!0 = !{i32 1, i32 9}
+!1 = !{!"lib", i32 6, i32 9}
+!2 = !{!3, !6, null, null}
+!3 = !{!4}
+!4 = !{i32 0, %struct.RaytracingAccelerationStructure* bitcast (%dx.types.Handle* @"\01?RTAS@@3URaytracingAccelerationStructure@@A" to %struct.RaytracingAccelerationStructure*), !"RTAS", i32 -1, i32 -1, i32 1, i32 16, i32 0, !5}
+!5 = !{i32 0, i32 4}
+!6 = !{!7}
+!7 = !{i32 0, %struct.RWByteAddressBuffer* bitcast (%dx.types.Handle* @"\01?nonas_buf@@3URWByteAddressBuffer@@A" to %struct.RWByteAddressBuffer*), !"nonas_buf", i32 -1, i32 -1, i32 1, i32 11, i1 false, i1 false, i1 false, null}
+!8 = !{i32 1, void ()* @"\01?main@@YAXXZ", !9}
+!9 = !{!10}
+!10 = !{i32 1, !11, !11}
+!11 = !{}
+!12 = !{i32 0, %struct.Payload undef, !13}
+!13 = !{!14}
+!14 = !{i32 0, i32 8210}
+!15 = !{null, !"", null, !2, !16}
+!16 = !{i32 0, i64 8589934608}
+!17 = !{void ()* @"\01?main@@YAXXZ", !"\01?main@@YAXXZ", null, null, !18}
+!18 = !{i32 8, i32 7, i32 5, !19}
+!19 = !{i32 0}
+!20 = !{!21, !21, i64 0}
+!21 = !{!"omnipotent char", !22, i64 0}
+!22 = !{!"Simple C/C++ TBAA"}

From 1198c30f05ed944873ca55e89970fae407e2aacc Mon Sep 17 00:00:00 2001
From: Russell Liu <ginshio78@gmail.com>
Date: Sat, 3 May 2025 15:00:29 +0800
Subject: [PATCH 25/93] [SPIRV] Fix constant value in function (#7415)

We will get a crash when use spirv intrinsic to create a constant value.
```fundamental
fatal error: generated SPIR-V is invalid: Constant cannot appear in a function declaration
  %spirvIntrinsicType_42 = OpConstant %spirvIntrinsicType 42
```
---
 tools/clang/lib/SPIRV/EmitVisitor.cpp               |  8 +++++++-
 .../CodeGenSPIRV/spv.intrinsicConstantValue.hlsl    | 13 +++++++++++++
 2 files changed, 20 insertions(+), 1 deletion(-)
 create mode 100644 tools/clang/test/CodeGenSPIRV/spv.intrinsicConstantValue.hlsl

diff --git a/tools/clang/lib/SPIRV/EmitVisitor.cpp b/tools/clang/lib/SPIRV/EmitVisitor.cpp
index 2a3ffd82f4..f58160254a 100644
--- a/tools/clang/lib/SPIRV/EmitVisitor.cpp
+++ b/tools/clang/lib/SPIRV/EmitVisitor.cpp
@@ -2000,7 +2000,13 @@ bool EmitVisitor::visit(SpirvIntrinsicInstruction *inst) {
     }
   }
 
-  finalizeInstruction(&mainBinary);
+  auto opcode = static_cast<spv::Op>(inst->getInstruction());
+  if ((opcode == spv::Op::OpSpecConstant || opcode == spv::Op::OpConstant) &&
+      !inst->getInstructionSet()) {
+    finalizeInstruction(&typeConstantBinary);
+  } else {
+    finalizeInstruction(&mainBinary);
+  }
   return true;
 }
 
diff --git a/tools/clang/test/CodeGenSPIRV/spv.intrinsicConstantValue.hlsl b/tools/clang/test/CodeGenSPIRV/spv.intrinsicConstantValue.hlsl
new file mode 100644
index 0000000000..a592863f1b
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/spv.intrinsicConstantValue.hlsl
@@ -0,0 +1,13 @@
+// RUN: %dxc -Od -T cs_6_8 -spirv -fcgl %s | FileCheck %s
+
+// CHECK: %spirvIntrinsicType = OpTypeInt 8 0
+using uint8_t [[vk::ext_capability(/* Int8 */ 39)]] =
+    vk::SpirvType</* OpTypeInt */ 21, 8, 8, vk::Literal<vk::integral_constant<uint, 8> >,
+                  vk::Literal<vk::integral_constant<bool, false> > >;
+
+[[vk::ext_instruction(/* OpConstant */ 43)]] uint8_t mkconsant([[vk::ext_literal]] int v);
+
+// CHECK: OpConstant %spirvIntrinsicType 42
+static const uint8_t K = mkconsant(42);
+
+[numthreads(1, 1, 1)] void main() {}

From 6f17379e95b70943d8be4e52079509ac7c0cec02 Mon Sep 17 00:00:00 2001
From: Simon Moll <smoll@nvidia.com>
Date: Tue, 6 May 2025 17:59:21 +0200
Subject: [PATCH 26/93] [SER] REORDER_SCOPE check-fail validation tests (#7428)

Show 'REORDER_SCOPE' memory flag rejected pre DXIL 1.9. REORDER_SCOPE
validation logic already implemented in #7263

Move REORDER_SCOPE validation tests to `LitDXILValidation/` and adapt
RUN lines as we are touching this (#7359).

SER implementation tracker: #7214
---
 .../ser_reorder_scope_sm68_failing.ll         | 77 +++++++++++++++++++
 .../ser_reorder_scope_sm69_passing.ll         |  5 +-
 2 files changed, 81 insertions(+), 1 deletion(-)
 create mode 100644 tools/clang/test/LitDXILValidation/ser_reorder_scope_sm68_failing.ll
 rename tools/clang/test/{HLSLFileCheck/validation => LitDXILValidation}/ser_reorder_scope_sm69_passing.ll (96%)

diff --git a/tools/clang/test/LitDXILValidation/ser_reorder_scope_sm68_failing.ll b/tools/clang/test/LitDXILValidation/ser_reorder_scope_sm68_failing.ll
new file mode 100644
index 0000000000..cd93eca793
--- /dev/null
+++ b/tools/clang/test/LitDXILValidation/ser_reorder_scope_sm68_failing.ll
@@ -0,0 +1,77 @@
+; REQUIRES: dxil-1-8
+; RUN: not %dxv %s 2>&1 | FileCheck %s
+
+; Buffer Definitions:
+;
+;
+; Resource Bindings:
+;
+; Name                                 Type  Format         Dim      ID      HLSL Bind  Count
+; ------------------------------ ---------- ------- ----------- ------- -------------- ------
+; BAB                                   UAV    byte         r/w      U0             u1     1
+;
+target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+%dx.types.Handle = type { i8* }
+%dx.types.ResourceProperties = type { i32, i32 }
+%struct.RWByteAddressBuffer = type { i32 }
+
+@"\01?BAB@@3URWByteAddressBuffer@@A" = external constant %dx.types.Handle, align 4
+
+; CHECK: Function: ?main@@YAXXZ: error: Invalid semantic flags on DXIL operation 'BarrierByMemoryType'
+; CHECK-NEXT: note: at 'call void @dx.op.barrierByMemoryType(i32 244, i32 1, i32 8)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK-NEXT: Function: ?main@@YAXXZ: error: Invalid semantic flags on DXIL operation 'barrierByMemoryHandle'
+; CHECK-NEXT: note: at 'call void @dx.op.barrierByMemoryHandle(i32 245, %dx.types.Handle %3, i32 8)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK-NEXT: Function: ?main@@YAXXZ: error: Entry function performs some operation that is incompatible with the shader stage or other entry properties.  See other errors for details.
+; CHECK-NEXT: Function: ?main@@YAXXZ: error: Function uses features incompatible with the shader model.
+; CHECK-NEXT: Validation failed.
+
+; Function Attrs: nounwind
+define void @"\01?main@@YAXXZ"() #0 {
+  %1 = load %dx.types.Handle, %dx.types.Handle* @"\01?BAB@@3URWByteAddressBuffer@@A", align 4
+  call void @dx.op.barrierByMemoryType(i32 244, i32 1, i32 8)  ; BarrierByMemoryType(MemoryTypeFlags,SemanticFlags)
+  %2 = call %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32 160, %dx.types.Handle %1)  ; CreateHandleForLib(Resource)
+  %3 = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %2, %dx.types.ResourceProperties { i32 4107, i32 0 })  ; AnnotateHandle(res,props)  resource: RWByteAddressBuffer
+  call void @dx.op.barrierByMemoryHandle(i32 245, %dx.types.Handle %3, i32 8)  ; BarrierByMemoryHandle(object,SemanticFlags)
+  ret void
+}
+
+; Function Attrs: noduplicate nounwind
+declare void @dx.op.barrierByMemoryType(i32, i32, i32) #1
+
+; Function Attrs: noduplicate nounwind
+declare void @dx.op.barrierByMemoryHandle(i32, %dx.types.Handle, i32) #1
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @dx.op.annotateHandle(i32, %dx.types.Handle, %dx.types.ResourceProperties) #2
+
+; Function Attrs: nounwind readonly
+declare %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32, %dx.types.Handle) #3
+
+attributes #0 = { nounwind }
+attributes #1 = { noduplicate nounwind }
+attributes #2 = { nounwind readnone }
+attributes #3 = { nounwind readonly }
+
+!dx.version = !{!0}
+!dx.valver = !{!0}
+!dx.shaderModel = !{!1}
+!dx.resources = !{!2}
+!dx.typeAnnotations = !{!5}
+!dx.entryPoints = !{!9, !11}
+
+!0 = !{i32 1, i32 8}
+!1 = !{!"lib", i32 6, i32 8}
+!2 = !{null, !3, null, null}
+!3 = !{!4}
+!4 = !{i32 0, %struct.RWByteAddressBuffer* bitcast (%dx.types.Handle* @"\01?BAB@@3URWByteAddressBuffer@@A" to %struct.RWByteAddressBuffer*), !"BAB", i32 0, i32 1, i32 1, i32 11, i1 false, i1 false, i1 false, null}
+!5 = !{i32 1, void ()* @"\01?main@@YAXXZ", !6}
+!6 = !{!7}
+!7 = !{i32 1, !8, !8}
+!8 = !{}
+!9 = !{null, !"", null, !2, !10}
+!10 = !{i32 0, i64 8589934608}
+!11 = !{void ()* @"\01?main@@YAXXZ", !"\01?main@@YAXXZ", null, null, !12}
+!12 = !{i32 8, i32 7, i32 5, !13}
+!13 = !{i32 0}
diff --git a/tools/clang/test/HLSLFileCheck/validation/ser_reorder_scope_sm69_passing.ll b/tools/clang/test/LitDXILValidation/ser_reorder_scope_sm69_passing.ll
similarity index 96%
rename from tools/clang/test/HLSLFileCheck/validation/ser_reorder_scope_sm69_passing.ll
rename to tools/clang/test/LitDXILValidation/ser_reorder_scope_sm69_passing.ll
index cab9942b02..fa2733ef22 100644
--- a/tools/clang/test/HLSLFileCheck/validation/ser_reorder_scope_sm69_passing.ll
+++ b/tools/clang/test/LitDXILValidation/ser_reorder_scope_sm69_passing.ll
@@ -1,4 +1,7 @@
-; RUN: %dxilver 1.9 | %dxv %s
+; REQUIRES: dxil-1-9
+; RUN: %dxv %s 2>&1 | FileCheck %s
+
+; CHECK: Validation succeeded.
 
 ; Buffer Definitions:
 ;

From 556fc5b3307a0f9571a7d567116e01f8977c8d85 Mon Sep 17 00:00:00 2001
From: Joshua Batista <jbatista@microsoft.com>
Date: Wed, 7 May 2025 01:06:46 -0700
Subject: [PATCH 27/93] [Internal] Remove dead fallthrough statement (#7433)

This PR removes a dead fallthrough statement after a switch-statement's
case that is terminated by a break;
This needs to be removed, because the way linux builds dxc in our
internal infrastructure, it errors when a dead fallthrough statement is
detected.
For context, here is the relevant error:

```
ScalarReplAggregatesHLSL.cpp:2822:9: error: fallthrough annotation in unreachable code [-Werror,-Wimplicit-fallthrough]
        LLVM_FALLTHROUGH;
        ^
... DXC/include/llvm/Support/Compiler.h:224:26: note: expanded from macro 'LLVM_FALLTHROUGH'
#define LLVM_FALLTHROUGH [[fallthrough]]
                         ^
1 error generated.
```
---
 lib/Transforms/Scalar/ScalarReplAggregatesHLSL.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/lib/Transforms/Scalar/ScalarReplAggregatesHLSL.cpp b/lib/Transforms/Scalar/ScalarReplAggregatesHLSL.cpp
index 20265af40a..8bd78dd9a6 100644
--- a/lib/Transforms/Scalar/ScalarReplAggregatesHLSL.cpp
+++ b/lib/Transforms/Scalar/ScalarReplAggregatesHLSL.cpp
@@ -2819,7 +2819,6 @@ void SROA_Helper::RewriteCall(CallInst *CI) {
         DeadInsts.push_back(CI);
         break;
       }
-        LLVM_FALLTHROUGH;
       default:
         // RayQuery this pointer replacement.
         if (OldVal->getType()->isPointerTy() &&

From 8df744951f4389c2d682faae1cb98a8475c74c04 Mon Sep 17 00:00:00 2001
From: Chris B <cbieneman@microsoft.com>
Date: Wed, 7 May 2025 10:51:53 -0700
Subject: [PATCH 28/93] Pull format checking tools from `main` (#7403)

This updates the clang-format-checker action workflow to pull the format
checker and tools from `main` instead of from the PR.

Note: This PR basically can't be tested pre-merge since the pre-merge
check will use the version of the action in `main`.
---
 .github/workflows/clang-format-checker.yml | 45 ++++++++++++++++------
 1 file changed, 34 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/clang-format-checker.yml b/.github/workflows/clang-format-checker.yml
index d1887e4519..74b734a7c0 100644
--- a/.github/workflows/clang-format-checker.yml
+++ b/.github/workflows/clang-format-checker.yml
@@ -12,7 +12,7 @@ jobs:
     permissions:
       pull-requests: write
     steps:
-      - name: Fetch LLVM sources
+      - name: Fetch DirectXShaderCompiler sources
         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
         with:
           ref: ${{ github.event.pull_request.head.sha }}
@@ -31,6 +31,20 @@ jobs:
           separator: ","
           skip_initial_fetch: true
 
+      # We need to pull the script from the main branch, so that we ensure
+      # we get the latest version of this script.
+      - name: Fetch code formatting utils
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          repository: microsoft/DirectXShaderCompiler
+          ref: ${{ github.base_ref }}
+          sparse-checkout: |
+            utils/git/requirements_formatting.txt
+            utils/git/code-format-helper.py
+            utils/git/code-format-save-diff.py
+          sparse-checkout-cone-mode: false
+          path: code-format-tools
+
       - name: "Listed files"
         env:
           LISTED_FILES: ${{ steps.changed-files.outputs.all_changed_files }}
@@ -48,10 +62,10 @@ jobs:
         with:
           python-version: '3.11'
           cache: 'pip'
-          cache-dependency-path: 'utils/git/requirements_formatting.txt'
+          cache-dependency-path: 'code-format-tools/utils/git/requirements_formatting.txt'
 
       - name: Install python dependencies
-        run: pip install -r utils/git/requirements_formatting.txt
+        run: pip install -r code-format-tools/utils/git/requirements_formatting.txt
 
       - name: Run code formatter
         id: formatter
@@ -61,7 +75,7 @@ jobs:
           END_REV: ${{ github.event.pull_request.head.sha }}
           CHANGED_FILES: ${{ steps.changed-files.outputs.all_changed_files }}
         run: |
-           python utils/git/code-format-helper.py \
+           python code-format-tools/utils/git/code-format-helper.py \
             --token ${{ secrets.GITHUB_TOKEN }} \
             --issue-number $GITHUB_PR_NUMBER \
             --start-rev $START_REV \
@@ -92,28 +106,37 @@ jobs:
             } catch (err) {
               core.setFailed(`Request failed with error ${err}`)
             } 
-      - name: Fetch LLVM sources
-        uses: actions/checkout@v4
+      
+      # We need to pull the script from the main branch, so that we ensure
+      # we get the latest version of this script.
+      - name: Fetch code formatting utils
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
         with:
-          fetch-depth: 2
-          path: build/main_src
+          repository: microsoft/DirectXShaderCompiler
+          ref: ${{ github.base_ref }}
+          sparse-checkout: |
+            utils/git/requirements_formatting.txt
+            utils/git/code-format-helper.py
+            utils/git/code-format-save-diff.py
+          sparse-checkout-cone-mode: false
+          path: code-format-tools
 
       - name: Setup Python env
         uses: actions/setup-python@v4
         with:
           python-version: '3.11'
           cache: 'pip'
-          cache-dependency-path: 'build/main_src/utils/git/requirements_formatting.txt'
+          cache-dependency-path: 'code-format-tools/utils/git/requirements_formatting.txt'
 
       - name: Install python dependencies
-        run: pip install -r build/main_src/utils/git/requirements_formatting.txt
+        run: pip install -r code-format-tools/utils/git/requirements_formatting.txt
 
       - name: Apply code diff
         env:
           GITHUB_PR_NUMBER: ${{ github.event.issue.number }}
           COMMENT_ID: ${{ github.event.comment.id }}
         run: |
-          python build/main_src/utils/git/code-format-save-diff.py \
+          python code-format-tools/utils/git/code-format-save-diff.py \
             --token ${{ secrets.GITHUB_TOKEN }} \
             --issue-number $GITHUB_PR_NUMBER \
             --tmp-diff-file $TMP_DIFF_FILE \

From 231d648af0f9bf9ecb9a61c591337ceca67c6cd3 Mon Sep 17 00:00:00 2001
From: Greg Roth <grroth@microsoft.com>
Date: Wed, 7 May 2025 13:34:07 -0700
Subject: [PATCH 29/93] [SM6.9] Disable native vec deriv ops and expand testing
 (#7432)

Several intrinsics that were enabled for native vectors late got their
testing removed as it expected scalarized forms. This adds tests for
pow, modf, and abs in their native vector intrinsic forms. It removes
native vector intrinsics for derivative operations as they require more
scalarization removal due to their convergent markers and restores the
scalarized testing for them. The 1024 size was removed from
longvec-intrinsics as the verbose way that constant vectors are
represented in the disassembly made the test take significantly longer.
---
 lib/DXIL/DxilOperations.cpp                   | 16 +++---
 .../hlsl/types/longvec-intrinsics.hlsl        | 50 ++++++++++++++++++-
 ...longvec-trivial-scalarized-intrinsics.hlsl |  7 +++
 ...ongvec-trivial-unary-float-intrinsics.hlsl |  2 +
 utils/hct/hctdb.py                            |  8 +--
 5 files changed, 70 insertions(+), 13 deletions(-)

diff --git a/lib/DXIL/DxilOperations.cpp b/lib/DXIL/DxilOperations.cpp
index 95e8dfaeba..a66dfc68d4 100644
--- a/lib/DXIL/DxilOperations.cpp
+++ b/lib/DXIL/DxilOperations.cpp
@@ -765,32 +765,32 @@ const OP::OpCodeProperty OP::m_OpCodeProps[(unsigned)OP::OpCode::NumOpCodes] = {
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x403}},
-     {{0x3}}}, // Overloads: hf<hf
+     {{0x3}},
+     {{0x0}}}, // Overloads: hf
     {OC::DerivCoarseY,
      "DerivCoarseY",
      OCC::Unary,
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x403}},
-     {{0x3}}}, // Overloads: hf<hf
+     {{0x3}},
+     {{0x0}}}, // Overloads: hf
     {OC::DerivFineX,
      "DerivFineX",
      OCC::Unary,
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x403}},
-     {{0x3}}}, // Overloads: hf<hf
+     {{0x3}},
+     {{0x0}}}, // Overloads: hf
     {OC::DerivFineY,
      "DerivFineY",
      OCC::Unary,
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x403}},
-     {{0x3}}}, // Overloads: hf<hf
+     {{0x3}},
+     {{0x0}}}, // Overloads: hf
 
     // Pixel shader
     {OC::EvalSnapped,
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-intrinsics.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-intrinsics.hlsl
index 0b7f0d6b2f..f13772970b 100644
--- a/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-intrinsics.hlsl
+++ b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-intrinsics.hlsl
@@ -2,7 +2,6 @@
 // RUN: %dxc -T cs_6_9 -enable-16bit-types -DNUM=7   %s | FileCheck %s
 // RUN: %dxc -T cs_6_9 -enable-16bit-types -DNUM=125 %s | FileCheck %s
 // RUN: %dxc -T cs_6_9 -enable-16bit-types -DNUM=256 %s | FileCheck %s
-// RUN: %dxc -T cs_6_9 -enable-16bit-types -DNUM=1024 %s | FileCheck %s
 
 // Test vector-enabled non-trivial intrinsics that take parameters of various types.
 
@@ -203,6 +202,36 @@ void main() {
   // CHECK: fmul fast <[[NUM]] x float> [[tmp]], <float 0x3FE62E4300000000
   fRes += log(fVec1);
 
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[tmp:%.*]] = call <[[NUM]] x half> @dx.op.unary.[[HTY]](i32 23, <[[NUM]] x half> [[hvec2]])  ; Log(value)
+  // CHECK: [[tmp2:%.*]] = fmul fast <[[NUM]] x half> [[tmp]], [[hvec1]]
+  // CHECK: call <[[NUM]] x half> @dx.op.unary.[[HTY]](i32 21, <[[NUM]] x half> [[tmp2]])  ; Exp(value)
+  hRes += pow(hVec2, hVec1);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[tmp:%.*]] = call <[[NUM]] x float> @dx.op.unary.[[FTY]](i32 23, <[[NUM]] x float> [[fvec2]])  ; Log(value)
+  // CHECK: [[tmp2:%.*]] = fmul fast <[[NUM]] x float> [[tmp]], [[fvec1]]
+  // CHECK: call <[[NUM]] x float> @dx.op.unary.[[FTY]](i32 21, <[[NUM]] x float> [[tmp2]])  ; Exp(value)
+  fRes += pow(fVec2, fVec1);
+
+  vector<half, NUM> hVal;
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[tmp:%.*]] = call <[[NUM]] x half> @dx.op.unary.[[HTY]](i32 29, <[[NUM]] x half> [[hvec1]])  ; Round_z(value)
+  // CHECK: fsub fast <[[NUM]] x half> [[hvec1]], [[tmp]]
+  hRes *= modf(hVec1, hVal);
+  hRes += hVal;
+
+  vector<float, NUM> fVal;
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[tmp:%.*]] = call <[[NUM]] x float> @dx.op.unary.[[FTY]](i32 29, <[[NUM]] x float> [[fvec1]])  ; Round_z(value)
+  // CHECK: fsub fast <[[NUM]] x float> [[fvec1]], [[tmp]]
+  fRes *= modf(fVec1, fVal);
+  fRes += fVal;
+
   // CHECK-NOT: extractelement
   // CHECK-NOT: insertelement
   // CHECK: [[sub:%.*]] = fsub fast <[[NUM]] x half> [[hvec2]], [[hvec1]]
@@ -227,6 +256,25 @@ void main() {
   // CHECK: fmul fast <[[NUM]] x float> [[mul]], [[sub]]
   fRes += smoothstep(fVec1, fVec2, fVec3);
 
+  // Note that Fabs is tested in longvec-trivial-unary-float-intrinsics.
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[tmp:%.*]] = sub <[[NUM]] x i16> zeroinitializer, [[svec1]]
+  // CHECK: call <[[NUM]] x i16> @dx.op.binary.[[STY]](i32 37, <[[NUM]] x i16> [[svec1]], <[[NUM]] x i16> [[tmp]])  ; IMax(a,b)
+  sRes += abs(sVec1);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[tmp:%.*]] = sub <[[NUM]] x i32> zeroinitializer, [[ivec1]]
+  // CHECK: call <[[NUM]] x i32> @dx.op.binary.[[ITY]](i32 37, <[[NUM]] x i32> [[ivec1]], <[[NUM]] x i32> [[tmp]])  ; IMax(a,b)
+  iRes += abs(iVec1);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[tmp:%.*]] = sub <[[NUM]] x i64> zeroinitializer, [[lvec1]]
+  // CHECK: call <[[NUM]] x i64> @dx.op.binary.[[LTY]](i32 37, <[[NUM]] x i64> [[lvec1]], <[[NUM]] x i64> [[tmp]])  ; IMax(a,b)
+  lRes += abs(lVec1);
+
   // Intrinsics that expand into llvm ops.
 
   // CHECK-NOT: extractelement
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-scalarized-intrinsics.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-scalarized-intrinsics.hlsl
index 6ebb511b00..37fb1d2e15 100644
--- a/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-scalarized-intrinsics.hlsl
+++ b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-scalarized-intrinsics.hlsl
@@ -9,6 +9,13 @@
 // RUN: %dxc -DFUNC=countbits   -DARITY=1 -DTYPE=uint -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,UNARY
 // RUN: %dxc -DFUNC=firstbithigh -DARITY=1 -DTYPE=uint -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,UNARY
 // RUN: %dxc -DFUNC=firstbitlow  -DARITY=1 -DTYPE=uint -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,UNARY
+// RUN: %dxc -DFUNC=ddx         -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,UNARY
+// RUN: %dxc -DFUNC=ddx_coarse  -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,UNARY
+// RUN: %dxc -DFUNC=ddx_fine    -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,UNARY
+// RUN: %dxc -DFUNC=ddy         -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,UNARY
+// RUN: %dxc -DFUNC=ddy_coarse  -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,UNARY
+// RUN: %dxc -DFUNC=ddy_fine    -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,UNARY
+// RUN: %dxc -DFUNC=fwidth      -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,UNARY
 // RUN: %dxc -DFUNC=QuadReadLaneAt         -DARITY=4 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,QUAD
 // RUN: %dxc -DFUNC=QuadReadAcrossX        -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,QUAD
 // RUN: %dxc -DFUNC=QuadReadAcrossY        -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,QUAD
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-unary-float-intrinsics.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-unary-float-intrinsics.hlsl
index 91ab631a7e..9cc3d23b66 100644
--- a/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-unary-float-intrinsics.hlsl
+++ b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-unary-float-intrinsics.hlsl
@@ -1,3 +1,5 @@
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=abs  -DOP=6 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=abs  -DOP=6 -DNUM=1022 %s | FileCheck %s
 // RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=saturate  -DOP=7 -DNUM=7    %s | FileCheck %s
 // RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=saturate  -DOP=7 -DNUM=1022 %s | FileCheck %s
 // RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=cos  -DOP=12 -DNUM=7    %s | FileCheck %s
diff --git a/utils/hct/hctdb.py b/utils/hct/hctdb.py
index 57f2574005..7954faf2af 100644
--- a/utils/hct/hctdb.py
+++ b/utils/hct/hctdb.py
@@ -2629,7 +2629,7 @@ def UFI(name, **mappings):
             next_op_idx,
             "Unary",
             "computes the rate of change of components per stamp",
-            "hf<",
+            "hf",
             "rn",
             [
                 db_dxil_param(
@@ -2647,7 +2647,7 @@ def UFI(name, **mappings):
             next_op_idx,
             "Unary",
             "computes the rate of change of components per stamp",
-            "hf<",
+            "hf",
             "rn",
             [
                 db_dxil_param(
@@ -2665,7 +2665,7 @@ def UFI(name, **mappings):
             next_op_idx,
             "Unary",
             "computes the rate of change of components per pixel",
-            "hf<",
+            "hf",
             "rn",
             [
                 db_dxil_param(
@@ -2683,7 +2683,7 @@ def UFI(name, **mappings):
             next_op_idx,
             "Unary",
             "computes the rate of change of components per pixel",
-            "hf<",
+            "hf",
             "rn",
             [
                 db_dxil_param(

From 422604be6845851bb955579cab21498b5b38301a Mon Sep 17 00:00:00 2001
From: Joshua Batista <jbatista@microsoft.com>
Date: Thu, 8 May 2025 16:32:27 -0700
Subject: [PATCH 30/93] [HLSL Options] Remove select-validator option (#7423)

This PR removes the select-validator option. It is being deprecated, and
it wasn't ever officially documented.
Fixes https://github.com/microsoft/DirectXShaderCompiler/issues/7419
---
 include/dxc/Support/HLSLOptions.td                 |  2 --
 lib/DxcSupport/HLSLOptions.cpp                     | 14 --------------
 .../test/DXC/deprecated-select-validator.hlsl      | 14 ++++++++++++++
 .../hlsl/types/struct/struct-annotations.hlsl      |  6 +++---
 .../test/HLSLFileCheck/infra/auto-dxilver.hlsl     | 11 ++++++-----
 .../WrongShaderModel.hlsl                          |  4 +---
 6 files changed, 24 insertions(+), 27 deletions(-)
 create mode 100644 tools/clang/test/DXC/deprecated-select-validator.hlsl

diff --git a/include/dxc/Support/HLSLOptions.td b/include/dxc/Support/HLSLOptions.td
index ea000f4877..4d72cb2312 100644
--- a/include/dxc/Support/HLSLOptions.td
+++ b/include/dxc/Support/HLSLOptions.td
@@ -317,8 +317,6 @@ def print_before_all : Flag<["-", "/"], "print-before-all">, Group<hlslcomp_Grou
   HelpText<"Print LLVM IR before each pass.">;
 def print_before : Separate<["-", "/"], "print-before">, Group<hlslcomp_Group>, Flags<[CoreOption, HelpHidden]>,
   HelpText<"Print LLVM IR before a specific pass. May be specificied multiple times.">;
-def select_validator : Separate<["-", "/"], "select-validator">, Group<hlslcomp_Group>, Flags<[CoreOption, HelpHidden]>,
-  HelpText<"Select validator: auto: (default) use DXIL.dll if found, otherwise use internal;  internal: internal non-signing validator;  external: use DXIL.dll if found, otherwise fail compilation.">;
 def print_after_all : Flag<["-", "/"], "print-after-all">, Group<hlslcomp_Group>, Flags<[CoreOption, HelpHidden]>,
   HelpText<"Print LLVM IR after each pass.">;
 def print_after : Separate<["-", "/"], "print-after">, Group<hlslcomp_Group>, Flags<[CoreOption, HelpHidden]>,
diff --git a/lib/DxcSupport/HLSLOptions.cpp b/lib/DxcSupport/HLSLOptions.cpp
index 1ce7d0dfc0..eb071eb0a6 100644
--- a/lib/DxcSupport/HLSLOptions.cpp
+++ b/lib/DxcSupport/HLSLOptions.cpp
@@ -1033,20 +1033,6 @@ int ReadDxcOpts(const OptTable *optionTable, unsigned flagsToInclude,
     opts.ValVerMinor = (unsigned long)minor64;
   }
 
-  llvm::StringRef valSelectStr = Args.getLastArgValue(OPT_select_validator);
-  if (!valSelectStr.empty()) {
-    opts.SelectValidator = llvm::StringSwitch<ValidatorSelection>(valSelectStr)
-                               .Case("auto", ValidatorSelection::Auto)
-                               .Case("internal", ValidatorSelection::Internal)
-                               .Case("external", ValidatorSelection::External)
-                               .Default(ValidatorSelection::Invalid);
-    if (opts.SelectValidator == ValidatorSelection::Invalid) {
-      errors << "Unsupported value '" << valSelectStr
-             << "for -select-validator option.";
-      return 1;
-    }
-  }
-
   if (opts.IsLibraryProfile() && Minor == 0xF) {
     if (opts.ValVerMajor != UINT_MAX && opts.ValVerMajor != 0) {
       errors << "Offline library profile cannot be used with non-zero "
diff --git a/tools/clang/test/DXC/deprecated-select-validator.hlsl b/tools/clang/test/DXC/deprecated-select-validator.hlsl
new file mode 100644
index 0000000000..2ad3e5199c
--- /dev/null
+++ b/tools/clang/test/DXC/deprecated-select-validator.hlsl
@@ -0,0 +1,14 @@
+// Test that the deprecated option, select-validator, doesn't work.
+// RUN: not %dxc -E main -T vs_6_7 -select-validator internal %s 2>&1 | FileCheck %s
+
+// CHECK: dxc failed : Unknown argument: '-select-validator'
+
+float4 main(int loc : SV_StartVertexLocation
+           , uint loc2 : SV_StartInstanceLocation
+           ) : SV_Position
+{
+    float4 r = 0;
+    r += loc;
+    r += loc2;
+    return r;
+}
diff --git a/tools/clang/test/HLSLFileCheck/hlsl/types/struct/struct-annotations.hlsl b/tools/clang/test/HLSLFileCheck/hlsl/types/struct/struct-annotations.hlsl
index 5a1b5e43d8..4ffb325c8b 100644
--- a/tools/clang/test/HLSLFileCheck/hlsl/types/struct/struct-annotations.hlsl
+++ b/tools/clang/test/HLSLFileCheck/hlsl/types/struct/struct-annotations.hlsl
@@ -1,5 +1,5 @@
-// RUN: %dxc -T ps_6_8 -E main -Qkeep_reflect_in_dxil -select-validator internal %s | FileCheck -check-prefix=CHECK68 %s
-// RUN: %dxc -T ps_6_7 -E main -Qkeep_reflect_in_dxil -select-validator internal %s | FileCheck -check-prefix=CHECK67 %s
+// RUN: %dxc -T ps_6_8 -E main -Qkeep_reflect_in_dxil %s | FileCheck -check-prefix=CHECK68 %s
+// RUN: %dxc -T ps_6_7 -E main -Qkeep_reflect_in_dxil %s | FileCheck -check-prefix=CHECK67 %s
 
 // Make sure the vector is annotated with vector size (DXIL 1.8 and higher),
 // matrix is annotated with matrix size and orientation, and scalar does not
@@ -47,4 +47,4 @@ StructuredBuffer<MyStruct> g_myStruct;
 float main() : SV_Target 
 { 
     return g_myStruct[0].vec.x + g_myStruct[0].vec.y; 
-}
\ No newline at end of file
+}
diff --git a/tools/clang/test/HLSLFileCheck/infra/auto-dxilver.hlsl b/tools/clang/test/HLSLFileCheck/infra/auto-dxilver.hlsl
index 166fa5918d..14ee7f7bf9 100644
--- a/tools/clang/test/HLSLFileCheck/infra/auto-dxilver.hlsl
+++ b/tools/clang/test/HLSLFileCheck/infra/auto-dxilver.hlsl
@@ -5,14 +5,17 @@
 // This should implicitly require dxilver 1.8.
 
 // RUN: %dxc -T vs_6_8 -Vd %s | FileCheck %s
-// Even though this is using -Vd, the validator version is set by the available
-// validator.  If that isn't version 1.8 or above, we'll see an error.
+// Even though this is using -Vd, the validator version being checked is the internal
+// validator's version. If a pre-DXIL-1.8 DXC was used to run this test, we expect failure,
+// since the internal validator will be the same version as the older DXC.
 // The implicit dxilver logic should not skip the check when -Vd is used.
 // CHECK-NOT: error: validator version {{.*}} does not support target profile.
 
 // RUN: %dxc -T vs_6_0 -validator-version 1.8 %s | FileCheck %s
 // Even though target is 6.0, the explicit -validator-version should add an
-// implicit dxilver 1.8 requirement.
+// implicit dxilver 1.8 requirement. The requirement should pass for DXCs that
+// are newer than DXIL Version 1.8, since then, the internal validator's version will
+// be sufficiently new for this check.
 // CHECK-NOT: error: The module cannot be validated by the version of the validator currently attached.
 
 // This error would occur if run against wrong compiler.
@@ -21,8 +24,6 @@
 // Catch any other unexpected error cases.
 // CHECK-NOT: error
 
-// RUN: %dxc -T vs_6_8 -select-validator internal %s | FileCheck %s
-// This should always be run, and always succeed.
 // CHECK: define void @main()
 
 void main() {}
diff --git a/tools/clang/test/SemaHLSL/hlsl/semantics/ExtendedCommandInformation/WrongShaderModel.hlsl b/tools/clang/test/SemaHLSL/hlsl/semantics/ExtendedCommandInformation/WrongShaderModel.hlsl
index 667e1f4579..4bddf37acd 100644
--- a/tools/clang/test/SemaHLSL/hlsl/semantics/ExtendedCommandInformation/WrongShaderModel.hlsl
+++ b/tools/clang/test/SemaHLSL/hlsl/semantics/ExtendedCommandInformation/WrongShaderModel.hlsl
@@ -1,7 +1,5 @@
 // TODO: use -verify instead of FileCheck after fix https://github.com/microsoft/DirectXShaderCompiler/issues/5768
-// -select-validator internal used to avoid downlevel validator testing
-// incompatibility with shader model 6.7.
-// RUN: not %dxc -E main -T vs_6_7 -select-validator internal %s 2>&1 | FileCheck %s --check-prefix=SM67
+// RUN: not %dxc -E main -T vs_6_7 %s 2>&1 | FileCheck %s --check-prefix=SM67
 
 // SM67:invalid semantic 'SV_StartVertexLocation' for vs 6.7
 // SM67:invalid semantic 'SV_StartInstanceLocation' for vs 6.7

From 474f9d28a334ac3abd449e4afa823a14719fad19 Mon Sep 17 00:00:00 2001
From: Simon Moll <smoll@nvidia.com>
Date: Fri, 9 May 2025 18:42:44 +0200
Subject: [PATCH 31/93] [SER] Validate 'reordercoherent' resource property
 (#7429)

Validates:
 All resources
 All instructions using resources

Rules:
 'reordercoherent' may only be used in SM6.9+ in resource handles and
 resource declarations.
 Increment/DecrementCounter unsupported on 'reordercoherent' resources.

Create a new DXIL 1.9 variant of the 'CompileWhenOkThenCheckRDAT'
container test and restore the old one without 'reordercoherent'
(pre-#7250). The validator now rejects 'reordercoherent' in DXIL 1.3 and
accepts from DXIL 1.9+.

SER implementation tracker: #7214

---------

Co-authored-by: Tex Riddell <texr@microsoft.com>
Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
---
 docs/DXIL.rst                                 |   3 +-
 lib/DxilValidation/DxilValidation.cpp         |  24 ++-
 .../ser_reordercoherent_invalid_incdec.ll     |  92 +++++++++++
 .../ser_reordercoherent_invalid_sm.ll         |  83 ++++++++++
 .../unittests/HLSL/DxilContainerTest.cpp      | 143 +++++++++++++++++-
 tools/clang/unittests/HLSL/ValidationTest.cpp |   2 +-
 utils/hct/hctdb.py                            |  11 +-
 7 files changed, 345 insertions(+), 13 deletions(-)
 create mode 100644 tools/clang/test/LitDXILValidation/ser_reordercoherent_invalid_incdec.ll
 create mode 100644 tools/clang/test/LitDXILValidation/ser_reordercoherent_invalid_sm.ll

diff --git a/docs/DXIL.rst b/docs/DXIL.rst
index 69bcae8c53..7532ec3c42 100644
--- a/docs/DXIL.rst
+++ b/docs/DXIL.rst
@@ -3175,6 +3175,7 @@ INSTR.OPCONSTRANGE                                    Constant values must be in
 INSTR.OPERANDRANGE                                    DXIL intrinsic operand must be within defined range
 INSTR.PARAMMULTIPLE                                   Parameter must be a valid multiple
 INSTR.PTRBITCAST                                      Pointer type bitcast must be have same size.
+INSTR.REORDERCOHERENTREQUIRESSM69                     reordercoherent requires SM 6.9 or later.
 INSTR.RESOURCECLASSFORLOAD                            load can only run on UAV/SRV resource.
 INSTR.RESOURCECLASSFORSAMPLERGATHER                   sample, lod and gather should be on srv resource.
 INSTR.RESOURCECLASSFORUAVSTORE                        store should be on uav resource.
@@ -3216,6 +3217,7 @@ META.BARYCENTRICSTWOPERSPECTIVES                      There can only be up to tw
 META.BRANCHFLATTEN                                    Can't use branch and flatten attributes together.
 META.CLIPCULLMAXCOMPONENTS                            Combined elements of SV_ClipDistance and SV_CullDistance must fit in 8 components
 META.CLIPCULLMAXROWS                                  Combined elements of SV_ClipDistance and SV_CullDistance must fit in two rows.
+META.COHERENCENOTONAPPENDCONSUME                      globally/reorder coherent incompatible with append/consume/counter buffers
 META.COMPUTEWITHNODE                                  Compute entry must not have node metadata
 META.CONTROLFLOWHINTNOTONCONTROLFLOW                  Control flow hint only works on control flow inst.
 META.DENSERESIDS                                      Resource identifiers must be zero-based and dense.
@@ -3223,7 +3225,6 @@ META.DUPLICATESYSVALUE                                System value may only appe
 META.ENTRYFUNCTION                                    entrypoint not found.
 META.FLAGSUSAGE                                       Flags must match usage.
 META.FORCECASEONSWITCH                                Attribute forcecase only works for switch.
-META.GLCNOTONAPPENDCONSUME                            globallycoherent cannot be used with append/consume buffers: '%0'.
 META.INTEGERINTERPMODE                                Interpolation mode on integer must be Constant
 META.INTERPMODEINONEROW                               Interpolation mode must be identical for all elements packed into the same row.
 META.INTERPMODEVALID                                  Interpolation mode must be valid
diff --git a/lib/DxilValidation/DxilValidation.cpp b/lib/DxilValidation/DxilValidation.cpp
index bd69cdaf5d..28917e0600 100644
--- a/lib/DxilValidation/DxilValidation.cpp
+++ b/lib/DxilValidation/DxilValidation.cpp
@@ -165,7 +165,8 @@ ValidateSignatureAccess(Instruction *I, DxilSignature &Sig, Value *SigId,
 
 static DxilResourceProperties GetResourceFromHandle(Value *Handle,
                                                     ValidationContext &ValCtx) {
-  if (!isa<CallInst>(Handle)) {
+  CallInst *HandleCall = dyn_cast<CallInst>(Handle);
+  if (!HandleCall) {
     if (Instruction *I = dyn_cast<Instruction>(Handle))
       ValCtx.EmitInstrError(I, ValidationRule::InstrHandleNotFromCreateHandle);
     else
@@ -175,10 +176,13 @@ static DxilResourceProperties GetResourceFromHandle(Value *Handle,
   }
 
   DxilResourceProperties RP = ValCtx.GetResourceFromVal(Handle);
-  if (RP.getResourceClass() == DXIL::ResourceClass::Invalid) {
+  if (RP.getResourceClass() == DXIL::ResourceClass::Invalid)
     ValCtx.EmitInstrError(cast<CallInst>(Handle),
                           ValidationRule::InstrHandleNotFromCreateHandle);
-  }
+  if (RP.Basic.IsReorderCoherent &&
+      !ValCtx.DxilMod.GetShaderModel()->IsSM69Plus())
+    ValCtx.EmitInstrError(HandleCall,
+                          ValidationRule::InstrReorderCoherentRequiresSM69);
 
   return RP;
 }
@@ -4182,6 +4186,9 @@ static void ValidateResourceOverlap(
 
 static void ValidateResource(hlsl::DxilResource &Res,
                              ValidationContext &ValCtx) {
+  if (Res.IsReorderCoherent() && !ValCtx.DxilMod.GetShaderModel()->IsSM69Plus())
+    ValCtx.EmitResourceError(&Res,
+                             ValidationRule::InstrReorderCoherentRequiresSM69);
   switch (Res.GetKind()) {
   case DXIL::ResourceKind::RawBuffer:
   case DXIL::ResourceKind::TypedBuffer:
@@ -4413,10 +4420,13 @@ static void ValidateResources(ValidationContext &ValCtx) {
       ValCtx.EmitResourceError(Uav.get(),
                                ValidationRule::SmCounterOnlyOnStructBuf);
     }
-    if (Uav->HasCounter() && Uav->IsGloballyCoherent())
-      ValCtx.EmitResourceFormatError(Uav.get(),
-                                     ValidationRule::MetaGlcNotOnAppendConsume,
-                                     {ValCtx.GetResourceName(Uav.get())});
+    const bool UavIsCoherent =
+        Uav->IsGloballyCoherent() || Uav->IsReorderCoherent();
+    if (Uav->HasCounter() && UavIsCoherent) {
+      StringRef Prefix = Uav->IsGloballyCoherent() ? "globally" : "reorder";
+      ValCtx.EmitResourceFormatError(
+          Uav.get(), ValidationRule::MetaCoherenceNotOnAppendConsume, {Prefix});
+    }
 
     ValidateResource(*Uav, ValCtx);
     ValidateResourceOverlap(*Uav, UavAllocator, ValCtx);
diff --git a/tools/clang/test/LitDXILValidation/ser_reordercoherent_invalid_incdec.ll b/tools/clang/test/LitDXILValidation/ser_reordercoherent_invalid_incdec.ll
new file mode 100644
index 0000000000..1f68a9a95f
--- /dev/null
+++ b/tools/clang/test/LitDXILValidation/ser_reordercoherent_invalid_incdec.ll
@@ -0,0 +1,92 @@
+; REQUIRES: dxil-1-9
+; RUN: not %dxv %s 2>&1 | FileCheck %s
+
+; COM: Original HLSL source:
+; COM: reordercoherent RWStructuredBuffer<float> buffer;
+; COM:
+; COM:
+; COM: [Shader("raygeneration")]
+; COM: void
+; COM: main()
+; COM: {
+; COM:   buffer.IncrementCounter();
+; COM:   buffer.DecrementCounter();
+; COM: }
+
+; CHECK: error: reordercoherent cannot be used on buffer with counter 'buffer'
+; CHECK-NEXT: Validation failed.
+
+; shader hash: 638950814a9023bf537d61dbb330a4c8
+;
+; Buffer Definitions:
+;
+; Resource bind info for buffer
+; {
+;
+;   float $Element;                                   ; Offset:    0 Size:     4
+;
+; }
+;
+;
+; Resource Bindings:
+;
+; Name                                 Type  Format         Dim      ID      HLSL Bind  Count
+; ------------------------------ ---------- ------- ----------- ------- -------------- ------
+; buffer                                UAV  struct     r/w+cnt      U0u4294967295,space4294967295     1
+;
+target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+%dx.types.Handle = type { i8* }
+%dx.types.ResourceProperties = type { i32, i32 }
+%"class.RWStructuredBuffer<float>" = type { float }
+
+@"\01?buffer@@3V?$RWStructuredBuffer@M@@A" = external constant %dx.types.Handle, align 4
+
+; Function Attrs: nounwind
+define void @"\01?main@@YAXXZ"() #0 {
+  %1 = load %dx.types.Handle, %dx.types.Handle* @"\01?buffer@@3V?$RWStructuredBuffer@M@@A", align 4
+  %2 = call %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32 160, %dx.types.Handle %1)  ; CreateHandleForLib(Resource)
+  %3 = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %2, %dx.types.ResourceProperties { i32 102412, i32 4 })  ; AnnotateHandle(res,props)  resource: reordercoherent RWStructuredBuffer<stride=4, counter>
+  %4 = call i32 @dx.op.bufferUpdateCounter(i32 70, %dx.types.Handle %3, i8 1)  ; BufferUpdateCounter(uav,inc)
+  %5 = call %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32 160, %dx.types.Handle %1)  ; CreateHandleForLib(Resource)
+  %6 = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %5, %dx.types.ResourceProperties { i32 102412, i32 4 })  ; AnnotateHandle(res,props)  resource: reordercoherent RWStructuredBuffer<stride=4, counter>
+  %7 = call i32 @dx.op.bufferUpdateCounter(i32 70, %dx.types.Handle %6, i8 -1)  ; BufferUpdateCounter(uav,inc)
+  ret void
+}
+
+; Function Attrs: nounwind
+declare i32 @dx.op.bufferUpdateCounter(i32, %dx.types.Handle, i8) #0
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @dx.op.annotateHandle(i32, %dx.types.Handle, %dx.types.ResourceProperties) #1
+
+; Function Attrs: nounwind readonly
+declare %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32, %dx.types.Handle) #2
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind readonly }
+
+!dx.version = !{!0}
+!dx.valver = !{!0}
+!dx.shaderModel = !{!1}
+!dx.resources = !{!2}
+!dx.typeAnnotations = !{!6}
+!dx.entryPoints = !{!10, !12}
+
+!0 = !{i32 1, i32 9}
+!1 = !{!"lib", i32 6, i32 9}
+!2 = !{null, !3, null, null}
+!3 = !{!4}
+!4 = !{i32 0, %"class.RWStructuredBuffer<float>"* bitcast (%dx.types.Handle* @"\01?buffer@@3V?$RWStructuredBuffer@M@@A" to %"class.RWStructuredBuffer<float>"*), !"buffer", i32 -1, i32 -1, i32 1, i32 12, i1 false, i1 true, i1 false, !5}
+!5 = !{i32 1, i32 4, i32 4, i1 true}
+!6 = !{i32 1, void ()* @"\01?main@@YAXXZ", !7}
+!7 = !{!8}
+!8 = !{i32 1, !9, !9}
+!9 = !{}
+!10 = !{null, !"", null, !2, !11}
+!11 = !{i32 0, i64 8589934608}
+!12 = !{void ()* @"\01?main@@YAXXZ", !"\01?main@@YAXXZ", null, null, !13}
+!13 = !{i32 8, i32 7, i32 5, !14}
+!14 = !{i32 0}
\ No newline at end of file
diff --git a/tools/clang/test/LitDXILValidation/ser_reordercoherent_invalid_sm.ll b/tools/clang/test/LitDXILValidation/ser_reordercoherent_invalid_sm.ll
new file mode 100644
index 0000000000..efcb7d3c2b
--- /dev/null
+++ b/tools/clang/test/LitDXILValidation/ser_reordercoherent_invalid_sm.ll
@@ -0,0 +1,83 @@
+; REQUIRES: dxil-1-8
+; RUN: not %dxv %s 2>&1 | FileCheck %s
+
+
+; CHECK: error: reordercoherent requires SM 6.9 or later. 'buf'
+; CHECK-NEXT: Function: ?main@@YAXXZ: error: reordercoherent requires SM 6.9 or later.
+; CHECK-NEXT: note: at '%3 = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %2, %dx.types.ResourceProperties { i32 69643, i32 0 })' in block '#0' of function '?main@@YAXXZ'.
+; CHECK-NEXT: Function: ?main@@YAXXZ: error: reordercoherent requires SM 6.9 or later.
+; CHECK-NEXT: note: at '%3 = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %2, %dx.types.ResourceProperties { i32 69643, i32 0 })' in block '#0' of function '?main@@YAXXZ'.
+; CHECK-NEXT: Validation failed.
+; COM: Original HLSL source:
+; COM: reordercoherent RWByteAddressBuffer buf;
+; COM:
+; COM: [Shader("raygeneration")]
+; COM: void main()
+; COM: {
+; COM:   buf.Store(0, 11.f);
+; COM: }
+
+; shader hash: f7be6354830d1423764991adcfc26b0b
+;
+; Buffer Definitions:
+;
+;
+; Resource Bindings:
+;
+; Name                                 Type  Format         Dim      ID      HLSL Bind  Count
+; ------------------------------ ---------- ------- ----------- ------- -------------- ------
+; buf                                   UAV    byte         r/w      U0u4294967295,space4294967295     1
+;
+target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+%dx.types.Handle = type { i8* }
+%dx.types.ResourceProperties = type { i32, i32 }
+%struct.RWByteAddressBuffer = type { i32 }
+
+@"\01?buf@@3URWByteAddressBuffer@@A" = external constant %dx.types.Handle, align 4
+
+; Function Attrs: nounwind
+define void @"\01?main@@YAXXZ"() #0 {
+  %1 = load %dx.types.Handle, %dx.types.Handle* @"\01?buf@@3URWByteAddressBuffer@@A", align 4
+  %2 = call %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32 160, %dx.types.Handle %1)  ; CreateHandleForLib(Resource)
+  %3 = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %2, %dx.types.ResourceProperties { i32 69643, i32 0 })  ; AnnotateHandle(res,props)  resource: reordercoherent RWByteAddressBuffer
+  call void @dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle %3, i32 0, i32 undef, float 1.100000e+01, float undef, float undef, float undef, i8 1, i32 4)  ; RawBufferStore(uav,index,elementOffset,value0,value1,value2,value3,mask,alignment)
+  ret void
+}
+
+; Function Attrs: nounwind
+declare void @dx.op.rawBufferStore.f32(i32, %dx.types.Handle, i32, i32, float, float, float, float, i8, i32) #0
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @dx.op.annotateHandle(i32, %dx.types.Handle, %dx.types.ResourceProperties) #1
+
+; Function Attrs: nounwind readonly
+declare %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32, %dx.types.Handle) #2
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind readonly }
+
+!dx.version = !{!0}
+!dx.valver = !{!0}
+!dx.shaderModel = !{!1}
+!dx.resources = !{!2}
+!dx.typeAnnotations = !{!3}
+!dx.entryPoints = !{!4, !5}
+
+!0 = !{i32 1, i32 8}
+!1 = !{!"lib", i32 6, i32 8}
+!2 = !{null, !6, null, null}
+!3 = !{i32 1, void ()* @"\01?main@@YAXXZ", !7}
+!4 = !{null, !"", null, !2, !8}
+!5 = !{void ()* @"\01?main@@YAXXZ", !"\01?main@@YAXXZ", null, null, !9}
+!6 = !{!10}
+!7 = !{!11}
+!8 = !{i32 0, i64 8589934608}
+!9 = !{i32 8, i32 7, i32 5, !12}
+!10 = !{i32 0, %struct.RWByteAddressBuffer* bitcast (%dx.types.Handle* @"\01?buf@@3URWByteAddressBuffer@@A" to %struct.RWByteAddressBuffer*), !"buf", i32 -1, i32 -1, i32 1, i32 11, i1 false, i1 false, i1 false, !13}
+!11 = !{i32 1, !14, !14}
+!12 = !{i32 0}
+!13 = !{i32 4, i1 true}
+!14 = !{}
diff --git a/tools/clang/unittests/HLSL/DxilContainerTest.cpp b/tools/clang/unittests/HLSL/DxilContainerTest.cpp
index 339b33c655..34b4d338fe 100644
--- a/tools/clang/unittests/HLSL/DxilContainerTest.cpp
+++ b/tools/clang/unittests/HLSL/DxilContainerTest.cpp
@@ -103,6 +103,7 @@ class DxilContainerTest : public ::testing::Test {
   TEST_METHOD(CompileCSWaveSizeRange_CheckPSV0)
   TEST_METHOD(CompileWhenOkThenCheckRDAT)
   TEST_METHOD(CompileWhenOkThenCheckRDAT2)
+  TEST_METHOD(CompileWhenOkThenCheckRDATSM69)
   TEST_METHOD(CompileWhenOkThenCheckReflection1)
   TEST_METHOD(DxcUtils_CreateReflection)
   TEST_METHOD(CheckReflectionQueryInterface)
@@ -1444,6 +1445,146 @@ TEST_F(DxilContainerTest, CompileCSWaveSizeRange_CheckPSV0) {
 TEST_F(DxilContainerTest, CompileWhenOkThenCheckRDAT) {
   if (m_ver.SkipDxilVersion(1, 3))
     return;
+  const char *shader =
+      "float c_buf;"
+      "RWTexture1D<int4> tex : register(u5);"
+      "Texture1D<float4> tex2 : register(t0);"
+      "RWByteAddressBuffer b_buf;"
+      "struct Foo { float2 f2; int2 i2; };"
+      "AppendStructuredBuffer<Foo> append_buf;"
+      "ConsumeStructuredBuffer<Foo> consume_buf;"
+      "RasterizerOrderedByteAddressBuffer rov_buf;"
+      "globallycoherent RWByteAddressBuffer gc_buf;"
+      "float function_import(float x);"
+      "export float function0(min16float x) { "
+      "  return x + 1 + tex[0].x; }"
+      "export float function1(float x, min12int i) {"
+      "  return x + c_buf + b_buf.Load(x) + tex2[i].x; }"
+      "export float function2(float x) { return x + function_import(x); }"
+      "export void function3(int i) {"
+      "  Foo f = consume_buf.Consume();"
+      "  f.f2 += 0.5; append_buf.Append(f);"
+      "  rov_buf.Store(i, f.i2.x);"
+      "  gc_buf.Store(i, f.i2.y);"
+      "  b_buf.Store(i, f.i2.x + f.i2.y); }";
+  CComPtr<IDxcCompiler> pCompiler;
+  CComPtr<IDxcBlobEncoding> pSource;
+  CComPtr<IDxcBlob> pProgram;
+  CComPtr<IDxcBlobEncoding> pDisassembly;
+  CComPtr<IDxcOperationResult> pResult;
+
+  struct CheckResFlagInfo {
+    std::string name;
+    hlsl::DXIL::ResourceKind kind;
+    hlsl::RDAT::DxilResourceFlag flag;
+  };
+  const unsigned numResFlagCheck = 5;
+  CheckResFlagInfo resFlags[numResFlagCheck] = {
+      {"b_buf", hlsl::DXIL::ResourceKind::RawBuffer,
+       hlsl::RDAT::DxilResourceFlag::None},
+      {"append_buf", hlsl::DXIL::ResourceKind::StructuredBuffer,
+       hlsl::RDAT::DxilResourceFlag::UAVCounter},
+      {"consume_buf", hlsl::DXIL::ResourceKind::StructuredBuffer,
+       hlsl::RDAT::DxilResourceFlag::UAVCounter},
+      {"gc_buf", hlsl::DXIL::ResourceKind::RawBuffer,
+       hlsl::RDAT::DxilResourceFlag::UAVGloballyCoherent},
+      {"rov_buf", hlsl::DXIL::ResourceKind::RawBuffer,
+       hlsl::RDAT::DxilResourceFlag::UAVRasterizerOrderedView}};
+
+  VERIFY_SUCCEEDED(CreateCompiler(&pCompiler));
+  CreateBlobFromText(shader, &pSource);
+  VERIFY_SUCCEEDED(pCompiler->Compile(pSource, L"hlsl.hlsl", L"main",
+                                      L"lib_6_3", nullptr, 0, nullptr, 0,
+                                      nullptr, &pResult));
+  HRESULT hrStatus;
+  VERIFY_SUCCEEDED(pResult->GetStatus(&hrStatus));
+  VERIFY_SUCCEEDED(hrStatus);
+  VERIFY_SUCCEEDED(pResult->GetResult(&pProgram));
+  CComPtr<IDxcContainerReflection> containerReflection;
+  uint32_t partCount;
+  IFT(m_dllSupport.CreateInstance(CLSID_DxcContainerReflection,
+                                  &containerReflection));
+  IFT(containerReflection->Load(pProgram));
+  IFT(containerReflection->GetPartCount(&partCount));
+  bool blobFound = false;
+  for (uint32_t i = 0; i < partCount; ++i) {
+    uint32_t kind;
+    IFT(containerReflection->GetPartKind(i, &kind));
+    if (kind == (uint32_t)hlsl::DxilFourCC::DFCC_RuntimeData) {
+      blobFound = true;
+      using namespace hlsl::RDAT;
+      CComPtr<IDxcBlob> pBlob;
+      IFT(containerReflection->GetPartContent(i, &pBlob));
+      // Validate using DxilRuntimeData
+      DxilRuntimeData context;
+      context.InitFromRDAT((char *)pBlob->GetBufferPointer(),
+                           pBlob->GetBufferSize());
+      auto funcTable = context.GetFunctionTable();
+      auto resTable = context.GetResourceTable();
+      VERIFY_ARE_EQUAL(funcTable.Count(), 4U);
+      std::string str("function");
+      for (uint32_t j = 0; j < funcTable.Count(); ++j) {
+        auto funcReader = funcTable[j];
+        std::string funcName(funcReader.getUnmangledName());
+        VERIFY_IS_TRUE(str.compare(funcName.substr(0, 8)) == 0);
+        std::string cur_str = str;
+        cur_str.push_back('0' + j);
+        if (cur_str.compare("function0") == 0) {
+          VERIFY_ARE_EQUAL(funcReader.getResources().Count(), 1U);
+          hlsl::ShaderFlags flag;
+          flag.SetUAVLoadAdditionalFormats(true);
+          flag.SetLowPrecisionPresent(true);
+          uint64_t rawFlag = flag.GetFeatureInfo();
+          VERIFY_ARE_EQUAL(funcReader.GetFeatureFlags(), rawFlag);
+          auto resReader = funcReader.getResources()[0];
+          VERIFY_ARE_EQUAL(resReader.getClass(),
+                           hlsl::DXIL::ResourceClass::UAV);
+          VERIFY_ARE_EQUAL(resReader.getKind(),
+                           hlsl::DXIL::ResourceKind::Texture1D);
+        } else if (cur_str.compare("function1") == 0) {
+          hlsl::ShaderFlags flag;
+          flag.SetLowPrecisionPresent(true);
+          uint64_t rawFlag = flag.GetFeatureInfo();
+          VERIFY_ARE_EQUAL(funcReader.GetFeatureFlags(), rawFlag);
+          VERIFY_ARE_EQUAL(funcReader.getResources().Count(), 3U);
+        } else if (cur_str.compare("function2") == 0) {
+          VERIFY_ARE_EQUAL(funcReader.GetFeatureFlags() & 0xffffffffffffffff,
+                           0U);
+          VERIFY_ARE_EQUAL(funcReader.getResources().Count(), 0U);
+          std::string dependency = funcReader.getFunctionDependencies()[0];
+          VERIFY_IS_TRUE(dependency.find("function_import") !=
+                         std::string::npos);
+        } else if (cur_str.compare("function3") == 0) {
+          VERIFY_ARE_EQUAL(funcReader.GetFeatureFlags() & 0xffffffffffffffff,
+                           0U);
+          VERIFY_ARE_EQUAL(funcReader.getResources().Count(), numResFlagCheck);
+          for (unsigned i = 0; i < funcReader.getResources().Count(); ++i) {
+            auto resReader = funcReader.getResources()[0];
+            VERIFY_ARE_EQUAL(resReader.getClass(),
+                             hlsl::DXIL::ResourceClass::UAV);
+            unsigned j = 0;
+            for (; j < numResFlagCheck; ++j) {
+              if (resFlags[j].name.compare(resReader.getName()) == 0)
+                break;
+            }
+            VERIFY_IS_LESS_THAN(j, numResFlagCheck);
+            VERIFY_ARE_EQUAL(resReader.getKind(), resFlags[j].kind);
+            VERIFY_ARE_EQUAL(resReader.getFlags(),
+                             static_cast<uint32_t>(resFlags[j].flag));
+          }
+        } else {
+          IFTBOOLMSG(false, E_FAIL, "unknown function name");
+        }
+      }
+      VERIFY_ARE_EQUAL(resTable.Count(), 8U);
+    }
+  }
+  IFTBOOLMSG(blobFound, E_FAIL, "failed to find RDAT blob after compiling");
+}
+
+TEST_F(DxilContainerTest, CompileWhenOkThenCheckRDATSM69) {
+  if (m_ver.SkipDxilVersion(1, 9))
+    return;
   const char *shader =
       "float c_buf;"
       "RWTexture1D<int4> tex : register(u5);"
@@ -1497,7 +1638,7 @@ TEST_F(DxilContainerTest, CompileWhenOkThenCheckRDAT) {
   VERIFY_SUCCEEDED(CreateCompiler(&pCompiler));
   CreateBlobFromText(shader, &pSource);
   VERIFY_SUCCEEDED(pCompiler->Compile(pSource, L"hlsl.hlsl", L"main",
-                                      L"lib_6_3", nullptr, 0, nullptr, 0,
+                                      L"lib_6_9", nullptr, 0, nullptr, 0,
                                       nullptr, &pResult));
   HRESULT hrStatus;
   VERIFY_SUCCEEDED(pResult->GetStatus(&hrStatus));
diff --git a/tools/clang/unittests/HLSL/ValidationTest.cpp b/tools/clang/unittests/HLSL/ValidationTest.cpp
index 01f24e0227..980bf6c7c2 100644
--- a/tools/clang/unittests/HLSL/ValidationTest.cpp
+++ b/tools/clang/unittests/HLSL/ValidationTest.cpp
@@ -1488,7 +1488,7 @@ TEST_F(ValidationTest, StructBufGlobalCoherentAndCounter) {
       L"..\\DXILValidation\\struct_buf1.hlsl", "ps_6_0",
       "!\"buf2\", i32 0, i32 0, i32 1, i32 12, i1 false, i1 false",
       "!\"buf2\", i32 0, i32 0, i32 1, i32 12, i1 true, i1 true",
-      "globallycoherent cannot be used with append/consume buffers: 'buf2'");
+      "globallycoherent cannot be used on buffer with counter 'buf2'");
 }
 
 TEST_F(ValidationTest, StructBufStrideAlign) {
diff --git a/utils/hct/hctdb.py b/utils/hct/hctdb.py
index 7954faf2af..65f9aa1d80 100644
--- a/utils/hct/hctdb.py
+++ b/utils/hct/hctdb.py
@@ -8015,9 +8015,10 @@ def build_valrules(self):
             "Hull Shader MaxTessFactor must be [%0..%1].  %2 specified.",
         )
         self.add_valrule("Meta.ValidSamplerMode", "Invalid sampler mode on sampler .")
-        self.add_valrule(
-            "Meta.GlcNotOnAppendConsume",
-            "globallycoherent cannot be used with append/consume buffers: '%0'.",
+        self.add_valrule_msg(
+            "Meta.CoherenceNotOnAppendConsume",
+            "globally/reorder coherent incompatible with append/consume/counter buffers",
+            "%0coherent cannot be used on buffer with counter",
         )
         self.add_valrule_msg(
             "Meta.StructBufAlignment",
@@ -8409,6 +8410,10 @@ def build_valrules(self):
             "Instr.MayReorderThreadUndefCoherenceHintParam",
             "Use of undef coherence hint or num coherence hint bits in MaybeReorderThread.",
         )
+        self.add_valrule(
+            "Instr.ReorderCoherentRequiresSM69",
+            "reordercoherent requires SM 6.9 or later.",
+        )
 
         # Linalg ops
         self.add_valrule_msg(

From 377c4ca6d82adb83bf2eaf978a7040443848d6fd Mon Sep 17 00:00:00 2001
From: Jim Blandy <jimb@red-bean.com>
Date: Mon, 12 May 2025 10:35:22 -0700
Subject: [PATCH 32/93] Fix the return types of `dot4add_i8packed` and
 `dot4add_u8packed`. (#7401)

Change the definition of the HLSL `dot4add_i8packed` and
`dot4add_u8packed` intrinsics in `utils/hct/gen_intrin_main.txt` to
simply spell out the return types, rather than saying that their return
type is determined by their third argument.

This prevents DXC from trying to give those functions declarations like

declare i64 @"\01?dot4add_u8packed@hlsl@@YA_JII_J@Z"(i32, i32, i64
signext) #1

which seems to expect a 64-bit third argument and return value.

These functions are not generic, and they have only one overload, so
there is no need to use interesting `uComponentTypeId` values to get the
right effects, and `HLSLExternalSource::MatchArguments` seems to get
confused about how to treat argument types that affect the return types.

Fixes #7400.
---
 tools/clang/lib/Sema/SemaHLSL.cpp             |  4 +--
 .../test/DXC/dot4add_i8_u8_packed-types.hlsl  | 34 +++++++++++++++++++
 utils/hct/gen_intrin_main.txt                 |  6 ++--
 3 files changed, 39 insertions(+), 5 deletions(-)
 create mode 100644 tools/clang/test/DXC/dot4add_i8_u8_packed-types.hlsl

diff --git a/tools/clang/lib/Sema/SemaHLSL.cpp b/tools/clang/lib/Sema/SemaHLSL.cpp
index 6e58c0e872..b15068638d 100644
--- a/tools/clang/lib/Sema/SemaHLSL.cpp
+++ b/tools/clang/lib/Sema/SemaHLSL.cpp
@@ -6737,8 +6737,8 @@ bool HLSLExternalSource::MatchArguments(
           (iArg != retArgIdx && retTypeIdx == pIntrinsicArg->uComponentTypeId);
       // For literal arg which don't affect return type, find concrete type.
       // For literal arg affect return type,
-      //   TryEvalIntrinsic in CGHLSLMS.cpp will take care of cases
-      //     where all argumentss are literal.
+      //   TryEvalIntrinsic in CGHLSLMSFinishCodeGen.cpp will take care of
+      //     cases where all arguments are literal.
       //   CombineBasicTypes will cover the rest cases.
       if (!affectRetType) {
         TypeInfoEltKind =
diff --git a/tools/clang/test/DXC/dot4add_i8_u8_packed-types.hlsl b/tools/clang/test/DXC/dot4add_i8_u8_packed-types.hlsl
new file mode 100644
index 0000000000..53c87bb9c1
--- /dev/null
+++ b/tools/clang/test/DXC/dot4add_i8_u8_packed-types.hlsl
@@ -0,0 +1,34 @@
+// RUN: %dxc /enable-16bit-types /T cs_6_8 %s | FileCheck %s
+
+// Compiling this HLSL would fail this assertion in TranslateDot4AddPacked:
+//
+//     DXASSERT(
+//         !accTy->isVectorTy() && accTy->isIntegerTy(32),
+//         "otherwise, unexpected vector support in high level intrinsic template");
+//
+// Bug was fixed by changing the declarations of dot4add_i8packed and
+// dot4add_u8packed in utils/hct/gen_intrin_main.txt to simply write
+// out their argument and return types, rather than using the $typeN
+// reference syntax.
+
+// CHECK: call i32 @dx.op.dot4AddPacked.i32{{.*}}Dot4AddI8Packed(acc,a,b)
+// CHECK: call i32 @dx.op.dot4AddPacked.i32{{.*}}Dot4AddU8Packed(acc,a,b)
+// CHECK: call float @dx.op.dot2AddHalf.f32{{.*}}Dot2AddHalf(acc,ax,ay,bx,by)
+
+RWByteAddressBuffer buf;
+
+[numthreads(1, 1, 1)]
+void main()
+{
+    int a = dot4add_i8packed(0, 0, 0);
+    int b = dot4add_i8packed(0, 0, a);
+    buf.Store<int>(0, b);
+
+    uint c = dot4add_u8packed(0, 0, 0);
+    uint d = dot4add_u8packed(0, 0, c);
+    buf.Store<uint>(4, d);
+
+    float e = dot2add(half2(0,0), half2(0,0), 1.0);
+    float f = dot2add(half2(0,0), half2(0,0), e);
+    buf.Store<float>(8, f);
+}
diff --git a/utils/hct/gen_intrin_main.txt b/utils/hct/gen_intrin_main.txt
index c394611302..e5e4119330 100644
--- a/utils/hct/gen_intrin_main.txt
+++ b/utils/hct/gen_intrin_main.txt
@@ -339,9 +339,9 @@ float<4,3> [[rn]] ObjectToWorld4x3();
 float<4,3> [[rn]] WorldToObject4x3();
 
 // Packed dot products with accumulate:
-$type3 [[rn]] dot4add_u8packed(in uint a, in $type1 b, in uint c);
-$type3 [[rn]] dot4add_i8packed(in uint a, in $type1 b, in int c);
-$type3 [[rn]] dot2add(in float16_t<2> a, in $type1 b, in float c);
+uint [[rn]] dot4add_u8packed(in uint a, in $type1 b, in uint c);
+int [[rn]] dot4add_i8packed(in uint a, in $type1 b, in int c);
+float [[rn]] dot2add(in float16_t<2> a, in $type1 b, in float c);
 
 // Unpacking intrinsics
 int16_t<4> [[rn]] unpack_s8s16(in p32i8 pk);

From fb4d7d17bd5104cbbf259e8484390175fdc23192 Mon Sep 17 00:00:00 2001
From: Anupama Chandrasekhar <anupama.psu@gmail.com>
Date: Tue, 13 May 2025 09:06:24 -0700
Subject: [PATCH 33/93] [0029] [Main] For OuterProductAccumulate, matrix layout
 must be outerproductoptimal and matrix stride must be zero  (#7417)

Implements the DXIL portion of https://github.com/microsoft/hlsl-specs/pull/494, The HLSL checks will be a part of the HLSL validation checks, did not add it to this PR due to shared infrastructure.
---
 docs/DXIL.rst                                 | 584 +++++++++---------
 include/dxc/DXIL/DxilConstants.h              |   1 +
 lib/DxilValidation/DxilValidation.cpp         |  26 +
 .../linalg_builtins/check-shader-stages.hlsl  |   2 +-
 .../linalg_builtins/linalg-builtins.hlsl      |   4 +-
 ...uter-product-accumulate-multioverload.hlsl |  18 +-
 ...uter-product-accumulate-matrix-layout.hlsl |  28 +
 .../DXC/Passes/DxilGen/linalg-builtins.ll     |   4 +-
 ...roduct-accumulate-matrix-layout-failing.ll |  86 +++
 ...roduct-accumulate-matrix-layout-passing.ll |  65 ++
 utils/hct/hctdb.py                            |  12 +
 11 files changed, 525 insertions(+), 305 deletions(-)
 create mode 100644 tools/clang/test/CodeGenDXIL/hlsl/linalg/outer-product-accumulate-matrix-layout.hlsl
 create mode 100644 tools/clang/test/LitDXILValidation/outer-product-accumulate-matrix-layout-failing.ll
 create mode 100644 tools/clang/test/LitDXILValidation/outer-product-accumulate-matrix-layout-passing.ll

diff --git a/docs/DXIL.rst b/docs/DXIL.rst
index 7532ec3c42..1a2a691d27 100644
--- a/docs/DXIL.rst
+++ b/docs/DXIL.rst
@@ -3069,297 +3069,299 @@ The set of validation rules that are known to hold for a DXIL program is identif
 .. <py::lines('VALRULES-RST')>hctdb_instrhelp.get_valrules_rst()</py>
 .. VALRULES-RST:BEGIN
 
-===================================================== ========================================================================================================================================================================================================================================================================================================
-Rule Code                                             Description
-===================================================== ========================================================================================================================================================================================================================================================================================================
-BITCODE.VALID                                         Module must be bitcode-valid
-CONTAINER.CONTENTINVALID                              DXIL Container Content is well-formed
-CONTAINER.CONTENTMATCHES                              DXIL Container Content must match Module
-CONTAINER.PARTINVALID                                 DXIL Container must not contain unknown parts
-CONTAINER.PARTMATCHES                                 DXIL Container Parts must match Module
-CONTAINER.PARTMISSING                                 DXIL Container requires certain parts, corresponding to module
-CONTAINER.PARTREPEATED                                DXIL Container must have only one of each part type
-CONTAINER.ROOTSIGNATUREINCOMPATIBLE                   Root Signature in DXIL Container must be compatible with shader
-CONTAINER.UNUSEDITEMINTABLE                           Items in Table must be used
-DECL.ALLOCATERAYQUERY2FLAGSARECONST                   constRayFlags and RayQueryFlags for AllocateRayQuery2 must be constant
-DECL.ALLOCATERAYQUERYFLAGSARECONST                    RayFlags for AllocateRayQuery must be constant
-DECL.ALLOWOPACITYMICROMAPSEXPECTEDGIVENFORCEOMM2STATE When the ForceOMM2State ConstRayFlag is given as an argument to a RayQuery object, AllowOpacityMicromaps is expected as a RayQueryFlag argument
-DECL.ATTRSTRUCT                                       Attributes parameter must be struct type
-DECL.DXILFNEXTERN                                     External function must be a DXIL function
-DECL.DXILNSRESERVED                                   The DXIL reserved prefixes must only be used by built-in functions and types
-DECL.EXTRAARGS                                        Extra arguments not allowed for shader functions
-DECL.FNATTRIBUTE                                      Functions should only contain known function attributes
-DECL.FNFLATTENPARAM                                   Function parameters must not use struct types
-DECL.FNISCALLED                                       Functions can only be used by call instructions
-DECL.MULTIPLENODEINPUTS                               A node shader may not have more than one input record
-DECL.NODELAUNCHINPUTTYPE                              Invalid input record type for node launch type
-DECL.NOTUSEDEXTERNAL                                  External declaration should not be used
-DECL.PARAMSTRUCT                                      Callable function parameter must be struct type
-DECL.PAYLOADSTRUCT                                    Payload parameter must be struct type
-DECL.RAYQUERYINFNSIG                                  Rayquery objects not allowed in function signatures
-DECL.RESOURCEINFNSIG                                  Resources not allowed in function signatures
-DECL.SHADERMISSINGARG                                 payload/params/attributes parameter is required for certain shader types
-DECL.SHADERRETURNVOID                                 Shader functions must return void
-DECL.USEDEXTERNALFUNCTION                             External function must be used
-DECL.USEDINTERNAL                                     Internal declaration must be used
-FLOW.DEADLOOP                                         Loop must have break.
-FLOW.FUNCTIONCALL                                     Function with parameter is not permitted
-FLOW.NORECURSION                                      Recursion is not permitted.
-FLOW.REDUCIBLE                                        Execution flow must be reducible.
-INSTR.ALLOWED                                         Instructions must be of an allowed type.
-INSTR.ATOMICCONST                                     Constant destination to atomic.
-INSTR.ATOMICINTRINNONUAV                              Non-UAV destination to atomic intrinsic.
-INSTR.ATOMICOPNONGROUPSHAREDORRECORD                  Non-groupshared or node record destination to atomic operation.
-INSTR.ATTRIBUTEATVERTEXNOINTERPOLATION                Attribute %0 must have nointerpolation mode in order to use GetAttributeAtVertex function.
-INSTR.BARRIERFLAGINVALID                              Invalid %0 flags on DXIL operation '%1'
-INSTR.BARRIERMODEFORNONCS                             sync in a non-Compute/Amplification/Mesh/Node Shader must only sync UAV (sync_uglobal).
-INSTR.BARRIERMODENOMEMORY                             sync must include some form of memory barrier - _u (UAV) and/or _g (Thread Group Shared Memory).  Only _t (thread group sync) is optional.
-INSTR.BARRIERMODEUSELESSUGROUP                        sync can't specify both _ugroup and _uglobal. If both are needed, just specify _uglobal.
-INSTR.BARRIERNONCONSTANTFLAGARGUMENT                  Memory type, access, or sync flag is not constant
-INSTR.BARRIERREQUIRESNODE                             sync in a non-Node Shader must not sync node record memory.
-INSTR.BUFFERUPDATECOUNTERONRESHASCOUNTER              BufferUpdateCounter valid only when HasCounter is true.
-INSTR.BUFFERUPDATECOUNTERONUAV                        BufferUpdateCounter valid only on UAV.
-INSTR.CALLOLOAD                                       Call to DXIL intrinsic must match overload signature
-INSTR.CANNOTPULLPOSITION                              pull-model evaluation of position disallowed
-INSTR.CBUFFERCLASSFORCBUFFERHANDLE                    Expect Cbuffer for CBufferLoad handle.
-INSTR.CBUFFEROUTOFBOUND                               Cbuffer access out of bound.
-INSTR.CHECKACCESSFULLYMAPPED                          CheckAccessFullyMapped should only be used on resource status.
-INSTR.CONSTALIGNFORRAWBUF                             Raw Buffer alignment value must be a constant.
-INSTR.COORDINATECOUNTFORRAWTYPEDBUF                   raw/typed buffer offset must be undef.
-INSTR.COORDINATECOUNTFORSTRUCTBUF                     structured buffer requires defined index and offset coordinates.
-INSTR.CREATEHANDLEIMMRANGEID                          Local resource must map to global resource.
-INSTR.DXILSTRUCTUSER                                  Dxil struct types should only be used by ExtractValue.
-INSTR.DXILSTRUCTUSEROUTOFBOUND                        Index out of bound when extract value from dxil struct types.
-INSTR.EVALINTERPOLATIONMODE                           Interpolation mode on %0 used with eval_* instruction must be linear, linear_centroid, linear_noperspective, linear_noperspective_centroid, linear_sample or linear_noperspective_sample.
-INSTR.EXTRACTVALUE                                    ExtractValue should only be used on dxil struct types and cmpxchg.
-INSTR.FAILTORESLOVETGSMPOINTER                        TGSM pointers must originate from an unambiguous TGSM global variable.
-INSTR.HANDLENOTFROMCREATEHANDLE                       Resource handle should returned by createHandle.
-INSTR.ILLEGALDXILOPCODE                               DXILOpCode must be [0..%0].  %1 specified.
-INSTR.ILLEGALDXILOPFUNCTION                           '%0' is not a DXILOpFuncition for DXILOpcode '%1'.
-INSTR.IMMBIASFORSAMPLEB                               bias amount for sample_b must be in the range [%0,%1], but %2 was specified as an immediate.
-INSTR.INBOUNDSACCESS                                  Access to out-of-bounds memory is disallowed.
-INSTR.LINALGINTERPRETATIONPARAMARECONST               In Linalg operations, Interpretation value is a constant.
-INSTR.LINALGINVALIDMATRIXLAYOUTVALUEFORMATVECOPS      Matrix Layout for Linalg Mul/MulAdd operation must be valid.
-INSTR.LINALGINVALIDMEMORYINTERPVALUE                  In Memory Interpolation value must be valid.
-INSTR.LINALGINVALIDREGISTERINTERPVALUE                From Register Interpretation value must be valid.
-INSTR.LINALGMATRIXLAYOUTNOTTRANSPOSABLE               Row Major and Column Major matrix layouts are not transposable.
-INSTR.LINALGMATRIXSHAPEPARAMSARECONST                 Matrix Layout, Dimensions and isTranspose are constants
-INSTR.LINALGNOTANUNSIGNEDTYPE                         Unsigned flag set for a float signed type
-INSTR.MATVECOPISUNSIGNEDFLAGSARECONST                 In Linalg Mul/MulAdd functions, IsUnsigned flag is a constant.
-INSTR.MAYREORDERTHREADUNDEFCOHERENCEHINTPARAM         Use of undef coherence hint or num coherence hint bits in MaybeReorderThread.
-INSTR.MINPRECISIONNOTPRECISE                          Instructions marked precise may not refer to minprecision values.
-INSTR.MINPRECISONBITCAST                              Bitcast on minprecison types is not allowed.
-INSTR.MIPLEVELFORGETDIMENSION                         Use mip level on buffer when GetDimensions.
-INSTR.MIPONUAVLOAD                                    uav load don't support mipLevel/sampleIndex.
-INSTR.MISSINGSETMESHOUTPUTCOUNTS                      Missing SetMeshOutputCounts call.
-INSTR.MULTIPLEGETMESHPAYLOAD                          GetMeshPayload cannot be called multiple times.
-INSTR.MULTIPLESETMESHOUTPUTCOUNTS                     SetMeshOUtputCounts cannot be called multiple times.
-INSTR.NODERECORDHANDLEUSEAFTERCOMPLETE                Invalid use of completed record handle.
-INSTR.NOGENERICPTRADDRSPACECAST                       Address space cast between pointer types must have one part to be generic address space.
-INSTR.NOIDIVBYZERO                                    No signed integer division by zero.
-INSTR.NOINDEFINITEACOS                                No indefinite arccosine.
-INSTR.NOINDEFINITEASIN                                No indefinite arcsine.
-INSTR.NOINDEFINITEDSXY                                No indefinite derivative calculation.
-INSTR.NOINDEFINITELOG                                 No indefinite logarithm.
-INSTR.NONDOMINATINGDISPATCHMESH                       Non-Dominating DispatchMesh call.
-INSTR.NONDOMINATINGSETMESHOUTPUTCOUNTS                Non-Dominating SetMeshOutputCounts call.
-INSTR.NOREADINGUNINITIALIZED                          Instructions should not read uninitialized value.
-INSTR.NOTONCEDISPATCHMESH                             DispatchMesh must be called exactly once in an Amplification shader.
-INSTR.NOUDIVBYZERO                                    No unsigned integer division by zero.
-INSTR.OFFSETONUAVLOAD                                 uav load don't support offset.
-INSTR.OLOAD                                           DXIL intrinsic overload must be valid.
-INSTR.ONLYONEALLOCCONSUME                             RWStructuredBuffers may increment or decrement their counters, but not both.
-INSTR.OPCODERESERVED                                  Instructions must not reference reserved opcodes.
-INSTR.OPCONST                                         DXIL intrinsic requires an immediate constant operand
-INSTR.OPCONSTRANGE                                    Constant values must be in-range for operation.
-INSTR.OPERANDRANGE                                    DXIL intrinsic operand must be within defined range
-INSTR.PARAMMULTIPLE                                   Parameter must be a valid multiple
-INSTR.PTRBITCAST                                      Pointer type bitcast must be have same size.
-INSTR.REORDERCOHERENTREQUIRESSM69                     reordercoherent requires SM 6.9 or later.
-INSTR.RESOURCECLASSFORLOAD                            load can only run on UAV/SRV resource.
-INSTR.RESOURCECLASSFORSAMPLERGATHER                   sample, lod and gather should be on srv resource.
-INSTR.RESOURCECLASSFORUAVSTORE                        store should be on uav resource.
-INSTR.RESOURCECOORDINATEMISS                          coord uninitialized.
-INSTR.RESOURCECOORDINATETOOMANY                       out of bound coord must be undef.
-INSTR.RESOURCEKINDFORBUFFERLOADSTORE                  buffer load/store only works on Raw/Typed/StructuredBuffer.
-INSTR.RESOURCEKINDFORCALCLOD                          lod requires resource declared as texture1D/2D/3D/Cube/CubeArray/1DArray/2DArray.
-INSTR.RESOURCEKINDFORGATHER                           gather requires resource declared as texture/2D/Cube/2DArray/CubeArray.
-INSTR.RESOURCEKINDFORGETDIM                           Invalid resource kind on GetDimensions.
-INSTR.RESOURCEKINDFORSAMPLE                           sample/_l/_d requires resource declared as texture1D/2D/3D/Cube/1DArray/2DArray/CubeArray.
-INSTR.RESOURCEKINDFORSAMPLEC                          samplec requires resource declared as texture1D/2D/Cube/1DArray/2DArray/CubeArray.
-INSTR.RESOURCEKINDFORTEXTURELOAD                      texture load only works on Texture1D/1DArray/2D/2DArray/3D/MS2D/MS2DArray.
-INSTR.RESOURCEKINDFORTEXTURESTORE                     texture store only works on Texture1D/1DArray/2D/2DArray/3D.
-INSTR.RESOURCEKINDFORTRACERAY                         TraceRay should only use RTAccelerationStructure.
-INSTR.RESOURCEMAPTOSINGLEENTRY                        Fail to map resource to resource table.
-INSTR.RESOURCEOFFSETMISS                              offset uninitialized.
-INSTR.RESOURCEOFFSETTOOMANY                           out of bound offset must be undef.
-INSTR.RESOURCEUSER                                    Resource should only be used by Load/GEP/Call.
-INSTR.SAMPLECOMPTYPE                                  sample_* instructions require resource to be declared to return UNORM, SNORM or FLOAT.
-INSTR.SAMPLEINDEXFORLOAD2DMS                          load on Texture2DMS/2DMSArray require sampleIndex.
-INSTR.SAMPLERMODEFORLOD                               lod instruction requires sampler declared in default mode.
-INSTR.SAMPLERMODEFORSAMPLE                            sample/_l/_d/_cl_s/gather instruction requires sampler declared in default mode.
-INSTR.SAMPLERMODEFORSAMPLEC                           sample_c_*/gather_c instructions require sampler declared in comparison mode.
-INSTR.SIGNATUREOPERATIONNOTINENTRY                    Dxil operation for input output signature must be in entryPoints.
-INSTR.STATUS                                          Resource status should only be used by CheckAccessFullyMapped.
-INSTR.STRUCTBITCAST                                   Bitcast on struct types is not allowed.
-INSTR.SVCONFLICTINGLAUNCHMODE                         Input system values are compatible with node shader launch mode.
-INSTR.TEXTUREOFFSET                                   offset texture instructions must take offset which can resolve to integer literal in the range -8 to 7.
-INSTR.TGSMRACECOND                                    Race condition writing to shared memory detected, consider making this write conditional.
-INSTR.UNDEFHITOBJECT                                  HitObject is undef.
-INSTR.UNDEFINEDVALUEFORUAVSTORE                       Assignment of undefined values to UAV.
-INSTR.UNDEFRESULTFORGETDIMENSION                      GetDimensions used undef dimension %0 on %1.
-INSTR.WRITEMASKFORTYPEDUAVSTORE                       store on typed uav must write to all four components of the UAV.
-INSTR.WRITEMASKGAPFORUAV                              UAV write mask must be contiguous, starting at x: .x, .xy, .xyz, or .xyzw.
-INSTR.WRITEMASKMATCHVALUEFORUAVSTORE                  uav store write mask must match store value mask, write mask is %0 and store value mask is %1.
-META.BARYCENTRICSFLOAT3                               only 'float3' type is allowed for SV_Barycentrics.
-META.BARYCENTRICSINTERPOLATION                        SV_Barycentrics cannot be used with 'nointerpolation' type.
-META.BARYCENTRICSTWOPERSPECTIVES                      There can only be up to two input attributes of SV_Barycentrics with different perspective interpolation mode.
-META.BRANCHFLATTEN                                    Can't use branch and flatten attributes together.
-META.CLIPCULLMAXCOMPONENTS                            Combined elements of SV_ClipDistance and SV_CullDistance must fit in 8 components
-META.CLIPCULLMAXROWS                                  Combined elements of SV_ClipDistance and SV_CullDistance must fit in two rows.
-META.COHERENCENOTONAPPENDCONSUME                      globally/reorder coherent incompatible with append/consume/counter buffers
-META.COMPUTEWITHNODE                                  Compute entry must not have node metadata
-META.CONTROLFLOWHINTNOTONCONTROLFLOW                  Control flow hint only works on control flow inst.
-META.DENSERESIDS                                      Resource identifiers must be zero-based and dense.
-META.DUPLICATESYSVALUE                                System value may only appear once in signature
-META.ENTRYFUNCTION                                    entrypoint not found.
-META.FLAGSUSAGE                                       Flags must match usage.
-META.FORCECASEONSWITCH                                Attribute forcecase only works for switch.
-META.INTEGERINTERPMODE                                Interpolation mode on integer must be Constant
-META.INTERPMODEINONEROW                               Interpolation mode must be identical for all elements packed into the same row.
-META.INTERPMODEVALID                                  Interpolation mode must be valid
-META.INVALIDCONTROLFLOWHINT                           Invalid control flow hint.
-META.KNOWN                                            Named metadata should be known
-META.MAXTESSFACTOR                                    Hull Shader MaxTessFactor must be [%0..%1].  %2 specified.
-META.NOENTRYPROPSFORENTRY                             Entry point %0 must have entry properties.
-META.NOSEMANTICOVERLAP                                Semantics must not overlap
-META.REQUIRED                                         Required metadata missing.
-META.SEMAKINDMATCHESNAME                              Semantic name must match system value, when defined.
-META.SEMAKINDVALID                                    Semantic kind must be valid
-META.SEMANTICCOMPTYPE                                 %0 must be %1.
-META.SEMANTICINDEXMAX                                 System value semantics have a maximum valid semantic index
-META.SEMANTICLEN                                      Semantic length must be at least 1 and at most 64.
-META.SEMANTICSHOULDBEALLOCATED                        Semantic should have a valid packing location
-META.SEMANTICSHOULDNOTBEALLOCATED                     Semantic should have a packing location of -1
-META.SIGNATURECOMPTYPE                                signature %0 specifies unrecognized or invalid component type.
-META.SIGNATUREDATAWIDTH                               Data width must be identical for all elements packed into the same row.
-META.SIGNATUREILLEGALCOMPONENTORDER                   Component ordering for packed elements must be: arbitrary < system value < system generated value
-META.SIGNATUREINDEXCONFLICT                           Only elements with compatible indexing rules may be packed together
-META.SIGNATUREOUTOFRANGE                              Signature elements must fit within maximum signature size
-META.SIGNATUREOVERLAP                                 Signature elements may not overlap in packing location.
-META.STRUCTBUFALIGNMENT                               StructuredBuffer stride not aligned
-META.STRUCTBUFALIGNMENTOUTOFBOUND                     StructuredBuffer stride out of bounds
-META.SYSTEMVALUEROWS                                  System value may only have 1 row
-META.TARGET                                           Target triple must be 'dxil-ms-dx'
-META.TESSELLATOROUTPUTPRIMITIVE                       Invalid Tessellator Output Primitive specified. Must be point, line, triangleCW or triangleCCW.
-META.TESSELLATORPARTITION                             Invalid Tessellator Partitioning specified. Must be integer, pow2, fractional_odd or fractional_even.
-META.TEXTURETYPE                                      elements of typed buffers and textures must fit in four 32-bit quantities.
-META.USED                                             All metadata must be used by dxil.
-META.VALIDSAMPLERMODE                                 Invalid sampler mode on sampler .
-META.VALUERANGE                                       Metadata value must be within range.
-META.VERSIONSUPPORTED                                 Version in metadata must be supported.
-META.WELLFORMED                                       Metadata must be well-formed in operand count and types.
-SM.64BITRAWBUFFERLOADSTORE                            i64/f64 rawBufferLoad/Store overloads are allowed after SM 6.3.
-SM.AMPLIFICATIONSHADERPAYLOADSIZE                     For amplification shader with entry '%0', payload size %1 is greater than maximum size of %2 bytes.
-SM.AMPLIFICATIONSHADERPAYLOADSIZEDECLARED             For amplification shader with entry '%0', payload size %1 is greater than declared size of %2 bytes.
-SM.APPENDANDCONSUMEONSAMEUAV                          BufferUpdateCounter inc and dec on a given UAV (%d) cannot both be in the same shader for shader model less than 5.1.
-SM.CBUFFERARRAYOFFSETALIGNMENT                        CBuffer array offset must be aligned to 16-bytes
-SM.CBUFFERELEMENTOVERFLOW                             CBuffer elements must not overflow
-SM.CBUFFEROFFSETOVERLAP                               CBuffer offsets must not overlap
-SM.CBUFFERSIZE                                        CBuffer size must not exceed 65536 bytes
-SM.CBUFFERTEMPLATETYPEMUSTBESTRUCT                    D3D12 constant/texture buffer template element can only be a struct.
-SM.COMPLETEPOSITION                                   Not all elements of SV_Position were written.
-SM.CONSTANTINTERPMODE                                 Interpolation mode must be constant for MS primitive output.
-SM.COUNTERONLYONSTRUCTBUF                             BufferUpdateCounter valid only on structured buffers.
-SM.CSNOSIGNATURES                                     Compute shaders must not have shader signatures.
-SM.DOMAINLOCATIONIDXOOB                               DomainLocation component index out of bounds for the domain.
-SM.DSINPUTCONTROLPOINTCOUNTRANGE                      DS input control point count must be [0..%0].  %1 specified.
-SM.DXILVERSION                                        Target shader model requires specific Dxil Version
-SM.GSINSTANCECOUNTRANGE                               GS instance count must be [1..%0].  %1 specified.
-SM.GSOUTPUTVERTEXCOUNTRANGE                           GS output vertex count must be [0..%0].  %1 specified.
-SM.GSTOTALOUTPUTVERTEXDATARANGE                       Declared output vertex count (%0) multiplied by the total number of declared scalar components of output data (%1) equals %2.  This value cannot be greater than %3.
-SM.GSVALIDINPUTPRIMITIVE                              GS input primitive unrecognized.
-SM.GSVALIDOUTPUTPRIMITIVETOPOLOGY                     GS output primitive topology unrecognized.
-SM.HSINPUTCONTROLPOINTCOUNTRANGE                      HS input control point count must be [0..%0].  %1 specified.
-SM.HULLPASSTHRUCONTROLPOINTCOUNTMATCH                 For pass thru hull shader, input control point count must match output control point count
-SM.INCOMPATIBLECALLINENTRY                            Features used in internal function calls must be compatible with entry
-SM.INCOMPATIBLEDERIVINCOMPUTESHADERMODEL              Derivatives in compute-model shaders require shader model 6.6 and above
-SM.INCOMPATIBLEDERIVLAUNCH                            Node shaders only support derivatives in broadcasting launch mode
-SM.INCOMPATIBLEOPERATION                              Operations used in entry function must be compatible with shader stage and other properties
-SM.INCOMPATIBLEREQUIRESGROUP                          Functions requiring groupshared memory must be called from shaders with a visible group
-SM.INCOMPATIBLESHADERMODEL                            Functions may only use features available in the current shader model
-SM.INCOMPATIBLESTAGE                                  Functions may only use features available in the entry function's stage
-SM.INCOMPATIBLETHREADGROUPDIM                         When derivatives are used in compute-model shaders, the thread group dimensions must be compatible
-SM.INSIDETESSFACTORSIZEMATCHDOMAIN                    InsideTessFactor rows, columns (%0, %1) invalid for domain %2.  Expected %3 rows and 1 column.
-SM.INVALIDRESOURCECOMPTYPE                            Invalid resource return type.
-SM.INVALIDRESOURCEKIND                                Invalid resources kind.
-SM.INVALIDSAMPLERFEEDBACKTYPE                         Invalid sampler feedback type.
-SM.INVALIDTEXTUREKINDONUAV                            TextureCube[Array] resources are not supported with UAVs.
-SM.ISOLINEOUTPUTPRIMITIVEMISMATCH                     Hull Shader declared with IsoLine Domain must specify output primitive point or line. Triangle_cw or triangle_ccw output are not compatible with the IsoLine Domain.
-SM.MAXMSSMSIZE                                        Total Thread Group Shared Memory storage is %0, exceeded %1.
-SM.MAXTGSMSIZE                                        Total Thread Group Shared Memory storage is %0, exceeded %1.
-SM.MAXTHEADGROUP                                      Declared Thread Group Count %0 (X*Y*Z) is beyond the valid maximum of %1.
-SM.MESHPSIGROWCOUNT                                   For shader '%0', primitive output signatures are taking up more than %1 rows.
-SM.MESHSHADERINOUTSIZE                                For shader '%0', payload plus output size is greater than %1.
-SM.MESHSHADERMAXPRIMITIVECOUNT                        MS max primitive output count must be [0..%0].  %1 specified.
-SM.MESHSHADERMAXVERTEXCOUNT                           MS max vertex output count must be [0..%0].  %1 specified.
-SM.MESHSHADEROUTPUTSIZE                               For shader '%0', vertex plus primitive output size is greater than %1.
-SM.MESHSHADERPAYLOADSIZE                              For mesh shader with entry '%0', payload size %1 is greater than maximum size of %2 bytes.
-SM.MESHSHADERPAYLOADSIZEDECLARED                      For mesh shader with entry '%0', payload size %1 is greater than declared size of %2 bytes.
-SM.MESHTOTALSIGROWCOUNT                               For shader '%0', vertex and primitive output signatures are taking up more than %1 rows.
-SM.MESHVSIGROWCOUNT                                   For shader '%0', vertex output signatures are taking up more than %1 rows.
-SM.MULTISTREAMMUSTBEPOINT                             When multiple GS output streams are used they must be pointlists
-SM.NAME                                               Target shader model name must be known
-SM.NOINTERPMODE                                       Interpolation mode must be undefined for VS input/PS output/patch constant.
-SM.NOPSOUTPUTIDX                                      Pixel shader output registers are not indexable.
-SM.OPCODE                                             Opcode must be defined in target shader model
-SM.OPCODEININVALIDFUNCTION                            Invalid DXIL opcode usage like StorePatchConstant in patch constant function
-SM.OPERAND                                            Operand must be defined in target shader model.
-SM.OUTPUTCONTROLPOINTCOUNTRANGE                       output control point count must be [%0..%1].  %2 specified.
-SM.OUTPUTCONTROLPOINTSTOTALSCALARS                    Total number of scalars across all HS output control points must not exceed .
-SM.PATCHCONSTANTONLYFORHSDS                           patch constant signature only valid in HS and DS.
-SM.PROGRAMVERSION                                     Program Version in Dxil Container does not match Dxil Module shader model version
-SM.PSCONSISTENTINTERP                                 Interpolation mode for PS input position must be linear_noperspective_centroid or linear_noperspective_sample when outputting oDepthGE or oDepthLE and not running at sample frequency (which is forced by inputting SV_SampleIndex or declaring an input linear_sample or linear_noperspective_sample).
-SM.PSCOVERAGEANDINNERCOVERAGE                         InnerCoverage and Coverage are mutually exclusive.
-SM.PSMULTIPLEDEPTHSEMANTIC                            Pixel Shader only allows one type of depth semantic to be declared.
-SM.PSOUTPUTSEMANTIC                                   Pixel Shader allows output semantics to be SV_Target, SV_Depth, SV_DepthGreaterEqual, SV_DepthLessEqual, SV_Coverage or SV_StencilRef, %0 found.
-SM.PSTARGETCOL0                                       SV_Target packed location must start at column 0.
-SM.PSTARGETINDEXMATCHESROW                            SV_Target semantic index must match packed row location.
-SM.RAYSHADERPAYLOADSIZE                               For shader '%0', %1 size is smaller than argument's allocation size.
-SM.RAYSHADERSIGNATURES                                Ray tracing shader '%0' should not have any shader signatures.
-SM.RESOURCERANGEOVERLAP                               Resource ranges must not overlap
-SM.ROVONLYINPS                                        RasterizerOrdered objects are only allowed in 5.0+ pixel shaders.
-SM.SAMPLECOUNTONLYON2DMS                              Only Texture2DMS/2DMSArray could has sample count.
-SM.SEMANTIC                                           Semantic must be defined in target shader model
-SM.STREAMINDEXRANGE                                   Stream index (%0) must between 0 and %1.
-SM.TESSFACTORFORDOMAIN                                Required TessFactor for domain not found declared anywhere in Patch Constant data.
-SM.TESSFACTORSIZEMATCHDOMAIN                          TessFactor rows, columns (%0, %1) invalid for domain %2.  Expected %3 rows and 1 column.
-SM.TGSMUNSUPPORTED                                    Thread Group Shared Memory not supported %0.
-SM.THREADGROUPCHANNELRANGE                            Declared Thread Group %0 size %1 outside valid range [%2..%3].
-SM.TRIOUTPUTPRIMITIVEMISMATCH                         Hull Shader declared with Tri Domain must specify output primitive point, triangle_cw or triangle_ccw. Line output is not compatible with the Tri domain.
-SM.UNDEFINEDOUTPUT                                    Not all elements of output %0 were written.
-SM.VALIDDOMAIN                                        Invalid Tessellator Domain specified. Must be isoline, tri or quad.
-SM.VIEWIDNEEDSSLOT                                    ViewID requires compatible space in pixel shader input signature
-SM.WAVESIZEALLZEROWHENUNDEFINED                       WaveSize Max and Preferred must be 0 when Min is 0
-SM.WAVESIZEEXPECTSONEPARAM                            WaveSize tag expects exactly 1 parameter.
-SM.WAVESIZEMAXANDPREFERREDZEROWHENNORANGE             WaveSize Max and Preferred must be 0 to encode min==max
-SM.WAVESIZEMAXGREATERTHANMIN                          WaveSize Max must greater than Min
-SM.WAVESIZENEEDSCONSTANTOPERANDS                      WaveSize metadata operands must be constant values.
-SM.WAVESIZENEEDSSM66OR67                              WaveSize is valid only for Shader Model 6.6 and 6.7.
-SM.WAVESIZEONCOMPUTEORNODE                            WaveSize only allowed on compute or node shaders
-SM.WAVESIZEPREFERREDINRANGE                           WaveSize Preferred must be within Min..Max range
-SM.WAVESIZERANGEEXPECTSTHREEPARAMS                    WaveSize Range tag expects exactly 3 parameters.
-SM.WAVESIZERANGENEEDSSM68PLUS                         WaveSize Range is valid only for Shader Model 6.8 and higher.
-SM.WAVESIZETAGDUPLICATE                               WaveSize or WaveSizeRange tag may only appear once per entry point.
-SM.WAVESIZEVALUE                                      WaveSize value must be a power of 2 in range [4..128]
-SM.ZEROHSINPUTCONTROLPOINTWITHINPUT                   When HS input control point count is 0, no input signature should exist.
-TYPES.DEFINED                                         Type must be defined based on DXIL primitives
-TYPES.I8                                              I8 can only be used as immediate value for intrinsic or as i8* via bitcast by lifetime intrinsics.
-TYPES.INTWIDTH                                        Int type must be of valid width
-TYPES.NOMULTIDIM                                      Only one dimension allowed for array type.
-TYPES.NOPTRTOPTR                                      Pointers to pointers, or pointers in structures are not allowed.
-TYPES.NOVECTOR                                        Vector types must not be present
-===================================================== ========================================================================================================================================================================================================================================================================================================
+============================================================= ========================================================================================================================================================================================================================================================================================================
+Rule Code                                                     Description
+============================================================= ========================================================================================================================================================================================================================================================================================================
+BITCODE.VALID                                                 Module must be bitcode-valid
+CONTAINER.CONTENTINVALID                                      DXIL Container Content is well-formed
+CONTAINER.CONTENTMATCHES                                      DXIL Container Content must match Module
+CONTAINER.PARTINVALID                                         DXIL Container must not contain unknown parts
+CONTAINER.PARTMATCHES                                         DXIL Container Parts must match Module
+CONTAINER.PARTMISSING                                         DXIL Container requires certain parts, corresponding to module
+CONTAINER.PARTREPEATED                                        DXIL Container must have only one of each part type
+CONTAINER.ROOTSIGNATUREINCOMPATIBLE                           Root Signature in DXIL Container must be compatible with shader
+CONTAINER.UNUSEDITEMINTABLE                                   Items in Table must be used
+DECL.ALLOCATERAYQUERY2FLAGSARECONST                           constRayFlags and RayQueryFlags for AllocateRayQuery2 must be constant
+DECL.ALLOCATERAYQUERYFLAGSARECONST                            RayFlags for AllocateRayQuery must be constant
+DECL.ALLOWOPACITYMICROMAPSEXPECTEDGIVENFORCEOMM2STATE         When the ForceOMM2State ConstRayFlag is given as an argument to a RayQuery object, AllowOpacityMicromaps is expected as a RayQueryFlag argument
+DECL.ATTRSTRUCT                                               Attributes parameter must be struct type
+DECL.DXILFNEXTERN                                             External function must be a DXIL function
+DECL.DXILNSRESERVED                                           The DXIL reserved prefixes must only be used by built-in functions and types
+DECL.EXTRAARGS                                                Extra arguments not allowed for shader functions
+DECL.FNATTRIBUTE                                              Functions should only contain known function attributes
+DECL.FNFLATTENPARAM                                           Function parameters must not use struct types
+DECL.FNISCALLED                                               Functions can only be used by call instructions
+DECL.MULTIPLENODEINPUTS                                       A node shader may not have more than one input record
+DECL.NODELAUNCHINPUTTYPE                                      Invalid input record type for node launch type
+DECL.NOTUSEDEXTERNAL                                          External declaration should not be used
+DECL.PARAMSTRUCT                                              Callable function parameter must be struct type
+DECL.PAYLOADSTRUCT                                            Payload parameter must be struct type
+DECL.RAYQUERYINFNSIG                                          Rayquery objects not allowed in function signatures
+DECL.RESOURCEINFNSIG                                          Resources not allowed in function signatures
+DECL.SHADERMISSINGARG                                         payload/params/attributes parameter is required for certain shader types
+DECL.SHADERRETURNVOID                                         Shader functions must return void
+DECL.USEDEXTERNALFUNCTION                                     External function must be used
+DECL.USEDINTERNAL                                             Internal declaration must be used
+FLOW.DEADLOOP                                                 Loop must have break.
+FLOW.FUNCTIONCALL                                             Function with parameter is not permitted
+FLOW.NORECURSION                                              Recursion is not permitted.
+FLOW.REDUCIBLE                                                Execution flow must be reducible.
+INSTR.ALLOWED                                                 Instructions must be of an allowed type.
+INSTR.ATOMICCONST                                             Constant destination to atomic.
+INSTR.ATOMICINTRINNONUAV                                      Non-UAV destination to atomic intrinsic.
+INSTR.ATOMICOPNONGROUPSHAREDORRECORD                          Non-groupshared or node record destination to atomic operation.
+INSTR.ATTRIBUTEATVERTEXNOINTERPOLATION                        Attribute %0 must have nointerpolation mode in order to use GetAttributeAtVertex function.
+INSTR.BARRIERFLAGINVALID                                      Invalid %0 flags on DXIL operation '%1'
+INSTR.BARRIERMODEFORNONCS                                     sync in a non-Compute/Amplification/Mesh/Node Shader must only sync UAV (sync_uglobal).
+INSTR.BARRIERMODENOMEMORY                                     sync must include some form of memory barrier - _u (UAV) and/or _g (Thread Group Shared Memory).  Only _t (thread group sync) is optional.
+INSTR.BARRIERMODEUSELESSUGROUP                                sync can't specify both _ugroup and _uglobal. If both are needed, just specify _uglobal.
+INSTR.BARRIERNONCONSTANTFLAGARGUMENT                          Memory type, access, or sync flag is not constant
+INSTR.BARRIERREQUIRESNODE                                     sync in a non-Node Shader must not sync node record memory.
+INSTR.BUFFERUPDATECOUNTERONRESHASCOUNTER                      BufferUpdateCounter valid only when HasCounter is true.
+INSTR.BUFFERUPDATECOUNTERONUAV                                BufferUpdateCounter valid only on UAV.
+INSTR.CALLOLOAD                                               Call to DXIL intrinsic must match overload signature
+INSTR.CANNOTPULLPOSITION                                      pull-model evaluation of position disallowed
+INSTR.CBUFFERCLASSFORCBUFFERHANDLE                            Expect Cbuffer for CBufferLoad handle.
+INSTR.CBUFFEROUTOFBOUND                                       Cbuffer access out of bound.
+INSTR.CHECKACCESSFULLYMAPPED                                  CheckAccessFullyMapped should only be used on resource status.
+INSTR.CONSTALIGNFORRAWBUF                                     Raw Buffer alignment value must be a constant.
+INSTR.COORDINATECOUNTFORRAWTYPEDBUF                           raw/typed buffer offset must be undef.
+INSTR.COORDINATECOUNTFORSTRUCTBUF                             structured buffer requires defined index and offset coordinates.
+INSTR.CREATEHANDLEIMMRANGEID                                  Local resource must map to global resource.
+INSTR.DXILSTRUCTUSER                                          Dxil struct types should only be used by ExtractValue.
+INSTR.DXILSTRUCTUSEROUTOFBOUND                                Index out of bound when extract value from dxil struct types.
+INSTR.EVALINTERPOLATIONMODE                                   Interpolation mode on %0 used with eval_* instruction must be linear, linear_centroid, linear_noperspective, linear_noperspective_centroid, linear_sample or linear_noperspective_sample.
+INSTR.EXTRACTVALUE                                            ExtractValue should only be used on dxil struct types and cmpxchg.
+INSTR.FAILTORESLOVETGSMPOINTER                                TGSM pointers must originate from an unambiguous TGSM global variable.
+INSTR.HANDLENOTFROMCREATEHANDLE                               Resource handle should returned by createHandle.
+INSTR.ILLEGALDXILOPCODE                                       DXILOpCode must be [0..%0].  %1 specified.
+INSTR.ILLEGALDXILOPFUNCTION                                   '%0' is not a DXILOpFuncition for DXILOpcode '%1'.
+INSTR.IMMBIASFORSAMPLEB                                       bias amount for sample_b must be in the range [%0,%1], but %2 was specified as an immediate.
+INSTR.INBOUNDSACCESS                                          Access to out-of-bounds memory is disallowed.
+INSTR.LINALGINTERPRETATIONPARAMARECONST                       In Linalg operations, Interpretation value is a constant.
+INSTR.LINALGINVALIDMATRIXLAYOUTVALUEFORMATVECOPS              Matrix Layout for Linalg Mul/MulAdd operation must be valid.
+INSTR.LINALGINVALIDMATRIXLAYOUTVALUEFOROUTERPRODUCTACCUMULATE Matrix Layout for Linalg Mul/MulAdd operation must be valid.
+INSTR.LINALGINVALIDMEMORYINTERPVALUE                          In Memory Interpolation value must be valid.
+INSTR.LINALGINVALIDREGISTERINTERPVALUE                        From Register Interpretation value must be valid.
+INSTR.LINALGMATRIXLAYOUTNOTTRANSPOSABLE                       Row Major and Column Major matrix layouts are not transposable.
+INSTR.LINALGMATRIXSHAPEPARAMSARECONST                         Matrix Layout, Dimensions and isTranspose are constants
+INSTR.LINALGMATRIXSTRIDEZEROFOROPTIMALLAYOUTS                 For optimal layouts, matrix stride must be zero.
+INSTR.LINALGNOTANUNSIGNEDTYPE                                 Unsigned flag set for a float signed type
+INSTR.MATVECOPISUNSIGNEDFLAGSARECONST                         In Linalg Mul/MulAdd functions, IsUnsigned flag is a constant.
+INSTR.MAYREORDERTHREADUNDEFCOHERENCEHINTPARAM                 Use of undef coherence hint or num coherence hint bits in MaybeReorderThread.
+INSTR.MINPRECISIONNOTPRECISE                                  Instructions marked precise may not refer to minprecision values.
+INSTR.MINPRECISONBITCAST                                      Bitcast on minprecison types is not allowed.
+INSTR.MIPLEVELFORGETDIMENSION                                 Use mip level on buffer when GetDimensions.
+INSTR.MIPONUAVLOAD                                            uav load don't support mipLevel/sampleIndex.
+INSTR.MISSINGSETMESHOUTPUTCOUNTS                              Missing SetMeshOutputCounts call.
+INSTR.MULTIPLEGETMESHPAYLOAD                                  GetMeshPayload cannot be called multiple times.
+INSTR.MULTIPLESETMESHOUTPUTCOUNTS                             SetMeshOUtputCounts cannot be called multiple times.
+INSTR.NODERECORDHANDLEUSEAFTERCOMPLETE                        Invalid use of completed record handle.
+INSTR.NOGENERICPTRADDRSPACECAST                               Address space cast between pointer types must have one part to be generic address space.
+INSTR.NOIDIVBYZERO                                            No signed integer division by zero.
+INSTR.NOINDEFINITEACOS                                        No indefinite arccosine.
+INSTR.NOINDEFINITEASIN                                        No indefinite arcsine.
+INSTR.NOINDEFINITEDSXY                                        No indefinite derivative calculation.
+INSTR.NOINDEFINITELOG                                         No indefinite logarithm.
+INSTR.NONDOMINATINGDISPATCHMESH                               Non-Dominating DispatchMesh call.
+INSTR.NONDOMINATINGSETMESHOUTPUTCOUNTS                        Non-Dominating SetMeshOutputCounts call.
+INSTR.NOREADINGUNINITIALIZED                                  Instructions should not read uninitialized value.
+INSTR.NOTONCEDISPATCHMESH                                     DispatchMesh must be called exactly once in an Amplification shader.
+INSTR.NOUDIVBYZERO                                            No unsigned integer division by zero.
+INSTR.OFFSETONUAVLOAD                                         uav load don't support offset.
+INSTR.OLOAD                                                   DXIL intrinsic overload must be valid.
+INSTR.ONLYONEALLOCCONSUME                                     RWStructuredBuffers may increment or decrement their counters, but not both.
+INSTR.OPCODERESERVED                                          Instructions must not reference reserved opcodes.
+INSTR.OPCONST                                                 DXIL intrinsic requires an immediate constant operand
+INSTR.OPCONSTRANGE                                            Constant values must be in-range for operation.
+INSTR.OPERANDRANGE                                            DXIL intrinsic operand must be within defined range
+INSTR.PARAMMULTIPLE                                           Parameter must be a valid multiple
+INSTR.PTRBITCAST                                              Pointer type bitcast must be have same size.
+INSTR.REORDERCOHERENTREQUIRESSM69                             reordercoherent requires SM 6.9 or later.
+INSTR.RESOURCECLASSFORLOAD                                    load can only run on UAV/SRV resource.
+INSTR.RESOURCECLASSFORSAMPLERGATHER                           sample, lod and gather should be on srv resource.
+INSTR.RESOURCECLASSFORUAVSTORE                                store should be on uav resource.
+INSTR.RESOURCECOORDINATEMISS                                  coord uninitialized.
+INSTR.RESOURCECOORDINATETOOMANY                               out of bound coord must be undef.
+INSTR.RESOURCEKINDFORBUFFERLOADSTORE                          buffer load/store only works on Raw/Typed/StructuredBuffer.
+INSTR.RESOURCEKINDFORCALCLOD                                  lod requires resource declared as texture1D/2D/3D/Cube/CubeArray/1DArray/2DArray.
+INSTR.RESOURCEKINDFORGATHER                                   gather requires resource declared as texture/2D/Cube/2DArray/CubeArray.
+INSTR.RESOURCEKINDFORGETDIM                                   Invalid resource kind on GetDimensions.
+INSTR.RESOURCEKINDFORSAMPLE                                   sample/_l/_d requires resource declared as texture1D/2D/3D/Cube/1DArray/2DArray/CubeArray.
+INSTR.RESOURCEKINDFORSAMPLEC                                  samplec requires resource declared as texture1D/2D/Cube/1DArray/2DArray/CubeArray.
+INSTR.RESOURCEKINDFORTEXTURELOAD                              texture load only works on Texture1D/1DArray/2D/2DArray/3D/MS2D/MS2DArray.
+INSTR.RESOURCEKINDFORTEXTURESTORE                             texture store only works on Texture1D/1DArray/2D/2DArray/3D.
+INSTR.RESOURCEKINDFORTRACERAY                                 TraceRay should only use RTAccelerationStructure.
+INSTR.RESOURCEMAPTOSINGLEENTRY                                Fail to map resource to resource table.
+INSTR.RESOURCEOFFSETMISS                                      offset uninitialized.
+INSTR.RESOURCEOFFSETTOOMANY                                   out of bound offset must be undef.
+INSTR.RESOURCEUSER                                            Resource should only be used by Load/GEP/Call.
+INSTR.SAMPLECOMPTYPE                                          sample_* instructions require resource to be declared to return UNORM, SNORM or FLOAT.
+INSTR.SAMPLEINDEXFORLOAD2DMS                                  load on Texture2DMS/2DMSArray require sampleIndex.
+INSTR.SAMPLERMODEFORLOD                                       lod instruction requires sampler declared in default mode.
+INSTR.SAMPLERMODEFORSAMPLE                                    sample/_l/_d/_cl_s/gather instruction requires sampler declared in default mode.
+INSTR.SAMPLERMODEFORSAMPLEC                                   sample_c_*/gather_c instructions require sampler declared in comparison mode.
+INSTR.SIGNATUREOPERATIONNOTINENTRY                            Dxil operation for input output signature must be in entryPoints.
+INSTR.STATUS                                                  Resource status should only be used by CheckAccessFullyMapped.
+INSTR.STRUCTBITCAST                                           Bitcast on struct types is not allowed.
+INSTR.SVCONFLICTINGLAUNCHMODE                                 Input system values are compatible with node shader launch mode.
+INSTR.TEXTUREOFFSET                                           offset texture instructions must take offset which can resolve to integer literal in the range -8 to 7.
+INSTR.TGSMRACECOND                                            Race condition writing to shared memory detected, consider making this write conditional.
+INSTR.UNDEFHITOBJECT                                          HitObject is undef.
+INSTR.UNDEFINEDVALUEFORUAVSTORE                               Assignment of undefined values to UAV.
+INSTR.UNDEFRESULTFORGETDIMENSION                              GetDimensions used undef dimension %0 on %1.
+INSTR.WRITEMASKFORTYPEDUAVSTORE                               store on typed uav must write to all four components of the UAV.
+INSTR.WRITEMASKGAPFORUAV                                      UAV write mask must be contiguous, starting at x: .x, .xy, .xyz, or .xyzw.
+INSTR.WRITEMASKMATCHVALUEFORUAVSTORE                          uav store write mask must match store value mask, write mask is %0 and store value mask is %1.
+META.BARYCENTRICSFLOAT3                                       only 'float3' type is allowed for SV_Barycentrics.
+META.BARYCENTRICSINTERPOLATION                                SV_Barycentrics cannot be used with 'nointerpolation' type.
+META.BARYCENTRICSTWOPERSPECTIVES                              There can only be up to two input attributes of SV_Barycentrics with different perspective interpolation mode.
+META.BRANCHFLATTEN                                            Can't use branch and flatten attributes together.
+META.CLIPCULLMAXCOMPONENTS                                    Combined elements of SV_ClipDistance and SV_CullDistance must fit in 8 components
+META.CLIPCULLMAXROWS                                          Combined elements of SV_ClipDistance and SV_CullDistance must fit in two rows.
+META.COHERENCENOTONAPPENDCONSUME                              globally/reorder coherent incompatible with append/consume/counter buffers
+META.COMPUTEWITHNODE                                          Compute entry must not have node metadata
+META.CONTROLFLOWHINTNOTONCONTROLFLOW                          Control flow hint only works on control flow inst.
+META.DENSERESIDS                                              Resource identifiers must be zero-based and dense.
+META.DUPLICATESYSVALUE                                        System value may only appear once in signature
+META.ENTRYFUNCTION                                            entrypoint not found.
+META.FLAGSUSAGE                                               Flags must match usage.
+META.FORCECASEONSWITCH                                        Attribute forcecase only works for switch.
+META.INTEGERINTERPMODE                                        Interpolation mode on integer must be Constant
+META.INTERPMODEINONEROW                                       Interpolation mode must be identical for all elements packed into the same row.
+META.INTERPMODEVALID                                          Interpolation mode must be valid
+META.INVALIDCONTROLFLOWHINT                                   Invalid control flow hint.
+META.KNOWN                                                    Named metadata should be known
+META.MAXTESSFACTOR                                            Hull Shader MaxTessFactor must be [%0..%1].  %2 specified.
+META.NOENTRYPROPSFORENTRY                                     Entry point %0 must have entry properties.
+META.NOSEMANTICOVERLAP                                        Semantics must not overlap
+META.REQUIRED                                                 Required metadata missing.
+META.SEMAKINDMATCHESNAME                                      Semantic name must match system value, when defined.
+META.SEMAKINDVALID                                            Semantic kind must be valid
+META.SEMANTICCOMPTYPE                                         %0 must be %1.
+META.SEMANTICINDEXMAX                                         System value semantics have a maximum valid semantic index
+META.SEMANTICLEN                                              Semantic length must be at least 1 and at most 64.
+META.SEMANTICSHOULDBEALLOCATED                                Semantic should have a valid packing location
+META.SEMANTICSHOULDNOTBEALLOCATED                             Semantic should have a packing location of -1
+META.SIGNATURECOMPTYPE                                        signature %0 specifies unrecognized or invalid component type.
+META.SIGNATUREDATAWIDTH                                       Data width must be identical for all elements packed into the same row.
+META.SIGNATUREILLEGALCOMPONENTORDER                           Component ordering for packed elements must be: arbitrary < system value < system generated value
+META.SIGNATUREINDEXCONFLICT                                   Only elements with compatible indexing rules may be packed together
+META.SIGNATUREOUTOFRANGE                                      Signature elements must fit within maximum signature size
+META.SIGNATUREOVERLAP                                         Signature elements may not overlap in packing location.
+META.STRUCTBUFALIGNMENT                                       StructuredBuffer stride not aligned
+META.STRUCTBUFALIGNMENTOUTOFBOUND                             StructuredBuffer stride out of bounds
+META.SYSTEMVALUEROWS                                          System value may only have 1 row
+META.TARGET                                                   Target triple must be 'dxil-ms-dx'
+META.TESSELLATOROUTPUTPRIMITIVE                               Invalid Tessellator Output Primitive specified. Must be point, line, triangleCW or triangleCCW.
+META.TESSELLATORPARTITION                                     Invalid Tessellator Partitioning specified. Must be integer, pow2, fractional_odd or fractional_even.
+META.TEXTURETYPE                                              elements of typed buffers and textures must fit in four 32-bit quantities.
+META.USED                                                     All metadata must be used by dxil.
+META.VALIDSAMPLERMODE                                         Invalid sampler mode on sampler .
+META.VALUERANGE                                               Metadata value must be within range.
+META.VERSIONSUPPORTED                                         Version in metadata must be supported.
+META.WELLFORMED                                               Metadata must be well-formed in operand count and types.
+SM.64BITRAWBUFFERLOADSTORE                                    i64/f64 rawBufferLoad/Store overloads are allowed after SM 6.3.
+SM.AMPLIFICATIONSHADERPAYLOADSIZE                             For amplification shader with entry '%0', payload size %1 is greater than maximum size of %2 bytes.
+SM.AMPLIFICATIONSHADERPAYLOADSIZEDECLARED                     For amplification shader with entry '%0', payload size %1 is greater than declared size of %2 bytes.
+SM.APPENDANDCONSUMEONSAMEUAV                                  BufferUpdateCounter inc and dec on a given UAV (%d) cannot both be in the same shader for shader model less than 5.1.
+SM.CBUFFERARRAYOFFSETALIGNMENT                                CBuffer array offset must be aligned to 16-bytes
+SM.CBUFFERELEMENTOVERFLOW                                     CBuffer elements must not overflow
+SM.CBUFFEROFFSETOVERLAP                                       CBuffer offsets must not overlap
+SM.CBUFFERSIZE                                                CBuffer size must not exceed 65536 bytes
+SM.CBUFFERTEMPLATETYPEMUSTBESTRUCT                            D3D12 constant/texture buffer template element can only be a struct.
+SM.COMPLETEPOSITION                                           Not all elements of SV_Position were written.
+SM.CONSTANTINTERPMODE                                         Interpolation mode must be constant for MS primitive output.
+SM.COUNTERONLYONSTRUCTBUF                                     BufferUpdateCounter valid only on structured buffers.
+SM.CSNOSIGNATURES                                             Compute shaders must not have shader signatures.
+SM.DOMAINLOCATIONIDXOOB                                       DomainLocation component index out of bounds for the domain.
+SM.DSINPUTCONTROLPOINTCOUNTRANGE                              DS input control point count must be [0..%0].  %1 specified.
+SM.DXILVERSION                                                Target shader model requires specific Dxil Version
+SM.GSINSTANCECOUNTRANGE                                       GS instance count must be [1..%0].  %1 specified.
+SM.GSOUTPUTVERTEXCOUNTRANGE                                   GS output vertex count must be [0..%0].  %1 specified.
+SM.GSTOTALOUTPUTVERTEXDATARANGE                               Declared output vertex count (%0) multiplied by the total number of declared scalar components of output data (%1) equals %2.  This value cannot be greater than %3.
+SM.GSVALIDINPUTPRIMITIVE                                      GS input primitive unrecognized.
+SM.GSVALIDOUTPUTPRIMITIVETOPOLOGY                             GS output primitive topology unrecognized.
+SM.HSINPUTCONTROLPOINTCOUNTRANGE                              HS input control point count must be [0..%0].  %1 specified.
+SM.HULLPASSTHRUCONTROLPOINTCOUNTMATCH                         For pass thru hull shader, input control point count must match output control point count
+SM.INCOMPATIBLECALLINENTRY                                    Features used in internal function calls must be compatible with entry
+SM.INCOMPATIBLEDERIVINCOMPUTESHADERMODEL                      Derivatives in compute-model shaders require shader model 6.6 and above
+SM.INCOMPATIBLEDERIVLAUNCH                                    Node shaders only support derivatives in broadcasting launch mode
+SM.INCOMPATIBLEOPERATION                                      Operations used in entry function must be compatible with shader stage and other properties
+SM.INCOMPATIBLEREQUIRESGROUP                                  Functions requiring groupshared memory must be called from shaders with a visible group
+SM.INCOMPATIBLESHADERMODEL                                    Functions may only use features available in the current shader model
+SM.INCOMPATIBLESTAGE                                          Functions may only use features available in the entry function's stage
+SM.INCOMPATIBLETHREADGROUPDIM                                 When derivatives are used in compute-model shaders, the thread group dimensions must be compatible
+SM.INSIDETESSFACTORSIZEMATCHDOMAIN                            InsideTessFactor rows, columns (%0, %1) invalid for domain %2.  Expected %3 rows and 1 column.
+SM.INVALIDRESOURCECOMPTYPE                                    Invalid resource return type.
+SM.INVALIDRESOURCEKIND                                        Invalid resources kind.
+SM.INVALIDSAMPLERFEEDBACKTYPE                                 Invalid sampler feedback type.
+SM.INVALIDTEXTUREKINDONUAV                                    TextureCube[Array] resources are not supported with UAVs.
+SM.ISOLINEOUTPUTPRIMITIVEMISMATCH                             Hull Shader declared with IsoLine Domain must specify output primitive point or line. Triangle_cw or triangle_ccw output are not compatible with the IsoLine Domain.
+SM.MAXMSSMSIZE                                                Total Thread Group Shared Memory storage is %0, exceeded %1.
+SM.MAXTGSMSIZE                                                Total Thread Group Shared Memory storage is %0, exceeded %1.
+SM.MAXTHEADGROUP                                              Declared Thread Group Count %0 (X*Y*Z) is beyond the valid maximum of %1.
+SM.MESHPSIGROWCOUNT                                           For shader '%0', primitive output signatures are taking up more than %1 rows.
+SM.MESHSHADERINOUTSIZE                                        For shader '%0', payload plus output size is greater than %1.
+SM.MESHSHADERMAXPRIMITIVECOUNT                                MS max primitive output count must be [0..%0].  %1 specified.
+SM.MESHSHADERMAXVERTEXCOUNT                                   MS max vertex output count must be [0..%0].  %1 specified.
+SM.MESHSHADEROUTPUTSIZE                                       For shader '%0', vertex plus primitive output size is greater than %1.
+SM.MESHSHADERPAYLOADSIZE                                      For mesh shader with entry '%0', payload size %1 is greater than maximum size of %2 bytes.
+SM.MESHSHADERPAYLOADSIZEDECLARED                              For mesh shader with entry '%0', payload size %1 is greater than declared size of %2 bytes.
+SM.MESHTOTALSIGROWCOUNT                                       For shader '%0', vertex and primitive output signatures are taking up more than %1 rows.
+SM.MESHVSIGROWCOUNT                                           For shader '%0', vertex output signatures are taking up more than %1 rows.
+SM.MULTISTREAMMUSTBEPOINT                                     When multiple GS output streams are used they must be pointlists
+SM.NAME                                                       Target shader model name must be known
+SM.NOINTERPMODE                                               Interpolation mode must be undefined for VS input/PS output/patch constant.
+SM.NOPSOUTPUTIDX                                              Pixel shader output registers are not indexable.
+SM.OPCODE                                                     Opcode must be defined in target shader model
+SM.OPCODEININVALIDFUNCTION                                    Invalid DXIL opcode usage like StorePatchConstant in patch constant function
+SM.OPERAND                                                    Operand must be defined in target shader model.
+SM.OUTPUTCONTROLPOINTCOUNTRANGE                               output control point count must be [%0..%1].  %2 specified.
+SM.OUTPUTCONTROLPOINTSTOTALSCALARS                            Total number of scalars across all HS output control points must not exceed .
+SM.PATCHCONSTANTONLYFORHSDS                                   patch constant signature only valid in HS and DS.
+SM.PROGRAMVERSION                                             Program Version in Dxil Container does not match Dxil Module shader model version
+SM.PSCONSISTENTINTERP                                         Interpolation mode for PS input position must be linear_noperspective_centroid or linear_noperspective_sample when outputting oDepthGE or oDepthLE and not running at sample frequency (which is forced by inputting SV_SampleIndex or declaring an input linear_sample or linear_noperspective_sample).
+SM.PSCOVERAGEANDINNERCOVERAGE                                 InnerCoverage and Coverage are mutually exclusive.
+SM.PSMULTIPLEDEPTHSEMANTIC                                    Pixel Shader only allows one type of depth semantic to be declared.
+SM.PSOUTPUTSEMANTIC                                           Pixel Shader allows output semantics to be SV_Target, SV_Depth, SV_DepthGreaterEqual, SV_DepthLessEqual, SV_Coverage or SV_StencilRef, %0 found.
+SM.PSTARGETCOL0                                               SV_Target packed location must start at column 0.
+SM.PSTARGETINDEXMATCHESROW                                    SV_Target semantic index must match packed row location.
+SM.RAYSHADERPAYLOADSIZE                                       For shader '%0', %1 size is smaller than argument's allocation size.
+SM.RAYSHADERSIGNATURES                                        Ray tracing shader '%0' should not have any shader signatures.
+SM.RESOURCERANGEOVERLAP                                       Resource ranges must not overlap
+SM.ROVONLYINPS                                                RasterizerOrdered objects are only allowed in 5.0+ pixel shaders.
+SM.SAMPLECOUNTONLYON2DMS                                      Only Texture2DMS/2DMSArray could has sample count.
+SM.SEMANTIC                                                   Semantic must be defined in target shader model
+SM.STREAMINDEXRANGE                                           Stream index (%0) must between 0 and %1.
+SM.TESSFACTORFORDOMAIN                                        Required TessFactor for domain not found declared anywhere in Patch Constant data.
+SM.TESSFACTORSIZEMATCHDOMAIN                                  TessFactor rows, columns (%0, %1) invalid for domain %2.  Expected %3 rows and 1 column.
+SM.TGSMUNSUPPORTED                                            Thread Group Shared Memory not supported %0.
+SM.THREADGROUPCHANNELRANGE                                    Declared Thread Group %0 size %1 outside valid range [%2..%3].
+SM.TRIOUTPUTPRIMITIVEMISMATCH                                 Hull Shader declared with Tri Domain must specify output primitive point, triangle_cw or triangle_ccw. Line output is not compatible with the Tri domain.
+SM.UNDEFINEDOUTPUT                                            Not all elements of output %0 were written.
+SM.VALIDDOMAIN                                                Invalid Tessellator Domain specified. Must be isoline, tri or quad.
+SM.VIEWIDNEEDSSLOT                                            ViewID requires compatible space in pixel shader input signature
+SM.WAVESIZEALLZEROWHENUNDEFINED                               WaveSize Max and Preferred must be 0 when Min is 0
+SM.WAVESIZEEXPECTSONEPARAM                                    WaveSize tag expects exactly 1 parameter.
+SM.WAVESIZEMAXANDPREFERREDZEROWHENNORANGE                     WaveSize Max and Preferred must be 0 to encode min==max
+SM.WAVESIZEMAXGREATERTHANMIN                                  WaveSize Max must greater than Min
+SM.WAVESIZENEEDSCONSTANTOPERANDS                              WaveSize metadata operands must be constant values.
+SM.WAVESIZENEEDSSM66OR67                                      WaveSize is valid only for Shader Model 6.6 and 6.7.
+SM.WAVESIZEONCOMPUTEORNODE                                    WaveSize only allowed on compute or node shaders
+SM.WAVESIZEPREFERREDINRANGE                                   WaveSize Preferred must be within Min..Max range
+SM.WAVESIZERANGEEXPECTSTHREEPARAMS                            WaveSize Range tag expects exactly 3 parameters.
+SM.WAVESIZERANGENEEDSSM68PLUS                                 WaveSize Range is valid only for Shader Model 6.8 and higher.
+SM.WAVESIZETAGDUPLICATE                                       WaveSize or WaveSizeRange tag may only appear once per entry point.
+SM.WAVESIZEVALUE                                              WaveSize value must be a power of 2 in range [4..128]
+SM.ZEROHSINPUTCONTROLPOINTWITHINPUT                           When HS input control point count is 0, no input signature should exist.
+TYPES.DEFINED                                                 Type must be defined based on DXIL primitives
+TYPES.I8                                                      I8 can only be used as immediate value for intrinsic or as i8* via bitcast by lifetime intrinsics.
+TYPES.INTWIDTH                                                Int type must be of valid width
+TYPES.NOMULTIDIM                                              Only one dimension allowed for array type.
+TYPES.NOPTRTOPTR                                              Pointers to pointers, or pointers in structures are not allowed.
+TYPES.NOVECTOR                                                Vector types must not be present
+============================================================= ========================================================================================================================================================================================================================================================================================================
 
 .. VALRULES-RST:END
 
diff --git a/include/dxc/DXIL/DxilConstants.h b/include/dxc/DXIL/DxilConstants.h
index fe32c06f63..bf6de7ed3b 100644
--- a/include/dxc/DXIL/DxilConstants.h
+++ b/include/dxc/DXIL/DxilConstants.h
@@ -1609,6 +1609,7 @@ const unsigned kMatVecMulAddIsOutputUnsignedIdx = 15;
 // Outer Product Accumulate
 const unsigned kOuterProdAccMatrixInterpretation = 5;
 const unsigned kOuterProdAccMatrixLayout = 6;
+const unsigned kOuterProdAccMatrixStride = 7;
 
 // TODO: add operand index for all the OpCodeClass.
 } // namespace OperandIndex
diff --git a/lib/DxilValidation/DxilValidation.cpp b/lib/DxilValidation/DxilValidation.cpp
index 28917e0600..db596a3821 100644
--- a/lib/DxilValidation/DxilValidation.cpp
+++ b/lib/DxilValidation/DxilValidation.cpp
@@ -1233,6 +1233,32 @@ static void ValidateImmOperandsForOuterProdAcc(CallInst *CI,
         {"MatrixLayout"});
     return;
   }
+  ConstantInt *ML = cast<ConstantInt>(MatrixLayout);
+  uint64_t MLValue = ML->getLimitedValue();
+  if (MLValue !=
+      static_cast<unsigned>(DXIL::LinalgMatrixLayout::OuterProductOptimal))
+    ValCtx.EmitInstrFormatError(
+        CI,
+        ValidationRule::
+            InstrLinalgInvalidMatrixLayoutValueForOuterProductAccumulate,
+        {GetMatrixLayoutStr(MLValue),
+         GetMatrixLayoutStr(static_cast<unsigned>(
+             DXIL::LinalgMatrixLayout::OuterProductOptimal))});
+
+  llvm::Value *MatrixStride =
+      CI->getOperand(DXIL::OperandIndex::kOuterProdAccMatrixStride);
+  if (!llvm::isa<llvm::Constant>(MatrixStride)) {
+    ValCtx.EmitInstrError(
+        CI, ValidationRule::InstrLinalgMatrixStrideZeroForOptimalLayouts);
+    return;
+  }
+  ConstantInt *MS = cast<ConstantInt>(MatrixStride);
+  uint64_t MSValue = MS->getLimitedValue();
+  if (MSValue != 0) {
+    ValCtx.EmitInstrError(
+        CI, ValidationRule::InstrLinalgMatrixStrideZeroForOptimalLayouts);
+    return;
+  }
 }
 
 // Validate the type-defined mask compared to the store value mask which
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/check-shader-stages.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/check-shader-stages.hlsl
index 74cb51260c..75e7c8a5cd 100644
--- a/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/check-shader-stages.hlsl
+++ b/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/check-shader-stages.hlsl
@@ -43,7 +43,7 @@ void UseCoopVec() {
     const uint opa_matrix_offset = 0;
     const uint opa_matrix_interpretation = 5; /*U32*/
     const uint opa_matrix_layout = 3; /*OuterProductOptimal*/
-    const uint opa_matrix_stride = 64;
+    const uint opa_matrix_stride = 0;
 
     __builtin_OuterProductAccumulate(input_vector1, input_vector2,
       rw_matrix_buffer, opa_matrix_offset, opa_matrix_interpretation,
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/linalg-builtins.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/linalg-builtins.hlsl
index c3b4a3a8d7..f1badb9101 100644
--- a/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/linalg-builtins.hlsl
+++ b/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/linalg-builtins.hlsl
@@ -58,12 +58,12 @@ void cs_main()
     const uint opa_matrix_offset = 0;
     const uint opa_matrix_interpretation = 5; /*U32*/
     const uint opa_matrix_layout = 3; /*OuterProductOptimal*/
-    const uint opa_matrix_stride = 64;
+    const uint opa_matrix_stride = 0;
 
     // CHECK: %[[MLD2:[^ ]+]] = load %struct.RWByteAddressBuffer, %struct.RWByteAddressBuffer* @"\01?rw_matrix_buffer@@3URWByteAddressBuffer@@A"
     // CHECK: %[[MCH2:[^ ]+]] = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RWByteAddressBuffer)"(i32 0, %struct.RWByteAddressBuffer %[[MLD2]])
     // CHECK: %[[MAH2:[^ ]+]] = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RWByteAddressBuffer)"(i32 14, %dx.types.Handle %[[MCH2]], %dx.types.ResourceProperties { i32 4107, i32 0 }, %struct.RWByteAddressBuffer undef)
-    // CHECK: call void @"dx.hl.op..void (i32, <8 x i32>, <8 x i32>, %dx.types.Handle, i32, i32, i32, i32)"(i32 392, <8 x i32> %{{[^ ]+}}, <8 x i32> %{{[^ ]+}}, %dx.types.Handle %[[MAH2]], i32 0, i32 5, i32 3, i32 64)
+    // CHECK: call void @"dx.hl.op..void (i32, <8 x i32>, <8 x i32>, %dx.types.Handle, i32, i32, i32, i32)"(i32 392, <8 x i32> %{{[^ ]+}}, <8 x i32> %{{[^ ]+}}, %dx.types.Handle %[[MAH2]], i32 0, i32 5, i32 3, i32 0)
     __builtin_OuterProductAccumulate(input_vector1, input_vector2,
       rw_matrix_buffer, opa_matrix_offset, opa_matrix_interpretation,
       opa_matrix_layout, opa_matrix_stride);
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/outer-product-accumulate-multioverload.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/outer-product-accumulate-multioverload.hlsl
index 40bbe62284..c40365078f 100644
--- a/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/outer-product-accumulate-multioverload.hlsl
+++ b/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/outer-product-accumulate-multioverload.hlsl
@@ -1,8 +1,8 @@
-// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DITY=float16_t -DMI=F16 -DML=RowMajor | FileCheck %s --check-prefixes COMMON,DXIL-0
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DITY=float16_t -DMI=F16 -DML=OuterProductOptimal | FileCheck %s --check-prefixes COMMON,DXIL-0
 // RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DITY=float16_t -DMI=F8_E4M3 -DML=OuterProductOptimal | FileCheck %s --check-prefixes COMMON,DXIL-1
 // RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DITY=uint -DMI=U8 -DML=OuterProductOptimal | FileCheck %s --check-prefixes COMMON,DXIL-2
 
-// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DITY=float16_t -DMI=F16 -DML=RowMajor -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-0
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DITY=float16_t -DMI=F16 -DML=OuterProductOptimal -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-0
 // RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DITY=float16_t -DMI=F8_E4M3 -DML=OuterProductOptimal -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-1
 // RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DITY=uint -DMI=U8 -DML=OuterProductOptimal -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-2
 
@@ -11,12 +11,12 @@ ByteAddressBuffer input_vector_buffer2;
 RWByteAddressBuffer matrix_buffer;
 
 // COMMON: define void @main()
-// DXIL-0: call void @dx.op.outerProductAccumulate.v8f16.v8f16(i32 307, <8 x half> %{{[^ ]+}}, <8 x half> %{{[^ ]+}}, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 8, i32 0, i32 64)  ; OuterProductAccumulate(inputVector1,inputVector2,matrixBuffer,matrixOffset,matrixIntepretation,matrixLayout,matrixStride)
-// HLOP-0: call void @"dx.hl.op..void (i32, <8 x half>, <8 x half>, %dx.types.Handle, i32, i32, i32, i32)"(i32 392, <8 x half> %{{[^ ]+}}, <8 x half> %{{[^ ]+}}, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 8, i32 0, i32 64)
-// DXIL-1: call void @dx.op.outerProductAccumulate.v8f16.v8f16(i32 307, <8 x half> %{{[^ ]+}}, <8 x half> %{{[^ ]+}}, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 21, i32 3, i32 64)  ; OuterProductAccumulate(inputVector1,inputVector2,matrixBuffer,matrixOffset,matrixIntepretation,matrixLayout,matrixStride)
-// HLOP-1: call void @"dx.hl.op..void (i32, <8 x half>, <8 x half>, %dx.types.Handle, i32, i32, i32, i32)"(i32 392, <8 x half> %{{[^ ]+}}, <8 x half> %{{[^ ]+}}, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 21, i32 3, i32 64)
-// DXIL-2: call void @dx.op.outerProductAccumulate.v8i32.v8i32(i32 307, <8 x i32> %{{[^ ]+}}, <8 x i32> %{{[^ ]+}}, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 19, i32 3, i32 64)  ; OuterProductAccumulate(inputVector1,inputVector2,matrixBuffer,matrixOffset,matrixIntepretation,matrixLayout,matrixStride)
-// HLOP-2: call void @"dx.hl.op..void (i32, <8 x i32>, <8 x i32>, %dx.types.Handle, i32, i32, i32, i32)"(i32 392, <8 x i32> %{{[^ ]+}}, <8 x i32> %{{[^ ]+}}, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 19, i32 3, i32 64)
+// DXIL-0: call void @dx.op.outerProductAccumulate.v8f16.v8f16(i32 307, <8 x half> %{{[^ ]+}}, <8 x half> %{{[^ ]+}}, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 8, i32 3, i32 0)  ; OuterProductAccumulate(inputVector1,inputVector2,matrixBuffer,matrixOffset,matrixIntepretation,matrixLayout,matrixStride)
+// HLOP-0: call void @"dx.hl.op..void (i32, <8 x half>, <8 x half>, %dx.types.Handle, i32, i32, i32, i32)"(i32 392, <8 x half> %{{[^ ]+}}, <8 x half> %{{[^ ]+}}, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 8, i32 3, i32 0)
+// DXIL-1: call void @dx.op.outerProductAccumulate.v8f16.v8f16(i32 307, <8 x half> %{{[^ ]+}}, <8 x half> %{{[^ ]+}}, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 21, i32 3, i32 0)  ; OuterProductAccumulate(inputVector1,inputVector2,matrixBuffer,matrixOffset,matrixIntepretation,matrixLayout,matrixStride)
+// HLOP-1: call void @"dx.hl.op..void (i32, <8 x half>, <8 x half>, %dx.types.Handle, i32, i32, i32, i32)"(i32 392, <8 x half> %{{[^ ]+}}, <8 x half> %{{[^ ]+}}, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 21, i32 3, i32 0)
+// DXIL-2: call void @dx.op.outerProductAccumulate.v8i32.v8i32(i32 307, <8 x i32> %{{[^ ]+}}, <8 x i32> %{{[^ ]+}}, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 19, i32 3, i32 0)  ; OuterProductAccumulate(inputVector1,inputVector2,matrixBuffer,matrixOffset,matrixIntepretation,matrixLayout,matrixStride)
+// HLOP-2: call void @"dx.hl.op..void (i32, <8 x i32>, <8 x i32>, %dx.types.Handle, i32, i32, i32, i32)"(i32 392, <8 x i32> %{{[^ ]+}}, <8 x i32> %{{[^ ]+}}, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 19, i32 3, i32 0)
 
 enum CompType {
   Invalid = 0,
@@ -63,7 +63,7 @@ void main()
     const uint matrix_interpretation = MI;
     const uint matrix_layout = ML;
     const uint matrix_offset = 0;
-    const uint matrix_stride = 64;
+    const uint matrix_stride = 0;
 
     __builtin_OuterProductAccumulate(input_vector1, input_vector2, matrix_buffer, matrix_offset, matrix_interpretation, matrix_layout, matrix_stride);
 
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/linalg/outer-product-accumulate-matrix-layout.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/linalg/outer-product-accumulate-matrix-layout.hlsl
new file mode 100644
index 0000000000..e930557cf9
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/hlsl/linalg/outer-product-accumulate-matrix-layout.hlsl
@@ -0,0 +1,28 @@
+// RUN: %dxc -I %hlsl_headers -T cs_6_9 %s -enable-16bit-types -DML=MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL -DSTRIDE=0 2>&1 | FileCheck %s
+
+//Source file for the IR in \tools\clang\test\LitDXILValidation\outer-product-accumulate-matrix-layout-failing.ll
+//Source file for the IR in \tools\clang\test\LitDXILValidation\outer-product-accumulate-matrix-layout-passing.ll
+
+ByteAddressBuffer input_vector_buffer;
+ByteAddressBuffer input_vector_buffer2;
+RWByteAddressBuffer matrix_buffer;
+
+#include <dx/linalg.h>
+
+// CHECK: call void @dx.op.outerProductAccumulate.v8f16.v8f16(i32 307, <8 x half> %{{[^ ]+}}, <8 x half> %{{[^ ]+}}, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 8, i32 3, i32 0)
+using namespace dx::linalg;
+
+[Numthreads(1,1,1)]
+[shader("compute")]
+void main()
+{
+  vector<half, 8> input_vector1 = input_vector_buffer.Load<vector<half, 8> >(0);
+  vector<half, 8> input_vector2 = input_vector_buffer2.Load<vector<half, 8> >(0);
+
+  const uint matrix_interpretation = DATA_TYPE_FLOAT16;
+  const uint matrix_layout = ML;
+  const uint matrix_offset = 0;
+  const uint matrix_stride = STRIDE;
+
+  __builtin_OuterProductAccumulate(input_vector1, input_vector2, matrix_buffer, matrix_offset, matrix_interpretation, matrix_layout, matrix_stride);
+}
diff --git a/tools/clang/test/DXC/Passes/DxilGen/linalg-builtins.ll b/tools/clang/test/DXC/Passes/DxilGen/linalg-builtins.ll
index 6623f63031..ea1be46c4c 100644
--- a/tools/clang/test/DXC/Passes/DxilGen/linalg-builtins.ll
+++ b/tools/clang/test/DXC/Passes/DxilGen/linalg-builtins.ll
@@ -76,8 +76,8 @@ entry:
 
   ;CHECK: %[[RWMCH0:[^ ]+]] = call %dx.types.Handle @dx.op.createHandleForLib.struct.RWByteAddressBuffer(i32 160, %struct.RWByteAddressBuffer %[[RWMLD0]]
   ;CHECK: %[[RWMAH0:[^ ]+]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %[[RWMCH0]]
-  ;CHECK: call void @dx.op.outerProductAccumulate.v8i32.v8i32(i32 307, <8 x i32> %{{[^ ]+}}, <8 x i32> %{{[^ ]+}}, %dx.types.Handle %[[RWMAH0]], i32 0, i32 5, i32 3, i32 64)
-  call void @"dx.hl.op..void (i32, <8 x i32>, <8 x i32>, %dx.types.Handle, i32, i32, i32, i32)"(i32 392, <8 x i32> %tmp25, <8 x i32> %tmp29, %dx.types.Handle %tmp32, i32 0, i32 5, i32 3, i32 64), !dbg !37 ; line:67 col:5
+  ;CHECK: call void @dx.op.outerProductAccumulate.v8i32.v8i32(i32 307, <8 x i32> %{{[^ ]+}}, <8 x i32> %{{[^ ]+}}, %dx.types.Handle %[[RWMAH0]], i32 0, i32 5, i32 3, i32 0)
+  call void @"dx.hl.op..void (i32, <8 x i32>, <8 x i32>, %dx.types.Handle, i32, i32, i32, i32)"(i32 392, <8 x i32> %tmp25, <8 x i32> %tmp29, %dx.types.Handle %tmp32, i32 0, i32 5, i32 3, i32 0), !dbg !37 ; line:67 col:5
 
   
   %tmp33 = load %struct.RWByteAddressBuffer, %struct.RWByteAddressBuffer* @"\01?rw_matrix_buffer@@3URWByteAddressBuffer@@A", !dbg !38 ; line:77 col:5
diff --git a/tools/clang/test/LitDXILValidation/outer-product-accumulate-matrix-layout-failing.ll b/tools/clang/test/LitDXILValidation/outer-product-accumulate-matrix-layout-failing.ll
new file mode 100644
index 0000000000..33591126e5
--- /dev/null
+++ b/tools/clang/test/LitDXILValidation/outer-product-accumulate-matrix-layout-failing.ll
@@ -0,0 +1,86 @@
+; REQUIRES: dxil-1-9
+; RUN: not %dxv %s 2>&1 | FileCheck %s
+
+; Original Source: \tools\clang\test\CodeGenHLSL\linalg\outer-product-accumulate-matrix-layout.hlsl
+; The failing tests were generated by manually editing the IR produced from the IR from the passing
+; case generated by running the hlsl above (Original Source)
+
+target datalayout = "e-m:e-p:32:32-i1:32-i8:8-i16:16-i32:32-i64:64-f16:16-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+%dx.types.Handle = type { i8* }
+%dx.types.ResBind = type { i32, i32, i32, i8 }
+%dx.types.ResourceProperties = type { i32, i32 }
+%dx.types.ResRet.v8f16 = type { <8 x half>, i32 }
+%struct.ByteAddressBuffer = type { i32 }
+%struct.RWByteAddressBuffer = type { i32 }
+
+; As noted in other tests, the validation errors come out in
+; an order different from the IR. So listed them here in the
+; order they appear and added comments for correlation
+
+;CHECK: error: matrix stride must be a constant zero for optimal layouts
+;CHECK: error: matrix stride must be a constant zero for optimal layouts
+;CHECK-NOT: error: matrix layout value 'OuterProductOptimal' is not valid for outerproductaccumulate, must be 'OuterProductOptimal'
+;CHECK: error: matrix layout value 'MulOptimal' is not valid for outerproductaccumulate, must be 'OuterProductOptimal'
+;CHECK: error: matrix layout value 'ColumnMajor' is not valid for outerproductaccumulate, must be 'OuterProductOptimal'
+;CHECK: error: matrix layout value 'RowMajor' is not valid for outerproductaccumulate, must be 'OuterProductOptimal'
+; CHECK: Validation failed.
+
+define void @main() {
+  %1 = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217, %dx.types.ResBind { i32 0, i32 0, i32 0, i8 1 }, i32 0, i1 false)  ; CreateHandleFromBinding(bind,index,nonUniformIndex)
+  %2 = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217, %dx.types.ResBind { i32 1, i32 1, i32 0, i8 0 }, i32 1, i1 false)  ; CreateHandleFromBinding(bind,index,nonUniformIndex)
+  %3 = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217, %dx.types.ResBind zeroinitializer, i32 0, i1 false)  ; CreateHandleFromBinding(bind,index,nonUniformIndex)
+  %4 = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %3, %dx.types.ResourceProperties { i32 11, i32 0 })  ; AnnotateHandle(res,props)  resource: ByteAddressBuffer
+  %5 = call %dx.types.ResRet.v8f16 @dx.op.rawBufferVectorLoad.v8f16(i32 303, %dx.types.Handle %4, i32 0, i32 undef, i32 2)  ; RawBufferVectorLoad(buf,index,elementOffset,alignment)
+  %6 = extractvalue %dx.types.ResRet.v8f16 %5, 0
+  %7 = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %2, %dx.types.ResourceProperties { i32 11, i32 0 })  ; AnnotateHandle(res,props)  resource: ByteAddressBuffer
+  %8 = call %dx.types.ResRet.v8f16 @dx.op.rawBufferVectorLoad.v8f16(i32 303, %dx.types.Handle %7, i32 0, i32 undef, i32 2)  ; RawBufferVectorLoad(buf,index,elementOffset,alignment)
+  %9 = extractvalue %dx.types.ResRet.v8f16 %8, 0
+  %10 = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %1, %dx.types.ResourceProperties { i32 4107, i32 0 })  ; AnnotateHandle(res,props)  resource: RWByteAddressBuffer
+  ; error: matrix layout value 'RowMajor' is not valid for outerproductaccumulate, must be 'OuterProductOptimal'
+  call void @dx.op.outerProductAccumulate.v8f16.v8f16(i32 307, <8 x half> %6, <8 x half> %9, %dx.types.Handle %10, i32 0, i32 8, i32 0, i32 0)  ; OuterProductAccumulate(inputVector1,inputVector2,matrixBuffer,matrixOffset,matrixIntepretation,matrixLayout,matrixStride)
+  ; error: matrix layout value 'ColumnMajor' is not valid for outerproductaccumulate, must be 'OuterProductOptimal'
+  call void @dx.op.outerProductAccumulate.v8f16.v8f16(i32 307, <8 x half> %6, <8 x half> %9, %dx.types.Handle %10, i32 0, i32 8, i32 1, i32 0)  ; OuterProductAccumulate(inputVector1,inputVector2,matrixBuffer,matrixOffset,matrixIntepretation,matrixLayout,matrixStride)
+  ; matrix layout value 'MulOptimal' is not valid for outerproductaccumulate, must be 'OuterProductOptimal'
+  call void @dx.op.outerProductAccumulate.v8f16.v8f16(i32 307, <8 x half> %6, <8 x half> %9, %dx.types.Handle %10, i32 0, i32 8, i32 2, i32 0)  ; OuterProductAccumulate(inputVector1,inputVector2,matrixBuffer,matrixOffset,matrixIntepretation,matrixLayout,matrixStride)
+  ; error: matrix stride must be a constant zero for optimal layouts
+  call void @dx.op.outerProductAccumulate.v8f16.v8f16(i32 307, <8 x half> %6, <8 x half> %9, %dx.types.Handle %10, i32 0, i32 8, i32 3, i32 64)  ; OuterProductAccumulate(inputVector1,inputVector2,matrixBuffer,matrixOffset,matrixIntepretation,matrixLayout,matrixStride)
+  ; error: matrix stride must be a constant zero for optimal layouts
+  call void @dx.op.outerProductAccumulate.v8f16.v8f16(i32 307, <8 x half> %6, <8 x half> %9, %dx.types.Handle %10, i32 0, i32 8, i32 3, i32 63)  ; OuterProductAccumulate(inputVector1,inputVector2,matrixBuffer,matrixOffset,matrixIntepretation,matrixLayout,matrixStride)
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare %dx.types.ResRet.v8f16 @dx.op.rawBufferVectorLoad.v8f16(i32, %dx.types.Handle, i32, i32, i32) #0
+
+; Function Attrs: nounwind
+declare void @dx.op.outerProductAccumulate.v8f16.v8f16(i32, <8 x half>, <8 x half>, %dx.types.Handle, i32, i32, i32, i32) #1
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @dx.op.annotateHandle(i32, %dx.types.Handle, %dx.types.ResourceProperties) #2
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @dx.op.createHandleFromBinding(i32, %dx.types.ResBind, i32, i1) #2
+
+attributes #0 = { nounwind readonly }
+attributes #1 = { nounwind }
+attributes #2 = { nounwind readnone }
+
+!dx.version = !{!0}
+!dx.valver = !{!0}
+!dx.shaderModel = !{!1}
+!dx.resources = !{!2}
+!dx.entryPoints = !{!8}
+
+!0 = !{i32 1, i32 9}
+!1 = !{!"cs", i32 6, i32 9}
+!2 = !{!3, !6, null, null}
+!3 = !{!4, !5}
+!4 = !{i32 0, %struct.ByteAddressBuffer* undef, !"", i32 0, i32 0, i32 1, i32 11, i32 0, null}
+!5 = !{i32 1, %struct.ByteAddressBuffer* undef, !"", i32 0, i32 1, i32 1, i32 11, i32 0, null}
+!6 = !{!7}
+!7 = !{i32 0, %struct.RWByteAddressBuffer* undef, !"", i32 0, i32 0, i32 1, i32 11, i1 false, i1 false, i1 false, null}
+!8 = !{void ()* @main, !"main", null, !2, !9}
+!9 = !{i32 0, i64 8598323216, i32 4, !10}
+!10 = !{i32 1, i32 1, i32 1}
diff --git a/tools/clang/test/LitDXILValidation/outer-product-accumulate-matrix-layout-passing.ll b/tools/clang/test/LitDXILValidation/outer-product-accumulate-matrix-layout-passing.ll
new file mode 100644
index 0000000000..44cd3e48b3
--- /dev/null
+++ b/tools/clang/test/LitDXILValidation/outer-product-accumulate-matrix-layout-passing.ll
@@ -0,0 +1,65 @@
+; REQUIRES: dxil-1-9
+; RUN: %dxv %s 2>&1 | FileCheck %s
+
+;Original Source: \tools\clang\test\CodeGenHLSL\linalg\outer-product-accumulate-matrix-layout.hlsl
+
+target datalayout = "e-m:e-p:32:32-i1:32-i8:8-i16:16-i32:32-i64:64-f16:16-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+%dx.types.Handle = type { i8* }
+%dx.types.ResBind = type { i32, i32, i32, i8 }
+%dx.types.ResourceProperties = type { i32, i32 }
+%dx.types.ResRet.v8f16 = type { <8 x half>, i32 }
+%struct.ByteAddressBuffer = type { i32 }
+%struct.RWByteAddressBuffer = type { i32 }
+
+;CHECK: Validation succeeded.
+
+define void @main() {
+  %1 = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217, %dx.types.ResBind { i32 0, i32 0, i32 0, i8 1 }, i32 0, i1 false)  ; CreateHandleFromBinding(bind,index,nonUniformIndex)
+  %2 = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217, %dx.types.ResBind { i32 1, i32 1, i32 0, i8 0 }, i32 1, i1 false)  ; CreateHandleFromBinding(bind,index,nonUniformIndex)
+  %3 = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217, %dx.types.ResBind zeroinitializer, i32 0, i1 false)  ; CreateHandleFromBinding(bind,index,nonUniformIndex)
+  %4 = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %3, %dx.types.ResourceProperties { i32 11, i32 0 })  ; AnnotateHandle(res,props)  resource: ByteAddressBuffer
+  %5 = call %dx.types.ResRet.v8f16 @dx.op.rawBufferVectorLoad.v8f16(i32 303, %dx.types.Handle %4, i32 0, i32 undef, i32 2)  ; RawBufferVectorLoad(buf,index,elementOffset,alignment)
+  %6 = extractvalue %dx.types.ResRet.v8f16 %5, 0
+  %7 = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %2, %dx.types.ResourceProperties { i32 11, i32 0 })  ; AnnotateHandle(res,props)  resource: ByteAddressBuffer
+  %8 = call %dx.types.ResRet.v8f16 @dx.op.rawBufferVectorLoad.v8f16(i32 303, %dx.types.Handle %7, i32 0, i32 undef, i32 2)  ; RawBufferVectorLoad(buf,index,elementOffset,alignment)
+  %9 = extractvalue %dx.types.ResRet.v8f16 %8, 0
+  %10 = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %1, %dx.types.ResourceProperties { i32 4107, i32 0 })  ; AnnotateHandle(res,props)  resource: RWByteAddressBuffer
+  call void @dx.op.outerProductAccumulate.v8f16.v8f16(i32 307, <8 x half> %6, <8 x half> %9, %dx.types.Handle %10, i32 0, i32 8, i32 3, i32 0)  ; OuterProductAccumulate(inputVector1,inputVector2,matrixBuffer,matrixOffset,matrixIntepretation,matrixLayout,matrixStride)
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare %dx.types.ResRet.v8f16 @dx.op.rawBufferVectorLoad.v8f16(i32, %dx.types.Handle, i32, i32, i32) #0
+
+; Function Attrs: nounwind
+declare void @dx.op.outerProductAccumulate.v8f16.v8f16(i32, <8 x half>, <8 x half>, %dx.types.Handle, i32, i32, i32, i32) #1
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @dx.op.annotateHandle(i32, %dx.types.Handle, %dx.types.ResourceProperties) #2
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @dx.op.createHandleFromBinding(i32, %dx.types.ResBind, i32, i1) #2
+
+attributes #0 = { nounwind readonly }
+attributes #1 = { nounwind }
+attributes #2 = { nounwind readnone }
+
+!dx.version = !{!0}
+!dx.valver = !{!0}
+!dx.shaderModel = !{!1}
+!dx.resources = !{!2}
+!dx.entryPoints = !{!8}
+
+!0 = !{i32 1, i32 9}
+!1 = !{!"cs", i32 6, i32 9}
+!2 = !{!3, !6, null, null}
+!3 = !{!4, !5}
+!4 = !{i32 0, %struct.ByteAddressBuffer* undef, !"", i32 0, i32 0, i32 1, i32 11, i32 0, null}
+!5 = !{i32 1, %struct.ByteAddressBuffer* undef, !"", i32 0, i32 1, i32 1, i32 11, i32 0, null}
+!6 = !{!7}
+!7 = !{i32 0, %struct.RWByteAddressBuffer* undef, !"", i32 0, i32 0, i32 1, i32 11, i1 false, i1 false, i1 false, null}
+!8 = !{void ()* @main, !"main", null, !2, !9}
+!9 = !{i32 0, i64 8598323216, i32 4, !10}
+!10 = !{i32 1, i32 1, i32 1}
diff --git a/utils/hct/hctdb.py b/utils/hct/hctdb.py
index 65f9aa1d80..5567a6a88d 100644
--- a/utils/hct/hctdb.py
+++ b/utils/hct/hctdb.py
@@ -8452,6 +8452,12 @@ def build_valrules(self):
             "matrix layout value '%0' is not valid. Must be between [%1 - %2]",
         )
 
+        self.add_valrule_msg(
+            "Instr.LinalgMatrixStrideZeroForOptimalLayouts",
+            "For optimal layouts, matrix stride must be zero.",
+            "matrix stride must be a constant zero for optimal layouts",
+        )
+
         self.add_valrule_msg(
             "Instr.LinalgMatrixLayoutNotTransposable",
             "Row Major and Column Major matrix layouts are not transposable.",
@@ -8464,6 +8470,12 @@ def build_valrules(self):
             "IsUnsigned flag set to true for a float type '%0' vector",
         )
 
+        self.add_valrule_msg(
+            "Instr.LinalgInvalidMatrixLayoutValueForOuterProductAccumulate",
+            "Matrix Layout for Linalg Mul/MulAdd operation must be valid.",
+            "matrix layout value '%0' is not valid for outerproductaccumulate, must be '%1'",
+        )
+
         # Some legacy rules:
         # - space is only supported for shader targets 5.1 and higher
         # - multiple rules regarding derivatives, which isn't a supported feature for DXIL

From f5214f17ec23fe3fd263315e643fe5f470a8ea84 Mon Sep 17 00:00:00 2001
From: Tim Corringham <timothy.corringham@amd.com>
Date: Wed, 14 May 2025 18:45:50 +0100
Subject: [PATCH 34/93] Support SV_DispatchGrid semantic in a nested record
 (#6931)

The SV_DispatchGrid DXIL metadata for a node input record was not
generated in cases where:
- the field with the SV_DispatchGrid semantic was in a nested record
- the field with the SV_DispatchGrid semantic was in a record field
- the field with the SV_DispatchGrid semantic was inherited from a base
record
- in any combinations of the above

Added FindDispatchGridSemantic() to be used by the
AddHLSLNodeRecordTypeInfo() function, and added a test case.

Fixes #6928

---------

Co-authored-by: Tim Corringham <tcorring@amd.com>
Co-authored-by: Tex Riddell <texr@microsoft.com>
---
 tools/clang/lib/CodeGen/CGHLSLMS.cpp          | 121 ++++++++--------
 .../workgraph/nested_sv_dispatchgrid.hlsl     | 130 ++++++++++++++++++
 2 files changed, 196 insertions(+), 55 deletions(-)
 create mode 100644 tools/clang/test/HLSLFileCheck/hlsl/workgraph/nested_sv_dispatchgrid.hlsl

diff --git a/tools/clang/lib/CodeGen/CGHLSLMS.cpp b/tools/clang/lib/CodeGen/CGHLSLMS.cpp
index 16ddeaec60..b5add521a6 100644
--- a/tools/clang/lib/CodeGen/CGHLSLMS.cpp
+++ b/tools/clang/lib/CodeGen/CGHLSLMS.cpp
@@ -288,6 +288,9 @@ class CGMSHLSLRuntime : public CGHLSLRuntime {
                                            llvm::Value *DestPtr,
                                            clang::QualType DestTy) override;
   void AddHLSLFunctionInfo(llvm::Function *, const FunctionDecl *FD) override;
+  bool FindDispatchGridSemantic(const CXXRecordDecl *RD,
+                                hlsl::SVDispatchGrid &SDGRec,
+                                CharUnits Offset = CharUnits());
   void AddHLSLNodeRecordTypeInfo(const clang::ParmVarDecl *parmDecl,
                                  hlsl::NodeIOProperties &node);
   void EmitHLSLFunctionProlog(llvm::Function *,
@@ -2560,6 +2563,66 @@ void CGMSHLSLRuntime::AddHLSLFunctionInfo(Function *F, const FunctionDecl *FD) {
   m_ScopeMap[F] = ScopeInfo(F, FD->getLocation());
 }
 
+// Find the input node record field with the SV_DispatchGrid semantic.
+// We have already diagnosed any error conditions in Sema, so we
+// expect valid size and types, and use the first occurance found.
+// We return true if we have populated the SV_DispatchGrid values.
+bool CGMSHLSLRuntime::FindDispatchGridSemantic(const CXXRecordDecl *RD,
+                                               hlsl::SVDispatchGrid &SDGRec,
+                                               CharUnits Offset) {
+  const ASTRecordLayout &Layout = CGM.getContext().getASTRecordLayout(RD);
+
+  // Check (non-virtual) bases
+  for (const CXXBaseSpecifier &Base : RD->bases()) {
+    DXASSERT(!Base.getType()->isDependentType(),
+             "Node Record with dependent base class not caught by Sema");
+    if (Base.getType()->isDependentType())
+      continue;
+    CXXRecordDecl *BaseDecl = Base.getType()->getAsCXXRecordDecl();
+    CharUnits BaseOffset = Offset + Layout.getBaseClassOffset(BaseDecl);
+    if (FindDispatchGridSemantic(BaseDecl, SDGRec, BaseOffset))
+      return true;
+  }
+
+  // Check each field in this record.
+  for (FieldDecl *Field : RD->fields()) {
+    uint64_t FieldNo = Field->getFieldIndex();
+    CharUnits FieldOffset = Offset + CGM.getContext().toCharUnitsFromBits(
+                                         Layout.getFieldOffset(FieldNo));
+
+    // If this field is a record check its fields
+    if (const CXXRecordDecl *D = Field->getType()->getAsCXXRecordDecl()) {
+      if (FindDispatchGridSemantic(D, SDGRec, FieldOffset))
+        return true;
+    }
+    // Otherwise check this field for the SV_DispatchGrid semantic annotation
+    for (const hlsl::UnusualAnnotation *UA : Field->getUnusualAnnotations()) {
+      if (UA->getKind() == hlsl::UnusualAnnotation::UA_SemanticDecl) {
+        const hlsl::SemanticDecl *SD = cast<hlsl::SemanticDecl>(UA);
+        if (SD->SemanticName.equals("SV_DispatchGrid")) {
+          const llvm::Type *FTy = CGM.getTypes().ConvertType(Field->getType());
+          const llvm::Type *ElTy = FTy;
+          SDGRec.NumComponents = 1;
+          SDGRec.ByteOffset = (unsigned)FieldOffset.getQuantity();
+          if (const llvm::VectorType *VT = dyn_cast<llvm::VectorType>(FTy)) {
+            SDGRec.NumComponents = VT->getNumElements();
+            ElTy = VT->getElementType();
+          } else if (const llvm::ArrayType *AT =
+                         dyn_cast<llvm::ArrayType>(FTy)) {
+            SDGRec.NumComponents = AT->getNumElements();
+            ElTy = AT->getElementType();
+          }
+          SDGRec.ComponentType = (ElTy->getIntegerBitWidth() == 16)
+                                     ? DXIL::ComponentType::U16
+                                     : DXIL::ComponentType::U32;
+          return true;
+        }
+      }
+    }
+  }
+  return false;
+}
+
 void CGMSHLSLRuntime::AddHLSLNodeRecordTypeInfo(
     const clang::ParmVarDecl *parmDecl, hlsl::NodeIOProperties &node) {
   clang::QualType paramTy = parmDecl->getType().getCanonicalType();
@@ -2577,7 +2640,6 @@ void CGMSHLSLRuntime::AddHLSLNodeRecordTypeInfo(
         DiagnosticsEngine &Diags = CGM.getDiags();
         auto &Rec = TemplateArgs.get(0);
         clang::QualType RecType = Rec.getAsType();
-        llvm::Type *Type = CGM.getTypes().ConvertType(RecType);
         CXXRecordDecl *RD = RecType->getAsCXXRecordDecl();
 
         // Get the TrackRWInputSharing flag from the record attribute
@@ -2597,63 +2659,12 @@ void CGMSHLSLRuntime::AddHLSLNodeRecordTypeInfo(
 
         // Ex: For DispatchNodeInputRecord<MY_RECORD>, set size =
         // size(MY_RECORD), alignment = alignof(MY_RECORD)
+        llvm::Type *Type = CGM.getTypes().ConvertType(RecType);
         node.RecordType.size = CGM.getDataLayout().getTypeAllocSize(Type);
         node.RecordType.alignment =
             CGM.getDataLayout().getABITypeAlignment(Type);
-        // Iterate over fields of the MY_RECORD(example) struct
-        for (auto fieldDecl : RD->fields()) {
-          // Check if any of the fields have a semantic annotation =
-          // SV_DispatchGrid
-          for (const hlsl::UnusualAnnotation *it :
-               fieldDecl->getUnusualAnnotations()) {
-            if (it->getKind() == hlsl::UnusualAnnotation::UA_SemanticDecl) {
-              const hlsl::SemanticDecl *sd = cast<hlsl::SemanticDecl>(it);
-              // if we find a field with SV_DispatchGrid, fill out the
-              // SV_DispatchGrid member with byteoffset of the field,
-              // NumComponents (3 for uint3 etc) and U32 vs U16 types, which are
-              // the only types allowed
-              if (sd->SemanticName.equals("SV_DispatchGrid")) {
-                clang::QualType FT = fieldDecl->getType();
-                auto &DL = CGM.getDataLayout();
-                auto &SDGRec = node.RecordType.SV_DispatchGrid;
-
-                DXASSERT_NOMSG(SDGRec.NumComponents == 0);
-
-                unsigned fieldIdx = fieldDecl->getFieldIndex();
-                if (StructType *ST = dyn_cast<StructType>(Type)) {
-                  SDGRec.ByteOffset =
-                      DL.getStructLayout(ST)->getElementOffset(fieldIdx);
-                }
-                const llvm::Type *lTy = CGM.getTypes().ConvertType(FT);
-                if (const llvm::VectorType *VT =
-                        dyn_cast<llvm::VectorType>(lTy)) {
-                  DXASSERT(VT->getElementType()->isIntegerTy(), "invalid type");
-                  SDGRec.NumComponents = VT->getNumElements();
-                  SDGRec.ComponentType =
-                      (VT->getElementType()->getIntegerBitWidth() == 16)
-                          ? DXIL::ComponentType::U16
-                          : DXIL::ComponentType::U32;
-                } else if (const llvm::ArrayType *AT =
-                               dyn_cast<llvm::ArrayType>(lTy)) {
-                  DXASSERT(AT->getElementType()->isIntegerTy(), "invalid type");
-                  DXASSERT_NOMSG(AT->getNumElements() <= 3);
-                  SDGRec.NumComponents = AT->getNumElements();
-                  SDGRec.ComponentType =
-                      (AT->getElementType()->getIntegerBitWidth() == 16)
-                          ? DXIL::ComponentType::U16
-                          : DXIL::ComponentType::U32;
-                } else {
-                  // Scalar U16 or U32
-                  DXASSERT(lTy->isIntegerTy(), "invalid type");
-                  SDGRec.NumComponents = 1;
-                  SDGRec.ComponentType = (lTy->getIntegerBitWidth() == 16)
-                                             ? DXIL::ComponentType::U16
-                                             : DXIL::ComponentType::U32;
-                }
-              }
-            }
-          }
-        }
+
+        FindDispatchGridSemantic(RD, node.RecordType.SV_DispatchGrid);
       }
     }
   }
diff --git a/tools/clang/test/HLSLFileCheck/hlsl/workgraph/nested_sv_dispatchgrid.hlsl b/tools/clang/test/HLSLFileCheck/hlsl/workgraph/nested_sv_dispatchgrid.hlsl
new file mode 100644
index 0000000000..1da45dae1d
--- /dev/null
+++ b/tools/clang/test/HLSLFileCheck/hlsl/workgraph/nested_sv_dispatchgrid.hlsl
@@ -0,0 +1,130 @@
+// RUN: %dxc -T lib_6_8 %s | FileCheck %s
+
+// Check that the SV_DispatchGrid DXIL metadata for a node input record is
+// generated in cases where:
+// node1 - the field with the SV_DispatchGrid semantic is in a nested record
+// node2 - the field with the SV_DispatchGrid semantic is in a record field
+// node3 - the field with the SV_DispatchGrid semantic is inherited from a base record
+// node4 - the field with the SV_DispatchGrid semantic is within a nested record inherited from a base record
+// node5 - the field with the SV_DispatchGrid semantic is within a base record of a nested record
+// node6 - the field with the SV_DispatchGrid semantic is within a templated base record
+// node7 - the field with the SV_DispatchGrid semantic is within a templated base record of a templated record
+// node8 - the field with the SV_DispatchGrid semantic has templated type
+
+struct Record1 {
+    struct {
+      // SV_DispatchGrid is within a nested record
+      uint3 grid : SV_DispatchGrid;
+    };
+};
+
+[Shader("node")]
+[NodeMaxDispatchGrid(32,16,1)]
+[NumThreads(32,1,1)]
+void node1(DispatchNodeInputRecord<Record1> input) {}
+// CHECK: {!"node1"
+// CHECK: , i32 1, ![[SVDG_1:[0-9]+]]
+// CHECK: [[SVDG_1]] = !{i32 0, i32 5, i32 3}
+
+struct Record2a {
+  uint u;
+  uint2 grid : SV_DispatchGrid;
+};
+
+struct Record2 {
+  uint a;
+  // SV_DispatchGrid is within a record field
+  Record2a b;
+};
+
+[Shader("node")]
+[NodeMaxDispatchGrid(32,16,1)]
+[NumThreads(32,1,1)]
+void node2(DispatchNodeInputRecord<Record2> input) {}
+// CHECK: {!"node2"
+// CHECK: , i32 1, ![[SVDG_2:[0-9]+]]
+// CHECK: [[SVDG_2]] = !{i32 8, i32 5, i32 2}
+
+struct Record3 : Record2a {
+  // SV_DispatchGrid is inherited
+  uint4 n;
+};
+
+[Shader("node")]
+[NodeMaxDispatchGrid(32,16,1)]
+[NumThreads(32,1,1)]
+void node3(DispatchNodeInputRecord<Record3> input) {}
+// CHECK: {!"node3"
+// CHECK: , i32 1, ![[SVDG_3:[0-9]+]]
+// CHECK: [[SVDG_3]] = !{i32 4, i32 5, i32 2}
+
+struct Record4 : Record2 {
+  // SV_DispatchGrid is in a nested field in a base record
+  float f;
+};
+
+[Shader("node")]
+[NodeMaxDispatchGrid(32,16,1)]
+[NumThreads(32,1,1)]
+void node4(DispatchNodeInputRecord<Record4> input) {}
+// CHECK: {!"node4"
+// CHECK: , i32 1, ![[SVDG_2]]
+
+struct Record5 {
+  uint4 x;
+  // SV_DispatchGrid is in a base record of a record field
+  Record3 r;
+};
+
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NodeMaxDispatchGrid(32,16,1)]
+[NumThreads(32,1,1)]
+void node5(DispatchNodeInputRecord<Record5> input) {}
+// CHECK: {!"node5"
+// CHECK: , i32 1, ![[SVDG_5:[0-9]+]]
+// CHECK: [[SVDG_5]] = !{i32 20, i32 5, i32 2}
+
+template <typename T>
+struct Base {
+  T DG : SV_DispatchGrid;
+};
+
+struct Derived1 : Base<uint3> {
+  int4 x;
+};
+
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NodeMaxDispatchGrid(32,16,1)]
+[NumThreads(32,1,1)]
+void node6(DispatchNodeInputRecord<Derived1 > input) {}
+// CHECK: {!"node6"
+// CHECK: , i32 1, ![[SVDG_1]]
+
+template <typename T>
+struct Derived2 : Base<T> {
+  T Y;
+};
+
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NodeMaxDispatchGrid(32,16,1)]
+[NumThreads(32,1,1)]
+void node7(DispatchNodeInputRecord<Derived2<uint2> > input) {}
+// CHECK: {!"node7"
+// CHECK: , i32 1, ![[SVDG_7:[0-9]+]]
+// CHECK: [[SVDG_7]] = !{i32 0, i32 5, i32 2}
+
+template <typename T>
+struct Derived3 {
+  Derived2<T> V;
+};
+
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NodeMaxDispatchGrid(32,16,1)]
+[NumThreads(32,1,1)]
+void node8(DispatchNodeInputRecord< Derived3 <uint3> > input) {}
+// CHECK: {!"node8"
+// CHECK: , i32 1, ![[SVDG_1]]

From 8b406b5717ca17874bd6b2ce832a8802c6fb3979 Mon Sep 17 00:00:00 2001
From: Tex Riddell <texr@microsoft.com>
Date: Wed, 14 May 2025 17:57:23 -0700
Subject: [PATCH 35/93] Fix detection of builtin UDT DXR struct types (#7452)

Built-in DXR struct types RayDesc and
BuiltInTriangleIntersectionAttributes were not treated identically to
other UDT types.

This caused differences in intrinsic codegen when one of these types is
returned.

This change corrects this difference so these builtin structs are
handled in the same way as other UDTs.

Fixes #7450.
---
 tools/clang/include/clang/AST/HlslTypes.h     |  1 -
 tools/clang/lib/AST/HlslTypes.cpp             | 28 ++++++-------
 .../hitobject_attributes_builtin.hlsl         | 42 +++++++++++++++++++
 3 files changed, 54 insertions(+), 17 deletions(-)
 create mode 100644 tools/clang/test/CodeGenDXIL/hlsl/objects/HitObject/hitobject_attributes_builtin.hlsl

diff --git a/tools/clang/include/clang/AST/HlslTypes.h b/tools/clang/include/clang/AST/HlslTypes.h
index 3a02824b3a..c14f562101 100644
--- a/tools/clang/include/clang/AST/HlslTypes.h
+++ b/tools/clang/include/clang/AST/HlslTypes.h
@@ -494,7 +494,6 @@ DXIL::NodeIOKind GetNodeIOType(clang::QualType type);
 
 bool IsHLSLStructuredBufferType(clang::QualType type);
 bool IsHLSLNumericOrAggregateOfNumericType(clang::QualType type);
-bool IsHLSLNumericUserDefinedType(clang::QualType type);
 bool IsHLSLCopyableAnnotatableRecord(clang::QualType QT);
 bool IsHLSLBuiltinRayAttributeStruct(clang::QualType QT);
 bool IsHLSLAggregateType(clang::QualType type);
diff --git a/tools/clang/lib/AST/HlslTypes.cpp b/tools/clang/lib/AST/HlslTypes.cpp
index 5b19e064a3..07efb53c8c 100644
--- a/tools/clang/lib/AST/HlslTypes.cpp
+++ b/tools/clang/lib/AST/HlslTypes.cpp
@@ -103,14 +103,19 @@ bool IsHLSLNumericOrAggregateOfNumericType(clang::QualType type) {
          BuiltinTy->getKind() != BuiltinType::Kind::Char_S;
 }
 
-bool IsHLSLNumericUserDefinedType(clang::QualType type) {
-  const clang::Type *Ty = type.getCanonicalType().getTypePtr();
+// In some cases we need record types that are annotatable and trivially
+// copyable from outside the shader. This excludes resource types which may be
+// trivially copyable inside the shader, and builtin matrix and vector types
+// which can't be annotated. But includes UDTs of trivially copyable data and
+// the builtin trivially copyable raytracing structs.
+bool IsHLSLCopyableAnnotatableRecord(clang::QualType QT) {
+  const clang::Type *Ty = QT.getCanonicalType().getTypePtr();
   if (const RecordType *RT = dyn_cast<RecordType>(Ty)) {
     const RecordDecl *RD = RT->getDecl();
-    if (!IsUserDefinedRecordType(type))
+    if (!IsUserDefinedRecordType(QT))
       return false;
-    for (auto member : RD->fields()) {
-      if (!IsHLSLNumericOrAggregateOfNumericType(member->getType()))
+    for (auto Member : RD->fields()) {
+      if (!IsHLSLNumericOrAggregateOfNumericType(Member->getType()))
         return false;
     }
     return true;
@@ -118,16 +123,6 @@ bool IsHLSLNumericUserDefinedType(clang::QualType type) {
   return false;
 }
 
-// In some cases we need record types that are annotatable and trivially
-// copyable from outside the shader. This excludes resource types which may be
-// trivially copyable inside the shader, and builtin matrix and vector types
-// which can't be annotated. But includes UDTs of trivially copyable data and
-// the builtin trivially copyable raytracing structs.
-bool IsHLSLCopyableAnnotatableRecord(clang::QualType QT) {
-  return IsHLSLNumericUserDefinedType(QT) ||
-         IsHLSLBuiltinRayAttributeStruct(QT);
-}
-
 bool IsHLSLBuiltinRayAttributeStruct(clang::QualType QT) {
   QT = QT.getCanonicalType();
   const clang::Type *Ty = QT.getTypePtr();
@@ -609,7 +604,8 @@ bool IsUserDefinedRecordType(clang::QualType QT) {
   const clang::Type *Ty = QT.getCanonicalType().getTypePtr();
   if (const RecordType *RT = dyn_cast<RecordType>(Ty)) {
     const RecordDecl *RD = RT->getDecl();
-    if (RD->isImplicit())
+    // Built-in ray tracing struct types are considered user defined types.
+    if (RD->isImplicit() && !IsHLSLBuiltinRayAttributeStruct(QT))
       return false;
     if (auto TD = dyn_cast<ClassTemplateSpecializationDecl>(RD))
       if (TD->getSpecializedTemplate()->isImplicit())
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/objects/HitObject/hitobject_attributes_builtin.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/objects/HitObject/hitobject_attributes_builtin.hlsl
new file mode 100644
index 0000000000..a096bb6f11
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/hlsl/objects/HitObject/hitobject_attributes_builtin.hlsl
@@ -0,0 +1,42 @@
+// RUN: %dxc /Tlib_6_9 %s | FileCheck %s
+// RUN: %dxc /Tlib_6_9 -fcgl %s | FileCheck %s -check-prefix=FCGL
+
+// Make sure that we can use the BuiltInTriangleIntersectionAttributes struct
+// as a template argument to GetAttributes.
+
+// For -fcgl, just check the form of the HL call.
+// FCGL: %{{[^ ]+}} = call %struct.BuiltInTriangleIntersectionAttributes* @"dx.hl.op..%struct.BuiltInTriangleIntersectionAttributes* (i32, %dx.types.HitObject*)"(i32 364, %dx.types.HitObject* %{{[^ ]+}})
+
+// CHECK: %[[ATTR:[^ ]+]] = alloca %struct.BuiltInTriangleIntersectionAttributes
+// CHECK: call void @dx.op.hitObject_Attributes.struct.BuiltInTriangleIntersectionAttributes(i32 289, %dx.types.HitObject %{{[^ ]+}}, %struct.BuiltInTriangleIntersectionAttributes* nonnull %[[ATTR]])
+
+RaytracingAccelerationStructure Scene : register(t0, space0);
+RWTexture2D<float4> RenderTarget : register(u0);
+
+struct [raypayload] RayPayload
+{
+    float4 color : write(caller, closesthit, miss) : read(caller);
+};
+
+typedef BuiltInTriangleIntersectionAttributes MyAttribs;
+
+[shader("raygeneration")]
+void MyRaygenShader()
+{
+    RayDesc ray;
+    ray.Origin = float3(0,0,0);
+    ray.Direction = float3(0, 0, 1);
+    ray.TMin = 0.001;
+    ray.TMax = 10000.0;
+
+    RayPayload payload = { float4(0, 0, 0, 0) };
+    float4 color = float4(1,1,1,1);
+
+    dx::HitObject hit = dx::HitObject::TraceRay(Scene, RAY_FLAG_NONE, ~0, 0, 1, 0, ray, payload);
+
+    MyAttribs attr = hit.GetAttributes<MyAttribs>();
+    payload.color += float4(attr,0,1);
+
+    // Write the raytraced color to the output texture.
+    RenderTarget[DispatchRaysIndex().xy] = payload.color;
+}

From 9b04d69dbfc181966a06fce46b9005d685558724 Mon Sep 17 00:00:00 2001
From: Steven Perron <stevenperron@google.com>
Date: Thu, 15 May 2025 14:38:09 -0400
Subject: [PATCH 36/93] [SPIRV] Cast derivative opts to 32-bits. (#7445)

The SPIR-V operations require 32-bit floats. Smaller float type can be
cast to 32-bits to perform the operation. The FE already emits a warning
for 64-bits.

Fixes #7431
---
 tools/clang/lib/SPIRV/SpirvEmitter.cpp        | 88 +++++++++++++++++--
 tools/clang/lib/SPIRV/SpirvEmitter.h          | 15 ++++
 .../CodeGenSPIRV/intrinsics.ddx.double.hlsl   | 21 +++++
 .../CodeGenSPIRV/intrinsics.ddx.half.hlsl     | 19 ++++
 4 files changed, 137 insertions(+), 6 deletions(-)
 create mode 100644 tools/clang/test/CodeGenSPIRV/intrinsics.ddx.double.hlsl
 create mode 100644 tools/clang/test/CodeGenSPIRV/intrinsics.ddx.half.hlsl

diff --git a/tools/clang/lib/SPIRV/SpirvEmitter.cpp b/tools/clang/lib/SPIRV/SpirvEmitter.cpp
index 7337a33b01..4da8584eee 100644
--- a/tools/clang/lib/SPIRV/SpirvEmitter.cpp
+++ b/tools/clang/lib/SPIRV/SpirvEmitter.cpp
@@ -9484,12 +9484,17 @@ SpirvEmitter::processIntrinsicCallExpr(const CallExpr *callExpr) {
     retVal = processIntrinsicPointerCast(callExpr, true);
     break;
   }
-    INTRINSIC_SPIRV_OP_CASE(ddx, DPdx, true);
-    INTRINSIC_SPIRV_OP_CASE(ddx_coarse, DPdxCoarse, false);
-    INTRINSIC_SPIRV_OP_CASE(ddx_fine, DPdxFine, false);
-    INTRINSIC_SPIRV_OP_CASE(ddy, DPdy, true);
-    INTRINSIC_SPIRV_OP_CASE(ddy_coarse, DPdyCoarse, false);
-    INTRINSIC_SPIRV_OP_CASE(ddy_fine, DPdyFine, false);
+  case hlsl::IntrinsicOp::IOP_ddx:
+  case hlsl::IntrinsicOp::IOP_ddx_coarse:
+  case hlsl::IntrinsicOp::IOP_ddx_fine:
+  case hlsl::IntrinsicOp::IOP_ddy:
+  case hlsl::IntrinsicOp::IOP_ddy_coarse:
+  case hlsl::IntrinsicOp::IOP_ddy_fine: {
+    retVal = processDerivativeIntrinsic(hlslOpcode, callExpr->getArg(0),
+                                        callExpr->getExprLoc(),
+                                        callExpr->getSourceRange());
+    break;
+  }
     INTRINSIC_SPIRV_OP_CASE(countbits, BitCount, false);
     INTRINSIC_SPIRV_OP_CASE(fmod, FRem, true);
     INTRINSIC_SPIRV_OP_CASE(fwidth, Fwidth, true);
@@ -9572,6 +9577,77 @@ SpirvEmitter::processIntrinsicFirstbit(const CallExpr *callExpr,
                                        srcRange);
 }
 
+SpirvInstruction *SpirvEmitter::processMatrixDerivativeIntrinsic(
+    hlsl::IntrinsicOp hlslOpcode, const Expr *arg, SourceLocation loc,
+    SourceRange range) {
+  const auto actOnEachVec = [this, hlslOpcode, loc, range](
+                                uint32_t /*index*/, QualType inType,
+                                QualType outType, SpirvInstruction *curRow) {
+    return processDerivativeIntrinsic(hlslOpcode, curRow, loc, range);
+  };
+
+  return processEachVectorInMatrix(arg, arg->getType(), doExpr(arg),
+                                   actOnEachVec, loc, range);
+}
+
+SpirvInstruction *
+SpirvEmitter::processDerivativeIntrinsic(hlsl::IntrinsicOp hlslOpcode,
+                                         const Expr *arg, SourceLocation loc,
+                                         SourceRange range) {
+  if (isMxNMatrix(arg->getType())) {
+    return processMatrixDerivativeIntrinsic(hlslOpcode, arg, loc, range);
+  }
+  return processDerivativeIntrinsic(hlslOpcode, doExpr(arg), loc, range);
+}
+
+SpirvInstruction *SpirvEmitter::processDerivativeIntrinsic(
+    hlsl::IntrinsicOp hlslOpcode, SpirvInstruction *arg, SourceLocation loc,
+    SourceRange range) {
+  QualType returnType = arg->getAstResultType();
+  assert(isFloatOrVecOfFloatType(returnType));
+
+  if (!spvContext.isPS())
+    addDerivativeGroupExecutionMode();
+  needsLegalization = true;
+
+  QualType B32Type = astContext.FloatTy;
+  uint32_t vectorSize = 0;
+  QualType elementType = returnType;
+  if (isVectorType(returnType, &elementType, &vectorSize)) {
+    B32Type = astContext.getExtVectorType(B32Type, vectorSize);
+  }
+
+  // Derivative operations work on 32-bit floats only. Cast to 32-bit if needed.
+  SpirvInstruction *operand = castToType(arg, returnType, B32Type, loc, range);
+
+  spv::Op opcode = spv::Op::OpNop;
+  switch (hlslOpcode) {
+  case hlsl::IntrinsicOp::IOP_ddx:
+    opcode = spv::Op::OpDPdx;
+    break;
+  case hlsl::IntrinsicOp::IOP_ddx_coarse:
+    opcode = spv::Op::OpDPdxCoarse;
+    break;
+  case hlsl::IntrinsicOp::IOP_ddx_fine:
+    opcode = spv::Op::OpDPdxFine;
+    break;
+  case hlsl::IntrinsicOp::IOP_ddy:
+    opcode = spv::Op::OpDPdy;
+    break;
+  case hlsl::IntrinsicOp::IOP_ddy_coarse:
+    opcode = spv::Op::OpDPdyCoarse;
+    break;
+  case hlsl::IntrinsicOp::IOP_ddy_fine:
+    opcode = spv::Op::OpDPdyFine;
+    break;
+  };
+
+  SpirvInstruction *result =
+      spvBuilder.createUnaryOp(opcode, B32Type, operand, loc, range);
+  result = castToType(result, B32Type, returnType, loc, range);
+  return result;
+}
+
 // Returns true is the given expression can be used as an output parameter.
 //
 // Warning: this function could return false negatives.
diff --git a/tools/clang/lib/SPIRV/SpirvEmitter.h b/tools/clang/lib/SPIRV/SpirvEmitter.h
index 6c1e12989c..10694313a8 100644
--- a/tools/clang/lib/SPIRV/SpirvEmitter.h
+++ b/tools/clang/lib/SPIRV/SpirvEmitter.h
@@ -789,6 +789,21 @@ class SpirvEmitter : public ASTConsumer {
   SpirvInstruction *processIntrinsicFirstbit(const CallExpr *,
                                              GLSLstd450 glslOpcode);
 
+  SpirvInstruction *
+  processMatrixDerivativeIntrinsic(hlsl::IntrinsicOp hlslOpcode,
+                                   const Expr *arg, SourceLocation loc,
+                                   SourceRange range);
+
+  SpirvInstruction *processDerivativeIntrinsic(hlsl::IntrinsicOp hlslOpcode,
+                                               const Expr *arg,
+                                               SourceLocation loc,
+                                               SourceRange range);
+
+  SpirvInstruction *processDerivativeIntrinsic(hlsl::IntrinsicOp hlslOpcode,
+                                               SpirvInstruction *arg,
+                                               SourceLocation loc,
+                                               SourceRange range);
+
 private:
   /// Returns the <result-id> for constant value 0 of the given type.
   SpirvConstant *getValueZero(QualType type);
diff --git a/tools/clang/test/CodeGenSPIRV/intrinsics.ddx.double.hlsl b/tools/clang/test/CodeGenSPIRV/intrinsics.ddx.double.hlsl
new file mode 100644
index 0000000000..a306463466
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/intrinsics.ddx.double.hlsl
@@ -0,0 +1,21 @@
+// RUN: %dxc -T ps_6_2 -E main -fcgl  %s -spirv 2>&1 | FileCheck %s
+
+// CHECK: :14:22: warning: conversion from larger type 'double' to smaller type 'float', possible loss of data [-Wconversion]
+// CHECK: :20:22: warning: conversion from larger type 'double2' to smaller type 'vector<float, 2>', possible loss of data [-Wconversion]
+
+void main() {
+  double    a;
+  double2   b;
+
+// CHECK:      [[a:%[0-9]+]] = OpLoad %double %a
+// CHECK-NEXT: [[c:%[0-9]+]] = OpFConvert %float [[a]]
+// CHECK-NEXT:   [[r:%[0-9]+]] = OpDPdx %float [[c]]
+// CHECK-NEXT:  OpFConvert %double [[r]]
+  double    da = ddx(a);
+
+// CHECK:      [[b:%[0-9]+]] = OpLoad %v2double %b
+// CHECK-NEXT: [[c:%[0-9]+]] = OpFConvert %v2float [[b]]
+// CHECK-NEXT: [[r:%[0-9]+]] = OpDPdx %v2float [[c]]
+// CHECK-NEXT:  OpFConvert %v2double [[r]]
+  double2   db = ddx(b);
+}
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenSPIRV/intrinsics.ddx.half.hlsl b/tools/clang/test/CodeGenSPIRV/intrinsics.ddx.half.hlsl
new file mode 100644
index 0000000000..11b63151ee
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/intrinsics.ddx.half.hlsl
@@ -0,0 +1,19 @@
+// RUN: %dxc -T ps_6_2 -E main -enable-16bit-types -fcgl  %s -spirv | FileCheck %s
+
+void main() {
+
+  half    a;
+  half2   b;
+
+// CHECK:      [[a:%[0-9]+]] = OpLoad %half %a
+// CHECK-NEXT: [[c:%[0-9]+]] = OpFConvert %float [[a]]
+// CHECK-NEXT:   [[r:%[0-9]+]] = OpDPdx %float [[c]]
+// CHECK-NEXT:  OpFConvert %half [[r]]
+  half    da = ddx(a);
+
+// CHECK:      [[b:%[0-9]+]] = OpLoad %v2half %b
+// CHECK-NEXT: [[c:%[0-9]+]] = OpFConvert %v2float [[b]]
+// CHECK-NEXT: [[r:%[0-9]+]] = OpDPdx %v2float [[c]]
+// CHECK-NEXT:  OpFConvert %v2half [[r]]
+  half2   db = ddx(b);
+}

From 14e1f83cd1c437cd74804d4a99861c4961ded646 Mon Sep 17 00:00:00 2001
From: Steven Perron <stevenperron@google.com>
Date: Thu, 15 May 2025 14:38:47 -0400
Subject: [PATCH 37/93] [SPIRV] Allow decoration attributes on fields. (#7453)

Fixes #7270
---
 tools/clang/include/clang/Basic/Attr.td        |  6 ++++--
 .../spv.inline.decorate.member.hlsl            | 18 +++++++++++++-----
 .../attributes/spv.inline.decorate.member.hlsl | 13 -------------
 3 files changed, 17 insertions(+), 20 deletions(-)
 delete mode 100644 tools/clang/test/SemaHLSL/attributes/spv.inline.decorate.member.hlsl

diff --git a/tools/clang/include/clang/Basic/Attr.td b/tools/clang/include/clang/Basic/Attr.td
index 2518423565..db7fdea8d9 100644
--- a/tools/clang/include/clang/Basic/Attr.td
+++ b/tools/clang/include/clang/Basic/Attr.td
@@ -1418,7 +1418,8 @@ def VKDecorateExt : InheritableAttr {
 
 def VKDecorateIdExt : InheritableAttr {
   let Spellings = [CXX11<"vk", "ext_decorate_id">];
-  let Subjects = SubjectList<[Function, Var, ParmVar, TypedefName], ErrorDiag>;
+  let Subjects =
+      SubjectList<[Function, Var, ParmVar, Field, TypedefName], ErrorDiag>;
   let Args = [UnsignedArgument<"decorate">, VariadicExprArgument<"arguments">];
   let LangOpts = [SPIRV];
   let Documentation = [Undocumented];
@@ -1426,7 +1427,8 @@ def VKDecorateIdExt : InheritableAttr {
 
 def VKDecorateStringExt : InheritableAttr {
   let Spellings = [CXX11<"vk", "ext_decorate_string">];
-  let Subjects = SubjectList<[Function, Var, ParmVar, TypedefName], ErrorDiag>;
+  let Subjects =
+      SubjectList<[Function, Var, ParmVar, Field, TypedefName], ErrorDiag>;
   let Args = [UnsignedArgument<"decorate">, VariadicStringArgument<"arguments">];
   let LangOpts = [SPIRV];
   let Documentation = [Undocumented];
diff --git a/tools/clang/test/CodeGenSPIRV/inline-spirv/spv.inline.decorate.member.hlsl b/tools/clang/test/CodeGenSPIRV/inline-spirv/spv.inline.decorate.member.hlsl
index bb4c2efde1..88a902d326 100644
--- a/tools/clang/test/CodeGenSPIRV/inline-spirv/spv.inline.decorate.member.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/inline-spirv/spv.inline.decorate.member.hlsl
@@ -4,9 +4,9 @@ template<class T, class U>
 [[vk::ext_instruction(/*spv::OpBitcast*/124)]]
 T Bitcast(U);
 
-// CHECK: OpMemberDecorate %S 0 Offset 0
-// CHECK: OpMemberDecorate %S 1 Offset 16
-// CHECK: %S = OpTypeStruct %v4float %v4float
+// CHECK-DAG: OpMemberDecorate %S 0 Offset 0
+// CHECK-DAG: OpMemberDecorate %S 1 Offset 16
+// CHECK-DAG: %S = OpTypeStruct %v4float %v4float
 
 struct S
 {
@@ -14,6 +14,12 @@ struct S
     [[vk::ext_decorate(/*offset*/ 35, 16)]] float4 f2;
 };
 
+// CHECK-DAG: OpDecorateString %out_var_SV_TARGET UserSemantic "raster_order_group_0"
+struct PixelOutput
+{
+	[[vk::location(0), vk::ext_decorate_string(5635, "raster_order_group_0")]] float4 rt0 : SV_TARGET;
+};
+
 using PointerType = vk::SpirvOpaqueType<
     /* OpTypePointer */ 32,
     /* PhysicalStorageBuffer */ vk::Literal<vk::integral_constant<uint,5349> >,
@@ -27,14 +33,16 @@ S Load(PointerType pointer,
 
 uint64_t address;
 
-float4 main() : SV_TARGET
+PixelOutput main()
 {
 
 // CHECK: [[BC:%[0-9]+]] = OpBitcast %_ptr_PhysicalStorageBuffer_S {{%[0-9]+}}
   PointerType ptr = Bitcast<PointerType>(address);
 
+PixelOutput output;
 // CHECK: [[LD:%[0-9]+]] = OpLoad %S [[BC]] Aligned 32
 // CHECK: [[RET:%[0-9]+]] = OpCompositeExtract %v4float [[LD]] 0
 // CHECK: OpStore %out_var_SV_TARGET [[RET]]
-  return Load(ptr).f1;
+output.rt0 = Load(ptr).f1;
+  return output;
 }
diff --git a/tools/clang/test/SemaHLSL/attributes/spv.inline.decorate.member.hlsl b/tools/clang/test/SemaHLSL/attributes/spv.inline.decorate.member.hlsl
deleted file mode 100644
index ece7e3f2f4..0000000000
--- a/tools/clang/test/SemaHLSL/attributes/spv.inline.decorate.member.hlsl
+++ /dev/null
@@ -1,13 +0,0 @@
-// REQUIRES: spirv
-// RUN: %dxc -T ps_6_0 -E main -verify -spirv %s
-
-struct S
-{
-    [[vk::ext_decorate_id(/*offset*/ 35, 0)]] float4 f1; /* expected-error{{'ext_decorate_id' attribute only applies to functions, variables, parameters, and types}} */
-    [[vk::ext_decorate_string(/*offset*/ 35, "16")]] float4 f2; /* expected-error{{'ext_decorate_string' attribute only applies to functions, variables, parameters, and types}} */
-};
-
-float4 main() : SV_TARGET
-{
-
-}

From 2a6bacd8712b040c5ab490c80dfa7553a355d2e0 Mon Sep 17 00:00:00 2001
From: Steven Perron <stevenperron@google.com>
Date: Thu, 15 May 2025 14:39:03 -0400
Subject: [PATCH 38/93] [SPIRV] Treat vk::Spirv*Type as opaque when
 reconstructing (#7454)

It is possible to have two struct types in spir-v that are the same
except for the decorations. Sometimes we have to reconstruct the value
from one type to another.

In the case of a vk::SpirvType, we do not know anything about the type,
so this should not happen. When trying to reconstuct the value, we
should simply return the original value.

Fixes #6963
---
 tools/clang/lib/SPIRV/SpirvEmitter.cpp             | 11 +++++++----
 .../CodeGenSPIRV/intrinsics.vkrawbufferload.hlsl   | 14 ++++++++++++++
 2 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/tools/clang/lib/SPIRV/SpirvEmitter.cpp b/tools/clang/lib/SPIRV/SpirvEmitter.cpp
index 4da8584eee..dc2b332d31 100644
--- a/tools/clang/lib/SPIRV/SpirvEmitter.cpp
+++ b/tools/clang/lib/SPIRV/SpirvEmitter.cpp
@@ -7081,14 +7081,17 @@ SpirvInstruction *SpirvEmitter::reconstructValue(SpirvInstruction *srcVal,
 
   // Structs
   if (const auto *recordType = valType->getAs<RecordType>()) {
-    assert(recordType->isStructureType());
-
     if (isTypeInVkNamespace(recordType) &&
-        recordType->getDecl()->getName().equals("BufferPointer")) {
-      // Uniquely among structs, vk::BufferPointer<T> lowers to a pointer type.
+        (recordType->getDecl()->getName().equals("BufferPointer") ||
+         recordType->getDecl()->getName().equals("SpirvType") ||
+         recordType->getDecl()->getName().equals("SpirvOpaqueType"))) {
+      // vk::BufferPointer<T> lowers to a pointer type. No need to reconstruct
+      // the value. The vk::Spirv*Type should be treated an opaque type. All we
+      // can do is leave it the same.
       return srcVal;
     }
 
+    assert(recordType->isStructureType());
     LowerTypeVisitor lowerTypeVisitor(astContext, spvContext, spirvOptions,
                                       spvBuilder);
     const StructType *spirvStructType =
diff --git a/tools/clang/test/CodeGenSPIRV/intrinsics.vkrawbufferload.hlsl b/tools/clang/test/CodeGenSPIRV/intrinsics.vkrawbufferload.hlsl
index 7be0713e48..c2892cfc29 100644
--- a/tools/clang/test/CodeGenSPIRV/intrinsics.vkrawbufferload.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/intrinsics.vkrawbufferload.hlsl
@@ -12,7 +12,16 @@ struct BufferData {
   float3 v;
 };
 
+using MyInt = vk::SpirvType<
+    /*spv::OpTypeInt*/21,
+    1,1, // size and alignment
+    vk::Literal<vk::integral_constant<uint,16> >, // bits
+    vk::Literal<vk::integral_constant<uint,1> > // signed
+>;
+
 uint64_t Address;
+
+[[vk::ext_capability(/* Int16 */ 22)]]
 float4 main() : SV_Target0 {
   // CHECK:      [[addr:%[0-9]+]] = OpLoad %ulong
   // CHECK-NEXT: [[buf:%[0-9]+]] = OpBitcast %_ptr_PhysicalStorageBuffer_float [[addr]]
@@ -50,5 +59,10 @@ float4 main() : SV_Target0 {
   // CHECK-NEXT: [[load:%[0-9]+]] = OpLoad %BufferData_0 [[buf]] Aligned 4
   d = vk::RawBufferLoad<BufferData>(0);
 
+  // CHECK: [[buf:%[0-9]+]] = OpBitcast %_ptr_PhysicalStorageBuffer_spirvIntrinsicType %ulong_0
+  // CHECK-NEXT: [[load:%[0-9]+]] = OpLoad %spirvIntrinsicType [[buf]] Aligned 4
+  // CHECK-NEXT: OpStore %mi [[load]]
+  MyInt mi = vk::RawBufferLoad<MyInt>(0);
+
   return float4(w.x, x, y, z);
 }

From 242196438a1fad25da901b6a095b33929ca19ee3 Mon Sep 17 00:00:00 2001
From: Steven Perron <stevenperron@google.com>
Date: Thu, 15 May 2025 17:52:56 -0400
Subject: [PATCH 39/93] [SPIRV] Add warning for initialized globals (#7448)

To be consistent with DXIL, we will start emitting a warning for
extenally visible variables that have an initializer. Until now, there
were silently ignored.

Fixes #3950
---
 tools/clang/lib/SPIRV/SpirvEmitter.cpp        |  4 ++++
 .../groupshared.init.warning.hlsl             | 19 +++++++++++++++++++
 2 files changed, 23 insertions(+)
 create mode 100644 tools/clang/test/CodeGenSPIRV/groupshared.init.warning.hlsl

diff --git a/tools/clang/lib/SPIRV/SpirvEmitter.cpp b/tools/clang/lib/SPIRV/SpirvEmitter.cpp
index dc2b332d31..575597352d 100644
--- a/tools/clang/lib/SPIRV/SpirvEmitter.cpp
+++ b/tools/clang/lib/SPIRV/SpirvEmitter.cpp
@@ -2021,6 +2021,10 @@ void SpirvEmitter::doVarDecl(const VarDecl *decl) {
   // variables) belongs to the Function storage class.
   if (isExternalVar(decl)) {
     var = declIdMapper.createExternVar(decl);
+    if (decl->hasInit()) {
+      emitWarning("Initializer of external global will be ignored",
+                  decl->getLocation());
+    }
   } else {
     // We already know the variable is not externally visible here. If it does
     // not have local storage, it should be file scope variable.
diff --git a/tools/clang/test/CodeGenSPIRV/groupshared.init.warning.hlsl b/tools/clang/test/CodeGenSPIRV/groupshared.init.warning.hlsl
new file mode 100644
index 0000000000..c49534948b
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/groupshared.init.warning.hlsl
@@ -0,0 +1,19 @@
+// RUN: %dxc -T cs_6_0 -E main -spirv %s 2>&1 | FileCheck %s
+
+groupshared uint testing = 0;
+
+[numthreads(64, 1, 1)]
+void main(uint local_thread_id_flat : SV_GroupIndex) {
+    
+    InterlockedAdd(testing, 1);
+    GroupMemoryBarrierWithGroupSync();
+    
+    if (local_thread_id_flat == 0) {
+        if (testing > 64) {
+            printf("testing is %u wtf", testing);
+        }
+    }
+}
+
+// CHECK: warning: Initializer of external global will be ignored
+// CHECK-NEXT: groupshared uint testing = 0;
\ No newline at end of file

From 7054e5207ae9cb573e02068aac29ddf6299d2c2e Mon Sep 17 00:00:00 2001
From: Steven Perron <stevenperron@google.com>
Date: Fri, 16 May 2025 06:11:59 -0400
Subject: [PATCH 40/93] [SEMA] Don't emit an error for sizeof an enum. (#7449)

Fixes #7416
---
 tools/clang/lib/AST/HlslTypes.cpp             |  2 ++
 .../clang/test/CodeGenSPIRV/enum_sizeof.hlsl  | 31 +++++++++++++++++++
 tools/clang/test/SemaHLSL/enum_sizeof.hlsl    | 31 +++++++++++++++++++
 3 files changed, 64 insertions(+)
 create mode 100644 tools/clang/test/CodeGenSPIRV/enum_sizeof.hlsl
 create mode 100644 tools/clang/test/SemaHLSL/enum_sizeof.hlsl

diff --git a/tools/clang/lib/AST/HlslTypes.cpp b/tools/clang/lib/AST/HlslTypes.cpp
index 07efb53c8c..e081362ebf 100644
--- a/tools/clang/lib/AST/HlslTypes.cpp
+++ b/tools/clang/lib/AST/HlslTypes.cpp
@@ -95,6 +95,8 @@ bool IsHLSLNumericOrAggregateOfNumericType(clang::QualType type) {
   } else if (type->isArrayType()) {
     return IsHLSLNumericOrAggregateOfNumericType(
         QualType(type->getArrayElementTypeNoTypeQual(), 0));
+  } else if (type->isEnumeralType()) {
+    return true;
   }
 
   // Chars can only appear as part of strings, which we don't consider numeric.
diff --git a/tools/clang/test/CodeGenSPIRV/enum_sizeof.hlsl b/tools/clang/test/CodeGenSPIRV/enum_sizeof.hlsl
new file mode 100644
index 0000000000..f596a2db50
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/enum_sizeof.hlsl
@@ -0,0 +1,31 @@
+// RUN: %dxc -T cs_6_0 -E main  -fcgl %s -spirv | FileCheck %s
+
+enum E1 : uint64_t
+{
+    v1 = 0,
+};
+
+enum E2 : uint32_t
+{
+    v2 = 0,
+};
+
+struct S {
+  E1 e1;
+  E2 e2;
+};
+
+RWBuffer<int> b;
+
+[numthreads(128, 1, 1)]
+void main()
+{
+// CHECK: OpImageWrite {{%.*}} %uint_0 %int_8 None
+    b[0] = sizeof(E1);
+
+// CHECK: OpImageWrite {{%.*}} %uint_1 %int_4 None
+    b[1] = sizeof(E2);
+
+// CHECK: OpImageWrite {{%.*}} %uint_2 %int_16 None
+    b[2] = sizeof(S);
+}
diff --git a/tools/clang/test/SemaHLSL/enum_sizeof.hlsl b/tools/clang/test/SemaHLSL/enum_sizeof.hlsl
new file mode 100644
index 0000000000..71723976a9
--- /dev/null
+++ b/tools/clang/test/SemaHLSL/enum_sizeof.hlsl
@@ -0,0 +1,31 @@
+// RUN: %dxc -T cs_6_9 -E main %s -ast-dump-implicit | FileCheck %s --check-prefix AST
+
+enum E1 : uint64_t
+{
+    v1 = 0,
+};
+
+enum E2 : uint32_t
+{
+    v2 = 0,
+};
+
+struct S {
+  E1 e1;
+  E2 e2;
+};
+
+RWBuffer<int> b;
+
+[numthreads(128, 1, 1)]
+void main()
+{
+// AST: UnaryExprOrTypeTraitExpr {{.*}} 'unsigned long' sizeof 'E1'
+    b[0] = sizeof(E1);
+
+// AST: UnaryExprOrTypeTraitExpr {{.*}} 'unsigned long' sizeof 'E2'
+    b[1] = sizeof(E2);
+
+// AST: UnaryExprOrTypeTraitExpr {{.*}} 'unsigned long' sizeof 'S'
+    b[2] = sizeof(S);
+}

From fef2f94250793edd2358bc44f5aab8d6251a80e1 Mon Sep 17 00:00:00 2001
From: Nielsbishere <n@osomi.net>
Date: Fri, 16 May 2025 16:10:20 +0200
Subject: [PATCH 41/93] Fix -fvk-invert-y (#7447)

https://github.com/microsoft/DirectXShaderCompiler/issues/7446
This fixes some outdated documentation as well as a compile error when
enabling fvk-invert-y on lib files and makes sure that it only gets
enabled on SV_POSITION that is used in VS/GS/DS/MS (so PS doesn't get
caught in the crossfire).
Also tested the dx-position-w one and that one already has correct
behavior here.

---------

Co-authored-by: NielsbishereAlt <nb@osomi.net>
---
 docs/SPIR-V.rst                                      |  2 +-
 include/dxc/Support/HLSLOptions.td                   |  2 +-
 tools/clang/lib/SPIRV/SpirvEmitter.cpp               | 12 ++++++++----
 .../test/CodeGenSPIRV/vk.cloption.invert-y.lib.hlsl  | 12 ++++++++++++
 4 files changed, 22 insertions(+), 6 deletions(-)
 create mode 100644 tools/clang/test/CodeGenSPIRV/vk.cloption.invert-y.lib.hlsl

diff --git a/docs/SPIR-V.rst b/docs/SPIR-V.rst
index b5e9c05079..f3981ba854 100644
--- a/docs/SPIR-V.rst
+++ b/docs/SPIR-V.rst
@@ -4227,7 +4227,7 @@ codegen for Vulkan:
 - ``-fvk-use-dx-layout``: Uses DirectX layout rules for resources.
 - ``-fvk-invert-y``: Negates (additively inverts) SV_Position.y before writing
   to stage output. Used to accommodate the difference between Vulkan's
-  coordinate system and DirectX's. Only allowed in VS/DS/GS.
+  coordinate system and DirectX's. Only allowed in VS/DS/GS/MS/Lib.
 - ``-fvk-use-dx-position-w``: Reciprocates (multiplicatively inverts)
   SV_Position.w after reading from stage input. Used to accommodate the
   difference between Vulkan DirectX: the w component of SV_Position in PS is
diff --git a/include/dxc/Support/HLSLOptions.td b/include/dxc/Support/HLSLOptions.td
index 4d72cb2312..58f6bdfbf3 100644
--- a/include/dxc/Support/HLSLOptions.td
+++ b/include/dxc/Support/HLSLOptions.td
@@ -368,7 +368,7 @@ def fvk_bind_register : MultiArg<["-"], "fvk-bind-register", 4>, MetaVarName<"<t
   HelpText<"Specify Vulkan descriptor set and binding for a specific register">;
 def vkbr : MultiArg<["-"], "vkbr", 4>, Flags<[CoreOption, DriverOption]>, Alias<fvk_bind_register>;
 def fvk_invert_y: Flag<["-"], "fvk-invert-y">, Group<spirv_Group>, Flags<[CoreOption, DriverOption]>,
-  HelpText<"Negate SV_Position.y before writing to stage output in VS/DS/GS to accommodate Vulkan's coordinate system">;
+  HelpText<"Negate SV_Position.y before writing to stage output in VS/DS/GS/MS/Lib to accommodate Vulkan's coordinate system">;
 def fvk_use_dx_position_w: Flag<["-"], "fvk-use-dx-position-w">, Group<spirv_Group>, Flags<[CoreOption, DriverOption]>,
   HelpText<"Reciprocate SV_Position.w after reading from stage input in PS to accommodate the difference between Vulkan and DirectX">;
 def fvk_support_nonzero_base_instance: Flag<["-"], "fvk-support-nonzero-base-instance">, Group<spirv_Group>, Flags<[CoreOption, DriverOption]>,
diff --git a/tools/clang/lib/SPIRV/SpirvEmitter.cpp b/tools/clang/lib/SPIRV/SpirvEmitter.cpp
index 575597352d..9ffa978511 100644
--- a/tools/clang/lib/SPIRV/SpirvEmitter.cpp
+++ b/tools/clang/lib/SPIRV/SpirvEmitter.cpp
@@ -604,8 +604,8 @@ SpirvEmitter::SpirvEmitter(CompilerInstance &ci)
     emitError("unknown shader module: %0", {}) << shaderModel->GetName();
 
   if (spirvOptions.invertY && !shaderModel->IsVS() && !shaderModel->IsDS() &&
-      !shaderModel->IsGS() && !shaderModel->IsMS())
-    emitError("-fvk-invert-y can only be used in VS/DS/GS/MS", {});
+      !shaderModel->IsGS() && !shaderModel->IsMS() && !shaderModel->IsLib())
+    emitError("-fvk-invert-y can only be used in VS/DS/GS/MS/Lib", {});
 
   if (spirvOptions.useGlLayout && spirvOptions.useDxLayout)
     emitError("cannot specify both -fvk-use-dx-layout and -fvk-use-gl-layout",
@@ -14964,8 +14964,12 @@ SpirvEmitter::createSpirvIntrInstExt(llvm::ArrayRef<const Attr *> attrs,
 SpirvInstruction *SpirvEmitter::invertYIfRequested(SpirvInstruction *position,
                                                    SourceLocation loc,
                                                    SourceRange range) {
-  // Negate SV_Position.y if requested
-  if (spirvOptions.invertY) {
+  // Negate SV_Position.y if requested and supported
+
+  bool supportsInvertY = spvContext.isVS() || spvContext.isGS() ||
+                         spvContext.isDS() || spvContext.isMS();
+
+  if (spirvOptions.invertY && supportsInvertY) {
     const auto oldY = spvBuilder.createCompositeExtract(
         astContext.FloatTy, position, {1}, loc, range);
     const auto newY = spvBuilder.createUnaryOp(
diff --git a/tools/clang/test/CodeGenSPIRV/vk.cloption.invert-y.lib.hlsl b/tools/clang/test/CodeGenSPIRV/vk.cloption.invert-y.lib.hlsl
new file mode 100644
index 0000000000..6dac20fc6f
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/vk.cloption.invert-y.lib.hlsl
@@ -0,0 +1,12 @@
+// RUN: %dxc -T lib_6_3 -fvk-invert-y -fcgl  %s -spirv | FileCheck %s
+
+[shader("vertex")]
+float4 main(float4 a : A) : SV_Position {
+    return a;
+}
+
+// CHECK:         [[a:%[0-9]+]] = OpFunctionCall %v4float %src_main %param_var_a
+// CHECK-NEXT: [[oldY:%[0-9]+]] = OpCompositeExtract %float [[a]] 1
+// CHECK-NEXT: [[newY:%[0-9]+]] = OpFNegate %float [[oldY]]
+// CHECK-NEXT:  [[pos:%[0-9]+]] = OpCompositeInsert %v4float [[newY]] [[a]] 1
+// CHECK-NEXT:                 OpStore %gl_Position [[pos]]

From 053e7ac656e01d90aa9931c4d8b8a89c14741027 Mon Sep 17 00:00:00 2001
From: Tex Riddell <texr@microsoft.com>
Date: Fri, 16 May 2025 08:42:57 -0700
Subject: [PATCH 42/93] Refactor udt intrinsic arg copy to before SROA, flatten
 RayDesc (#7440)

Intrinsics that take UDT arguments need copy-in/copy-out. Other
aggregate args are flattened for intrinsic calls. Previously, these
operations were intermingled, driven by SROA on alloca/GV values.

There were RayDesc arguments that weren't treated consistently, and
weren't copied in when necessary, leading to problems. They should be
flattened into the intrinsic arguments, but TraceRay calls didn't do
this.

This change:
- flattens RayDesc args for all intrinsics that use them.
- separates the copy-in/copy-out generation into a separate operation
before SROA. Ideally, this copy-in/copy-out would have been generated by
CodeGen based on by-value passing, but that's a deeper intrinsic AST
issue potentially.
- Updated and added tests.

Fixes #7434.
---
 include/dxc/DXIL/DxilConstants.h              |   4 +
 include/dxc/HLSL/HLOperations.h               |  23 +-
 lib/HLSL/HLOperationLower.cpp                 | 200 ++++++-------
 .../Scalar/ScalarReplAggregatesHLSL.cpp       | 266 +++++++++++-------
 .../hlsl/objects/RayQuery/tracerayinline.hlsl |   0
 .../RayQuery/tracerayinline_cb_raydesc.hlsl   |  14 +
 .../DxilGen/hitobject_traceinvoke_dxilgen.ll  | 201 ++++++-------
 .../tracerayinline_cb_raydesc_dxilgen.ll      | 160 +++++++++++
 .../Passes/DxilGen/tracerayinline_dxilgen.ll  | 134 +++++++++
 .../hitobject_fromrayquery_scalarrepl.ll      |   8 +-
 .../hitobject_make_scalarrepl.ll              |  13 +-
 .../hitobject_traceinvoke_scalarrepl.ll       | 198 +++++++++++++
 .../ScalarReplHLSL/traceray_scalarrepl.ll     | 182 ++++++++++++
 .../tracerayinline_cb_raydesc_scalarrepl.ll   | 154 ++++++++++
 .../tracerayinline_scalarrepl.ll              | 155 ++++++++++
 .../pix/AnnotateVirtualRegs-Raygen.hlsl       |  36 ---
 ...raytracing_intersection_geometryIndex.hlsl |   8 +-
 tools/clang/unittests/HLSL/PixTest.cpp        | 100 -------
 18 files changed, 1375 insertions(+), 481 deletions(-)
 rename tools/clang/test/{HLSLFileCheck => CodeGenDXIL}/hlsl/objects/RayQuery/tracerayinline.hlsl (100%)
 create mode 100644 tools/clang/test/CodeGenDXIL/hlsl/objects/RayQuery/tracerayinline_cb_raydesc.hlsl
 create mode 100644 tools/clang/test/DXC/Passes/DxilGen/tracerayinline_cb_raydesc_dxilgen.ll
 create mode 100644 tools/clang/test/DXC/Passes/DxilGen/tracerayinline_dxilgen.ll
 create mode 100644 tools/clang/test/DXC/Passes/ScalarReplHLSL/hitobject_traceinvoke_scalarrepl.ll
 create mode 100644 tools/clang/test/DXC/Passes/ScalarReplHLSL/traceray_scalarrepl.ll
 create mode 100644 tools/clang/test/DXC/Passes/ScalarReplHLSL/tracerayinline_cb_raydesc_scalarrepl.ll
 create mode 100644 tools/clang/test/DXC/Passes/ScalarReplHLSL/tracerayinline_scalarrepl.ll
 delete mode 100644 tools/clang/test/HLSLFileCheck/pix/AnnotateVirtualRegs-Raygen.hlsl

diff --git a/include/dxc/DXIL/DxilConstants.h b/include/dxc/DXIL/DxilConstants.h
index bf6de7ed3b..0f28edbc39 100644
--- a/include/dxc/DXIL/DxilConstants.h
+++ b/include/dxc/DXIL/DxilConstants.h
@@ -1583,6 +1583,10 @@ const unsigned kMSStoreOutputColOpIdx = 3;
 const unsigned kMSStoreOutputVIdxOpIdx = 4;
 const unsigned kMSStoreOutputValOpIdx = 5;
 
+// HitObject::MakeMiss
+const unsigned kHitObjectMakeMiss_RayDescOpIdx = 3;
+const unsigned kHitObjectMakeMiss_NumOp = 11;
+
 // HitObject::TraceRay
 const unsigned kHitObjectTraceRay_RayDescOpIdx = 7;
 const unsigned kHitObjectTraceRay_PayloadOpIdx = 15;
diff --git a/include/dxc/HLSL/HLOperations.h b/include/dxc/HLSL/HLOperations.h
index c75318da99..0da9804ecb 100644
--- a/include/dxc/HLSL/HLOperations.h
+++ b/include/dxc/HLSL/HLOperations.h
@@ -396,7 +396,12 @@ const unsigned kAnnotateHandleResourceTypeOpIdx = 3;
 
 // TraceRay.
 const unsigned kTraceRayRayDescOpIdx = 7;
-const unsigned kTraceRayPayLoadOpIdx = 8;
+// kTraceRayPayloadPreOpIdx is before flattening the RayDesc
+const unsigned kTraceRayPayloadPreOpIdx = 8;
+// kTraceRayPayloadOpIdx is after flattening the RayDesc
+const unsigned kTraceRayPayloadOpIdx = 11;
+const unsigned kTraceRay_PreNumOp = 9;
+const unsigned kTraceRay_NumOp = 12;
 
 // AllocateRayQuery
 const unsigned kAllocateRayQueryRayFlagsIdx = 1;
@@ -407,6 +412,10 @@ const unsigned kCallShaderPayloadOpIdx = 2;
 
 // TraceRayInline.
 const unsigned kTraceRayInlineRayDescOpIdx = 5;
+// kTraceRayInlinePayloadPreOpIdx is before flattening the RayDesc
+const unsigned kTraceRayInlinePayloadPreOpIdx = 6;
+// kTraceRayInlinePayloadOpIdx is after flattening the RayDesc
+const unsigned kTraceRayInlinePayloadOpIdx = 9;
 
 // ReportIntersection.
 const unsigned kReportIntersectionAttributeOpIdx = 3;
@@ -435,11 +444,19 @@ const unsigned kAnnotateNodeRecordHandleNodeRecordPropIdx = 2;
 
 // HitObject::MakeMiss
 const unsigned kHitObjectMakeMiss_NumOp = 8;
-const unsigned kHitObjectMakeMissRayDescOpIdx = 4;
+const unsigned kHitObjectMakeMiss_RayDescOpIdx = 4;
 
 // HitObject::TraceRay
 const unsigned kHitObjectTraceRay_RayDescOpIdx = 8;
-const unsigned kHitObjectTraceRay_NumOp = 10;
+// kHitObjectTraceRay_PayloadPreOpIdx is before flattening the RayDesc
+const unsigned kHitObjectTraceRay_PayloadPreOpIdx = 9;
+// kHitObjectTraceRay_PayloadOpIdx is after flattening the RayDesc
+const unsigned kHitObjectTraceRay_PayloadOpIdx = 12;
+const unsigned kHitObjectTraceRay_PreNumOp = 10;
+const unsigned kHitObjectTraceRay_NumOp = 13;
+
+// HitObject::Invoke
+const unsigned kHitObjectInvoke_PayloadOpIdx = 2;
 
 // HitObject::FromRayQuery
 const unsigned kHitObjectFromRayQuery_WithAttrs_AttributeOpIdx = 4;
diff --git a/lib/HLSL/HLOperationLower.cpp b/lib/HLSL/HLOperationLower.cpp
index 18d003a764..58c1de3941 100644
--- a/lib/HLSL/HLOperationLower.cpp
+++ b/lib/HLSL/HLOperationLower.cpp
@@ -5720,37 +5720,24 @@ Value *TranslateCallShader(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
   return Builder.CreateCall(F, {opArg, ShaderIndex, Parameter});
 }
 
-static unsigned LoadRayDescElementsIntoArgs(Value **Args, hlsl::OP *OP,
-                                            IRBuilder<> &Builder,
-                                            Value *RayDescPtr, unsigned Index) {
-  // struct RayDesc
-  //{
-  //    float3 Origin;
-  //    float  TMin;
-  //    float3 Direction;
-  //    float  TMax;
-  //};
-  Value *ZeroIdx = OP->GetU32Const(0);
-  Value *Origin = Builder.CreateGEP(RayDescPtr, {ZeroIdx, ZeroIdx});
-  Origin = Builder.CreateLoad(Origin);
-  Args[Index++] = Builder.CreateExtractElement(Origin, (uint64_t)0);
-  Args[Index++] = Builder.CreateExtractElement(Origin, 1);
-  Args[Index++] = Builder.CreateExtractElement(Origin, 2);
-
-  Value *TMinPtr = Builder.CreateGEP(RayDescPtr, {ZeroIdx, OP->GetU32Const(1)});
-  Args[Index++] = Builder.CreateLoad(TMinPtr);
-
-  Value *DirectionPtr =
-      Builder.CreateGEP(RayDescPtr, {ZeroIdx, OP->GetU32Const(2)});
-  Value *Direction = Builder.CreateLoad(DirectionPtr);
-
-  Args[Index++] = Builder.CreateExtractElement(Direction, (uint64_t)0);
-  Args[Index++] = Builder.CreateExtractElement(Direction, 1);
-  Args[Index++] = Builder.CreateExtractElement(Direction, 2);
-
-  Value *TMaxPtr = Builder.CreateGEP(RayDescPtr, {ZeroIdx, OP->GetU32Const(3)});
-  Args[Index++] = Builder.CreateLoad(TMaxPtr);
-  return Index;
+static void TransferRayDescArgs(Value **Args, hlsl::OP *OP,
+                                IRBuilder<> &Builder, CallInst *CI,
+                                unsigned &Index, unsigned &HLIndex) {
+  // Extract elements from flattened ray desc arguments in HL op.
+  // float3 Origin;
+  Value *origin = CI->getArgOperand(HLIndex++);
+  Args[Index++] = Builder.CreateExtractElement(origin, (uint64_t)0);
+  Args[Index++] = Builder.CreateExtractElement(origin, 1);
+  Args[Index++] = Builder.CreateExtractElement(origin, 2);
+  // float  TMin;
+  Args[Index++] = CI->getArgOperand(HLIndex++);
+  // float3 Direction;
+  Value *direction = CI->getArgOperand(HLIndex++);
+  Args[Index++] = Builder.CreateExtractElement(direction, (uint64_t)0);
+  Args[Index++] = Builder.CreateExtractElement(direction, 1);
+  Args[Index++] = Builder.CreateExtractElement(direction, 2);
+  // float  TMax;
+  Args[Index++] = CI->getArgOperand(HLIndex++);
 }
 
 Value *TranslateTraceRay(CallInst *CI, IntrinsicOp IOP, OP::OpCode OpCode,
@@ -5759,21 +5746,24 @@ Value *TranslateTraceRay(CallInst *CI, IntrinsicOp IOP, OP::OpCode OpCode,
                          bool &Translated) {
   hlsl::OP *OP = &Helper.hlslOP;
 
-  Value *RayDesc = CI->getArgOperand(HLOperandIndex::kTraceRayRayDescOpIdx);
-  Value *PayLoad = CI->getArgOperand(HLOperandIndex::kTraceRayPayLoadOpIdx);
-
   Value *Args[DXIL::OperandIndex::kTraceRayNumOp];
   Args[0] = OP->GetU32Const(static_cast<unsigned>(OpCode));
-  for (unsigned i = 1; i < HLOperandIndex::kTraceRayRayDescOpIdx; i++)
-    Args[i] = CI->getArgOperand(i);
+  unsigned Index = 1, HLIndex = 1;
+  while (HLIndex < HLOperandIndex::kTraceRayRayDescOpIdx)
+    Args[Index++] = CI->getArgOperand(HLIndex++);
 
   IRBuilder<> Builder(CI);
-  LoadRayDescElementsIntoArgs(Args, OP, Builder, RayDesc,
-                              DXIL::OperandIndex::kTraceRayRayDescOpIdx);
+  TransferRayDescArgs(Args, OP, Builder, CI, Index, HLIndex);
+  DXASSERT_NOMSG(HLIndex == CI->getNumArgOperands() - 1);
+  DXASSERT_NOMSG(Index == DXIL::OperandIndex::kTraceRayPayloadOpIdx);
+
+  Value *Payload = CI->getArgOperand(HLIndex++);
+  Args[Index++] = Payload;
 
-  Args[DXIL::OperandIndex::kTraceRayPayloadOpIdx] = PayLoad;
+  DXASSERT_NOMSG(HLIndex == CI->getNumArgOperands());
+  DXASSERT_NOMSG(Index == DXIL::OperandIndex::kTraceRayNumOp);
 
-  Type *Ty = PayLoad->getType();
+  Type *Ty = Payload->getType();
   Function *F = OP->GetOpFunc(OpCode, Ty);
 
   return Builder.CreateCall(F, Args);
@@ -5817,33 +5807,16 @@ Value *TranslateTraceRayInline(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
 
   Value *Args[DXIL::OperandIndex::kTraceRayInlineNumOp];
   Args[0] = opArg;
-  for (unsigned i = 1; i < HLOperandIndex::kTraceRayInlineRayDescOpIdx; i++) {
-    Args[i] = CI->getArgOperand(i);
-  }
+  unsigned Index = 1, HLIndex = 1;
+  while (HLIndex < HLOperandIndex::kTraceRayInlineRayDescOpIdx)
+    Args[Index++] = CI->getArgOperand(HLIndex++);
 
   IRBuilder<> Builder(CI);
-  unsigned hlIndex = HLOperandIndex::kTraceRayInlineRayDescOpIdx;
-  unsigned index = DXIL::OperandIndex::kTraceRayInlineRayDescOpIdx;
-
-  // struct RayDesc
-  //{
-  //    float3 Origin;
-  Value *origin = CI->getArgOperand(hlIndex++);
-  Args[index++] = Builder.CreateExtractElement(origin, (uint64_t)0);
-  Args[index++] = Builder.CreateExtractElement(origin, 1);
-  Args[index++] = Builder.CreateExtractElement(origin, 2);
-  //    float  TMin;
-  Args[index++] = CI->getArgOperand(hlIndex++);
-  //    float3 Direction;
-  Value *direction = CI->getArgOperand(hlIndex++);
-  Args[index++] = Builder.CreateExtractElement(direction, (uint64_t)0);
-  Args[index++] = Builder.CreateExtractElement(direction, 1);
-  Args[index++] = Builder.CreateExtractElement(direction, 2);
-  //    float  TMax;
-  Args[index++] = CI->getArgOperand(hlIndex++);
-  //};
-
-  DXASSERT_NOMSG(index == DXIL::OperandIndex::kTraceRayInlineNumOp);
+  DXASSERT_NOMSG(HLIndex == HLOperandIndex::kTraceRayInlineRayDescOpIdx);
+  DXASSERT_NOMSG(Index == DXIL::OperandIndex::kTraceRayInlineRayDescOpIdx);
+  TransferRayDescArgs(Args, hlslOP, Builder, CI, Index, HLIndex);
+  DXASSERT_NOMSG(HLIndex == CI->getNumArgOperands());
+  DXASSERT_NOMSG(Index == DXIL::OperandIndex::kTraceRayInlineNumOp);
 
   Function *F = hlslOP->GetOpFunc(opcode, Builder.getVoidTy());
 
@@ -6197,55 +6170,49 @@ Value *TranslateUnpack(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
 
 // Shader Execution Reordering.
 namespace {
-Value *TranslateHitObjectMake(CallInst *CI, IntrinsicOp IOP, OP::OpCode Opcode,
-                              HLOperationLowerHelper &Helper,
-                              HLObjectOperationLowerHelper *ObjHelper,
-                              bool &Translated) {
+Value *TranslateHitObjectMakeNop(CallInst *CI, IntrinsicOp IOP,
+                                 OP::OpCode Opcode,
+                                 HLOperationLowerHelper &Helper,
+                                 HLObjectOperationLowerHelper *ObjHelper,
+                                 bool &Translated) {
   hlsl::OP *HlslOP = &Helper.hlslOP;
   IRBuilder<> Builder(CI);
-  unsigned SrcIdx = 1;
-  Value *HitObjectPtr = CI->getArgOperand(SrcIdx++);
-  if (Opcode == OP::OpCode::HitObject_MakeNop) {
-    Value *HitObject = TrivialDxilOperation(
-        Opcode, {nullptr}, Type::getVoidTy(CI->getContext()), CI, HlslOP);
-    Builder.CreateStore(HitObject, HitObjectPtr);
-    DXASSERT(
-        CI->use_empty(),
-        "Default ctor return type is a Clang artifact. Value must not be used");
-    return nullptr;
-  }
+  Value *HitObjectPtr = CI->getArgOperand(1);
+  Value *HitObject = TrivialDxilOperation(
+      Opcode, {nullptr}, Type::getVoidTy(CI->getContext()), CI, HlslOP);
+  Builder.CreateStore(HitObject, HitObjectPtr);
+  DXASSERT(
+      CI->use_empty(),
+      "Default ctor return type is a Clang artifact. Value must not be used");
+  return nullptr;
+}
 
+Value *TranslateHitObjectMakeMiss(CallInst *CI, IntrinsicOp IOP,
+                                  OP::OpCode Opcode,
+                                  HLOperationLowerHelper &Helper,
+                                  HLObjectOperationLowerHelper *ObjHelper,
+                                  bool &Translated) {
   DXASSERT_NOMSG(CI->getNumArgOperands() ==
                  HLOperandIndex::kHitObjectMakeMiss_NumOp);
-  Value *RayFlags = CI->getArgOperand(SrcIdx++);
-  Value *MissShaderIdx = CI->getArgOperand(SrcIdx++);
-  DXASSERT_NOMSG(SrcIdx == HLOperandIndex::kHitObjectMakeMissRayDescOpIdx);
-  Value *RayDescOrigin = CI->getArgOperand(SrcIdx++);
-  Value *RayDescOriginX =
-      Builder.CreateExtractElement(RayDescOrigin, (uint64_t)0);
-  Value *RayDescOriginY =
-      Builder.CreateExtractElement(RayDescOrigin, (uint64_t)1);
-  Value *RayDescOriginZ =
-      Builder.CreateExtractElement(RayDescOrigin, (uint64_t)2);
-
-  Value *RayDescTMin = CI->getArgOperand(SrcIdx++);
-  Value *RayDescDirection = CI->getArgOperand(SrcIdx++);
-  Value *RayDescDirectionX =
-      Builder.CreateExtractElement(RayDescDirection, (uint64_t)0);
-  Value *RayDescDirectionY =
-      Builder.CreateExtractElement(RayDescDirection, (uint64_t)1);
-  Value *RayDescDirectionZ =
-      Builder.CreateExtractElement(RayDescDirection, (uint64_t)2);
-
-  Value *RayDescTMax = CI->getArgOperand(SrcIdx++);
+  hlsl::OP *OP = &Helper.hlslOP;
+  IRBuilder<> Builder(CI);
+  Value *Args[DXIL::OperandIndex::kHitObjectMakeMiss_NumOp];
+  Args[0] = nullptr; // Filled in by TrivialDxilOperation
+
+  unsigned DestIdx = 1, SrcIdx = 1;
+  Value *HitObjectPtr = CI->getArgOperand(SrcIdx++);
+  Args[DestIdx++] = CI->getArgOperand(SrcIdx++); // RayFlags
+  Args[DestIdx++] = CI->getArgOperand(SrcIdx++); // MissShaderIdx
+
+  DXASSERT_NOMSG(SrcIdx == HLOperandIndex::kHitObjectMakeMiss_RayDescOpIdx);
+  DXASSERT_NOMSG(DestIdx ==
+                 DXIL::OperandIndex::kHitObjectMakeMiss_RayDescOpIdx);
+  TransferRayDescArgs(Args, OP, Builder, CI, DestIdx, SrcIdx);
   DXASSERT_NOMSG(SrcIdx == CI->getNumArgOperands());
+  DXASSERT_NOMSG(DestIdx == DXIL::OperandIndex::kHitObjectMakeMiss_NumOp);
 
-  Value *OutHitObject = TrivialDxilOperation(
-      Opcode,
-      {nullptr, RayFlags, MissShaderIdx, RayDescOriginX, RayDescOriginY,
-       RayDescOriginZ, RayDescTMin, RayDescDirectionX, RayDescDirectionY,
-       RayDescDirectionZ, RayDescTMax},
-      Helper.voidTy, CI, HlslOP);
+  Value *OutHitObject =
+      TrivialDxilOperation(Opcode, Args, Helper.voidTy, CI, OP);
   Builder.CreateStore(OutHitObject, HitObjectPtr);
   return nullptr;
 }
@@ -6348,10 +6315,9 @@ Value *TranslateHitObjectTraceRay(CallInst *CI, IntrinsicOp IOP,
   hlsl::OP *OP = &Helper.hlslOP;
   IRBuilder<> Builder(CI);
 
-  const unsigned DxilNumArgs = DxilInst_HitObject_TraceRay::arg_payload + 1;
   DXASSERT_NOMSG(CI->getNumArgOperands() ==
                  HLOperandIndex::kHitObjectTraceRay_NumOp);
-  Value *Args[DxilNumArgs];
+  Value *Args[DXIL::OperandIndex::kHitObjectTraceRay_NumOp];
   Value *OpArg = OP->GetU32Const(static_cast<unsigned>(OpCode));
   Args[0] = OpArg;
 
@@ -6363,13 +6329,19 @@ Value *TranslateHitObjectTraceRay(CallInst *CI, IntrinsicOp IOP,
     Args[DestIdx] = CI->getArgOperand(SrcIdx);
   }
 
-  Value *RayDescPtr = CI->getArgOperand(SrcIdx++);
-  DestIdx = LoadRayDescElementsIntoArgs(Args, OP, Builder, RayDescPtr, DestIdx);
+  DXASSERT_NOMSG(SrcIdx == HLOperandIndex::kHitObjectTraceRay_RayDescOpIdx);
+  DXASSERT_NOMSG(DestIdx ==
+                 DXIL::OperandIndex::kHitObjectTraceRay_RayDescOpIdx);
+  TransferRayDescArgs(Args, OP, Builder, CI, DestIdx, SrcIdx);
+  DXASSERT_NOMSG(SrcIdx == CI->getNumArgOperands() - 1);
+  DXASSERT_NOMSG(DestIdx ==
+                 DXIL::OperandIndex::kHitObjectTraceRay_PayloadOpIdx);
+
   Value *Payload = CI->getArgOperand(SrcIdx++);
   Args[DestIdx++] = Payload;
 
   DXASSERT_NOMSG(SrcIdx == CI->getNumArgOperands());
-  DXASSERT_NOMSG(DestIdx == DxilNumArgs);
+  DXASSERT_NOMSG(DestIdx == DXIL::OperandIndex::kHitObjectTraceRay_NumOp);
 
   Function *F = OP->GetOpFunc(OpCode, Payload->getType());
 
@@ -7402,7 +7374,7 @@ IntrinsicLower gLowerTable[] = {
      DXIL::OpCode::NumOpCodes},
     {IntrinsicOp::MOP_InterlockedUMin, TranslateMopAtomicBinaryOperation,
      DXIL::OpCode::NumOpCodes},
-    {IntrinsicOp::MOP_DxHitObject_MakeNop, TranslateHitObjectMake,
+    {IntrinsicOp::MOP_DxHitObject_MakeNop, TranslateHitObjectMakeNop,
      DXIL::OpCode::HitObject_MakeNop},
     {IntrinsicOp::IOP_DxMaybeReorderThread, TranslateMaybeReorderThread,
      DXIL::OpCode::MaybeReorderThread},
@@ -7462,7 +7434,7 @@ IntrinsicLower gLowerTable[] = {
     {IntrinsicOp::MOP_DxHitObject_LoadLocalRootTableConstant,
      TranslateHitObjectLoadLocalRootTableConstant,
      DXIL::OpCode::HitObject_LoadLocalRootTableConstant},
-    {IntrinsicOp::MOP_DxHitObject_MakeMiss, TranslateHitObjectMake,
+    {IntrinsicOp::MOP_DxHitObject_MakeMiss, TranslateHitObjectMakeMiss,
      DXIL::OpCode::HitObject_MakeMiss},
     {IntrinsicOp::MOP_DxHitObject_SetShaderTableIndex,
      TranslateHitObjectSetShaderTableIndex,
diff --git a/lib/Transforms/Scalar/ScalarReplAggregatesHLSL.cpp b/lib/Transforms/Scalar/ScalarReplAggregatesHLSL.cpp
index 8bd78dd9a6..d8746862bc 100644
--- a/lib/Transforms/Scalar/ScalarReplAggregatesHLSL.cpp
+++ b/lib/Transforms/Scalar/ScalarReplAggregatesHLSL.cpp
@@ -129,7 +129,6 @@ class SROA_Helper {
   void RewriteMemIntrin(MemIntrinsic *MI, Value *OldV);
   void RewriteCall(CallInst *CI);
   void RewriteBitCast(BitCastInst *BCI);
-  void RewriteCallArg(CallInst *CI, unsigned ArgIdx, bool bIn, bool bOut);
 };
 
 } // namespace
@@ -1478,6 +1477,53 @@ void isSafePHISelectUseForScalarRepl(Instruction *I, uint64_t Offset,
   }
 }
 
+// Returns whether the `OpIdx` argument of HL intrinsic call `CI` is expected to
+// be a user-defined-type.
+static bool isUDTIntrinsicArg(CallInst *CI, unsigned OpIdx) {
+  if (HLOpcodeGroup::HLIntrinsic != GetHLOpcodeGroup(CI->getCalledFunction()))
+    return false;
+  const unsigned NumOps = CI->getNumArgOperands();
+  switch (static_cast<IntrinsicOp>(GetHLOpcode(CI))) {
+  case IntrinsicOp::IOP_TraceRay:
+    if (NumOps == HLOperandIndex::kTraceRay_PreNumOp &&
+        OpIdx == HLOperandIndex::kTraceRayPayloadPreOpIdx)
+      return true;
+    else if (NumOps == HLOperandIndex::kTraceRay_NumOp &&
+             OpIdx == HLOperandIndex::kTraceRayPayloadOpIdx)
+      return true;
+    break;
+  case IntrinsicOp::IOP_ReportHit:
+    if (OpIdx == HLOperandIndex::kReportIntersectionAttributeOpIdx)
+      return true;
+    break;
+  case IntrinsicOp::IOP_CallShader:
+    if (OpIdx == HLOperandIndex::kCallShaderPayloadOpIdx)
+      return true;
+    break;
+  case IntrinsicOp::MOP_DxHitObject_FromRayQuery:
+    if (NumOps == HLOperandIndex::kHitObjectFromRayQuery_WithAttrs_NumOp &&
+        OpIdx ==
+            HLOperandIndex::kHitObjectFromRayQuery_WithAttrs_AttributeOpIdx)
+      return true;
+    break;
+  case IntrinsicOp::MOP_DxHitObject_TraceRay:
+    if (NumOps == HLOperandIndex::kHitObjectTraceRay_PreNumOp &&
+        OpIdx == HLOperandIndex::kHitObjectTraceRay_PayloadPreOpIdx)
+      return true;
+    else if (NumOps == HLOperandIndex::kHitObjectTraceRay_NumOp &&
+             OpIdx == HLOperandIndex::kHitObjectTraceRay_PayloadOpIdx)
+      return true;
+    break;
+  case IntrinsicOp::MOP_DxHitObject_Invoke:
+    if (OpIdx == HLOperandIndex::kHitObjectInvoke_PayloadOpIdx)
+      return true;
+    break;
+  default:
+    break;
+  }
+  return false;
+}
+
 /// isSafeForScalarRepl - Check if instruction I is a safe use with regard to
 /// performing scalar replacement of alloca AI.  The results are flagged in
 /// the Info parameter.  Offset indicates the position within AI that is
@@ -1535,18 +1581,9 @@ void isSafeForScalarRepl(Instruction *I, uint64_t Offset, AllocaInfo &Info) {
       // Most HL functions are safe for scalar repl.
       if (HLOpcodeGroup::NotHL == group)
         return MarkUnsafe(Info, User);
-      else if (HLOpcodeGroup::HLIntrinsic == group) {
-        // TODO: should we check HL parameter type for UDT overload instead of
-        // basing on IOP?
-        IntrinsicOp opcode = static_cast<IntrinsicOp>(GetHLOpcode(CI));
-        if (IntrinsicOp::IOP_TraceRay == opcode ||
-            IntrinsicOp::MOP_DxHitObject_TraceRay == opcode ||
-            IntrinsicOp::MOP_DxHitObject_Invoke == opcode ||
-            IntrinsicOp::IOP_ReportHit == opcode ||
-            IntrinsicOp::IOP_CallShader == opcode) {
-          return MarkUnsafe(Info, User);
-        }
-      }
+      else if (HLOpcodeGroup::HLIntrinsic == group &&
+               isUDTIntrinsicArg(CI, U.getOperandNo()))
+        return MarkUnsafe(Info, User);
     } else {
       return MarkUnsafe(Info, User);
     }
@@ -2662,12 +2699,11 @@ void SROA_Helper::RewriteBitCast(BitCastInst *BCI) {
   RewriteForGEP(cast<GEPOperator>(GEP), GEPBuilder);
 }
 
-/// RewriteCallArg - For Functions which don't flat,
-///                  replace OldVal with alloca and
-///                  copy in copy out data between alloca and flattened NewElts
-///                  in CallInst.
-void SROA_Helper::RewriteCallArg(CallInst *CI, unsigned ArgIdx, bool bIn,
-                                 bool bOut) {
+/// memcpyAggCallArg - For an aggregate call argument, this replaces the
+/// argument with an alloca and inserts a memcpy for input (if CopyIn) and
+/// output (if CopyOut).
+static void memcpyAggCallArg(CallInst *CI, unsigned ArgIdx, bool CopyIn,
+                             bool CopyOut) {
   Function *F = CI->getParent()->getParent();
   IRBuilder<> AllocaBuilder(dxilutil::FindAllocaInsertionPt(F));
   const DataLayout &DL = F->getParent()->getDataLayout();
@@ -2677,17 +2713,79 @@ void SROA_Helper::RewriteCallArg(CallInst *CI, unsigned ArgIdx, bool bIn,
   Type *userTyElt = userTy->getElementType();
   Value *Alloca = AllocaBuilder.CreateAlloca(userTyElt);
   IRBuilder<> Builder(CI);
-  if (bIn) {
-    MemCpyInst *cpy = cast<MemCpyInst>(Builder.CreateMemCpy(
-        Alloca, userTyV, DL.getTypeAllocSize(userTyElt), false));
-    RewriteMemIntrin(cpy, cpy->getRawSource());
-  }
+  if (CopyIn)
+    Builder.CreateMemCpy(Alloca, userTyV, DL.getTypeAllocSize(userTyElt),
+                         false);
   CI->setArgOperand(ArgIdx, Alloca);
-  if (bOut) {
+  if (CopyOut) {
     Builder.SetInsertPoint(CI->getNextNode());
-    MemCpyInst *cpy = cast<MemCpyInst>(Builder.CreateMemCpy(
-        userTyV, Alloca, DL.getTypeAllocSize(userTyElt), false));
-    RewriteMemIntrin(cpy, cpy->getRawSource());
+    Builder.CreateMemCpy(userTyV, Alloca, DL.getTypeAllocSize(userTyElt),
+                         false);
+  }
+}
+
+static void copyIntrinsicAggArgs(HLModule &HLM) {
+  // Iterate HLIntrinsic function users
+  // For specific intrinsics, use memcpyAggCallArg on aggregate args
+  // This ensures that the call does not directly use the pointer supplied,
+  // allowing certain arguments to be flattened, and UDT args to be correctly
+  // lowered.
+  for (Function &F : HLM.GetModule()->functions()) {
+    if (F.isIntrinsic() || !F.isDeclaration())
+      continue;
+    if (GetHLOpcodeGroup(&F) != HLOpcodeGroup::HLIntrinsic)
+      continue;
+    // Iterate users
+    for (User *U : F.users()) {
+      if (CallInst *CI = dyn_cast<CallInst>(U)) {
+        switch (static_cast<IntrinsicOp>(GetHLOpcode(CI))) {
+        case IntrinsicOp::IOP_TraceRay:
+          memcpyAggCallArg(CI, HLOperandIndex::kTraceRayRayDescOpIdx,
+                           /*CopyIn*/ true, /*CopyOut*/ false);
+          memcpyAggCallArg(CI, HLOperandIndex::kTraceRayPayloadPreOpIdx,
+                           /*CopyIn*/ true, /*CopyOut*/ true);
+          break;
+        case IntrinsicOp::IOP_ReportHit:
+          memcpyAggCallArg(CI,
+                           HLOperandIndex::kReportIntersectionAttributeOpIdx,
+                           /*CopyIn*/ true, /*CopyOut*/ false);
+          break;
+        case IntrinsicOp::IOP_CallShader:
+          memcpyAggCallArg(CI, HLOperandIndex::kCallShaderPayloadOpIdx,
+                           /*CopyIn*/ true, /*CopyOut*/ true);
+          break;
+        case IntrinsicOp::MOP_TraceRayInline:
+          memcpyAggCallArg(CI, HLOperandIndex::kTraceRayInlineRayDescOpIdx,
+                           /*CopyIn*/ true, /*CopyOut*/ false);
+          break;
+        case IntrinsicOp::MOP_DxHitObject_FromRayQuery:
+          if (CI->getNumArgOperands() ==
+              HLOperandIndex::kHitObjectFromRayQuery_WithAttrs_NumOp)
+            memcpyAggCallArg(
+                CI,
+                HLOperandIndex::kHitObjectFromRayQuery_WithAttrs_AttributeOpIdx,
+                /*CopyIn*/ true, /*CopyOut*/ false);
+          break;
+        case IntrinsicOp::MOP_DxHitObject_MakeMiss:
+          memcpyAggCallArg(CI, HLOperandIndex::kHitObjectMakeMiss_RayDescOpIdx,
+                           /*CopyIn*/ true, /*CopyOut*/ false);
+          break;
+        case IntrinsicOp::MOP_DxHitObject_TraceRay:
+          memcpyAggCallArg(CI, HLOperandIndex::kHitObjectTraceRay_RayDescOpIdx,
+                           /*CopyIn*/ true, /*CopyOut*/ false);
+          memcpyAggCallArg(CI,
+                           HLOperandIndex::kHitObjectTraceRay_PayloadPreOpIdx,
+                           /*CopyIn*/ true, /*CopyOut*/ true);
+          break;
+        case IntrinsicOp::MOP_DxHitObject_Invoke:
+          memcpyAggCallArg(CI, HLOperandIndex::kHitObjectInvoke_PayloadOpIdx,
+                           /*CopyIn*/ true, /*CopyOut*/ true);
+          break;
+        default:
+          break;
+        }
+      }
+    }
   }
 }
 
@@ -2741,13 +2839,26 @@ static CallInst *RewriteWithFlattenedHLIntrinsicCall(CallInst *CI,
 
 /// RewriteCall - Replace OldVal with flattened NewElts in CallInst.
 void SROA_Helper::RewriteCall(CallInst *CI) {
-  HLOpcodeGroup group = GetHLOpcodeGroupByName(CI->getCalledFunction());
-  if (group != HLOpcodeGroup::NotHL) {
+  HLOpcodeGroup Group = GetHLOpcodeGroupByName(CI->getCalledFunction());
+  if (Group != HLOpcodeGroup::NotHL) {
     unsigned opcode = GetHLOpcode(CI);
-    if (group == HLOpcodeGroup::HLIntrinsic) {
+    if (Group == HLOpcodeGroup::HLIntrinsic) {
+      // RayQuery this pointer replacement.
+      if (OldVal->getType()->isPointerTy() &&
+          dxilutil::IsHLSLRayQueryType(
+              OldVal->getType()->getPointerElementType())) {
+        // For RayQuery methods, we want to replace the RayQuery this pointer
+        // with a load and use of the underlying handle value.
+        // This will allow elimination of RayQuery types earlier.
+        RewriteWithFlattenedHLIntrinsicCall(CI, OldVal, NewElts,
+                                            /*loadElts*/ true);
+        DeadInsts.push_back(CI);
+        return;
+      }
+
       IntrinsicOp IOP = static_cast<IntrinsicOp>(opcode);
       switch (IOP) {
-      case IntrinsicOp::MOP_Append: {
+      case IntrinsicOp::MOP_Append:
         // Buffer Append already expand in code gen.
         // Must be OutputStream Append here.
         // Every Elt has a pointer type.
@@ -2755,87 +2866,47 @@ void SROA_Helper::RewriteCall(CallInst *CI) {
         RewriteWithFlattenedHLIntrinsicCall(CI, OldVal, NewElts,
                                             /*loadElts*/ false);
         DeadInsts.push_back(CI);
-      } break;
-      case IntrinsicOp::IOP_TraceRay: {
+        return;
+      case IntrinsicOp::IOP_TraceRay:
         if (OldVal ==
             CI->getArgOperand(HLOperandIndex::kTraceRayRayDescOpIdx)) {
-          RewriteCallArg(CI, HLOperandIndex::kTraceRayRayDescOpIdx,
-                         /*bIn*/ true, /*bOut*/ false);
-        } else {
-          DXASSERT(OldVal ==
-                       CI->getArgOperand(HLOperandIndex::kTraceRayPayLoadOpIdx),
-                   "else invalid TraceRay");
-          RewriteCallArg(CI, HLOperandIndex::kTraceRayPayLoadOpIdx,
-                         /*bIn*/ true, /*bOut*/ true);
-        }
-      } break;
-      case IntrinsicOp::IOP_ReportHit: {
-        RewriteCallArg(CI, HLOperandIndex::kReportIntersectionAttributeOpIdx,
-                       /*bIn*/ true, /*bOut*/ false);
-      } break;
-      case IntrinsicOp::IOP_CallShader: {
-        RewriteCallArg(CI, HLOperandIndex::kCallShaderPayloadOpIdx,
-                       /*bIn*/ true, /*bOut*/ true);
-      } break;
-      case IntrinsicOp::MOP_DxHitObject_MakeMiss: {
-        if (OldVal ==
-            CI->getArgOperand(HLOperandIndex::kHitObjectMakeMissRayDescOpIdx)) {
           RewriteWithFlattenedHLIntrinsicCall(CI, OldVal, NewElts,
                                               /*loadElts*/ true);
           DeadInsts.push_back(CI);
+          return;
         }
-      } break;
-      case IntrinsicOp::MOP_TraceRayInline: {
-        if (OldVal ==
-            CI->getArgOperand(HLOperandIndex::kTraceRayInlineRayDescOpIdx)) {
+        break;
+      case IntrinsicOp::MOP_DxHitObject_TraceRay:
+        if (OldVal == CI->getArgOperand(
+                          HLOperandIndex::kHitObjectTraceRay_RayDescOpIdx)) {
           RewriteWithFlattenedHLIntrinsicCall(CI, OldVal, NewElts,
                                               /*loadElts*/ true);
           DeadInsts.push_back(CI);
-          break;
+          return;
         }
-      }
-        LLVM_FALLTHROUGH;
-      case IntrinsicOp::MOP_DxHitObject_FromRayQuery: {
-        const bool IsWithAttrs =
-            CI->getNumArgOperands() ==
-            HLOperandIndex::kHitObjectFromRayQuery_WithAttrs_NumOp;
-        if (IsWithAttrs &&
-            (OldVal ==
-             CI->getArgOperand(
-                 HLOperandIndex::
-                     kHitObjectFromRayQuery_WithAttrs_AttributeOpIdx))) {
-          RewriteCallArg(
-              CI,
-              HLOperandIndex::kHitObjectFromRayQuery_WithAttrs_AttributeOpIdx,
-              /*bIn*/ true, /*bOut*/ false);
-          break;
+        break;
+      case IntrinsicOp::MOP_DxHitObject_MakeMiss:
+        if (OldVal == CI->getArgOperand(
+                          HLOperandIndex::kHitObjectMakeMiss_RayDescOpIdx)) {
+          RewriteWithFlattenedHLIntrinsicCall(CI, OldVal, NewElts,
+                                              /*loadElts*/ true);
+          DeadInsts.push_back(CI);
+          return;
         }
-
-        // For RayQuery methods, we want to replace the RayQuery this pointer
-        // with a load and use of the underlying handle value.
-        // This will allow elimination of RayQuery types earlier.
-        RewriteWithFlattenedHLIntrinsicCall(CI, OldVal, NewElts,
-                                            /*loadElts*/ true);
-        DeadInsts.push_back(CI);
         break;
-      }
-      default:
-        // RayQuery this pointer replacement.
-        if (OldVal->getType()->isPointerTy() &&
-            CI->getNumArgOperands() >= HLOperandIndex::kHandleOpIdx &&
-            OldVal == CI->getArgOperand(HLOperandIndex::kHandleOpIdx) &&
-            dxilutil::IsHLSLRayQueryType(
-                OldVal->getType()->getPointerElementType())) {
-          // For RayQuery methods, we want to replace the RayQuery this pointer
-          // with a load and use of the underlying handle value.
-          // This will allow elimination of RayQuery types earlier.
+      case IntrinsicOp::MOP_TraceRayInline:
+        if (OldVal ==
+            CI->getArgOperand(HLOperandIndex::kTraceRayInlineRayDescOpIdx)) {
           RewriteWithFlattenedHLIntrinsicCall(CI, OldVal, NewElts,
                                               /*loadElts*/ true);
           DeadInsts.push_back(CI);
-          break;
+          return;
         }
-        DXASSERT(0, "cannot flatten hlsl intrinsic.");
+        break;
+      default:
+        break;
       }
+      DXASSERT(0, "cannot flatten hlsl intrinsic.");
     }
     // TODO: check other high level dx operations if need to.
   } else {
@@ -4416,6 +4487,9 @@ class SROA_Parameter_HLSL : public ModulePass {
       F->eraseFromParent();
     }
 
+    // Expand flattened copy-in/copy-out for intrinsic UDT args:
+    copyIntrinsicAggArgs(*m_pHLModule);
+
     // SROA globals and allocas.
     SROAGlobalAndAllocas(*m_pHLModule, m_HasDbgInfo);
 
diff --git a/tools/clang/test/HLSLFileCheck/hlsl/objects/RayQuery/tracerayinline.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/objects/RayQuery/tracerayinline.hlsl
similarity index 100%
rename from tools/clang/test/HLSLFileCheck/hlsl/objects/RayQuery/tracerayinline.hlsl
rename to tools/clang/test/CodeGenDXIL/hlsl/objects/RayQuery/tracerayinline.hlsl
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/objects/RayQuery/tracerayinline_cb_raydesc.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/objects/RayQuery/tracerayinline_cb_raydesc.hlsl
new file mode 100644
index 0000000000..256b6a04e8
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/hlsl/objects/RayQuery/tracerayinline_cb_raydesc.hlsl
@@ -0,0 +1,14 @@
+// RUN: %dxc -T vs_6_5 -E main %s | FileCheck %s
+
+// CHECK-DAG: %[[RTAS:[^ ]+]] = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 0, i32 0, i32 0, i1 false)
+// CHECK-DAG: %[[RQ:[^ ]+]] = call i32 @dx.op.allocateRayQuery(i32 178, i32 513)
+// CHECK: call void @dx.op.rayQuery_TraceRayInline(i32 179, i32 %[[RQ]], %dx.types.Handle %[[RTAS]], i32 1, i32 2,
+
+RaytracingAccelerationStructure RTAS;
+
+RayDesc rayDesc;
+
+void main() {
+  RayQuery<RAY_FLAG_FORCE_OPAQUE|RAY_FLAG_SKIP_PROCEDURAL_PRIMITIVES> rayQuery;
+  rayQuery.TraceRayInline(RTAS, 1, 2, rayDesc);
+}
diff --git a/tools/clang/test/DXC/Passes/DxilGen/hitobject_traceinvoke_dxilgen.ll b/tools/clang/test/DXC/Passes/DxilGen/hitobject_traceinvoke_dxilgen.ll
index 6f364a0161..03bb0716ce 100644
--- a/tools/clang/test/DXC/Passes/DxilGen/hitobject_traceinvoke_dxilgen.ll
+++ b/tools/clang/test/DXC/Passes/DxilGen/hitobject_traceinvoke_dxilgen.ll
@@ -1,26 +1,16 @@
 ; RUN: %dxopt %s -hlsl-passes-resume -dxilgen -S | FileCheck %s
 ; REQUIRES: dxil-1-9
 
-;
-; Buffer Definitions:
-;
-;
-; Resource Bindings:
-;
-; Name                                 Type  Format         Dim      ID      HLSL Bind  Count
-; ------------------------------ ---------- ------- ----------- ------- -------------- ------
-; RTAS                              texture     i32         ras      T0t4294967295,space4294967295     1
-;
 target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
 target triple = "dxil-ms-dx"
 
 %struct.RaytracingAccelerationStructure = type { i32 }
-%struct.RayDesc = type { <3 x float>, float, <3 x float>, float }
 %struct.Payload = type { <3 x float> }
 %dx.types.HitObject = type { i8* }
 %dx.types.Handle = type { i8* }
 %dx.types.ResourceProperties = type { i32, i32 }
 %"class.RWStructuredBuffer<float>" = type { float }
+%struct.RayDesc = type { <3 x float>, float, <3 x float>, float }
 %"class.dx::HitObject" = type { i32 }
 
 @"\01?RTAS@@3URaytracingAccelerationStructure@@A" = external global %struct.RaytracingAccelerationStructure, align 4
@@ -28,55 +18,37 @@ target triple = "dxil-ms-dx"
 ; Function Attrs: nounwind
 define void @"\01?main@@YAXXZ"() #0 {
 entry:
-  %rayDesc = alloca %struct.RayDesc, align 4
-  %pld = alloca %struct.Payload, align 4
+  %pld_invoke = alloca %struct.Payload
+  %pld_trace = alloca %struct.Payload
   %hit = alloca %dx.types.HitObject, align 4
-  %0 = bitcast %struct.RayDesc* %rayDesc to i8*, !dbg !31 ; line:80 col:3
-  call void @llvm.lifetime.start(i64 32, i8* %0) #0, !dbg !31 ; line:80 col:3
-  %Origin = getelementptr inbounds %struct.RayDesc, %struct.RayDesc* %rayDesc, i32 0, i32 0, !dbg !35 ; line:81 col:11
-  store <3 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00>, <3 x float>* %Origin, align 4, !dbg !36, !tbaa !37 ; line:81 col:18
-  %TMin = getelementptr inbounds %struct.RayDesc, %struct.RayDesc* %rayDesc, i32 0, i32 1, !dbg !40 ; line:82 col:11
-  store float 3.000000e+00, float* %TMin, align 4, !dbg !41, !tbaa !42 ; line:82 col:16
-  %Direction = getelementptr inbounds %struct.RayDesc, %struct.RayDesc* %rayDesc, i32 0, i32 2, !dbg !44 ; line:83 col:11
-  store <3 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00>, <3 x float>* %Direction, align 4, !dbg !45, !tbaa !37 ; line:83 col:21
-  %TMax = getelementptr inbounds %struct.RayDesc, %struct.RayDesc* %rayDesc, i32 0, i32 3, !dbg !46 ; line:84 col:11
-  store float 7.000000e+00, float* %TMax, align 4, !dbg !47, !tbaa !42 ; line:84 col:16
-  %1 = bitcast %struct.Payload* %pld to i8*, !dbg !48 ; line:86 col:3
-  call void @llvm.lifetime.start(i64 12, i8* %1) #0, !dbg !48 ; line:86 col:3
-  %dummy = getelementptr inbounds %struct.Payload, %struct.Payload* %pld, i32 0, i32 0, !dbg !49 ; line:87 col:7
-  store <3 x float> <float 7.000000e+00, float 8.000000e+00, float 9.000000e+00>, <3 x float>* %dummy, align 4, !dbg !50, !tbaa !37 ; line:87 col:13
-  %2 = bitcast %dx.types.HitObject* %hit to i8*, !dbg !51 ; line:89 col:3
-  call void @llvm.lifetime.start(i64 4, i8* %2) #0, !dbg !51 ; line:89 col:3
-  %3 = load %struct.RaytracingAccelerationStructure, %struct.RaytracingAccelerationStructure* @"\01?RTAS@@3URaytracingAccelerationStructure@@A", !dbg !52 ; line:89 col:23
-  %4 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RaytracingAccelerationStructure)"(i32 0, %struct.RaytracingAccelerationStructure %3), !dbg !52 ; line:89 col:23
-  %5 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RaytracingAccelerationStructure)"(i32 14, %dx.types.Handle %4, %dx.types.ResourceProperties { i32 16, i32 0 }, %struct.RaytracingAccelerationStructure zeroinitializer), !dbg !52 ; line:89 col:23
-  ; CHECK: %[[ORIGINPTR:[^ ]+]] = getelementptr %struct.RayDesc, %struct.RayDesc* %[[RAYDESCPTR:[^ ]+]], i32 0, i32 0
-  ; CHECK: %[[ORIGIN:[^ ]+]] = load <3 x float>, <3 x float>* %[[ORIGINPTR]]
-  ; CHECK: %[[O0:[^ ]+]] = extractelement <3 x float> %[[ORIGIN]], i64 0
-  ; CHECK: %[[O1:[^ ]+]] = extractelement <3 x float> %[[ORIGIN]], i64 1
-  ; CHECK: %[[O2:[^ ]+]] = extractelement <3 x float> %[[ORIGIN]], i64 2
-  ; CHECK: %[[TMINPTR:[^ ]+]] = getelementptr %struct.RayDesc, %struct.RayDesc* %[[RAYDESCPTR]], i32 0, i32 1
-  ; CHECK: %[[TMIN:[^ ]+]] = load float, float* %[[TMINPTR]]
-  ; CHECK: %[[DIRPTR:[^ ]+]] = getelementptr %struct.RayDesc, %struct.RayDesc* %[[RAYDESCPTR]], i32 0, i32 2
-  ; CHECK: %[[DIR:[^ ]+]] = load <3 x float>, <3 x float>* %[[DIRPTR]]
-  ; CHECK: %[[D0:[^ ]+]] = extractelement <3 x float> %[[DIR]], i64 0
-  ; CHECK: %[[D1:[^ ]+]] = extractelement <3 x float> %[[DIR]], i64 1
-  ; CHECK: %[[D2:[^ ]+]] = extractelement <3 x float> %[[DIR]], i64 2
-  ; CHECK: %[[TMAXPTR:[^ ]+]] = getelementptr %struct.RayDesc, %struct.RayDesc* %[[RAYDESCPTR]], i32 0, i32 3
-  ; CHECK: %[[TMAX:[^ ]+]] = load float, float* %[[TMAXPTR]]
-  ; CHECK: %[[TRACEHO:[^ ]+]] = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle %5, i32 513, i32 1, i32 2, i32 4, i32 0, float %[[O0]], float %[[O1]], float %[[O2]], float %[[TMIN]], float %[[D0]], float %[[D1]], float %[[D2]], float %[[TMAX]], %struct.Payload* %pld)
-  call void @"dx.hl.op..void (i32, %dx.types.HitObject*, %dx.types.Handle, i32, i32, i32, i32, i32, %struct.RayDesc*, %struct.Payload*)"(i32 389, %dx.types.HitObject* %hit, %dx.types.Handle %5, i32 513, i32 1, i32 2, i32 4, i32 0, %struct.RayDesc* %rayDesc, %struct.Payload* %pld), !dbg !52 ; line:89 col:23
-  ; CHECK: store %dx.types.HitObject %[[TRACEHO]], %dx.types.HitObject* %[[HOPTR:[^ ]+]]
-  ; CHECK: %[[INVOKEHO:[^ ]+]] = load %dx.types.HitObject, %dx.types.HitObject* %[[HOPTR]]
-  ; CHECK: call void @dx.op.hitObject_Invoke.struct.Payload(i32 267, %dx.types.HitObject %[[INVOKEHO]], %struct.Payload* %pld)
-  call void @"dx.hl.op..void (i32, %dx.types.HitObject*, %struct.Payload*)"(i32 382, %dx.types.HitObject* %hit, %struct.Payload* %pld), !dbg !53 ; line:99 col:3
-  %6 = bitcast %dx.types.HitObject* %hit to i8*, !dbg !54 ; line:100 col:1
-  call void @llvm.lifetime.end(i64 4, i8* %6) #0, !dbg !54 ; line:100 col:1
-  %7 = bitcast %struct.Payload* %pld to i8*, !dbg !54 ; line:100 col:1
-  call void @llvm.lifetime.end(i64 12, i8* %7) #0, !dbg !54 ; line:100 col:1
-  %8 = bitcast %struct.RayDesc* %rayDesc to i8*, !dbg !54 ; line:100 col:1
-  call void @llvm.lifetime.end(i64 32, i8* %8) #0, !dbg !54 ; line:100 col:1
-  ret void, !dbg !54 ; line:100 col:1
+  %0 = bitcast %dx.types.HitObject* %hit to i8*, !dbg !32 ; line:91 col:3
+  call void @llvm.lifetime.start(i64 4, i8* %0) #0, !dbg !32 ; line:91 col:3
+  %1 = load %struct.RaytracingAccelerationStructure, %struct.RaytracingAccelerationStructure* @"\01?RTAS@@3URaytracingAccelerationStructure@@A", !dbg !36 ; line:91 col:23
+  %rtas = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RaytracingAccelerationStructure)"(i32 0, %struct.RaytracingAccelerationStructure %1), !dbg !36 ; line:91 col:23
+
+  ; Capture the handle for the RTAS
+  ; CHECK: %[[RTAS:[^ ]+]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %{{[^ ]+}}, %dx.types.ResourceProperties { i32 16, i32 0 })
+  %2 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RaytracingAccelerationStructure)"(i32 14, %dx.types.Handle %rtas, %dx.types.ResourceProperties { i32 16, i32 0 }, %struct.RaytracingAccelerationStructure zeroinitializer), !dbg !36 ; line:91 col:23
+
+  %3 = getelementptr inbounds %struct.Payload, %struct.Payload* %pld_trace, i32 0, i32 0, !dbg !36 ; line:91 col:23
+  store <3 x float> <float 7.000000e+00, float 8.000000e+00, float 9.000000e+00>, <3 x float>* %3, !dbg !36 ; line:91 col:23
+
+  ; CHECK: %[[TRACEHO:[^ ]+]] = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle %[[RTAS]], i32 513, i32 1, i32 2, i32 4, i32 0, float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, %struct.Payload* %pld_trace), !dbg !3 ; line:91 col:23
+  ; CHECK: store %dx.types.HitObject %[[TRACEHO]], %dx.types.HitObject* %hit
+  call void @"dx.hl.op..void (i32, %dx.types.HitObject*, %dx.types.Handle, i32, i32, i32, i32, i32, <3 x float>, float, <3 x float>, float, %struct.Payload*)"(i32 389, %dx.types.HitObject* %hit, %dx.types.Handle %2, i32 513, i32 1, i32 2, i32 4, i32 0, <3 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00>, float 3.000000e+00, <3 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00>, float 7.000000e+00, %struct.Payload* %pld_trace), !dbg !36 ; line:91 col:23
+
+  %4 = getelementptr inbounds %struct.Payload, %struct.Payload* %pld_trace, i32 0, i32 0, !dbg !37 ; line:101 col:3
+  %5 = load <3 x float>, <3 x float>* %4, !dbg !37 ; line:101 col:3
+  %6 = getelementptr inbounds %struct.Payload, %struct.Payload* %pld_invoke, i32 0, i32 0, !dbg !37 ; line:101 col:3
+  store <3 x float> %5, <3 x float>* %6, !dbg !37 ; line:101 col:3
+
+  ; CHECK: %[[INVOKEHO:[^ ]+]] = load %dx.types.HitObject, %dx.types.HitObject* %hit
+  ; CHECK: call void @dx.op.hitObject_Invoke.struct.Payload(i32 267, %dx.types.HitObject %[[INVOKEHO]], %struct.Payload* %pld_invoke)
+  call void @"dx.hl.op..void (i32, %dx.types.HitObject*, %struct.Payload*)"(i32 382, %dx.types.HitObject* %hit, %struct.Payload* %pld_invoke), !dbg !37 ; line:101 col:3
+
+  %7 = bitcast %dx.types.HitObject* %hit to i8*, !dbg !38 ; line:102 col:1
+  call void @llvm.lifetime.end(i64 4, i8* %7) #0, !dbg !38 ; line:102 col:1
+  ret void, !dbg !38 ; line:102 col:1
 }
 
 ; Function Attrs: nounwind
@@ -85,9 +57,6 @@ declare void @llvm.lifetime.start(i64, i8* nocapture) #0
 ; Function Attrs: nounwind
 declare void @llvm.lifetime.end(i64, i8* nocapture) #0
 
-; Function Attrs: nounwind
-declare void @"dx.hl.op..void (i32, %dx.types.HitObject*, %dx.types.Handle, i32, i32, i32, i32, i32, %struct.RayDesc*, %struct.Payload*)"(i32, %dx.types.HitObject*, %dx.types.Handle, i32, i32, i32, i32, i32, %struct.RayDesc*, %struct.Payload*) #0
-
 ; Function Attrs: nounwind readnone
 declare %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RaytracingAccelerationStructure)"(i32, %struct.RaytracingAccelerationStructure) #1
 
@@ -97,71 +66,59 @@ declare %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.type
 ; Function Attrs: nounwind
 declare void @"dx.hl.op..void (i32, %dx.types.HitObject*, %struct.Payload*)"(i32, %dx.types.HitObject*, %struct.Payload*) #0
 
+; Function Attrs: nounwind
+declare void @"dx.hl.op..void (i32, %dx.types.HitObject*, %dx.types.Handle, i32, i32, i32, i32, i32, <3 x float>, float, <3 x float>, float, %struct.Payload*)"(i32, %dx.types.HitObject*, %dx.types.Handle, i32, i32, i32, i32, i32, <3 x float>, float, <3 x float>, float, %struct.Payload*) #0
+
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
 
 !llvm.module.flags = !{!0}
 !pauseresume = !{!1}
-!dx.version = !{!2}
-!dx.valver = !{!2}
-!dx.shaderModel = !{!3}
-!dx.typeAnnotations = !{!4, !19}
-!dx.entryPoints = !{!23}
-!dx.fnprops = !{!28}
-!dx.options = !{!29, !30}
+!llvm.ident = !{!2}
+!dx.version = !{!3}
+!dx.valver = !{!3}
+!dx.shaderModel = !{!4}
+!dx.typeAnnotations = !{!5, !20}
+!dx.entryPoints = !{!24}
+!dx.fnprops = !{!29}
+!dx.options = !{!30, !31}
 
 !0 = !{i32 2, !"Debug Info Version", i32 3}
 !1 = !{!"hlsl-hlemit", !"hlsl-hlensure"}
-!2 = !{i32 1, i32 9}
-!3 = !{!"lib", i32 6, i32 9}
-!4 = !{i32 0, %"class.RWStructuredBuffer<float>" undef, !5, %struct.RayDesc undef, !10, %struct.Payload undef, !15, %"class.dx::HitObject" undef, !17}
-!5 = !{i32 4, !6, !7}
-!6 = !{i32 6, !"h", i32 3, i32 0, i32 7, i32 9}
-!7 = !{i32 0, !8}
-!8 = !{!9}
-!9 = !{i32 0, float undef}
-!10 = !{i32 32, !11, !12, !13, !14}
-!11 = !{i32 6, !"Origin", i32 3, i32 0, i32 7, i32 9, i32 13, i32 3}
-!12 = !{i32 6, !"TMin", i32 3, i32 12, i32 7, i32 9}
-!13 = !{i32 6, !"Direction", i32 3, i32 16, i32 7, i32 9, i32 13, i32 3}
-!14 = !{i32 6, !"TMax", i32 3, i32 28, i32 7, i32 9}
-!15 = !{i32 12, !16}
-!16 = !{i32 6, !"dummy", i32 3, i32 0, i32 7, i32 9, i32 13, i32 3}
-!17 = !{i32 4, !18}
-!18 = !{i32 6, !"h", i32 3, i32 0, i32 7, i32 4}
-!19 = !{i32 1, void ()* @"\01?main@@YAXXZ", !20}
-!20 = !{!21}
-!21 = !{i32 1, !22, !22}
-!22 = !{}
-!23 = !{null, !"", null, !24, null}
-!24 = !{!25, null, null, null}
-!25 = !{!26}
-!26 = !{i32 0, %struct.RaytracingAccelerationStructure* @"\01?RTAS@@3URaytracingAccelerationStructure@@A", !"RTAS", i32 -1, i32 -1, i32 1, i32 16, i32 0, !27}
-!27 = !{i32 0, i32 4}
-!28 = !{void ()* @"\01?main@@YAXXZ", i32 7}
-!29 = !{i32 -2147483584}
-!30 = !{i32 -1}
-!31 = !DILocation(line: 80, column: 3, scope: !32)
-!32 = !DISubprogram(name: "main", scope: !33, file: !33, line: 79, type: !34, isLocal: false, isDefinition: true, scopeLine: 79, flags: DIFlagPrototyped, isOptimized: false, function: void ()* @"\01?main@@YAXXZ")
-!33 = !DIFile(filename: "tools/clang/test/CodeGenDXIL/hlsl/objects/HitObject/hitobject_traceinvoke.hlsl", directory: "")
-!34 = !DISubroutineType(types: !22)
-!35 = !DILocation(line: 81, column: 11, scope: !32)
-!36 = !DILocation(line: 81, column: 18, scope: !32)
-!37 = !{!38, !38, i64 0}
-!38 = !{!"omnipotent char", !39, i64 0}
-!39 = !{!"Simple C/C++ TBAA"}
-!40 = !DILocation(line: 82, column: 11, scope: !32)
-!41 = !DILocation(line: 82, column: 16, scope: !32)
-!42 = !{!43, !43, i64 0}
-!43 = !{!"float", !38, i64 0}
-!44 = !DILocation(line: 83, column: 11, scope: !32)
-!45 = !DILocation(line: 83, column: 21, scope: !32)
-!46 = !DILocation(line: 84, column: 11, scope: !32)
-!47 = !DILocation(line: 84, column: 16, scope: !32)
-!48 = !DILocation(line: 86, column: 3, scope: !32)
-!49 = !DILocation(line: 87, column: 7, scope: !32)
-!50 = !DILocation(line: 87, column: 13, scope: !32)
-!51 = !DILocation(line: 89, column: 3, scope: !32)
-!52 = !DILocation(line: 89, column: 23, scope: !32)
-!53 = !DILocation(line: 99, column: 3, scope: !32)
-!54 = !DILocation(line: 100, column: 1, scope: !32)
+!2 = !{!"dxc(private) 1.8.0.4928 (ser_hlslattributes_patch, 937c16cc6)"}
+!3 = !{i32 1, i32 9}
+!4 = !{!"lib", i32 6, i32 9}
+!5 = !{i32 0, %"class.RWStructuredBuffer<float>" undef, !6, %struct.RayDesc undef, !11, %struct.Payload undef, !16, %"class.dx::HitObject" undef, !18}
+!6 = !{i32 4, !7, !8}
+!7 = !{i32 6, !"h", i32 3, i32 0, i32 7, i32 9}
+!8 = !{i32 0, !9}
+!9 = !{!10}
+!10 = !{i32 0, float undef}
+!11 = !{i32 32, !12, !13, !14, !15}
+!12 = !{i32 6, !"Origin", i32 3, i32 0, i32 7, i32 9, i32 13, i32 3}
+!13 = !{i32 6, !"TMin", i32 3, i32 12, i32 7, i32 9}
+!14 = !{i32 6, !"Direction", i32 3, i32 16, i32 7, i32 9, i32 13, i32 3}
+!15 = !{i32 6, !"TMax", i32 3, i32 28, i32 7, i32 9}
+!16 = !{i32 12, !17}
+!17 = !{i32 6, !"dummy", i32 3, i32 0, i32 7, i32 9, i32 13, i32 3}
+!18 = !{i32 4, !19}
+!19 = !{i32 6, !"h", i32 3, i32 0, i32 7, i32 4}
+!20 = !{i32 1, void ()* @"\01?main@@YAXXZ", !21}
+!21 = !{!22}
+!22 = !{i32 1, !23, !23}
+!23 = !{}
+!24 = !{null, !"", null, !25, null}
+!25 = !{!26, null, null, null}
+!26 = !{!27}
+!27 = !{i32 0, %struct.RaytracingAccelerationStructure* @"\01?RTAS@@3URaytracingAccelerationStructure@@A", !"RTAS", i32 -1, i32 -1, i32 1, i32 16, i32 0, !28}
+!28 = !{i32 0, i32 4}
+!29 = !{void ()* @"\01?main@@YAXXZ", i32 7}
+!30 = !{i32 -2147483584}
+!31 = !{i32 -1}
+!32 = !DILocation(line: 91, column: 3, scope: !33)
+!33 = !DISubprogram(name: "main", scope: !34, file: !34, line: 81, type: !35, isLocal: false, isDefinition: true, scopeLine: 81, flags: DIFlagPrototyped, isOptimized: false, function: void ()* @"\01?main@@YAXXZ")
+!34 = !DIFile(filename: "tools/clang/test/CodeGenDXIL/hlsl/objects/HitObject/hitobject_traceinvoke.hlsl", directory: "")
+!35 = !DISubroutineType(types: !23)
+!36 = !DILocation(line: 91, column: 23, scope: !33)
+!37 = !DILocation(line: 101, column: 3, scope: !33)
+!38 = !DILocation(line: 102, column: 1, scope: !33)
diff --git a/tools/clang/test/DXC/Passes/DxilGen/tracerayinline_cb_raydesc_dxilgen.ll b/tools/clang/test/DXC/Passes/DxilGen/tracerayinline_cb_raydesc_dxilgen.ll
new file mode 100644
index 0000000000..b969a63f12
--- /dev/null
+++ b/tools/clang/test/DXC/Passes/DxilGen/tracerayinline_cb_raydesc_dxilgen.ll
@@ -0,0 +1,160 @@
+; RUN: %dxopt %s -hlsl-passes-resume -dxilgen -S | FileCheck %s
+
+; Based on tools/clang/test/CodeGenDXIL/hlsl/objects/RayQuery/tracerayinline_cb_raydesc.hlsl
+
+target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+%struct.RaytracingAccelerationStructure = type { i32 }
+%"$Globals" = type { %struct.RayDesc }
+%struct.RayDesc = type { <3 x float>, float, <3 x float>, float }
+%dx.types.Handle = type { i8* }
+%dx.types.ResourceProperties = type { i32, i32 }
+%"class.RayQuery<513, 0>" = type { i32 }
+
+@"\01?RTAS@@3URaytracingAccelerationStructure@@A" = external global %struct.RaytracingAccelerationStructure, align 4
+@"$Globals" = external constant %"$Globals"
+
+; Function Attrs: nounwind
+define void @main() #0 {
+entry:
+
+  ; Capture CB, RTAS, and RayQuery
+  ; CHECK-DAG: %[[CB:[^ ,]+]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %"$Globals", %dx.types.ResourceProperties { i32 13, i32 32 })
+  ; CHECK-DAG: %[[RTAS:[^ ,]+]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %{{[^ ,]+}}, %dx.types.ResourceProperties { i32 16, i32 0 })
+  ; CHECK-DAG: %[[RQ:[^ ,]+]] = call i32 @dx.op.allocateRayQuery(i32 178, i32 513)
+
+  %0 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22$Globals\22*, i32)"(i32 0, %"$Globals"* @"$Globals", i32 0)
+  %1 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22$Globals\22)"(i32 14, %dx.types.Handle %0, %dx.types.ResourceProperties { i32 13, i32 32 }, %"$Globals" undef)
+  %2 = call %"$Globals"* @"dx.hl.subscript.cb.rn.%\22$Globals\22* (i32, %dx.types.Handle, i32)"(i32 6, %dx.types.Handle %1, i32 0)
+  %3 = getelementptr inbounds %"$Globals", %"$Globals"* %2, i32 0, i32 0
+  %rayQuery1 = call i32 @"dx.hl.op..i32 (i32, i32, i32)"(i32 4, i32 513, i32 0), !dbg !34 ; line:12 col:71
+  %4 = load %struct.RaytracingAccelerationStructure, %struct.RaytracingAccelerationStructure* @"\01?RTAS@@3URaytracingAccelerationStructure@@A", !dbg !38 ; line:13 col:3
+  %5 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RaytracingAccelerationStructure)"(i32 0, %struct.RaytracingAccelerationStructure %4), !dbg !38 ; line:13 col:3
+  %6 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RaytracingAccelerationStructure)"(i32 14, %dx.types.Handle %5, %dx.types.ResourceProperties { i32 16, i32 0 }, %struct.RaytracingAccelerationStructure zeroinitializer), !dbg !38 ; line:13 col:3
+
+  ; Load RayDesc.Origin
+  ; CHECK: %[[ORIG_CB_LD:[^ ,]+]] = call %dx.types.CBufRet.f32 @dx.op.cbufferLoadLegacy.f32(i32 59, %dx.types.Handle %[[CB]], i32 0)
+  ; CHECK: %[[ORIG_EX0:[^ ,]+]] = extractvalue %dx.types.CBufRet.f32 %[[ORIG_CB_LD]], 0
+  ; CHECK: %[[ORIG_VX:[^ ,]+]] = insertelement <3 x float> undef, float %[[ORIG_EX0]], i64 0
+  ; CHECK: %[[ORIG_EX1:[^ ,]+]] = extractvalue %dx.types.CBufRet.f32 %[[ORIG_CB_LD]], 1
+  ; CHECK: %[[ORIG_VXY:[^ ,]+]] = insertelement <3 x float> %[[ORIG_VX]], float %[[ORIG_EX1]], i64 1
+  ; CHECK: %[[ORIG_EX2:[^ ,]+]] = extractvalue %dx.types.CBufRet.f32 %[[ORIG_CB_LD]], 2
+  ; CHECK: %[[ORIG_VXYZ:[^ ,]+]] = insertelement <3 x float> %[[ORIG_VXY]], float %[[ORIG_EX2]], i64 2
+
+  ; Load RayDesc.TMin
+  ; CHECK: %[[TMIN_CB_LD:[^ ,]+]] = call %dx.types.CBufRet.f32 @dx.op.cbufferLoadLegacy.f32(i32 59, %dx.types.Handle %[[CB]], i32 0)
+  ; CHECK: %[[TMIN:[^ ,]+]] = extractvalue %dx.types.CBufRet.f32 %[[TMIN_CB_LD]], 3
+
+  ; Load RayDesc.Direction
+  ; CHECK: %[[DIR_CB_LD:[^ ,]+]] = call %dx.types.CBufRet.f32 @dx.op.cbufferLoadLegacy.f32(i32 59, %dx.types.Handle %[[CB]], i32 1)
+  ; CHECK: %[[DIR_EX0:[^ ,]+]] = extractvalue %dx.types.CBufRet.f32 %[[DIR_CB_LD]], 0
+  ; CHECK: %[[DIR_VX:[^ ,]+]] = insertelement <3 x float> undef, float %[[DIR_EX0]], i64 0
+  ; CHECK: %[[DIR_EX1:[^ ,]+]] = extractvalue %dx.types.CBufRet.f32 %[[DIR_CB_LD]], 1
+  ; CHECK: %[[DIR_VXY:[^ ,]+]] = insertelement <3 x float> %[[DIR_VX]], float %[[DIR_EX1]], i64 1
+  ; CHECK: %[[DIR_EX2:[^ ,]+]] = extractvalue %dx.types.CBufRet.f32 %[[DIR_CB_LD]], 2
+  ; CHECK: %[[DIR_VXYZ:[^ ,]+]] = insertelement <3 x float> %[[DIR_VXY]], float %[[DIR_EX2]], i64 2
+
+  ; Load RayDesc.TMax
+  ; CHECK: %21 = call %dx.types.CBufRet.f32 @dx.op.cbufferLoadLegacy.f32(i32 59, %dx.types.Handle %[[CB]], i32 1)
+  ; CHECK: %22 = extractvalue %dx.types.CBufRet.f32 %21, 3
+
+  ; Extract RayDesc vector fields
+  ; CHECK: %[[ORIGX:[^ ,]+]] = extractelement <3 x float> %[[ORIG_VXYZ]], i64 0
+  ; CHECK: %[[ORIGY:[^ ,]+]] = extractelement <3 x float> %[[ORIG_VXYZ]], i64 1
+  ; CHECK: %[[ORIGZ:[^ ,]+]] = extractelement <3 x float> %[[ORIG_VXYZ]], i64 2
+  ; CHECK: %[[DIRX:[^ ,]+]] = extractelement <3 x float> %[[DIR_VXYZ]], i64 0
+  ; CHECK: %[[DIRY:[^ ,]+]] = extractelement <3 x float> %[[DIR_VXYZ]], i64 1
+  ; CHECK: %[[DIRZ:[^ ,]+]] = extractelement <3 x float> %[[DIR_VXYZ]], i64 2
+
+  %7 = getelementptr inbounds %struct.RayDesc, %struct.RayDesc* %3, i32 0, i32 0, !dbg !38 ; line:13 col:3
+  %8 = load <3 x float>, <3 x float>* %7, !dbg !38 ; line:13 col:3
+  %9 = getelementptr inbounds %struct.RayDesc, %struct.RayDesc* %3, i32 0, i32 1, !dbg !38 ; line:13 col:3
+  %10 = load float, float* %9, !dbg !38 ; line:13 col:3
+  %11 = getelementptr inbounds %struct.RayDesc, %struct.RayDesc* %3, i32 0, i32 2, !dbg !38 ; line:13 col:3
+  %12 = load <3 x float>, <3 x float>* %11, !dbg !38 ; line:13 col:3
+  %13 = getelementptr inbounds %struct.RayDesc, %struct.RayDesc* %3, i32 0, i32 3, !dbg !38 ; line:13 col:3
+  %14 = load float, float* %13, !dbg !38 ; line:13 col:3
+
+  ; Call TraceRayInline
+  ; CHECK: call void @dx.op.rayQuery_TraceRayInline(i32 179, i32 %[[RQ]], %dx.types.Handle %[[RTAS]], i32 1, i32 2, float %[[ORIGX]], float %[[ORIGY]], float %[[ORIGZ]], float %[[TMIN]], float %[[DIRX]], float %[[DIRY]], float %[[DIRZ]], float %22)
+
+  call void @"dx.hl.op..void (i32, i32, %dx.types.Handle, i32, i32, <3 x float>, float, <3 x float>, float)"(i32 325, i32 %rayQuery1, %dx.types.Handle %6, i32 1, i32 2, <3 x float> %8, float %10, <3 x float> %12, float %14), !dbg !38 ; line:13 col:3
+  ret void, !dbg !39 ; line:14 col:1
+}
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RaytracingAccelerationStructure)"(i32, %struct.RaytracingAccelerationStructure) #1
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RaytracingAccelerationStructure)"(i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RaytracingAccelerationStructure) #1
+
+; Function Attrs: nounwind readnone
+declare %"$Globals"* @"dx.hl.subscript.cb.rn.%\22$Globals\22* (i32, %dx.types.Handle, i32)"(i32, %dx.types.Handle, i32) #1
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22$Globals\22*, i32)"(i32, %"$Globals"*, i32) #1
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22$Globals\22)"(i32, %dx.types.Handle, %dx.types.ResourceProperties, %"$Globals") #1
+
+; Function Attrs: nounwind
+declare i32 @"dx.hl.op..i32 (i32, i32, i32)"(i32, i32, i32) #0
+
+; Function Attrs: nounwind
+declare void @"dx.hl.op..void (i32, i32, %dx.types.Handle, i32, i32, <3 x float>, float, <3 x float>, float)"(i32, i32, %dx.types.Handle, i32, i32, <3 x float>, float, <3 x float>, float) #0
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+
+!llvm.module.flags = !{!0}
+!pauseresume = !{!1}
+!llvm.ident = !{!2}
+!dx.version = !{!3}
+!dx.valver = !{!4}
+!dx.shaderModel = !{!5}
+!dx.typeAnnotations = !{!6, !20}
+!dx.entryPoints = !{!24}
+!dx.fnprops = !{!31}
+!dx.options = !{!32, !33}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !{!"hlsl-hlemit", !"hlsl-hlensure"}
+!2 = !{!"dxc(private) 1.8.0.14861 (main, 33bc44a3d)"}
+!3 = !{i32 1, i32 5}
+!4 = !{i32 1, i32 9}
+!5 = !{!"vs", i32 6, i32 5}
+!6 = !{i32 0, %struct.RayDesc undef, !7, %"class.RayQuery<513, 0>" undef, !12, %"$Globals" undef, !18}
+!7 = !{i32 32, !8, !9, !10, !11}
+!8 = !{i32 6, !"Origin", i32 3, i32 0, i32 7, i32 9}
+!9 = !{i32 6, !"TMin", i32 3, i32 12, i32 7, i32 9}
+!10 = !{i32 6, !"Direction", i32 3, i32 16, i32 7, i32 9}
+!11 = !{i32 6, !"TMax", i32 3, i32 28, i32 7, i32 9}
+!12 = !{i32 4, !13, !14}
+!13 = !{i32 6, !"h", i32 3, i32 0, i32 7, i32 5}
+!14 = !{i32 0, !15}
+!15 = !{!16, !17}
+!16 = !{i32 1, i64 513}
+!17 = !{i32 1, i64 0}
+!18 = !{i32 32, !19}
+!19 = !{i32 6, !"rayDesc", i32 3, i32 0}
+!20 = !{i32 1, void ()* @main, !21}
+!21 = !{!22}
+!22 = !{i32 1, !23, !23}
+!23 = !{}
+!24 = !{void ()* @main, !"main", null, !25, null}
+!25 = !{!26, null, !29, null}
+!26 = !{!27}
+!27 = !{i32 0, %struct.RaytracingAccelerationStructure* @"\01?RTAS@@3URaytracingAccelerationStructure@@A", !"RTAS", i32 -1, i32 -1, i32 1, i32 16, i32 0, !28}
+!28 = !{i32 0, i32 4}
+!29 = !{!30}
+!30 = !{i32 0, %"$Globals"* @"$Globals", !"$Globals", i32 0, i32 -1, i32 1, i32 32, null}
+!31 = !{void ()* @main, i32 1}
+!32 = !{i32 64}
+!33 = !{i32 -1}
+!34 = !DILocation(line: 12, column: 71, scope: !35)
+!35 = !DISubprogram(name: "main", scope: !36, file: !36, line: 11, type: !37, isLocal: false, isDefinition: true, scopeLine: 11, flags: DIFlagPrototyped, isOptimized: false, function: void ()* @main)
+!36 = !DIFile(filename: "tools/clang/test/CodeGenDXIL/hlsl/objects/RayQuery/tracerayinline_cb_raydesc.hlsl", directory: "")
+!37 = !DISubroutineType(types: !23)
+!38 = !DILocation(line: 13, column: 3, scope: !35)
+!39 = !DILocation(line: 14, column: 1, scope: !35)
diff --git a/tools/clang/test/DXC/Passes/DxilGen/tracerayinline_dxilgen.ll b/tools/clang/test/DXC/Passes/DxilGen/tracerayinline_dxilgen.ll
new file mode 100644
index 0000000000..0d97d8782d
--- /dev/null
+++ b/tools/clang/test/DXC/Passes/DxilGen/tracerayinline_dxilgen.ll
@@ -0,0 +1,134 @@
+; RUN: %dxopt %s -hlsl-passes-resume -dxilgen -S | FileCheck %s
+
+; Based on tools/clang/test/CodeGenDXIL/hlsl/objects/RayQuery/tracerayinline.hlsl,
+; with call to DoTrace commented out.
+
+target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+%struct.RaytracingAccelerationStructure = type { i32 }
+%dx.types.Handle = type { i8* }
+%dx.types.ResourceProperties = type { i32, i32 }
+%struct.RayDesc = type { <3 x float>, float, <3 x float>, float }
+%"class.RayQuery<513, 0>" = type { i32 }
+
+@"\01?RTAS@@3URaytracingAccelerationStructure@@A" = external global %struct.RaytracingAccelerationStructure, align 4
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RaytracingAccelerationStructure)"(i32, %struct.RaytracingAccelerationStructure) #0
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RaytracingAccelerationStructure)"(i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RaytracingAccelerationStructure) #0
+
+; Function Attrs: nounwind
+declare i32 @"dx.hl.op..i32 (i32, i32, i32)"(i32, i32, i32) #1
+
+; Function Attrs: nounwind
+define void @main(float* noalias, <3 x float>, float, <3 x float>, float) #1 {
+entry:
+
+  ; Load RayDesc fields from input
+  ; CHECK-DAG: %[[ORIGX_LI:[^ ,]+]] = call float @dx.op.loadInput.f32(i32 4, i32 0, i32 0, i8 0, i32 undef)
+  ; CHECK-DAG: %[[ORIGY_LI:[^ ,]+]] = call float @dx.op.loadInput.f32(i32 4, i32 0, i32 0, i8 1, i32 undef)
+  ; CHECK-DAG: %[[ORIGZ_LI:[^ ,]+]] = call float @dx.op.loadInput.f32(i32 4, i32 0, i32 0, i8 2, i32 undef)
+  ; CHECK-DAG: %[[TMIN:[^ ,]+]] = call float @dx.op.loadInput.f32(i32 4, i32 1, i32 0, i8 0, i32 undef)
+  ; CHECK-DAG: %[[DIRX_LI:[^ ,]+]] = call float @dx.op.loadInput.f32(i32 4, i32 2, i32 0, i8 0, i32 undef)
+  ; CHECK-DAG: %[[DIRY_LI:[^ ,]+]] = call float @dx.op.loadInput.f32(i32 4, i32 2, i32 0, i8 1, i32 undef)
+  ; CHECK-DAG: %[[DIRZ_LI:[^ ,]+]] = call float @dx.op.loadInput.f32(i32 4, i32 2, i32 0, i8 2, i32 undef)
+  ; CHECK-DAG: %[[TMAX:[^ ,]+]] = call float @dx.op.loadInput.f32(i32 4, i32 3, i32 0, i8 0, i32 undef)
+  ; CHECK-DAG: %[[ORIG_VX:[^ ,]+]] = insertelement <3 x float> undef, float %[[ORIGX_LI]], i64 0
+  ; CHECK-DAG: %[[ORIG_VXY:[^ ,]+]] = insertelement <3 x float> %[[ORIG_VX]], float %[[ORIGY_LI]], i64 1
+  ; CHECK-DAG: %[[ORIG_VXYZ:[^ ,]+]] = insertelement <3 x float> %[[ORIG_VXY]], float %[[ORIGZ_LI]], i64 2
+  ; CHECK-DAG: %[[DIR_VX:[^ ,]+]] = insertelement <3 x float> undef, float %[[DIRX_LI]], i64 0
+  ; CHECK-DAG: %[[DIR_VXY:[^ ,]+]] = insertelement <3 x float> %[[DIR_VX]], float %[[DIRY_LI]], i64 1
+  ; CHECK-DAG: %[[DIR_VXYZ:[^ ,]+]] = insertelement <3 x float> %[[DIR_VXY]], float %[[DIRZ_LI]], i64 2
+
+  ; Capture RayQuery and RTAS
+  ; CHECK-DAG: %[[RQ:[^ ,]+]] = call i32 @dx.op.allocateRayQuery(i32 178, i32 513)
+  ; CHECK-DAG: %[[RTAS:[^ ,]+]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %{{[^ ,]+}}, %dx.types.ResourceProperties { i32 16, i32 0 })
+
+  %rayQuery1 = call i32 @"dx.hl.op..i32 (i32, i32, i32)"(i32 4, i32 513, i32 0), !dbg !41 ; line:15 col:71
+  %5 = load %struct.RaytracingAccelerationStructure, %struct.RaytracingAccelerationStructure* @"\01?RTAS@@3URaytracingAccelerationStructure@@A", !dbg !45 ; line:17 col:3
+  %6 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RaytracingAccelerationStructure)"(i32 0, %struct.RaytracingAccelerationStructure %5), !dbg !45 ; line:17 col:3
+  %7 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RaytracingAccelerationStructure)"(i32 14, %dx.types.Handle %6, %dx.types.ResourceProperties { i32 16, i32 0 }, %struct.RaytracingAccelerationStructure zeroinitializer), !dbg !45 ; line:17 col:3
+
+  ; Extract RayDesc vector fields
+  ; CHECK-DAG: %[[ORIGX:[^ ,]+]] = extractelement <3 x float> %[[ORIG_VXYZ]], i64 0
+  ; CHECK-DAG: %[[ORIGY:[^ ,]+]] = extractelement <3 x float> %[[ORIG_VXYZ]], i64 1
+  ; CHECK-DAG: %[[ORIGZ:[^ ,]+]] = extractelement <3 x float> %[[ORIG_VXYZ]], i64 2
+  ; CHECK-DAG: %[[DIRX:[^ ,]+]] = extractelement <3 x float> %[[DIR_VXYZ]], i64 0
+  ; CHECK-DAG: %[[DIRY:[^ ,]+]] = extractelement <3 x float> %[[DIR_VXYZ]], i64 1
+  ; CHECK-DAG: %[[DIRZ:[^ ,]+]] = extractelement <3 x float> %[[DIR_VXYZ]], i64 2
+
+  ; Call TraceRayInline
+  ; CHECK: call void @dx.op.rayQuery_TraceRayInline(i32 179, i32 %[[RQ]], %dx.types.Handle %[[RTAS]], i32 1, i32 2, float %[[ORIGX]], float %[[ORIGY]], float %[[ORIGZ]], float %[[TMIN]], float %[[DIRX]], float %[[DIRY]], float %[[DIRZ]], float %[[TMAX]])
+
+  call void @"dx.hl.op..void (i32, i32, %dx.types.Handle, i32, i32, <3 x float>, float, <3 x float>, float)"(i32 325, i32 %rayQuery1, %dx.types.Handle %7, i32 1, i32 2, <3 x float> %1, float %2, <3 x float> %3, float %4), !dbg !45 ; line:17 col:3
+  store float 0.000000e+00, float* %0, !dbg !46 ; line:18 col:3
+  ret void, !dbg !46 ; line:18 col:3
+}
+
+; Function Attrs: nounwind
+declare void @"dx.hl.op..void (i32, i32, %dx.types.Handle, i32, i32, <3 x float>, float, <3 x float>, float)"(i32, i32, %dx.types.Handle, i32, i32, <3 x float>, float, <3 x float>, float) #1
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
+
+!llvm.module.flags = !{!0}
+!pauseresume = !{!1}
+!llvm.ident = !{!2}
+!dx.version = !{!3}
+!dx.valver = !{!4}
+!dx.shaderModel = !{!5}
+!dx.typeAnnotations = !{!6, !18}
+!dx.entryPoints = !{!33}
+!dx.fnprops = !{!38}
+!dx.options = !{!39, !40}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !{!"hlsl-hlemit", !"hlsl-hlensure"}
+!2 = !{!"dxc(private) 1.8.0.14861 (main, 33bc44a3d)"}
+!3 = !{i32 1, i32 5}
+!4 = !{i32 1, i32 9}
+!5 = !{!"vs", i32 6, i32 5}
+!6 = !{i32 0, %struct.RayDesc undef, !7, %"class.RayQuery<513, 0>" undef, !12}
+!7 = !{i32 32, !8, !9, !10, !11}
+!8 = !{i32 6, !"Origin", i32 3, i32 0, i32 7, i32 9}
+!9 = !{i32 6, !"TMin", i32 3, i32 12, i32 7, i32 9}
+!10 = !{i32 6, !"Direction", i32 3, i32 16, i32 7, i32 9}
+!11 = !{i32 6, !"TMax", i32 3, i32 28, i32 7, i32 9}
+!12 = !{i32 4, !13, !14}
+!13 = !{i32 6, !"h", i32 3, i32 0, i32 7, i32 5}
+!14 = !{i32 0, !15}
+!15 = !{!16, !17}
+!16 = !{i32 1, i64 513}
+!17 = !{i32 1, i64 0}
+!18 = !{i32 1, void (float*, <3 x float>, float, <3 x float>, float)* @main, !19}
+!19 = !{!20, !22, !25, !27, !29, !31}
+!20 = !{i32 0, !21, !21}
+!21 = !{}
+!22 = !{i32 1, !23, !24}
+!23 = !{i32 4, !"OUT", i32 7, i32 9}
+!24 = !{i32 0}
+!25 = !{i32 0, !26, !24}
+!26 = !{i32 4, !"RAYDESC", i32 7, i32 9}
+!27 = !{i32 0, !26, !28}
+!28 = !{i32 1}
+!29 = !{i32 0, !26, !30}
+!30 = !{i32 2}
+!31 = !{i32 0, !26, !32}
+!32 = !{i32 3}
+!33 = !{void (float*, <3 x float>, float, <3 x float>, float)* @main, !"main", null, !34, null}
+!34 = !{!35, null, null, null}
+!35 = !{!36}
+!36 = !{i32 0, %struct.RaytracingAccelerationStructure* @"\01?RTAS@@3URaytracingAccelerationStructure@@A", !"RTAS", i32 -1, i32 -1, i32 1, i32 16, i32 0, !37}
+!37 = !{i32 0, i32 4}
+!38 = !{void (float*, <3 x float>, float, <3 x float>, float)* @main, i32 1}
+!39 = !{i32 64}
+!40 = !{i32 -1}
+!41 = !DILocation(line: 15, column: 71, scope: !42)
+!42 = !DISubprogram(name: "main", scope: !43, file: !43, line: 14, type: !44, isLocal: false, isDefinition: true, scopeLine: 14, flags: DIFlagPrototyped, isOptimized: false, function: void (float*, <3 x float>, float, <3 x float>, float)* @main)
+!43 = !DIFile(filename: "tools/clang/test/CodeGenDXIL/hlsl/objects/RayQuery/tracerayinline.hlsl", directory: "")
+!44 = !DISubroutineType(types: !21)
+!45 = !DILocation(line: 17, column: 3, scope: !42)
+!46 = !DILocation(line: 18, column: 3, scope: !42)
diff --git a/tools/clang/test/DXC/Passes/ScalarReplHLSL/hitobject_fromrayquery_scalarrepl.ll b/tools/clang/test/DXC/Passes/ScalarReplHLSL/hitobject_fromrayquery_scalarrepl.ll
index 5afd30b524..85c3a34eb9 100644
--- a/tools/clang/test/DXC/Passes/ScalarReplHLSL/hitobject_fromrayquery_scalarrepl.ll
+++ b/tools/clang/test/DXC/Passes/ScalarReplHLSL/hitobject_fromrayquery_scalarrepl.ll
@@ -95,10 +95,10 @@ target triple = "dxil-ms-dx"
 @"$Globals" = external constant %ConstantBuffer
 
 ; CHECK: %[[RQA:[^ ]+]] = alloca i32
-; CHECK: %[[ATTRA0:[^ ]+]] = alloca %struct.CustomAttrs
-; CHECK: %[[ATTRA1:[^ ]+]] = alloca %struct.CustomAttrs
 ; CHECK: %[[XATTRA:[^ ]+]] = alloca float
 ; CHECK: %[[YATTRA:[^ ]+]] = alloca float
+; CHECK: %[[ATTRA0:[^ ]+]] = alloca %struct.CustomAttrs
+; CHECK: %[[ATTRA1:[^ ]+]] = alloca %struct.CustomAttrs
 
 ; COM: Check same query handle used for TraceRayInline and the FromRayQuery calls
 ; CHECK: %[[RQH:[^ ]+]] = load i32, i32* %[[RQA]]
@@ -122,7 +122,7 @@ target triple = "dxil-ms-dx"
 ; CHECK: store float %[[XF1]], float* %[[XPTR0]]
 ; CHECK: %[[YPTR0:[^ ]+]] = getelementptr inbounds %struct.CustomAttrs, %struct.CustomAttrs* %[[ATTRA0]], i32 0, i32 1
 ; CHECK: %[[YF1:[^ ]+]] = load float, float* %[[YATTRA]]
-; CHECK: store float %[[YF1]], float* %[[YPTR0]], align 4
+; CHECK: store float %[[YF1]], float* %[[YPTR0]]
 ; CHECK: %[[RQH1:[^ ]+]] = load i32, i32* %[[RQA]]
 ; CHECK: call void @"dx.hl.op..void (i32, %dx.types.HitObject*, i32, i32, %struct.CustomAttrs*)"(i32 363, %dx.types.HitObject* %{{[^ ]+}}, i32 %[[RQH1]], i32 16, %struct.CustomAttrs* %[[ATTRA0]])
 
@@ -140,7 +140,7 @@ target triple = "dxil-ms-dx"
 ; CHECK: store float %[[XF2]], float* %[[XPTR1]]
 ; CHECK: %[[YPTR1:[^ ]+]] = getelementptr inbounds %struct.CustomAttrs, %struct.CustomAttrs* %[[ATTRA1]], i32 0, i32 1
 ; CHECK: %[[YF2:[^ ]+]] = load float, float* %[[YATTRA]]
-; CHECK: store float %[[YF2]], float* %[[YPTR1]], align 4
+; CHECK: store float %[[YF2]], float* %[[YPTR1]]
 ; CHECK: %[[RQH2:[^ ]+]] = load i32, i32* %[[RQA]]
 ; CHECK: call void @"dx.hl.op..void (i32, %dx.types.HitObject*, i32, i32, %struct.CustomAttrs*)"(i32 363, %dx.types.HitObject* %{{[^ ]+}}, i32 %[[RQH2]], i32 17, %struct.CustomAttrs* %[[ATTRA1]])
 
diff --git a/tools/clang/test/DXC/Passes/ScalarReplHLSL/hitobject_make_scalarrepl.ll b/tools/clang/test/DXC/Passes/ScalarReplHLSL/hitobject_make_scalarrepl.ll
index 89ee886c2e..78f7271e94 100644
--- a/tools/clang/test/DXC/Passes/ScalarReplHLSL/hitobject_make_scalarrepl.ll
+++ b/tools/clang/test/DXC/Passes/ScalarReplHLSL/hitobject_make_scalarrepl.ll
@@ -33,7 +33,7 @@ entry:
   %hit = alloca %dx.types.HitObject, align 4
   %tmp = alloca %dx.types.HitObject, align 4
   %ray = alloca %struct.RayDesc, align 4
-; CHECK-NOT: %{{[^ ]+}} = alloca %struct.RayDesc
+; CHECK-NOT: alloca %struct.RayDesc
   %tmp2 = alloca %dx.types.HitObject, align 4
 ; CHECK: %[[HIT0:[^ ]+]] = alloca %dx.types.HitObject, align 4
 ; CHECK: %[[HIT1:[^ ]+]] = alloca %dx.types.HitObject, align 4
@@ -69,7 +69,16 @@ entry:
 ; CHECK-DAG: %[[RDTMIN:[^ ]+]] = load float, float* %[[pRDTMIN]],
 ; CHECK-DAG: %[[RDD:[^ ]+]] = load <3 x float>, <3 x float>* %[[pRDD]],
 ; CHECK-DAG: %[[RDTMAX:[^ ]+]] = load float, float* %[[pRDTMAX]],
-; CHECK:  call void @"dx.hl.op..void (i32, %dx.types.HitObject*, i32, i32, <3 x float>, float, <3 x float>, float)"(i32 387, %dx.types.HitObject* %[[HIT2]], i32 0, i32 1, <3 x float> %[[RDO]], float %[[RDTMIN]], <3 x float> %[[RDD]], float %[[RDTMAX]])
+; Copy introduced for RayDesc argument
+; CHECK-DAG: store <3 x float> %[[RDO]], <3 x float>* %[[pRDO2:[^ ]+]],
+; CHECK-DAG: store float %[[RDTMIN]], float* %[[pRDTMIN2:[^ ]+]],
+; CHECK-DAG: store <3 x float> %[[RDD]], <3 x float>* %[[pRDD2:[^ ]+]],
+; CHECK-DAG: store float %[[RDTMAX]], float* %[[pRDTMAX2:[^ ]+]],
+; CHECK-DAG: %[[RDO2:[^ ]+]] = load <3 x float>, <3 x float>* %[[pRDO2]],
+; CHECK-DAG: %[[RDTMIN2:[^ ]+]] = load float, float* %[[pRDTMIN2]],
+; CHECK-DAG: %[[RDD2:[^ ]+]] = load <3 x float>, <3 x float>* %[[pRDD2]],
+; CHECK-DAG: %[[RDTMAX2:[^ ]+]] = load float, float* %[[pRDTMAX2]],
+; CHECK:  call void @"dx.hl.op..void (i32, %dx.types.HitObject*, i32, i32, <3 x float>, float, <3 x float>, float)"(i32 387, %dx.types.HitObject* %[[HIT2]], i32 0, i32 1, <3 x float> %[[RDO2]], float %[[RDTMIN2]], <3 x float> %[[RDD2]], float %[[RDTMAX2]])
   call void @"dx.hl.op..void (i32, %dx.types.HitObject*, i32, i32, %struct.RayDesc*)"(i32 387, %dx.types.HitObject* %tmp2, i32 0, i32 1, %struct.RayDesc* %ray), !dbg !31 ; line:45 col:3
   %10 = bitcast %dx.types.HitObject* %tmp2 to i8*, !dbg !31 ; line:45 col:3
   call void @llvm.lifetime.end(i64 4, i8* %10) #0, !dbg !31 ; line:45 col:3
diff --git a/tools/clang/test/DXC/Passes/ScalarReplHLSL/hitobject_traceinvoke_scalarrepl.ll b/tools/clang/test/DXC/Passes/ScalarReplHLSL/hitobject_traceinvoke_scalarrepl.ll
new file mode 100644
index 0000000000..fa22ee5744
--- /dev/null
+++ b/tools/clang/test/DXC/Passes/ScalarReplHLSL/hitobject_traceinvoke_scalarrepl.ll
@@ -0,0 +1,198 @@
+; RUN: %dxopt %s -hlsl-passes-resume -scalarrepl-param-hlsl -S | FileCheck %s
+
+; Based on tools/clang/test/CodeGenDXIL/hlsl/objects/HitObject/hitobject_traceinvoke.hlsl
+
+target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+%struct.RaytracingAccelerationStructure = type { i32 }
+%"class.RWStructuredBuffer<float>" = type { float }
+%ConstantBuffer = type opaque
+%struct.RayDesc = type { <3 x float>, float, <3 x float>, float }
+%struct.Payload = type { <3 x float> }
+%dx.types.HitObject = type { i8* }
+%dx.types.Handle = type { i8* }
+%dx.types.ResourceProperties = type { i32, i32 }
+%"class.dx::HitObject" = type { i32 }
+
+@"\01?RTAS@@3URaytracingAccelerationStructure@@A" = external global %struct.RaytracingAccelerationStructure, align 4
+@"\01?UAV@@3V?$RWStructuredBuffer@M@@A" = external global %"class.RWStructuredBuffer<float>", align 4
+@"$Globals" = external constant %ConstantBuffer
+
+; Function Attrs: nounwind
+define void @"\01?main@@YAXXZ"() #0 {
+entry:
+  %rayDesc = alloca %struct.RayDesc, align 4
+  %pld = alloca %struct.Payload, align 4
+
+  ; CHECK: %[[HITOBJ:[^ ,]+]] = alloca %dx.types.HitObject, align 4
+
+  %hit = alloca %dx.types.HitObject, align 4
+
+  %0 = bitcast %struct.RayDesc* %rayDesc to i8*, !dbg !37 ; line:82 col:3
+  call void @llvm.lifetime.start(i64 32, i8* %0) #0, !dbg !37 ; line:82 col:3
+
+  ; Init RayDesc.
+  ; CHECK-DAG: store <3 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00>, <3 x float>* %[[ORIGIN_P0:[^ ,]+]], align 4
+  ; CHECK-DAG: store float 3.000000e+00, float* %[[TMIN_P0:[^ ,]+]], align 4
+  ; CHECK-DAG: store <3 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00>, <3 x float>* %[[DIRECTION_P0:[^ ,]+]], align 4
+  ; CHECK-DAG: store float 7.000000e+00, float* %[[TMAX_P0:[^ ,]+]], align 4
+
+  %Origin = getelementptr inbounds %struct.RayDesc, %struct.RayDesc* %rayDesc, i32 0, i32 0, !dbg !41 ; line:83 col:11
+  store <3 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00>, <3 x float>* %Origin, align 4, !dbg !42, !tbaa !43 ; line:83 col:18
+  %TMin = getelementptr inbounds %struct.RayDesc, %struct.RayDesc* %rayDesc, i32 0, i32 1, !dbg !46 ; line:84 col:11
+  store float 3.000000e+00, float* %TMin, align 4, !dbg !47, !tbaa !48 ; line:84 col:16
+  %Direction = getelementptr inbounds %struct.RayDesc, %struct.RayDesc* %rayDesc, i32 0, i32 2, !dbg !50 ; line:85 col:11
+  store <3 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00>, <3 x float>* %Direction, align 4, !dbg !51, !tbaa !43 ; line:85 col:21
+  %TMax = getelementptr inbounds %struct.RayDesc, %struct.RayDesc* %rayDesc, i32 0, i32 3, !dbg !52 ; line:86 col:11
+  store float 7.000000e+00, float* %TMax, align 4, !dbg !53, !tbaa !48 ; line:86 col:16
+
+  %1 = bitcast %struct.Payload* %pld to i8*, !dbg !54 ; line:88 col:3
+  call void @llvm.lifetime.start(i64 12, i8* %1) #0, !dbg !54 ; line:88 col:3
+  %dummy = getelementptr inbounds %struct.Payload, %struct.Payload* %pld, i32 0, i32 0, !dbg !55 ; line:89 col:7
+  store <3 x float> <float 7.000000e+00, float 8.000000e+00, float 9.000000e+00>, <3 x float>* %dummy, align 4, !dbg !56, !tbaa !43 ; line:89 col:13
+  %2 = bitcast %dx.types.HitObject* %hit to i8*, !dbg !57 ; line:91 col:3
+  call void @llvm.lifetime.start(i64 4, i8* %2) #0, !dbg !57 ; line:91 col:3
+  %3 = load %struct.RaytracingAccelerationStructure, %struct.RaytracingAccelerationStructure* @"\01?RTAS@@3URaytracingAccelerationStructure@@A", !dbg !58 ; line:91 col:23
+  %4 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RaytracingAccelerationStructure)"(i32 0, %struct.RaytracingAccelerationStructure %3), !dbg !58 ; line:91 col:23
+
+  ; CHECK-DAG: %[[RTAS:[^ ,]+]] = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RaytracingAccelerationStructure)"(i32 14, %dx.types.Handle %{{[^ ,]+}}, %dx.types.ResourceProperties { i32 16, i32 0 }, %struct.RaytracingAccelerationStructure undef)
+
+  %5 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RaytracingAccelerationStructure)"(i32 14, %dx.types.Handle %4, %dx.types.ResourceProperties { i32 16, i32 0 }, %struct.RaytracingAccelerationStructure undef), !dbg !58 ; line:91 col:23
+
+  ; Copy RayDesc.
+  ; CHECK-DAG: %[[ORIGIN_L0:[^ ,]+]] = load <3 x float>, <3 x float>* %[[ORIGIN_P0]]
+  ; CHECK-DAG: store <3 x float> %[[ORIGIN_L0]], <3 x float>* %[[ORIGIN_P1:[^ ,]+]]
+  ; CHECK-DAG: %[[TMIN_L0:[^ ,]+]] = load float, float* %[[TMIN_P0]]
+  ; CHECK-DAG: store float %[[TMIN_L0]], float* %[[TMIN_P1:[^ ,]+]]
+  ; CHECK-DAG: %[[DIRECTION_L0:[^ ,]+]] = load <3 x float>, <3 x float>* %[[DIRECTION_P0]]
+  ; CHECK-DAG: store <3 x float> %[[DIRECTION_L0]], <3 x float>* %[[DIRECTION_P1:[^ ,]+]]
+  ; CHECK-DAG: %[[TMAX_L0:[^ ,]+]] = load float, float* %[[TMAX_P0]]
+  ; CHECK-DAG: store float %[[TMAX_L0]], float* %[[TMAX_P1:[^ ,]+]]
+
+  ; Load RayDesc.
+  ; CHECK-DAG: %[[ORIGIN_L1:[^ ,]+]] = load <3 x float>, <3 x float>* %[[ORIGIN_P1]]
+  ; CHECK-DAG: %[[TMIN_L1:[^ ,]+]] = load float, float* %[[TMIN_P1]]
+  ; CHECK-DAG: %[[DIRECTION_L1:[^ ,]+]] = load <3 x float>, <3 x float>* %[[DIRECTION_P1]]
+  ; CHECK-DAG: %[[TMAX_L1:[^ ,]+]] = load float, float* %[[TMAX_P1]]
+
+  ; RayDesc is scalar replaced in HL op for dx::HitObject::TraceRay.
+  ; CHECK: call void @"dx.hl.op..void (i32, %dx.types.HitObject*, %dx.types.Handle, i32, i32, i32, i32, i32, <3 x float>, float, <3 x float>, float, %struct.Payload*)"(i32 389, %dx.types.HitObject* %[[HITOBJ]], %dx.types.Handle %[[RTAS]], i32 513, i32 1, i32 2, i32 4, i32 0, <3 x float> %[[ORIGIN_L1]], float %[[TMIN_L1]], <3 x float> %[[DIRECTION_L1]], float %[[TMAX_L1]], %struct.Payload* %[[PLD_P0:[^ ,]+]])
+
+  call void @"dx.hl.op..void (i32, %dx.types.HitObject*, %dx.types.Handle, i32, i32, i32, i32, i32, %struct.RayDesc*, %struct.Payload*)"(i32 389, %dx.types.HitObject* %hit, %dx.types.Handle %5, i32 513, i32 1, i32 2, i32 4, i32 0, %struct.RayDesc* %rayDesc, %struct.Payload* %pld), !dbg !58 ; line:91 col:23
+
+  ; Copy payload.
+  ; CHECK: %[[GEP_PLD_P0:[^ ,]+]] = getelementptr inbounds %struct.Payload, %struct.Payload* %[[PLD_P0]], i32 0, i32 0
+  ; CHECK: %[[PLD_L0:[^ ,]+]] = load <3 x float>, <3 x float>* %[[GEP_PLD_P0]]
+  ; CHECK: store <3 x float> %[[PLD_L0]], <3 x float>* %[[PLD_M0_P0:[^ ,]+]]
+  ; CHECK: %[[GEP_PLD_P1:[^ ,]+]] = getelementptr inbounds %struct.Payload, %struct.Payload* %[[PLD_P1:[^ ,]+]], i32 0, i32 0
+  ; CHECK: [[PLD_L1:[^ ,]+]] = load <3 x float>, <3 x float>* %[[PLD_M0_P0]]
+  ; CHECK: store <3 x float> [[PLD_L1]], <3 x float>* %[[GEP_PLD_P1]]
+
+  ; dx::HitObject::Invoke
+  ; CHECK: call void @"dx.hl.op..void (i32, %dx.types.HitObject*, %struct.Payload*)"(i32 382, %dx.types.HitObject* %[[HITOBJ]], %struct.Payload* %[[PLD_P1]])
+
+  call void @"dx.hl.op..void (i32, %dx.types.HitObject*, %struct.Payload*)"(i32 382, %dx.types.HitObject* %hit, %struct.Payload* %pld), !dbg !59 ; line:101 col:3
+
+  %6 = bitcast %dx.types.HitObject* %hit to i8*, !dbg !60 ; line:102 col:1
+  call void @llvm.lifetime.end(i64 4, i8* %6) #0, !dbg !60 ; line:102 col:1
+  %7 = bitcast %struct.Payload* %pld to i8*, !dbg !60 ; line:102 col:1
+  call void @llvm.lifetime.end(i64 12, i8* %7) #0, !dbg !60 ; line:102 col:1
+  %8 = bitcast %struct.RayDesc* %rayDesc to i8*, !dbg !60 ; line:102 col:1
+  call void @llvm.lifetime.end(i64 32, i8* %8) #0, !dbg !60 ; line:102 col:1
+  ret void, !dbg !60 ; line:102 col:1
+}
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture) #0
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture) #0
+
+; Function Attrs: nounwind
+declare void @"dx.hl.op..void (i32, %dx.types.HitObject*, %dx.types.Handle, i32, i32, i32, i32, i32, %struct.RayDesc*, %struct.Payload*)"(i32, %dx.types.HitObject*, %dx.types.Handle, i32, i32, i32, i32, i32, %struct.RayDesc*, %struct.Payload*) #0
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RaytracingAccelerationStructure)"(i32, %struct.RaytracingAccelerationStructure) #1
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RaytracingAccelerationStructure)"(i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RaytracingAccelerationStructure) #1
+
+; Function Attrs: nounwind
+declare void @"dx.hl.op..void (i32, %dx.types.HitObject*, %struct.Payload*)"(i32, %dx.types.HitObject*, %struct.Payload*) #0
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+
+!llvm.module.flags = !{!0}
+!pauseresume = !{!1}
+!llvm.ident = !{!2}
+!dx.version = !{!3}
+!dx.valver = !{!3}
+!dx.shaderModel = !{!4}
+!dx.typeAnnotations = !{!5, !20}
+!dx.entryPoints = !{!24}
+!dx.fnprops = !{!34}
+!dx.options = !{!35, !36}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !{!"hlsl-hlemit", !"hlsl-hlensure"}
+!2 = !{!"dxc(private) 1.8.0.4928 (ser_hlslattributes_patch, 937c16cc6)"}
+!3 = !{i32 1, i32 9}
+!4 = !{!"lib", i32 6, i32 9}
+!5 = !{i32 0, %"class.RWStructuredBuffer<float>" undef, !6, %struct.RayDesc undef, !11, %struct.Payload undef, !16, %"class.dx::HitObject" undef, !18}
+!6 = !{i32 4, !7, !8}
+!7 = !{i32 6, !"h", i32 3, i32 0, i32 7, i32 9}
+!8 = !{i32 0, !9}
+!9 = !{!10}
+!10 = !{i32 0, float undef}
+!11 = !{i32 32, !12, !13, !14, !15}
+!12 = !{i32 6, !"Origin", i32 3, i32 0, i32 7, i32 9, i32 13, i32 3}
+!13 = !{i32 6, !"TMin", i32 3, i32 12, i32 7, i32 9}
+!14 = !{i32 6, !"Direction", i32 3, i32 16, i32 7, i32 9, i32 13, i32 3}
+!15 = !{i32 6, !"TMax", i32 3, i32 28, i32 7, i32 9}
+!16 = !{i32 12, !17}
+!17 = !{i32 6, !"dummy", i32 3, i32 0, i32 7, i32 9, i32 13, i32 3}
+!18 = !{i32 4, !19}
+!19 = !{i32 6, !"h", i32 3, i32 0, i32 7, i32 4}
+!20 = !{i32 1, void ()* @"\01?main@@YAXXZ", !21}
+!21 = !{!22}
+!22 = !{i32 1, !23, !23}
+!23 = !{}
+!24 = !{null, !"", null, !25, null}
+!25 = !{!26, !29, !32, null}
+!26 = !{!27}
+!27 = !{i32 0, %struct.RaytracingAccelerationStructure* @"\01?RTAS@@3URaytracingAccelerationStructure@@A", !"RTAS", i32 -1, i32 -1, i32 1, i32 16, i32 0, !28}
+!28 = !{i32 0, i32 4}
+!29 = !{!30}
+!30 = !{i32 0, %"class.RWStructuredBuffer<float>"* @"\01?UAV@@3V?$RWStructuredBuffer@M@@A", !"UAV", i32 0, i32 0, i32 1, i32 12, i1 false, i1 false, i1 false, !31}
+!31 = !{i32 1, i32 4}
+!32 = !{!33}
+!33 = !{i32 0, %ConstantBuffer* @"$Globals", !"$Globals", i32 0, i32 -1, i32 1, i32 0, null}
+!34 = !{void ()* @"\01?main@@YAXXZ", i32 7}
+!35 = !{i32 -2147483584}
+!36 = !{i32 -1}
+!37 = !DILocation(line: 82, column: 3, scope: !38)
+!38 = !DISubprogram(name: "main", scope: !39, file: !39, line: 81, type: !40, isLocal: false, isDefinition: true, scopeLine: 81, flags: DIFlagPrototyped, isOptimized: false, function: void ()* @"\01?main@@YAXXZ")
+!39 = !DIFile(filename: "D:\5Cgit\5Cdxc\5Cmain\5Ctools\5Cclang\5Ctest\5CCodeGenDXIL\5Chlsl\5Cobjects\5CHitObject\5Chitobject_traceinvoke.hlsl", directory: "")
+!40 = !DISubroutineType(types: !23)
+!41 = !DILocation(line: 83, column: 11, scope: !38)
+!42 = !DILocation(line: 83, column: 18, scope: !38)
+!43 = !{!44, !44, i64 0}
+!44 = !{!"omnipotent char", !45, i64 0}
+!45 = !{!"Simple C/C++ TBAA"}
+!46 = !DILocation(line: 84, column: 11, scope: !38)
+!47 = !DILocation(line: 84, column: 16, scope: !38)
+!48 = !{!49, !49, i64 0}
+!49 = !{!"float", !44, i64 0}
+!50 = !DILocation(line: 85, column: 11, scope: !38)
+!51 = !DILocation(line: 85, column: 21, scope: !38)
+!52 = !DILocation(line: 86, column: 11, scope: !38)
+!53 = !DILocation(line: 86, column: 16, scope: !38)
+!54 = !DILocation(line: 88, column: 3, scope: !38)
+!55 = !DILocation(line: 89, column: 7, scope: !38)
+!56 = !DILocation(line: 89, column: 13, scope: !38)
+!57 = !DILocation(line: 91, column: 3, scope: !38)
+!58 = !DILocation(line: 91, column: 23, scope: !38)
+!59 = !DILocation(line: 101, column: 3, scope: !38)
+!60 = !DILocation(line: 102, column: 1, scope: !38)
diff --git a/tools/clang/test/DXC/Passes/ScalarReplHLSL/traceray_scalarrepl.ll b/tools/clang/test/DXC/Passes/ScalarReplHLSL/traceray_scalarrepl.ll
new file mode 100644
index 0000000000..59551a7eb4
--- /dev/null
+++ b/tools/clang/test/DXC/Passes/ScalarReplHLSL/traceray_scalarrepl.ll
@@ -0,0 +1,182 @@
+; RUN: %dxopt %s -hlsl-passes-resume -scalarrepl-param-hlsl -S | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+%struct.RaytracingAccelerationStructure = type { i32 }
+%"$Globals" = type { i32, i32, i32, i32, i32 }
+%struct.RayDesc = type { <3 x float>, float, <3 x float>, float }
+%struct.Payload = type { <2 x float>, <3 x i32> }
+%dx.types.Handle = type { i8* }
+%dx.types.ResourceProperties = type { i32, i32 }
+
+@"\01?Acc@@3URaytracingAccelerationStructure@@A" = external global %struct.RaytracingAccelerationStructure, align 4
+@"\01?RayFlags@@3IB" = external constant i32, align 4
+@"\01?InstanceInclusionMask@@3IB" = external constant i32, align 4
+@"\01?RayContributionToHitGroupIndex@@3IB" = external constant i32, align 4
+@"\01?MultiplierForGeometryContributionToHitGroupIndex@@3IB" = external constant i32, align 4
+@"\01?MissShaderIndex@@3IB" = external constant i32, align 4
+@"$Globals" = external constant %"$Globals"
+
+; CHECK: define <4 x float> @"
+; CHECK-SAME: ?emit@@YA?AV?$vector@M$03@@AIAV?$vector@M$01@@URayDesc@@UPayload@@@Z"(<2 x float>* noalias dereferenceable(8) %f2, %struct.RayDesc* %Ray, %struct.Payload* noalias %p)
+
+; Function Attrs: nounwind
+define <4 x float> @"\01?emit@@YA?AV?$vector@M$03@@AIAV?$vector@M$01@@URayDesc@@UPayload@@@Z"(<2 x float>* noalias dereferenceable(8) %f2, %struct.RayDesc* %Ray, %struct.Payload* noalias %p) #0 {
+entry:
+
+  ; Copy Payload fields (PLD_F0, PLD_F1) to local allocas:
+  ; CHECK: %[[GEP:[^ ,]+]] = getelementptr inbounds %struct.Payload, %struct.Payload* %p, i32 0, i32 0
+  ; CHECK: %[[LOAD:[^ ,]+]] = load <2 x float>, <2 x float>* %[[GEP]]
+  ; CHECK: store <2 x float> %[[LOAD]], <2 x float>* %[[PLD_F0:[^ ,]+]]
+  ; CHECK: %[[GEP:[^ ,]+]] = getelementptr inbounds %struct.Payload, %struct.Payload* %p, i32 0, i32 1
+  ; CHECK: %[[LOAD:[^ ,]+]] = load <3 x i32>, <3 x i32>* %[[GEP]]
+  ; CHECK: store <3 x i32> %[[LOAD]], <3 x i32>* %[[PLD_F1:[^ ,]+]]
+
+  %0 = alloca %struct.RayDesc, !dbg !39 ; line:22 col:61
+  %1 = bitcast %struct.RayDesc* %0 to i8*, !dbg !39 ; line:22 col:61
+  %2 = bitcast %struct.RayDesc* %Ray to i8*, !dbg !39 ; line:22 col:61
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* %2, i64 32, i32 1, i1 false), !dbg !39 ; line:22 col:61
+  %3 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22$Globals\22*, i32)"(i32 0, %"$Globals"* @"$Globals", i32 0), !dbg !39 ; line:22 col:61
+  %4 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22$Globals\22)"(i32 14, %dx.types.Handle %3, %dx.types.ResourceProperties { i32 13, i32 20 }, %"$Globals" undef), !dbg !39 ; line:22 col:61
+  %5 = call %"$Globals"* @"dx.hl.subscript.cb.rn.%\22$Globals\22* (i32, %dx.types.Handle, i32)"(i32 6, %dx.types.Handle %4, i32 0), !dbg !39 ; line:22 col:61
+  %6 = getelementptr inbounds %"$Globals", %"$Globals"* %5, i32 0, i32 0, !dbg !39 ; line:22 col:61
+  %7 = getelementptr inbounds %"$Globals", %"$Globals"* %5, i32 0, i32 1, !dbg !39 ; line:22 col:61
+  %8 = getelementptr inbounds %"$Globals", %"$Globals"* %5, i32 0, i32 2, !dbg !39 ; line:22 col:61
+  %9 = getelementptr inbounds %"$Globals", %"$Globals"* %5, i32 0, i32 3, !dbg !39 ; line:22 col:61
+  %10 = getelementptr inbounds %"$Globals", %"$Globals"* %5, i32 0, i32 4, !dbg !39 ; line:22 col:61
+  %11 = load i32, i32* %10, align 4, !dbg !39, !tbaa !43 ; line:22 col:61
+  %12 = load i32, i32* %9, align 4, !dbg !47, !tbaa !43 ; line:22 col:12
+  %13 = load i32, i32* %8, align 4, !dbg !48, !tbaa !43 ; line:21 col:12
+  %14 = load i32, i32* %7, align 4, !dbg !49, !tbaa !43 ; line:20 col:25
+  %15 = load i32, i32* %6, align 4, !dbg !50, !tbaa !43 ; line:20 col:16
+  %16 = load %struct.RaytracingAccelerationStructure, %struct.RaytracingAccelerationStructure* @"\01?Acc@@3URaytracingAccelerationStructure@@A", !dbg !51 ; line:20 col:3
+  %17 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RaytracingAccelerationStructure)"(i32 0, %struct.RaytracingAccelerationStructure %16), !dbg !51 ; line:20 col:3
+
+  ; CHECK: %[[RTAS:[^ ,]+]] = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RaytracingAccelerationStructure)"(i32 14, %dx.types.Handle %{{[^ ,]+}}, %dx.types.ResourceProperties { i32 16, i32 0 }, %struct.RaytracingAccelerationStructure undef)
+  %18 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RaytracingAccelerationStructure)"(i32 14, %dx.types.Handle %17, %dx.types.ResourceProperties { i32 16, i32 0 }, %struct.RaytracingAccelerationStructure undef), !dbg !51 ; line:20 col:3
+
+  ; Copy RayDesc fields (Origin, TMin, Direction, TMax) to local allocas:
+  ; CHECK: %[[RAY_ORIGIN_GEP:[^ ,]+]] = getelementptr inbounds %struct.RayDesc, %struct.RayDesc* %Ray, i32 0, i32 0
+  ; CHECK: %[[RAY_ORIGIN_LOAD:[^ ,]+]] = load <3 x float>, <3 x float>* %[[RAY_ORIGIN_GEP]]
+  ; CHECK: store <3 x float> %[[RAY_ORIGIN_LOAD]], <3 x float>* %[[RAY_ORIGIN_P0:[^ ,]+]]
+  ; CHECK: %[[TMIN_GEP:[^ ,]+]] = getelementptr inbounds %struct.RayDesc, %struct.RayDesc* %Ray, i32 0, i32 1
+  ; CHECK: %[[TMIN_LOAD:[^ ,]+]] = load float, float* %[[TMIN_GEP]]
+  ; CHECK: store float %[[TMIN_LOAD]], float* %[[TMIN_P0:[^ ,]+]]
+  ; CHECK: %[[DIRECTION_GEP:[^ ,]+]] = getelementptr inbounds %struct.RayDesc, %struct.RayDesc* %Ray, i32 0, i32 2
+  ; CHECK: %[[DIRECTION_LOAD:[^ ,]+]] = load <3 x float>, <3 x float>* %[[DIRECTION_GEP]]
+  ; CHECK: store <3 x float> %[[DIRECTION_LOAD]], <3 x float>* %[[DIRECTION_P0:[^ ,]+]]
+  ; CHECK: %[[TMAX_GEP:[^ ,]+]] = getelementptr inbounds %struct.RayDesc, %struct.RayDesc* %Ray, i32 0, i32 3
+  ; CHECK: %[[TMAX_LOAD:[^ ,]+]] = load float, float* %[[TMAX_GEP]]
+  ; CHECK: store float %[[TMAX_LOAD]], float* %[[TMAX_P0:[^ ,]+]]
+
+  ; Copy Payload fields into payload struct for call:
+  ; CHECK: %[[PLD_F0_GEP:[^ ,]+]] = getelementptr inbounds %struct.Payload, %struct.Payload* %[[PLD_P0:[^ ,]+]], i32 0, i32 0
+  ; CHECK: %[[PLD_F0_LOAD:[^ ,]+]] = load <2 x float>, <2 x float>* %[[PLD_F0]]
+  ; CHECK: store <2 x float> %[[PLD_F0_LOAD]], <2 x float>* %[[PLD_F0_GEP]]
+  ; CHECK: %[[PLD_F1_GEP:[^ ,]+]] = getelementptr inbounds %struct.Payload, %struct.Payload* %[[PLD_P0]], i32 0, i32 1
+  ; CHECK: %[[PLD_F1_LOAD:[^ ,]+]] = load <3 x i32>, <3 x i32>* %[[PLD_F1]]
+  ; CHECK: store <3 x i32> %[[PLD_F1_LOAD]], <3 x i32>* %[[PLD_F1_GEP]]
+
+  ; Load RayDesc fields:
+  ; CHECK: %[[RAY_ORIGIN_LOAD2:[^ ,]+]] = load <3 x float>, <3 x float>* %[[RAY_ORIGIN_P0]]
+  ; CHECK: %[[TMIN_LOAD2:[^ ,]+]] = load float, float* %[[TMIN_P0]]
+  ; CHECK: %[[DIRECTION_LOAD2:[^ ,]+]] = load <3 x float>, <3 x float>* %[[DIRECTION_P0]]
+  ; CHECK: %[[TMAX_LOAD2:[^ ,]+]] = load float, float* %[[TMAX_P0]]
+
+  ; call TraceRay with the local allocas:
+  ; CHECK: call void @"dx.hl.op..void (i32, %dx.types.Handle, i32, i32, i32, i32, i32, <3 x float>, float, <3 x float>, float, %struct.Payload*)"(i32 69, %dx.types.Handle %[[RTAS]], i32 %{{[^ ,]+}}, i32 %{{[^ ,]+}}, i32 %{{[^ ,]+}}, i32 %{{[^ ,]+}}, i32 %{{[^ ,]+}}, <3 x float> %[[RAY_ORIGIN_LOAD2]], float %[[TMIN_LOAD2]], <3 x float> %[[DIRECTION_LOAD2]], float %[[TMAX_LOAD2]], %struct.Payload* %[[PLD_P0]])
+
+  call void @"dx.hl.op..void (i32, %dx.types.Handle, i32, i32, i32, i32, i32, %struct.RayDesc*, %struct.Payload*)"(i32 69, %dx.types.Handle %18, i32 %15, i32 %14, i32 %13, i32 %12, i32 %11, %struct.RayDesc* %0, %struct.Payload* %p), !dbg !51 ; line:20 col:3
+
+  ret <4 x float> <float 0x4004CCCCC0000000, float 0x4004CCCCC0000000, float 0x4004CCCCC0000000, float 0x4004CCCCC0000000>, !dbg !52 ; line:24 col:4
+}
+
+; Function Attrs: nounwind
+declare void @"dx.hl.op..void (i32, %dx.types.Handle, i32, i32, i32, i32, i32, %struct.RayDesc*, %struct.Payload*)"(i32, %dx.types.Handle, i32, i32, i32, i32, i32, %struct.RayDesc*, %struct.Payload*) #0
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RaytracingAccelerationStructure)"(i32, %struct.RaytracingAccelerationStructure) #1
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RaytracingAccelerationStructure)"(i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RaytracingAccelerationStructure) #1
+
+; Function Attrs: nounwind readnone
+declare %"$Globals"* @"dx.hl.subscript.cb.rn.%\22$Globals\22* (i32, %dx.types.Handle, i32)"(i32, %dx.types.Handle, i32) #1
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22$Globals\22*, i32)"(i32, %"$Globals"*, i32) #1
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22$Globals\22)"(i32, %dx.types.Handle, %dx.types.ResourceProperties, %"$Globals") #1
+
+; Function Attrs: nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #0
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+
+!llvm.module.flags = !{!0}
+!pauseresume = !{!1}
+!llvm.ident = !{!2}
+!dx.version = !{!3}
+!dx.valver = !{!4}
+!dx.shaderModel = !{!5}
+!dx.typeAnnotations = !{!6, !21}
+!dx.entryPoints = !{!30}
+!dx.fnprops = !{}
+!dx.options = !{!37, !38}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !{!"hlsl-hlemit", !"hlsl-hlensure"}
+!2 = !{!"dxc(private) 1.8.0.4928 (ser_hlslattributes_patch, 937c16cc6)"}
+!3 = !{i32 1, i32 3}
+!4 = !{i32 1, i32 9}
+!5 = !{!"lib", i32 6, i32 3}
+!6 = !{i32 0, %struct.RayDesc undef, !7, %struct.Payload undef, !12, %"$Globals" undef, !15}
+!7 = !{i32 32, !8, !9, !10, !11}
+!8 = !{i32 6, !"Origin", i32 3, i32 0, i32 7, i32 9}
+!9 = !{i32 6, !"TMin", i32 3, i32 12, i32 7, i32 9}
+!10 = !{i32 6, !"Direction", i32 3, i32 16, i32 7, i32 9}
+!11 = !{i32 6, !"TMax", i32 3, i32 28, i32 7, i32 9}
+!12 = !{i32 28, !13, !14}
+!13 = !{i32 6, !"t", i32 3, i32 0, i32 7, i32 9}
+!14 = !{i32 6, !"t2", i32 3, i32 16, i32 7, i32 4}
+!15 = !{i32 20, !16, !17, !18, !19, !20}
+!16 = !{i32 6, !"RayFlags", i32 3, i32 0, i32 7, i32 5}
+!17 = !{i32 6, !"InstanceInclusionMask", i32 3, i32 4, i32 7, i32 5}
+!18 = !{i32 6, !"RayContributionToHitGroupIndex", i32 3, i32 8, i32 7, i32 5}
+!19 = !{i32 6, !"MultiplierForGeometryContributionToHitGroupIndex", i32 3, i32 12, i32 7, i32 5}
+!20 = !{i32 6, !"MissShaderIndex", i32 3, i32 16, i32 7, i32 5}
+!21 = !{i32 1, <4 x float> (<2 x float>*, %struct.RayDesc*, %struct.Payload*)* @"\01?emit@@YA?AV?$vector@M$03@@AIAV?$vector@M$01@@URayDesc@@UPayload@@@Z", !22}
+!22 = !{!23, !26, !27, !29}
+!23 = !{i32 1, !24, !25}
+!24 = !{i32 7, i32 9}
+!25 = !{}
+!26 = !{i32 2, !24, !25}
+!27 = !{i32 0, !28, !25}
+!28 = !{i32 4, !"R"}
+!29 = !{i32 2, !25, !25}
+!30 = !{null, !"", null, !31, null}
+!31 = !{!32, null, !35, null}
+!32 = !{!33}
+!33 = !{i32 0, %struct.RaytracingAccelerationStructure* @"\01?Acc@@3URaytracingAccelerationStructure@@A", !"Acc", i32 -1, i32 -1, i32 1, i32 16, i32 0, !34}
+!34 = !{i32 0, i32 4}
+!35 = !{!36}
+!36 = !{i32 0, %"$Globals"* @"$Globals", !"$Globals", i32 0, i32 -1, i32 1, i32 20, null}
+!37 = !{i32 -2147483584}
+!38 = !{i32 11}
+!39 = !DILocation(line: 22, column: 61, scope: !40)
+!40 = !DISubprogram(name: "emit", scope: !41, file: !41, line: 19, type: !42, isLocal: false, isDefinition: true, scopeLine: 19, flags: DIFlagPrototyped, isOptimized: false, function: <4 x float> (<2 x float>*, %struct.RayDesc*, %struct.Payload*)* @"\01?emit@@YA?AV?$vector@M$03@@AIAV?$vector@M$01@@URayDesc@@UPayload@@@Z")
+!41 = !DIFile(filename: "D:\5Cgit\5Cdxc\5Cmain\5Ctools\5Cclang\5Ctest\5CHLSLFileCheck\5Cshader_targets\5Craytracing\5Craytracing_traceray.hlsl", directory: "")
+!42 = !DISubroutineType(types: !25)
+!43 = !{!44, !44, i64 0}
+!44 = !{!"int", !45, i64 0}
+!45 = !{!"omnipotent char", !46, i64 0}
+!46 = !{!"Simple C/C++ TBAA"}
+!47 = !DILocation(line: 22, column: 12, scope: !40)
+!48 = !DILocation(line: 21, column: 12, scope: !40)
+!49 = !DILocation(line: 20, column: 25, scope: !40)
+!50 = !DILocation(line: 20, column: 16, scope: !40)
+!51 = !DILocation(line: 20, column: 3, scope: !40)
+!52 = !DILocation(line: 24, column: 4, scope: !40)
diff --git a/tools/clang/test/DXC/Passes/ScalarReplHLSL/tracerayinline_cb_raydesc_scalarrepl.ll b/tools/clang/test/DXC/Passes/ScalarReplHLSL/tracerayinline_cb_raydesc_scalarrepl.ll
new file mode 100644
index 0000000000..c01ec797bb
--- /dev/null
+++ b/tools/clang/test/DXC/Passes/ScalarReplHLSL/tracerayinline_cb_raydesc_scalarrepl.ll
@@ -0,0 +1,154 @@
+; RUN: %dxopt %s -hlsl-passes-resume -scalarrepl-param-hlsl -S | FileCheck %s
+
+; Based on tools/clang/test/CodeGenDXIL/hlsl/objects/RayQuery/tracerayinline_cb_raydesc.hlsl
+
+target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+%struct.RaytracingAccelerationStructure = type { i32 }
+%struct.RayDesc = type { <3 x float>, float, <3 x float>, float }
+%"$Globals" = type { %struct.RayDesc }
+%dx.types.Handle = type { i8* }
+%dx.types.ResourceProperties = type { i32, i32 }
+%"class.RayQuery<513, 0>" = type { i32 }
+
+@"\01?RTAS@@3URaytracingAccelerationStructure@@A" = external global %struct.RaytracingAccelerationStructure, align 4
+@"\01?rayDesc@@3URayDesc@@B" = external constant %struct.RayDesc, align 4
+@"$Globals" = external constant %"$Globals"
+
+; Function Attrs: nounwind
+define void @main() #0 {
+entry:
+  %0 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22$Globals\22*, i32)"(i32 0, %"$Globals"* @"$Globals", i32 0)
+
+  ; Capture CB, RayDesc ptr from CB, RTAS, and init RayQuery
+  ; CHECK-DAG: %[[CB_H:[^ ,]+]] = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22$Globals\22)"(i32 14, %dx.types.Handle %{{[^ ,]+}}, %dx.types.ResourceProperties { i32 13, i32 32 }, %"$Globals" undef)
+
+  %1 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22$Globals\22)"(i32 14, %dx.types.Handle %0, %dx.types.ResourceProperties { i32 13, i32 32 }, %"$Globals" undef)
+
+  ; CHECK-DAG: %[[CB_PTR:[^ ,]+]] = call %"$Globals"* @"dx.hl.subscript.cb.rn.%\22$Globals\22* (i32, %dx.types.Handle, i32)"(i32 6, %dx.types.Handle %[[CB_H]], i32 0)
+
+  %2 = call %"$Globals"* @"dx.hl.subscript.cb.rn.%\22$Globals\22* (i32, %dx.types.Handle, i32)"(i32 6, %dx.types.Handle %1, i32 0)
+
+  ; CHECK-DAG: %[[RAYDESC_PTR:[^ ,]+]] = getelementptr inbounds %"$Globals", %"$Globals"* %[[CB_PTR]], i32 0, i32 0
+
+  %3 = getelementptr inbounds %"$Globals", %"$Globals"* %2, i32 0, i32 0
+
+  ; CHECK-DAG: %[[RQ0:[^ ,]+]] = call i32 @"dx.hl.op..i32 (i32, i32, i32)"(i32 4, i32 513, i32 0)
+  ; CHECK-DAG: store i32 %[[RQ0]], i32* %[[RQ_P0:[^ ,]+]]
+
+  %rayQuery = alloca %"class.RayQuery<513, 0>", align 4
+  %rayQuery1 = call i32 @"dx.hl.op..i32 (i32, i32, i32)"(i32 4, i32 513, i32 0), !dbg !34 ; line:12 col:71
+  %4 = getelementptr inbounds %"class.RayQuery<513, 0>", %"class.RayQuery<513, 0>"* %rayQuery, i32 0, i32 0, !dbg !34 ; line:12 col:71
+  store i32 %rayQuery1, i32* %4, !dbg !34 ; line:12 col:71
+
+  %5 = load %struct.RaytracingAccelerationStructure, %struct.RaytracingAccelerationStructure* @"\01?RTAS@@3URaytracingAccelerationStructure@@A", !dbg !38 ; line:13 col:3
+  %6 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RaytracingAccelerationStructure)"(i32 0, %struct.RaytracingAccelerationStructure %5), !dbg !38 ; line:13 col:3
+
+  ; CHECK-DAG: %[[RTAS:[^ ,]+]] = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RaytracingAccelerationStructure)"(i32 14, %dx.types.Handle %{{[^ ,]+}}, %dx.types.ResourceProperties { i32 16, i32 0 }, %struct.RaytracingAccelerationStructure undef)
+
+  %7 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RaytracingAccelerationStructure)"(i32 14, %dx.types.Handle %6, %dx.types.ResourceProperties { i32 16, i32 0 }, %struct.RaytracingAccelerationStructure undef), !dbg !38 ; line:13 col:3
+
+  ; Load RayDesc fields from CB to local copy
+  ; CHECK-DAG: %[[ORIG_CBP:[^ ,]+]] = getelementptr inbounds %struct.RayDesc, %struct.RayDesc* %[[RAYDESC_PTR]], i32 0, i32 0
+  ; CHECK-DAG: %[[ORIG_LD_CB:[^ ,]+]] = load <3 x float>, <3 x float>* %[[ORIG_CBP]]
+  ; CHECK-DAG: store <3 x float> %[[ORIG_LD_CB]], <3 x float>* %[[ORIG_P0:[^ ,]+]]
+  ; CHECK-DAG: %[[TMIN_CBP:[^ ,]+]] = getelementptr inbounds %struct.RayDesc, %struct.RayDesc* %[[RAYDESC_PTR]], i32 0, i32 1
+  ; CHECK-DAG: %[[TMIN_LD_CB:[^ ,]+]] = load float, float* %[[TMIN_CBP]]
+  ; CHECK-DAG: store float %[[TMIN_LD_CB]], float* %[[TMIN_P0:[^ ,]+]]
+  ; CHECK-DAG: %[[DIR_CBP:[^ ,]+]] = getelementptr inbounds %struct.RayDesc, %struct.RayDesc* %[[RAYDESC_PTR]], i32 0, i32 2
+  ; CHECK-DAG: %[[DIR_LD_CB:[^ ,]+]] = load <3 x float>, <3 x float>* %[[DIR_CBP]]
+  ; CHECK-DAG: store <3 x float> %[[DIR_LD_CB]], <3 x float>* %[[DIR_P0:[^ ,]+]]
+  ; CHECK-DAG: %[[TMAX_CBP:[^ ,]+]] = getelementptr inbounds %struct.RayDesc, %struct.RayDesc* %[[RAYDESC_PTR]], i32 0, i32 3
+  ; CHECK-DAG: %[[TMAX_LD_CB:[^ ,]+]] = load float, float* %[[TMAX_CBP]]
+  ; CHECK-DAG: store float %[[TMAX_LD_CB]], float* %[[TMAX_P0:[^ ,]+]]
+
+  ; Load RayDesc fields from local copy
+  ; CHECK-DAG: %[[ORIG:[^ ,]+]] = load <3 x float>, <3 x float>* %[[ORIG_P0]]
+  ; CHECK-DAG: %[[TMIN:[^ ,]+]] = load float, float* %[[TMIN_P0]]
+  ; CHECK-DAG: %[[DIR:[^ ,]+]] = load <3 x float>, <3 x float>* %[[DIR_P0]]
+  ; CHECK-DAG: %[[TMAX:[^ ,]+]] = load float, float* %[[TMAX_P0]]
+  ; CHECK-DAG: %[[RQ:[^ ,]+]] = load i32, i32* %[[RQ_P0]]
+
+  ; Call TraceRayInline
+  ; CHECK: call void @"dx.hl.op..void (i32, i32, %dx.types.Handle, i32, i32, <3 x float>, float, <3 x float>, float)"(i32 325, i32 %[[RQ]], %dx.types.Handle %[[RTAS]], i32 1, i32 2, <3 x float> %[[ORIG]], float %[[TMIN]], <3 x float> %[[DIR]], float %[[TMAX]])
+
+  call void @"dx.hl.op..void (i32, %\22class.RayQuery<513, 0>\22*, %dx.types.Handle, i32, i32, %struct.RayDesc*)"(i32 325, %"class.RayQuery<513, 0>"* %rayQuery, %dx.types.Handle %7, i32 1, i32 2, %struct.RayDesc* %3), !dbg !38 ; line:13 col:3
+  ret void, !dbg !39 ; line:14 col:1
+}
+
+; Function Attrs: nounwind
+declare void @"dx.hl.op..void (i32, %\22class.RayQuery<513, 0>\22*, %dx.types.Handle, i32, i32, %struct.RayDesc*)"(i32, %"class.RayQuery<513, 0>"*, %dx.types.Handle, i32, i32, %struct.RayDesc*) #0
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RaytracingAccelerationStructure)"(i32, %struct.RaytracingAccelerationStructure) #1
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RaytracingAccelerationStructure)"(i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RaytracingAccelerationStructure) #1
+
+; Function Attrs: nounwind readnone
+declare %"$Globals"* @"dx.hl.subscript.cb.rn.%\22$Globals\22* (i32, %dx.types.Handle, i32)"(i32, %dx.types.Handle, i32) #1
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22$Globals\22*, i32)"(i32, %"$Globals"*, i32) #1
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22$Globals\22)"(i32, %dx.types.Handle, %dx.types.ResourceProperties, %"$Globals") #1
+
+; Function Attrs: nounwind
+declare i32 @"dx.hl.op..i32 (i32, i32, i32)"(i32, i32, i32) #0
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+
+!llvm.module.flags = !{!0}
+!pauseresume = !{!1}
+!llvm.ident = !{!2}
+!dx.version = !{!3}
+!dx.valver = !{!4}
+!dx.shaderModel = !{!5}
+!dx.typeAnnotations = !{!6, !20}
+!dx.entryPoints = !{!24}
+!dx.fnprops = !{!31}
+!dx.options = !{!32, !33}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !{!"hlsl-hlemit", !"hlsl-hlensure"}
+!2 = !{!"dxc(private) 1.8.0.14861 (main, 33bc44a3d)"}
+!3 = !{i32 1, i32 5}
+!4 = !{i32 1, i32 9}
+!5 = !{!"vs", i32 6, i32 5}
+!6 = !{i32 0, %struct.RayDesc undef, !7, %"class.RayQuery<513, 0>" undef, !12, %"$Globals" undef, !18}
+!7 = !{i32 32, !8, !9, !10, !11}
+!8 = !{i32 6, !"Origin", i32 3, i32 0, i32 7, i32 9}
+!9 = !{i32 6, !"TMin", i32 3, i32 12, i32 7, i32 9}
+!10 = !{i32 6, !"Direction", i32 3, i32 16, i32 7, i32 9}
+!11 = !{i32 6, !"TMax", i32 3, i32 28, i32 7, i32 9}
+!12 = !{i32 4, !13, !14}
+!13 = !{i32 6, !"h", i32 3, i32 0, i32 7, i32 5}
+!14 = !{i32 0, !15}
+!15 = !{!16, !17}
+!16 = !{i32 1, i64 513}
+!17 = !{i32 1, i64 0}
+!18 = !{i32 32, !19}
+!19 = !{i32 6, !"rayDesc", i32 3, i32 0}
+!20 = !{i32 1, void ()* @main, !21}
+!21 = !{!22}
+!22 = !{i32 1, !23, !23}
+!23 = !{}
+!24 = !{void ()* @main, !"main", null, !25, null}
+!25 = !{!26, null, !29, null}
+!26 = !{!27}
+!27 = !{i32 0, %struct.RaytracingAccelerationStructure* @"\01?RTAS@@3URaytracingAccelerationStructure@@A", !"RTAS", i32 -1, i32 -1, i32 1, i32 16, i32 0, !28}
+!28 = !{i32 0, i32 4}
+!29 = !{!30}
+!30 = !{i32 0, %"$Globals"* @"$Globals", !"$Globals", i32 0, i32 -1, i32 1, i32 32, null}
+!31 = !{void ()* @main, i32 1}
+!32 = !{i32 64}
+!33 = !{i32 -1}
+!34 = !DILocation(line: 12, column: 71, scope: !35)
+!35 = !DISubprogram(name: "main", scope: !36, file: !36, line: 11, type: !37, isLocal: false, isDefinition: true, scopeLine: 11, flags: DIFlagPrototyped, isOptimized: false, function: void ()* @main)
+!36 = !DIFile(filename: "/home/texr/git/dxc/main/tools/clang/test/CodeGenDXIL/hlsl/objects/RayQuery/tracerayinline_cb_raydesc.hlsl", directory: "")
+!37 = !DISubroutineType(types: !23)
+!38 = !DILocation(line: 13, column: 3, scope: !35)
+!39 = !DILocation(line: 14, column: 1, scope: !35)
diff --git a/tools/clang/test/DXC/Passes/ScalarReplHLSL/tracerayinline_scalarrepl.ll b/tools/clang/test/DXC/Passes/ScalarReplHLSL/tracerayinline_scalarrepl.ll
new file mode 100644
index 0000000000..ee76872441
--- /dev/null
+++ b/tools/clang/test/DXC/Passes/ScalarReplHLSL/tracerayinline_scalarrepl.ll
@@ -0,0 +1,155 @@
+; RUN: %dxopt %s -hlsl-passes-resume -scalarrepl-param-hlsl -S | FileCheck %s
+
+; Based on tools/clang/test/CodeGenDXIL/hlsl/objects/RayQuery/tracerayinline.hlsl,
+; with call to DoTrace commented out.
+
+target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+%struct.RaytracingAccelerationStructure = type { i32 }
+%ConstantBuffer = type opaque
+%struct.RayDesc = type { <3 x float>, float, <3 x float>, float }
+%"class.RayQuery<513, 0>" = type { i32 }
+%dx.types.Handle = type { i8* }
+%dx.types.ResourceProperties = type { i32, i32 }
+
+@"\01?RTAS@@3URaytracingAccelerationStructure@@A" = external global %struct.RaytracingAccelerationStructure, align 4
+@"$Globals" = external constant %ConstantBuffer
+
+; CHECK: define void @main(float* noalias, <3 x float>, float, <3 x float>, float)
+
+; Function Attrs: nounwind
+define float @main(%struct.RayDesc* %rayDesc) #0 {
+entry:
+  %0 = alloca %struct.RayDesc
+
+  ; Copy flattened RayDesc input to main function
+  ; RayDesc fields: %1: Origin, %2: TMin, %3: Direction, %4: TMax
+  ; CHECK: store float %4, float* %[[RD3_P0:[^ ,]+]]
+  ; CHECK: store <3 x float> %3, <3 x float>* %[[RD2_P0:[^ ,]+]]
+  ; CHECK: store float %2, float* %[[RD1_P0:[^ ,]+]]
+  ; CHECK: store <3 x float> %1, <3 x float>* %[[RD0_P0:[^ ,]+]]
+
+  ; Copy RayDesc fields again
+  ; CHECK: %[[LOAD:[^ ,]+]] = load <3 x float>, <3 x float>* %[[RD0_P0]]
+  ; CHECK: store <3 x float> %[[LOAD]], <3 x float>* %[[RD0_P1:[^ ,]+]]
+  ; CHECK: %[[LOAD:[^ ,]+]] = load float, float* %[[RD1_P0]]
+  ; CHECK: store float %[[LOAD]], float* %[[RD1_P1:[^ ,]+]]
+  ; CHECK: %[[LOAD:[^ ,]+]] = load <3 x float>, <3 x float>* %[[RD2_P0]]
+  ; CHECK: store <3 x float> %[[LOAD]], <3 x float>* %[[RD2_P1:[^ ,]+]]
+  ; CHECK: %[[LOAD:[^ ,]+]] = load float, float* %[[RD3_P0]]
+  ; CHECK: store float %[[LOAD]], float* %[[RD3_P1:[^ ,]+]]
+
+  %1 = bitcast %struct.RayDesc* %0 to i8*
+  %2 = bitcast %struct.RayDesc* %rayDesc to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* %2, i64 32, i32 1, i1 false)
+
+  ; Capture RayQuery ptr and RTAS handle
+  ; CHECK: %[[RQ0:[^ ]+]] = call i32 @"dx.hl.op..i32 (i32, i32, i32)"(i32 4, i32 513, i32 0)
+  ; CHECK: store i32 %[[RQ0]], i32* %[[RQ_P0:[^ ,]+]]
+  ; CHECK: %[[RTAS:[^ ,]+]] = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RaytracingAccelerationStructure)"(i32 14, %dx.types.Handle %{{[^ ,]+}}, %dx.types.ResourceProperties { i32 16, i32 0 }, %struct.RaytracingAccelerationStructure undef)
+
+  %rayQuery = alloca %"class.RayQuery<513, 0>", align 4
+  %rayQuery1 = call i32 @"dx.hl.op..i32 (i32, i32, i32)"(i32 4, i32 513, i32 0), !dbg !35 ; line:15 col:71
+  %3 = getelementptr inbounds %"class.RayQuery<513, 0>", %"class.RayQuery<513, 0>"* %rayQuery, i32 0, i32 0, !dbg !35 ; line:15 col:71
+  store i32 %rayQuery1, i32* %3, !dbg !35 ; line:15 col:71
+  %4 = load %struct.RaytracingAccelerationStructure, %struct.RaytracingAccelerationStructure* @"\01?RTAS@@3URaytracingAccelerationStructure@@A", !dbg !39 ; line:17 col:3
+  %5 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RaytracingAccelerationStructure)"(i32 0, %struct.RaytracingAccelerationStructure %4), !dbg !39 ; line:17 col:3
+  %6 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RaytracingAccelerationStructure)"(i32 14, %dx.types.Handle %5, %dx.types.ResourceProperties { i32 16, i32 0 }, %struct.RaytracingAccelerationStructure undef), !dbg !39 ; line:17 col:3
+
+  ; Copy RayDesc fields again
+  ; CHECK: %[[LOAD:[^ ,]+]] = load <3 x float>, <3 x float>* %[[RD0_P1]]
+  ; CHECK: store <3 x float> %[[LOAD]], <3 x float>* %[[RD0_P2:[^ ,]+]]
+  ; CHECK: %[[LOAD:[^ ,]+]] = load float, float* %[[RD1_P1]]
+  ; CHECK: store float %[[LOAD]], float* %[[RD1_P2:[^ ,]+]]
+  ; CHECK: %[[LOAD:[^ ,]+]] = load <3 x float>, <3 x float>* %[[RD2_P1]]
+  ; CHECK: store <3 x float> %[[LOAD]], <3 x float>* %[[RD2_P2:[^ ,]+]]
+  ; CHECK: %[[LOAD:[^ ,]+]] = load float, float* %[[RD3_P1]]
+  ; CHECK: store float %[[LOAD]], float* %[[RD3_P2:[^ ,]+]]
+
+  ; Load RayDesc fields for TraceRayInline
+  ; CHECK: %[[RD0:[^ ,]+]] = load <3 x float>, <3 x float>* %[[RD0_P2]]
+  ; CHECK: %[[RD1:[^ ,]+]] = load float, float* %[[RD1_P2]]
+  ; CHECK: %[[RD2:[^ ,]+]] = load <3 x float>, <3 x float>* %[[RD2_P2]]
+  ; CHECK: %[[RD3:[^ ,]+]] = load float, float* %[[RD3_P2]]
+
+  ; Load RayQuery
+  ; CHECK: %[[RQ:[^ ,]+]] = load i32, i32* %[[RQ_P0]]
+
+  ; TraceRayInline call
+  ; CHECK: call void @"dx.hl.op..void (i32, i32, %dx.types.Handle, i32, i32, <3 x float>, float, <3 x float>, float)"(i32 325, i32 %[[RQ]], %dx.types.Handle %[[RTAS]], i32 1, i32 2, <3 x float> %[[RD0]], float %[[RD1]], <3 x float> %[[RD2]], float %[[RD3]])
+
+  call void @"dx.hl.op..void (i32, %\22class.RayQuery<513, 0>\22*, %dx.types.Handle, i32, i32, %struct.RayDesc*)"(i32 325, %"class.RayQuery<513, 0>"* %rayQuery, %dx.types.Handle %6, i32 1, i32 2, %struct.RayDesc* %0), !dbg !39 ; line:17 col:3
+  ret float 0.000000e+00, !dbg !40 ; line:18 col:3
+}
+
+; Function Attrs: nounwind
+declare void @"dx.hl.op..void (i32, %\22class.RayQuery<513, 0>\22*, %dx.types.Handle, i32, i32, %struct.RayDesc*)"(i32, %"class.RayQuery<513, 0>"*, %dx.types.Handle, i32, i32, %struct.RayDesc*) #0
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RaytracingAccelerationStructure)"(i32, %struct.RaytracingAccelerationStructure) #1
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RaytracingAccelerationStructure)"(i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RaytracingAccelerationStructure) #1
+
+; Function Attrs: nounwind
+declare i32 @"dx.hl.op..i32 (i32, i32, i32)"(i32, i32, i32) #0
+
+; Function Attrs: nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #0
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+
+!llvm.module.flags = !{!0}
+!pauseresume = !{!1}
+!llvm.ident = !{!2}
+!dx.version = !{!3}
+!dx.valver = !{!4}
+!dx.shaderModel = !{!5}
+!dx.typeAnnotations = !{!6, !18}
+!dx.entryPoints = !{!25}
+!dx.fnprops = !{!32}
+!dx.options = !{!33, !34}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !{!"hlsl-hlemit", !"hlsl-hlensure"}
+!2 = !{!"dxc(private) 1.8.0.14861 (main, 33bc44a3d)"}
+!3 = !{i32 1, i32 5}
+!4 = !{i32 1, i32 9}
+!5 = !{!"vs", i32 6, i32 5}
+!6 = !{i32 0, %struct.RayDesc undef, !7, %"class.RayQuery<513, 0>" undef, !12}
+!7 = !{i32 32, !8, !9, !10, !11}
+!8 = !{i32 6, !"Origin", i32 3, i32 0, i32 7, i32 9}
+!9 = !{i32 6, !"TMin", i32 3, i32 12, i32 7, i32 9}
+!10 = !{i32 6, !"Direction", i32 3, i32 16, i32 7, i32 9}
+!11 = !{i32 6, !"TMax", i32 3, i32 28, i32 7, i32 9}
+!12 = !{i32 4, !13, !14}
+!13 = !{i32 6, !"h", i32 3, i32 0, i32 7, i32 5}
+!14 = !{i32 0, !15}
+!15 = !{!16, !17}
+!16 = !{i32 1, i64 513}
+!17 = !{i32 1, i64 0}
+!18 = !{i32 1, float (%struct.RayDesc*)* @main, !19}
+!19 = !{!20, !23}
+!20 = !{i32 1, !21, !22}
+!21 = !{i32 4, !"OUT", i32 7, i32 9}
+!22 = !{}
+!23 = !{i32 0, !24, !22}
+!24 = !{i32 4, !"RAYDESC"}
+!25 = !{float (%struct.RayDesc*)* @main, !"main", null, !26, null}
+!26 = !{!27, null, !30, null}
+!27 = !{!28}
+!28 = !{i32 0, %struct.RaytracingAccelerationStructure* @"\01?RTAS@@3URaytracingAccelerationStructure@@A", !"RTAS", i32 -1, i32 -1, i32 1, i32 16, i32 0, !29}
+!29 = !{i32 0, i32 4}
+!30 = !{!31}
+!31 = !{i32 0, %ConstantBuffer* @"$Globals", !"$Globals", i32 0, i32 -1, i32 1, i32 0, null}
+!32 = !{float (%struct.RayDesc*)* @main, i32 1}
+!33 = !{i32 64}
+!34 = !{i32 -1}
+!35 = !DILocation(line: 15, column: 71, scope: !36)
+!36 = !DISubprogram(name: "main", scope: !37, file: !37, line: 14, type: !38, isLocal: false, isDefinition: true, scopeLine: 14, flags: DIFlagPrototyped, isOptimized: false, function: float (%struct.RayDesc*)* @main)
+!37 = !DIFile(filename: "tools/clang/test/CodeGenDXIL/hlsl/objects/RayQuery/tracerayinline.hlsl", directory: "")
+!38 = !DISubroutineType(types: !22)
+!39 = !DILocation(line: 17, column: 3, scope: !36)
+!40 = !DILocation(line: 18, column: 3, scope: !36)
diff --git a/tools/clang/test/HLSLFileCheck/pix/AnnotateVirtualRegs-Raygen.hlsl b/tools/clang/test/HLSLFileCheck/pix/AnnotateVirtualRegs-Raygen.hlsl
deleted file mode 100644
index b9670bdaba..0000000000
--- a/tools/clang/test/HLSLFileCheck/pix/AnnotateVirtualRegs-Raygen.hlsl
+++ /dev/null
@@ -1,36 +0,0 @@
-// RUN: %dxc -Od -T lib_6_6 %s | %opt -S -dxil-annotate-with-virtual-regs | FileCheck %s
-
-
-/* To run locally run:
-%dxc -Od -T lib_6_6 %s -Fc %t.ll
-%opt %t.ll -S -dxil-annotate-with-virtual-regs | FileCheck %s
-*/
-
-RaytracingAccelerationStructure scene : register(t0);
-
-struct RayPayload
-{
-    int3 color;
-};
-
-[shader("raygeneration")]
-void ENTRY()
-{
-    RayDesc ray = {{0,0,0}, {0,0,1}, 0.05, 1000.0};
-    RayPayload pld;
-    TraceRay(scene, 0 /*rayFlags*/, 0xFF /*rayMask*/, 0 /*sbtRecordOffset*/, 1 /*sbtRecordStride*/, 0 /*missIndex*/, ray, pld);
-}
-
-// CHECK: {{.*}} = getelementptr inbounds %struct.RayDesc, %struct.RayDesc* {{.*}}, i32 0, i32 0, !pix-dxil-reg [[RDGEP:![0-9]+]], !pix-dxil-inst-num {{.*}}
-// CHECK: {{.*}} = load i32, i32* getelementptr inbounds ([1 x i32], [1 x i32]* @dx.nothing.a, i32 0, i32 0), !pix-dxil-reg [[NothGEP:![0-9]+]], !pix-dxil-inst-num {{.*}}
-// CHECK: {{.*}} = getelementptr inbounds %struct.RayDesc, %struct.RayDesc* {{.*}}, i32 0, i32 1, !pix-dxil-reg [[RDGEP2:![0-9]+]], !pix-dxil-inst-num {{.*}}
-// CHECK: {{.*}} = load i32, i32* getelementptr inbounds ([1 x i32], [1 x i32]* @dx.nothing.a, i32 0, i32 0), !pix-dxil-reg [[NothGEP2:![0-9]+]], !pix-dxil-inst-num {{.*}}
-// CHECK: {{.*}} = getelementptr inbounds %struct.RayDesc, %struct.RayDesc* {{.*}}, i32 0, i32 2, !pix-dxil-reg [[RDGEP3:![0-9]+]], !pix-dxil-inst-num {{.*}}
-// CHECK: {{.*}} = load i32, i32* getelementptr inbounds ([1 x i32], [1 x i32]* @dx.nothing.a, i32 0, i32 0), !pix-dxil-reg [[NothGEP3:![0-9]+]], !pix-dxil-inst-num {{.*}}
-
-// CHECK-DAG: [[RDGEP]] = !{i32 0, i32 0}
-// CHECK-DAG: [[NothGEP]] = !{i32 0, i32 11}
-// CHECK-DAG: [[RDGEP2]] = !{i32 0, i32 3}
-// CHECK-DAG: [[NothGEP2]] = !{i32 0, i32 12}
-// CHECK-DAG: [[RDGEP3]] = !{i32 0, i32 4}
-// CHECK-DAG: [[NothGEP3]] = !{i32 0, i32 13}
diff --git a/tools/clang/test/HLSLFileCheck/shader_targets/raytracing/raytracing_intersection_geometryIndex.hlsl b/tools/clang/test/HLSLFileCheck/shader_targets/raytracing/raytracing_intersection_geometryIndex.hlsl
index 12df1ecbcf..98997a52b1 100644
--- a/tools/clang/test/HLSLFileCheck/shader_targets/raytracing/raytracing_intersection_geometryIndex.hlsl
+++ b/tools/clang/test/HLSLFileCheck/shader_targets/raytracing/raytracing_intersection_geometryIndex.hlsl
@@ -1,10 +1,10 @@
 // RUN: %dxc -T lib_6_5 -auto-binding-space 11 %s | FileCheck %s
 
 // CHECK: define void [[intersection1:@"\\01\?intersection1@[^\"]+"]]() #0 {
-// CHECK:   [[rayTCurrent:%[^ ]+]] = call float @dx.op.rayTCurrent.f32(i32 154)
-// CHECK:   [[GeometryIndex:%[^ ]+]] = call i32 @dx.op.geometryIndex.i32(i32 213)
-// CHECK:   icmp eq i32 [[GeometryIndex]], 0
-// CHECK:   call i1 @dx.op.reportHit.struct.MyAttributes(i32 158, float [[rayTCurrent]], i32 0, %struct.MyAttributes* nonnull {{.*}})
+// CHECK-DAG:   [[rayTCurrent:%[^ ]+]] = call float @dx.op.rayTCurrent.f32(i32 154)
+// CHECK-DAG:   [[GeometryIndex:%[^ ]+]] = call i32 @dx.op.geometryIndex.i32(i32 213)
+// CHECK-DAG:   icmp eq i32 [[GeometryIndex]], 0
+// CHECK-DAG:   call i1 @dx.op.reportHit.struct.MyAttributes(i32 158, float [[rayTCurrent]], i32 0, %struct.MyAttributes* nonnull {{.*}})
 // CHECK:   ret void
 
 struct MyAttributes {
diff --git a/tools/clang/unittests/HLSL/PixTest.cpp b/tools/clang/unittests/HLSL/PixTest.cpp
index e337d2951c..af7801c7bf 100644
--- a/tools/clang/unittests/HLSL/PixTest.cpp
+++ b/tools/clang/unittests/HLSL/PixTest.cpp
@@ -119,7 +119,6 @@ class PixTest : public ::testing::Test {
   TEST_METHOD(AccessTracking_ModificationReport_SM66)
 
   TEST_METHOD(PixStructAnnotation_Lib_DualRaygen)
-  TEST_METHOD(PixStructAnnotation_Lib_RaygenAllocaStructAlignment)
 
   TEST_METHOD(PixStructAnnotation_Simple)
   TEST_METHOD(PixStructAnnotation_CopiedStruct)
@@ -1455,100 +1454,6 @@ void Raygen1()
   }
 }
 
-TEST_F(PixTest, PixStructAnnotation_Lib_RaygenAllocaStructAlignment) {
-  if (m_ver.SkipDxilVersion(1, 5))
-    return;
-
-  const char *hlsl = R"(
-
-RaytracingAccelerationStructure Scene : register(t0, space0);
-RWTexture2D<float4> RenderTarget : register(u0);
-
-struct SceneConstantBuffer
-{
-    float4x4 projectionToWorld;
-    float4 cameraPosition;
-    float4 lightPosition;
-    float4 lightAmbientColor;
-    float4 lightDiffuseColor;
-};
-
-ConstantBuffer<SceneConstantBuffer> g_sceneCB : register(b0);
-
-struct RayPayload
-{
-    float4 color;
-};
-
-inline void GenerateCameraRay(uint2 index, out float3 origin, out float3 direction)
-{
-    float2 xy = index + 0.5f; // center in the middle of the pixel.
-    float2 screenPos = xy;// / DispatchRaysDimensions().xy * 2.0 - 1.0;
-
-    // Invert Y for DirectX-style coordinates.
-    screenPos.y = -screenPos.y;
-
-    // Unproject the pixel coordinate into a ray.
-    float4 world = /*mul(*/float4(screenPos, 0, 1)/*, g_sceneCB.projectionToWorld)*/;
-
-    //world.xyz /= world.w;
-    origin = world.xyz; //g_sceneCB.cameraPosition.xyz;
-    direction = float3(1,0,0);//normalize(world.xyz - origin);
-}
-
-void RaygenCommon()
-{
-    float3 rayDir;
-    float3 origin;
-    
-    // Generate a ray for a camera pixel corresponding to an index from the dispatched 2D grid.
-    GenerateCameraRay(DispatchRaysIndex().xy, origin, rayDir);
-
-    // Trace the ray.
-    // Set the ray's extents.
-    RayDesc ray;
-    ray.Origin = origin;
-    ray.Direction = rayDir;
-    // Set TMin to a non-zero small value to avoid aliasing issues due to floating - point errors.
-    // TMin should be kept small to prevent missing geometry at close contact areas.
-    ray.TMin = 0.001;
-    ray.TMax = 10000.0;
-    RayPayload payload = { float4(0, 0, 0, 0) };
-    TraceRay(Scene, RAY_FLAG_CULL_BACK_FACING_TRIANGLES, ~0, 0, 1, 0, ray, payload);
-
-    // Write the raytraced color to the output texture.
-   // RenderTarget[DispatchRaysIndex().xy] = payload.color;
-}
-
-[shader("raygeneration")]
-void Raygen()
-{
-    RaygenCommon();
-}
-)";
-
-  auto Testables = TestStructAnnotationCase(hlsl, L"-Od", true, L"lib_6_6");
-
-  // Built-in type "RayDesc" has this structure: struct { float3 Origin; float
-  // TMin; float3 Direction; float TMax; } This is 8 floats, with members at
-  // offsets 0,3,4,7 respectively.
-
-  auto FindAtLeastOneOf = [=](char const *name, uint32_t index) {
-    VERIFY_IS_TRUE(std::find_if(Testables.AllocaWrites.begin(),
-                                Testables.AllocaWrites.end(),
-                                [&name, &index](AllocaWrite const &aw) {
-                                  return 0 == strcmp(aw.memberName.c_str(),
-                                                     name) &&
-                                         aw.index == index;
-                                }) != Testables.AllocaWrites.end());
-  };
-
-  FindAtLeastOneOf("Origin.x", 0);
-  FindAtLeastOneOf("TMin", 3);
-  FindAtLeastOneOf("Direction.x", 4);
-  FindAtLeastOneOf("TMax", 7);
-}
-
 TEST_F(PixTest, PixStructAnnotation_Simple) {
   if (m_ver.SkipDxilVersion(1, 5))
     return;
@@ -3441,7 +3346,6 @@ void RaygenInternalName()
   // check that there are alloca writes that cover all of them. RayPayload
   // has four elements, and RayDesc has eight.
   std::array<bool, 4> RayPayloadElementCoverage;
-  std::array<bool, 8> RayDescElementCoverage;
 
   for (auto const &write : metaDataKeyToValue.allocaWrites) {
     // the whole point of the changes with this test is to separate vector
@@ -3452,14 +3356,10 @@ void RaygenInternalName()
     if (findAlloca != metaDataKeyToValue.allocaDefinitions.end()) {
       if (findAlloca->second.count == 4) {
         RayPayloadElementCoverage[write.second.offset] = true;
-      } else if (findAlloca->second.count == 8) {
-        RayDescElementCoverage[write.second.offset] = true;
       }
     }
   }
   // Check that coverage for every element was emitted:
   for (auto const &b : RayPayloadElementCoverage)
     VERIFY_IS_TRUE(b);
-  for (auto const &b : RayDescElementCoverage)
-    VERIFY_IS_TRUE(b);
 }

From f9c2d5de38cd37b42de07fe5b986bb424df38be5 Mon Sep 17 00:00:00 2001
From: Simon Moll <smoll@nvidia.com>
Date: Fri, 16 May 2025 18:03:51 +0200
Subject: [PATCH 43/93] [SER] Diagnose HitObject in unsupported declaration
 contexts (#7376)

- Generalize long vector diagnostics code to HitObjects.
- Diagnose unsupported use of HitObject in globals, entry params/returns
and various other shader-kind-specific contexts.
- Create HitObject variants from the invalid-longvec-decls*.hlsl tests
to make sure all cases are covered.

Specification:
https://github.com/microsoft/hlsl-specs/blob/main/proposals/0027-shader-execution-reordering.md

Closes #7234 [SER] Diagnose and validate illegal use of HitObject in
unsupported contexts (discussed offline)
---
 tools/clang/include/clang/AST/DeclCXX.h       |  11 -
 .../clang/Basic/DiagnosticSemaKinds.td        |  13 +-
 tools/clang/include/clang/Sema/SemaHLSL.h     |  32 ++
 tools/clang/lib/AST/DeclCXX.cpp               |  14 +-
 tools/clang/lib/AST/HlslTypes.cpp             |   7 +
 tools/clang/lib/Sema/SemaDXR.cpp              |  13 +-
 tools/clang/lib/Sema/SemaHLSL.cpp             | 241 +++++++-----
 tools/clang/lib/Sema/SemaHLSLDiagnoseTU.cpp   |  24 +-
 .../lib/Sema/SemaTemplateInstantiate.cpp      |  12 -
 .../HitObject/hitobject-in-buffer.hlsl        |   2 +-
 .../hitobject_traceinvoke_payload_udt.hlsl    |  17 +-
 .../types/invalid-hitobject-decls-hs.hlsl     |  32 ++
 .../types/invalid-hitobject-decls-struct.hlsl | 344 ++++++++++++++++++
 .../invalid-hitobject-decls-templated.hlsl    | 340 +++++++++++++++++
 .../workgraph/invalid_node_record_type.hlsl   |  14 +-
 .../clang/test/SemaHLSL/template-checks.hlsl  |   6 +-
 .../test/SemaHLSL/template-udt-load.hlsl      |   4 +-
 17 files changed, 964 insertions(+), 162 deletions(-)
 create mode 100644 tools/clang/test/SemaHLSL/hlsl/types/invalid-hitobject-decls-hs.hlsl
 create mode 100644 tools/clang/test/SemaHLSL/hlsl/types/invalid-hitobject-decls-struct.hlsl
 create mode 100644 tools/clang/test/SemaHLSL/hlsl/types/invalid-hitobject-decls-templated.hlsl

diff --git a/tools/clang/include/clang/AST/DeclCXX.h b/tools/clang/include/clang/AST/DeclCXX.h
index 36e0f99c82..3b07576545 100644
--- a/tools/clang/include/clang/AST/DeclCXX.h
+++ b/tools/clang/include/clang/AST/DeclCXX.h
@@ -465,10 +465,6 @@ class CXXRecordDecl : public RecordDecl {
     /// \brief Whether we are currently parsing base specifiers.
     bool IsParsingBaseSpecifiers : 1;
 
-    /// \brief Whether this class contains at least one member or base
-    ///  class containing an HLSL vector longer than 4 elements.
-    bool HasHLSLLongVector : 1;
-
     /// \brief The number of base class specifiers in Bases.
     unsigned NumBases;
 
@@ -1022,13 +1018,6 @@ class CXXRecordDecl : public RecordDecl {
     return data().NeedOverloadResolutionForDestructor;
   }
 
-  // HLSL Change add HLSL Long vector bit.
-  /// \brief Determine whether this class contains an HLSL long vector
-  /// of over 4 elements.
-  bool hasHLSLLongVector() { return data().HasHLSLLongVector; }
-  /// \brief Set that this class contains an HLSL long vector of over 4 elements
-  bool setHasHLSLLongVector() { return data().HasHLSLLongVector = true; }
-
   /// \brief Determine whether this class describes a lambda function object.
   bool isLambda() const {
     // An update record can't turn a non-lambda into a lambda.
diff --git a/tools/clang/include/clang/Basic/DiagnosticSemaKinds.td b/tools/clang/include/clang/Basic/DiagnosticSemaKinds.td
index ae7e777180..003aa50795 100644
--- a/tools/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/tools/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -7558,8 +7558,6 @@ def err_hlsl_missing_type_specifier : Error< // Patterened after err_missing_typ
   "HLSL requires a type specifier for all declarations">;
 def err_hlsl_multiple_concrete_bases : Error<
   "multiple concrete base types specified">;
-def err_hlsl_objectintemplateargument : Error<
-  "%0 is an object and cannot be used as a type parameter">;
 def err_hlsl_packoffset_requires_cbuffer : Error<
   "packoffset is only allowed in a constant buffer">;
 def warn_hlsl_packoffset_mix : Warning<
@@ -7886,6 +7884,15 @@ def err_hlsl_unsupported_long_vector
     "entry function parameters|entry function return type|"
     "patch constant function parameters|patch constant function return type|"
     "payload parameters|attributes}0 are not supported">;
+// First %select options must match err_hlsl_unsupported_long_vector (same index used)
+def err_hlsl_unsupported_object_context
+    : Error<"object %0 is not allowed in "
+    "%select{ConstantBuffers or TextureBuffers|"
+    "tessellation patches|geometry streams|node records|"
+    "cbuffers or tbuffers|user-defined struct parameter|"
+    "entry function parameters|entry function return type|"
+    "patch constant function parameters|patch constant function return type|"
+    "payload parameters|attributes|builtin template parameters|structured buffers|global variables|groupshared variables}1">;
 def err_hlsl_logical_binop_scalar : Error<
    "operands for short-circuiting logical binary operator must be scalar, for non-scalar types use '%select{and|or}0'">;
 def err_hlsl_ternary_scalar : Error<
@@ -7970,8 +7977,6 @@ def err_hlsl_too_many_node_inputs : Error<
    "Node shader '%0' may not have more than one input record">;
 def err_hlsl_node_record_type : Error<
    "%0 is not valid as a node record type - struct/class required">;
-def err_hlsl_node_record_object : Error<
-   "object %0 may not appear in a node record">;
 def err_hlsl_array_disallowed : Error<
    "%select{entry parameter|declaration}1 of type %0 may not be an array">;
 def err_hlsl_inputpatch_size: Error<
diff --git a/tools/clang/include/clang/Sema/SemaHLSL.h b/tools/clang/include/clang/Sema/SemaHLSL.h
index 59d99ab4c5..80ce8ddd7d 100644
--- a/tools/clang/include/clang/Sema/SemaHLSL.h
+++ b/tools/clang/include/clang/Sema/SemaHLSL.h
@@ -59,6 +59,38 @@ bool DiagnoseNodeStructArgument(clang::Sema *self,
                                 clang::QualType ArgTy, bool &Empty,
                                 const clang::FieldDecl *FD = nullptr);
 
+// Keep this in sync with err_hlsl_unsupported_object in DiagnosticSemaKinds.td
+enum class TypeDiagContext {
+  // Indices that the type context is valid and no diagnostics should be emitted
+  // for this type category.
+  Valid = -1,
+  // Supported indices for both `err_hlsl_unsupported_object_context` and
+  // `err_hlsl_unsupported_long_vector`
+  ConstantBuffersOrTextureBuffers = 0,
+  TessellationPatches = 1,
+  GeometryStreams = 2,
+  NodeRecords = 3,
+  CBuffersOrTBuffers = 4,
+  UserDefinedStructParameter = 5,
+  EntryFunctionParameters = 6,
+  EntryFunctionReturnType = 7,
+  PatchConstantFunctionParameters = 8,
+  PatchConstantFunctionReturnType = 9,
+  PayloadParameters = 10,
+  Attributes = 11,
+  TypeParameter = 12,
+  LongVecDiagMaxSelectIndex = TypeParameter,
+  // Below only supported for `err_hlsl_diag_unsupported_object_context`
+  StructuredBuffers = 13,
+  GlobalVariables = 14,
+  GroupShared = 15,
+  DiagMaxSelectIndex = 15,
+};
+bool DiagnoseTypeElements(clang::Sema &S, clang::SourceLocation Loc,
+                          clang::QualType Ty, TypeDiagContext ObjDiagContext,
+                          TypeDiagContext LongVecDiagContext,
+                          const clang::FieldDecl *FD = nullptr);
+
 void DiagnoseControlFlowConditionForHLSL(clang::Sema *self,
                                          clang::Expr *condExpr,
                                          llvm::StringRef StmtName);
diff --git a/tools/clang/lib/AST/DeclCXX.cpp b/tools/clang/lib/AST/DeclCXX.cpp
index baed44667f..8023a0a588 100644
--- a/tools/clang/lib/AST/DeclCXX.cpp
+++ b/tools/clang/lib/AST/DeclCXX.cpp
@@ -72,8 +72,8 @@ CXXRecordDecl::DefinitionData::DefinitionData(CXXRecordDecl *D)
       ImplicitCopyAssignmentHasConstParam(true),
       HasDeclaredCopyConstructorWithConstParam(false),
       HasDeclaredCopyAssignmentWithConstParam(false), IsLambda(false),
-      IsParsingBaseSpecifiers(false), HasHLSLLongVector(false), NumBases(0),
-      NumVBases(0), Bases(), VBases(), Definition(D), FirstFriend() {}
+      IsParsingBaseSpecifiers(false), NumBases(0), NumVBases(0), Bases(),
+      VBases(), Definition(D), FirstFriend() {}
 // HLSL Change End - Add HasLongVector and clang-format
 
 CXXBaseSpecifier *CXXRecordDecl::DefinitionData::getBasesSlowCase() const {
@@ -203,11 +203,6 @@ CXXRecordDecl::setBases(CXXBaseSpecifier const * const *Bases,
     if (!BaseClassDecl->isStandardLayout())
       data().IsStandardLayout = false;
 
-    // HLSL Change Begin - Propagate presence of long vector to child classes.
-    if (BaseClassDecl->hasHLSLLongVector())
-      data().HasHLSLLongVector = true;
-    // HLSL Change End
-
     // Record if this base is the first non-literal field or base.
     if (!hasNonLiteralTypeFieldsOrBases() && !BaseType->isLiteralType(C))
       data().HasNonLiteralTypeFieldsOrBases = true;
@@ -389,11 +384,6 @@ void CXXRecordDecl::addedClassSubobject(CXXRecordDecl *Subobj) {
     data().NeedOverloadResolutionForMoveConstructor = true;
     data().NeedOverloadResolutionForDestructor = true;
   }
-
-  // HLSL Change Begin - Propagate presence of long vector to child classes.
-  if (Subobj->hasHLSLLongVector())
-    data().HasHLSLLongVector = true;
-  // HLSL Change End
 }
 
 /// Callback function for CXXRecordDecl::forallBases that acknowledges
diff --git a/tools/clang/lib/AST/HlslTypes.cpp b/tools/clang/lib/AST/HlslTypes.cpp
index e081362ebf..05386ddaa5 100644
--- a/tools/clang/lib/AST/HlslTypes.cpp
+++ b/tools/clang/lib/AST/HlslTypes.cpp
@@ -120,6 +120,13 @@ bool IsHLSLCopyableAnnotatableRecord(clang::QualType QT) {
       if (!IsHLSLNumericOrAggregateOfNumericType(Member->getType()))
         return false;
     }
+    if (auto *CXXRD = dyn_cast<CXXRecordDecl>(RD)) {
+      // Walk up the inheritance chain and check base class fields
+      for (const auto &Base : CXXRD->bases()) {
+        if (!IsHLSLCopyableAnnotatableRecord(Base.getType()))
+          return false;
+      }
+    }
     return true;
   }
   return false;
diff --git a/tools/clang/lib/Sema/SemaDXR.cpp b/tools/clang/lib/Sema/SemaDXR.cpp
index f0102f9e3f..04e1582513 100644
--- a/tools/clang/lib/Sema/SemaDXR.cpp
+++ b/tools/clang/lib/Sema/SemaDXR.cpp
@@ -827,19 +827,16 @@ void DiagnoseBuiltinCallWithPayload(Sema &S, const VarDecl *Payload,
   }
 
   // Verify that the payload type is legal
-  if (!hlsl::IsHLSLCopyableAnnotatableRecord(Payload->getType())) {
+  if (!hlsl::IsHLSLCopyableAnnotatableRecord(Payload->getType()))
     S.Diag(Payload->getLocation(), diag::err_payload_attrs_must_be_udt)
         << /*payload|attributes|callable*/ 0 << /*parameter %2|type*/ 0
         << Payload;
-    return;
-  }
 
-  if (ContainsLongVector(Payload->getType())) {
-    const unsigned PayloadParametersIdx = 10;
-    S.Diag(Payload->getLocation(), diag::err_hlsl_unsupported_long_vector)
-        << PayloadParametersIdx;
+  // This will produce more details, but also catch disallowed long vectors
+  const TypeDiagContext DiagContext = TypeDiagContext::PayloadParameters;
+  if (DiagnoseTypeElements(S, Payload->getLocation(), Payload->getType(),
+                           DiagContext, DiagContext))
     return;
-  }
 
   CollectNonAccessableFields(PayloadType, CallerStage, {}, {},
                              NonWriteableFields, NonReadableFields);
diff --git a/tools/clang/lib/Sema/SemaHLSL.cpp b/tools/clang/lib/Sema/SemaHLSL.cpp
index b15068638d..e5424ecdde 100644
--- a/tools/clang/lib/Sema/SemaHLSL.cpp
+++ b/tools/clang/lib/Sema/SemaHLSL.cpp
@@ -46,6 +46,7 @@
 #include "clang/Sema/TemplateDeduction.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
@@ -5394,7 +5395,8 @@ class HLSLExternalSource : public ExternalSemaSource {
         objectKind = ClassifyRecordType(recordType);
         switch (objectKind) {
         case AR_TOBJ_OBJECT:
-          m_sema->Diag(argLoc, diag::err_hlsl_objectintemplateargument) << type;
+          m_sema->Diag(argLoc, diag::err_hlsl_unsupported_object_context)
+              << type << static_cast<unsigned>(TypeDiagContext::TypeParameter);
           return false;
         case AR_TOBJ_COMPOUND: {
           const RecordDecl *recordDecl = recordType->getDecl();
@@ -5533,14 +5535,27 @@ class HLSLExternalSource : public ExternalSemaSource {
         m_sema->RequireCompleteType(argSrcLoc, argType,
                                     diag::err_typecheck_decl_incomplete_type);
 
-        if (ContainsLongVector(argType)) {
-          const unsigned ConstantBuffersOrTextureBuffersIdx = 0;
-          m_sema->Diag(argSrcLoc, diag::err_hlsl_unsupported_long_vector)
-              << ConstantBuffersOrTextureBuffersIdx;
+        TypeDiagContext DiagContext =
+            TypeDiagContext::ConstantBuffersOrTextureBuffers;
+        if (DiagnoseTypeElements(*m_sema, argSrcLoc, argType, DiagContext,
+                                 DiagContext))
           return true;
-        }
       }
       return false;
+    } else if (ResAttr && DXIL::IsStructuredBuffer(ResAttr->getResKind())) {
+      if (TemplateArgList.size() == 1) {
+        const TemplateArgumentLoc &ArgLoc = TemplateArgList[0];
+        const TemplateArgument &Arg = ArgLoc.getArgument();
+        if (Arg.getKind() == TemplateArgument::ArgKind::Type) {
+          QualType ArgType = Arg.getAsType();
+          SourceLocation ArgSrcLoc = ArgLoc.getLocation();
+          if (DiagnoseTypeElements(
+                  *m_sema, ArgSrcLoc, ArgType,
+                  TypeDiagContext::StructuredBuffers /*ObjDiagContext*/,
+                  TypeDiagContext::Valid /*LongVecDiagContext*/))
+            return true;
+        }
+      }
 
     } else if (Template->getTemplatedDecl()->hasAttr<HLSLNodeObjectAttr>()) {
 
@@ -5641,13 +5656,10 @@ class HLSLExternalSource : public ExternalSemaSource {
       CXXRecordDecl *Decl = arg.getAsType()->getAsCXXRecordDecl();
       if (Decl && !Decl->isCompleteDefinition())
         return true;
-      if (ContainsLongVector(arg.getAsType())) {
-        const unsigned TessellationPatchesIDx = 1;
-        m_sema->Diag(argLoc.getLocation(),
-                     diag::err_hlsl_unsupported_long_vector)
-            << TessellationPatchesIDx;
+      const TypeDiagContext DiagContext = TypeDiagContext::TessellationPatches;
+      if (DiagnoseTypeElements(*m_sema, argLoc.getLocation(), arg.getAsType(),
+                               DiagContext, DiagContext))
         return true;
-      }
     } else if (Template->getTemplatedDecl()->hasAttr<HLSLStreamOutputAttr>()) {
       DXASSERT(TemplateArgList.size() > 0,
                "Geometry streams should have at least one template args");
@@ -5660,13 +5672,10 @@ class HLSLExternalSource : public ExternalSemaSource {
       CXXRecordDecl *Decl = arg.getAsType()->getAsCXXRecordDecl();
       if (Decl && !Decl->isCompleteDefinition())
         return true;
-      if (ContainsLongVector(arg.getAsType())) {
-        const unsigned GeometryStreamsIdx = 2;
-        m_sema->Diag(argLoc.getLocation(),
-                     diag::err_hlsl_unsupported_long_vector)
-            << GeometryStreamsIdx;
+      const TypeDiagContext DiagContext = TypeDiagContext::GeometryStreams;
+      if (DiagnoseTypeElements(*m_sema, argLoc.getLocation(), arg.getAsType(),
+                               DiagContext, DiagContext))
         return true;
-      }
     }
 
     bool isMatrix = Template->getCanonicalDecl() ==
@@ -10784,11 +10793,9 @@ bool DiagnoseIntersectionAttributes(Sema &S, SourceLocation Loc, QualType Ty) {
     return false;
   }
 
-  if (ContainsLongVector(Ty)) {
-    const unsigned AttributesIdx = 11;
-    S.Diag(Loc, diag::err_hlsl_unsupported_long_vector) << AttributesIdx;
+  const TypeDiagContext DiagContext = TypeDiagContext::Attributes;
+  if (DiagnoseTypeElements(S, Loc, Ty, DiagContext, DiagContext))
     return false;
-  }
   return true;
 }
 
@@ -10940,6 +10947,10 @@ HLSLExternalSource::DeduceTemplateArgumentsForHLSL(
         if (!IsLegalTemplate) {
           getSema()->Diag(Loc, diag::err_hlsl_intrinsic_template_arg_numeric)
               << intrinsicName;
+          DiagnoseTypeElements(
+              *getSema(), Loc, functionTemplateTypeArg,
+              TypeDiagContext::TypeParameter /*ObjDiagContext*/,
+              TypeDiagContext::Valid /*LongVecDiagContext*/);
           return Sema::TemplateDeductionResult::TDK_Invalid;
         }
       }
@@ -12128,34 +12139,73 @@ void Sema::DiagnoseReachableHLSLCall(CallExpr *CE, const hlsl::ShaderModel *SM,
 
 /////////////////////////////////////////////////////////////////////////////
 
-bool hlsl::DiagnoseNodeStructArgument(Sema *self, TemplateArgumentLoc ArgLoc,
-                                      QualType ArgTy, bool &Empty,
-                                      const FieldDecl *FD) {
-  DXASSERT_NOMSG(!ArgTy.isNull());
+static bool AllowObjectInContext(QualType Ty, TypeDiagContext DiagContext) {
+  // Disallow all object in template type parameters (former
+  // err_hlsl_objectintemplateargument)
+  if (DiagContext == TypeDiagContext::TypeParameter)
+    return false;
+  // Disallow all objects in node records (former
+  // err_hlsl_node_record_object)
+  if (DiagContext == TypeDiagContext::NodeRecords)
+    return false;
+  // TODO: Extend this list for other object types.
+  if (IsHLSLHitObjectType(Ty))
+    return false;
+  return true;
+}
 
-  HLSLExternalSource *source = HLSLExternalSource::FromSema(self);
-  ArTypeObjectKind shapeKind = source->GetTypeObjectKind(ArgTy);
-  switch (shapeKind) {
+// Determine if `Ty` is valid in this `DiagContext` and/or an empty type.  If
+// invalid returns false and Sema `S`, location `Loc`, error index
+// `DiagContext`, and FieldDecl `FD` are used to emit diagnostics. If
+// `CheckLongVec` is set, errors are produced if `Ty` is a long vector. If the
+// type is not empty, `Empty` is set to false. `CheckedDecls` is used to prevent
+// redundant recursive type checks.
+static bool
+DiagnoseElementTypes(Sema &S, SourceLocation Loc, QualType Ty, bool &Empty,
+                     TypeDiagContext ObjDiagContext,
+                     TypeDiagContext LongVecDiagContext,
+                     llvm::SmallPtrSet<const RecordDecl *, 8> &CheckedDecls,
+                     const clang::FieldDecl *FD) {
+  if (Ty.isNull() || Ty->isDependentType())
+    return false;
+
+  const bool CheckLongVec = LongVecDiagContext != TypeDiagContext::Valid;
+  const bool CheckObjects = ObjDiagContext != TypeDiagContext::Valid;
+
+  while (const ArrayType *Arr = Ty->getAsArrayTypeUnsafe())
+    Ty = Arr->getElementType();
+
+  const int ObjDiagContextIdx = static_cast<int>(ObjDiagContext);
+  const int LongVecDiagContextIdx = static_cast<int>(LongVecDiagContext);
+  DXASSERT_NOMSG(
+      LongVecDiagContext == TypeDiagContext::Valid ||
+      (0 <= LongVecDiagContextIdx &&
+       LongVecDiagContextIdx <=
+           static_cast<int>(TypeDiagContext::LongVecDiagMaxSelectIndex)));
+
+  HLSLExternalSource *Source = HLSLExternalSource::FromSema(&S);
+  ArTypeObjectKind ShapeKind = Source->GetTypeObjectKind(Ty);
+  switch (ShapeKind) {
   case AR_TOBJ_VECTOR:
-    if (GetHLSLVecSize(ArgTy) > DXIL::kDefaultMaxVectorLength) {
-      const unsigned NodeRecordsIdx = 3;
-      self->Diag(ArgLoc.getLocation(), diag::err_hlsl_unsupported_long_vector)
-          << NodeRecordsIdx;
+    if (CheckLongVec && GetHLSLVecSize(Ty) > DXIL::kDefaultMaxVectorLength) {
+      S.Diag(Loc, diag::err_hlsl_unsupported_long_vector)
+          << LongVecDiagContextIdx;
       Empty = false;
       return false;
     }
     LLVM_FALLTHROUGH;
-  case AR_TOBJ_ARRAY:
   case AR_TOBJ_BASIC:
   case AR_TOBJ_MATRIX:
     Empty = false;
     return false;
   case AR_TOBJ_OBJECT:
     Empty = false;
-    self->Diag(ArgLoc.getLocation(), diag::err_hlsl_node_record_object)
-        << ArgTy << ArgLoc.getSourceRange();
+    if (!CheckObjects || AllowObjectInContext(Ty, ObjDiagContext))
+      return false;
+    S.Diag(Loc, diag::err_hlsl_unsupported_object_context)
+        << Ty << ObjDiagContextIdx;
     if (FD)
-      self->Diag(FD->getLocation(), diag::note_field_declared_here)
+      S.Diag(FD->getLocation(), diag::note_field_declared_here)
           << FD->getType() << FD->getSourceRange();
     return true;
   case AR_TOBJ_DEPENDENT:
@@ -12164,25 +12214,55 @@ bool hlsl::DiagnoseNodeStructArgument(Sema *self, TemplateArgumentLoc ArgLoc,
     return true;
   case AR_TOBJ_COMPOUND: {
     bool ErrorFound = false;
-    const RecordDecl *RD = ArgTy->getAs<RecordType>()->getDecl();
+    const RecordDecl *RD = Ty->getAs<RecordType>()->getDecl();
+    // Never recurse redundantly into related subtypes that have already been
+    // checked.
+    if (!CheckedDecls.insert(RD).second)
+      return false;
+
     // Check the fields of the RecordDecl
-    for (auto *FD : RD->fields())
+    for (auto *ElemFD : RD->fields()) {
       ErrorFound |=
-          DiagnoseNodeStructArgument(self, ArgLoc, FD->getType(), Empty, FD);
-    if (RD->isCompleteDefinition())
-      if (auto *Child = dyn_cast<CXXRecordDecl>(RD))
-        // Walk up the inheritance chain and check base class fields
-        for (auto &B : Child->bases())
-          ErrorFound |=
-              DiagnoseNodeStructArgument(self, ArgLoc, B.getType(), Empty);
+          DiagnoseElementTypes(S, Loc, ElemFD->getType(), Empty, ObjDiagContext,
+                               LongVecDiagContext, CheckedDecls, ElemFD);
+    }
+    if (!RD->isCompleteDefinition())
+      return ErrorFound;
+
+    if (auto *Child = dyn_cast<CXXRecordDecl>(RD))
+      // Walk up the inheritance chain and check base class fields
+      for (auto &B : Child->bases())
+        ErrorFound |=
+            DiagnoseElementTypes(S, Loc, B.getType(), Empty, ObjDiagContext,
+                                 LongVecDiagContext, CheckedDecls, nullptr);
     return ErrorFound;
   }
   default:
-    DXASSERT(false, "unreachable");
+    // Not a recursive type, no element types to check here
+    Empty = false;
     return false;
   }
 }
 
+bool hlsl::DiagnoseTypeElements(Sema &S, SourceLocation Loc, QualType Ty,
+                                TypeDiagContext ObjDiagContext,
+                                TypeDiagContext LongVecDiagContext,
+                                const clang::FieldDecl *FD) {
+  bool Empty = false;
+  llvm::SmallPtrSet<const RecordDecl *, 8> CheckedDecls;
+  return DiagnoseElementTypes(S, Loc, Ty, Empty, ObjDiagContext,
+                              LongVecDiagContext, CheckedDecls, FD);
+}
+
+bool hlsl::DiagnoseNodeStructArgument(Sema *self, TemplateArgumentLoc ArgLoc,
+                                      QualType ArgTy, bool &Empty,
+                                      const FieldDecl *FD) {
+  llvm::SmallPtrSet<const RecordDecl *, 8> CheckedDecls;
+  return DiagnoseElementTypes(*self, ArgLoc.getLocation(), ArgTy, Empty,
+                              TypeDiagContext::NodeRecords,
+                              TypeDiagContext::NodeRecords, CheckedDecls, FD);
+}
+
 // This function diagnoses whether or not all entry-point attributes
 // should exist on this shader stage
 void DiagnoseEntryAttrAllowedOnStage(clang::Sema *self,
@@ -12610,21 +12690,6 @@ bool hlsl::ShouldSkipNRVO(clang::Sema &sema, clang::QualType returnType,
   return false;
 }
 
-bool hlsl::ContainsLongVector(QualType QT) {
-  if (QT.isNull() || QT->isDependentType())
-    return false;
-
-  while (const ArrayType *Arr = QT->getAsArrayTypeUnsafe())
-    QT = Arr->getElementType();
-
-  if (CXXRecordDecl *Decl = QT->getAsCXXRecordDecl()) {
-    if (!Decl->isCompleteDefinition())
-      return false;
-    return Decl->hasHLSLLongVector();
-  }
-  return false;
-}
-
 bool hlsl::IsConversionToLessOrEqualElements(
     clang::Sema *self, const clang::ExprResult &sourceExpr,
     const clang::QualType &targetType, bool explicitConversion) {
@@ -15295,8 +15360,8 @@ bool Sema::DiagnoseHLSLDecl(Declarator &D, DeclContext *DC, Expr *BitWidth,
     result = false;
   }
 
-  // Disallow long vecs from $Global cbuffers.
-  if (isGlobal && !isStatic && !isGroupShared && !IS_BASIC_OBJECT(basicKind)) {
+  // Disallow intangible HLSL objects in the global scope.
+  if (isGlobal) {
     // Suppress actual emitting of errors for incompletable types here
     // They are redundant to those produced in ActOnUninitializedDecl.
     struct SilentDiagnoser : public TypeDiagnoser {
@@ -15304,12 +15369,22 @@ bool Sema::DiagnoseHLSLDecl(Declarator &D, DeclContext *DC, Expr *BitWidth,
       virtual void diagnose(Sema &S, SourceLocation Loc, QualType T) {}
     } SD;
     RequireCompleteType(D.getLocStart(), qt, SD);
-    if (ContainsLongVector(qt)) {
-      unsigned CbuffersOrTbuffersIdx = 4;
-      Diag(D.getLocStart(), diag::err_hlsl_unsupported_long_vector)
-          << CbuffersOrTbuffersIdx;
+
+    // Disallow objects in the global context
+    TypeDiagContext ObjDiagContext = TypeDiagContext::CBuffersOrTBuffers;
+    if (isGroupShared)
+      ObjDiagContext = TypeDiagContext::GroupShared;
+    else if (isStatic)
+      ObjDiagContext = TypeDiagContext::GlobalVariables;
+
+    TypeDiagContext LongVecDiagContext = TypeDiagContext::Valid;
+
+    // Disallow long vecs from $Global cbuffers.
+    if (!isStatic && !isGroupShared && !IS_BASIC_OBJECT(basicKind))
+      LongVecDiagContext = TypeDiagContext::CBuffersOrTBuffers;
+    if (DiagnoseTypeElements(*this, D.getLocStart(), qt, ObjDiagContext,
+                             LongVecDiagContext))
       result = false;
-    }
   }
 
   // SPIRV change starts
@@ -16214,13 +16289,10 @@ static bool isRelatedDeclMarkedNointerpolation(Expr *E) {
 
 // Verify that user-defined intrinsic struct args contain no long vectors
 static bool CheckUDTIntrinsicArg(Sema *S, Expr *Arg) {
-  if (ContainsLongVector(Arg->getType())) {
-    const unsigned UserDefinedStructParameterIdx = 5;
-    S->Diag(Arg->getExprLoc(), diag::err_hlsl_unsupported_long_vector)
-        << UserDefinedStructParameterIdx;
-    return true;
-  }
-  return false;
+  const TypeDiagContext DiagContext =
+      TypeDiagContext::UserDefinedStructParameter;
+  return DiagnoseTypeElements(*S, Arg->getExprLoc(), Arg->getType(),
+                              DiagContext, DiagContext);
 }
 
 static bool CheckIntrinsicGetAttributeAtVertex(Sema *S, FunctionDecl *FDecl,
@@ -16957,18 +17029,15 @@ void DiagnoseEntry(Sema &S, FunctionDecl *FD) {
   // Would be nice to check for resources here as they crash the compiler now.
   // See issue #7186.
   for (const auto *param : FD->params()) {
-    if (ContainsLongVector(param->getType())) {
-      const unsigned EntryFunctionParametersIdx = 6;
-      S.Diag(param->getLocation(), diag::err_hlsl_unsupported_long_vector)
-          << EntryFunctionParametersIdx;
-    }
+    const TypeDiagContext DiagContext =
+        TypeDiagContext::EntryFunctionParameters;
+    hlsl::DiagnoseTypeElements(S, param->getLocation(), param->getType(),
+                               DiagContext, DiagContext);
   }
 
-  if (ContainsLongVector(FD->getReturnType())) {
-    const unsigned EntryFunctionReturnIdx = 7;
-    S.Diag(FD->getLocation(), diag::err_hlsl_unsupported_long_vector)
-        << EntryFunctionReturnIdx;
-  }
+  const TypeDiagContext DiagContext = TypeDiagContext::EntryFunctionReturnType;
+  DiagnoseTypeElements(S, FD->getLocation(), FD->getReturnType(), DiagContext,
+                       DiagContext);
 
   DXIL::ShaderKind Stage =
       ShaderModel::KindFromFullName(shaderAttr->getStage());
diff --git a/tools/clang/lib/Sema/SemaHLSLDiagnoseTU.cpp b/tools/clang/lib/Sema/SemaHLSLDiagnoseTU.cpp
index abca7cbf86..a3ca955802 100644
--- a/tools/clang/lib/Sema/SemaHLSLDiagnoseTU.cpp
+++ b/tools/clang/lib/Sema/SemaHLSLDiagnoseTU.cpp
@@ -709,20 +709,18 @@ void hlsl::DiagnoseTranslationUnit(clang::Sema *self) {
               << hullPatchCount.value();
         }
       }
-      for (const auto *param : pPatchFnDecl->params())
-        if (ContainsLongVector(param->getType())) {
-          const unsigned PatchConstantFunctionParametersIdx = 8;
-          self->Diag(param->getLocation(),
-                     diag::err_hlsl_unsupported_long_vector)
-              << PatchConstantFunctionParametersIdx;
-        }
-
-      if (ContainsLongVector(pPatchFnDecl->getReturnType())) {
-        const unsigned PatchConstantFunctionReturnIdx = 9;
-        self->Diag(pPatchFnDecl->getLocation(),
-                   diag::err_hlsl_unsupported_long_vector)
-            << PatchConstantFunctionReturnIdx;
+      for (const auto *param : pPatchFnDecl->params()) {
+        const TypeDiagContext ParamDiagContext =
+            TypeDiagContext::PatchConstantFunctionParameters;
+        DiagnoseTypeElements(*self, param->getLocation(), param->getType(),
+                             ParamDiagContext, ParamDiagContext);
       }
+
+      const TypeDiagContext ReturnDiagContext =
+          TypeDiagContext::PatchConstantFunctionReturnType;
+      DiagnoseTypeElements(*self, pPatchFnDecl->getLocation(),
+                           pPatchFnDecl->getReturnType(), ReturnDiagContext,
+                           ReturnDiagContext);
     }
     DXIL::ShaderKind EntrySK = shaderModel->GetKind();
     DXIL::NodeLaunchType NodeLaunchTy = DXIL::NodeLaunchType::Invalid;
diff --git a/tools/clang/lib/Sema/SemaTemplateInstantiate.cpp b/tools/clang/lib/Sema/SemaTemplateInstantiate.cpp
index 1eacedbb0b..a6ae05faa5 100644
--- a/tools/clang/lib/Sema/SemaTemplateInstantiate.cpp
+++ b/tools/clang/lib/Sema/SemaTemplateInstantiate.cpp
@@ -2139,18 +2139,6 @@ Sema::InstantiateClass(SourceLocation PointOfInstantiation,
               SourceLocation(), SourceLocation(), nullptr);
   CheckCompletedCXXClass(Instantiation);
 
-  // HLSL Change Begin - set longvec bit for vectors of over 4 elements
-  ClassTemplateSpecializationDecl *Spec =
-      dyn_cast<ClassTemplateSpecializationDecl>(Instantiation);
-  if (Spec && Spec->hasAttr<HLSLVectorAttr>()) {
-    const TemplateArgumentList &argList = Spec->getTemplateArgs();
-    const TemplateArgument &arg1 = argList[1];
-    llvm::APSInt vecSize = arg1.getAsIntegral();
-    if (vecSize.getLimitedValue() > hlsl::DXIL::kDefaultMaxVectorLength)
-      Instantiation->setHasHLSLLongVector();
-  }
-  // HLSL Change End - set longvec bit for vectors of over 4 elements
-
   // Default arguments are parsed, if not instantiated. We can go instantiate
   // default arg exprs for default constructors if necessary now.
   ActOnFinishCXXMemberDefaultArgs(Instantiation);
diff --git a/tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject-in-buffer.hlsl b/tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject-in-buffer.hlsl
index baa3a07a5b..b091bd2ac5 100644
--- a/tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject-in-buffer.hlsl
+++ b/tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject-in-buffer.hlsl
@@ -1,4 +1,4 @@
 // RUN: %dxc -T lib_6_9 %s -verify
 
-// expected-error@+1{{'dx::HitObject' is an object and cannot be used as a type parameter}}
+// expected-error@+1{{object 'dx::HitObject' is not allowed in structured buffers}}
 RWStructuredBuffer<dx::HitObject> InvalidBuffer;
diff --git a/tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_traceinvoke_payload_udt.hlsl b/tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_traceinvoke_payload_udt.hlsl
index e89e33a78f..ee4ff8c020 100644
--- a/tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_traceinvoke_payload_udt.hlsl
+++ b/tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_traceinvoke_payload_udt.hlsl
@@ -4,19 +4,28 @@ struct
 [raypayload]
 Payload
 {
-    int a : read(caller, closesthit, miss) : write(caller, closesthit, miss);
+    int a : read(closesthit, miss) : write(anyhit);
     dx::HitObject hit;
 };
 
-struct Attribs
+struct
+[raypayload]
+PayloadLV
 {
-    float2 barys;
+    int a : read(closesthit, miss) : write(anyhit);
+    vector<float, 5> b : read(closesthit, miss) : write(anyhit);
 };
 
 [shader("raygeneration")]
 void RayGen()
 {
-  // expected-error@+1{{payload parameter 'payload_in_rg' must be a user-defined type composed of only numeric types}}
+  // expected-error@+3{{payload parameter 'payload_in_rg' must be a user-defined type composed of only numeric types}}
+  // expected-error@+2{{object 'dx::HitObject' is not allowed in payload parameters}}
+  // expected-note@8{{'dx::HitObject' field declared here}}
   Payload payload_in_rg;
   dx::HitObject::Invoke( dx::HitObject(), payload_in_rg );
+
+  // expected-error@+1{{vectors of over 4 elements in payload parameters are not supported}}
+  PayloadLV payload_with_lv;
+  dx::HitObject::Invoke( dx::HitObject(), payload_with_lv );
 }
\ No newline at end of file
diff --git a/tools/clang/test/SemaHLSL/hlsl/types/invalid-hitobject-decls-hs.hlsl b/tools/clang/test/SemaHLSL/hlsl/types/invalid-hitobject-decls-hs.hlsl
new file mode 100644
index 0000000000..3a4457bd5f
--- /dev/null
+++ b/tools/clang/test/SemaHLSL/hlsl/types/invalid-hitobject-decls-hs.hlsl
@@ -0,0 +1,32 @@
+// RUN: %dxc -T hs_6_9 -verify %s
+
+struct HsConstantData {
+  float Edges[3] : SV_TessFactor;
+  dx::HitObject hit;
+};
+
+struct LongVec {
+  float4 f;
+  dx::HitObject hit;
+};
+
+HsConstantData
+PatchConstantFunction(
+    // expected-error@-1{{object 'dx::HitObject' is not allowed in patch constant function return type}}
+    // expected-note@5{{'dx::HitObject' field declared here}}
+	  dx::HitObject hit : V,
+    // expected-error@-1{{object 'dx::HitObject' is not allowed in patch constant function parameters}}
+	  LongVec lv : L)
+    // expected-error@-1{{object 'dx::HitObject' is not allowed in patch constant function parameters}}
+    // expected-note@10{{'dx::HitObject' field declared here}}
+{
+  HsConstantData empty;
+  return empty;
+}
+
+[domain("tri")]
+[outputtopology("triangle_cw")]
+[outputcontrolpoints(32)]
+[patchconstantfunc("PatchConstantFunction")]
+void main() {
+}
diff --git a/tools/clang/test/SemaHLSL/hlsl/types/invalid-hitobject-decls-struct.hlsl b/tools/clang/test/SemaHLSL/hlsl/types/invalid-hitobject-decls-struct.hlsl
new file mode 100644
index 0000000000..b6b28700a9
--- /dev/null
+++ b/tools/clang/test/SemaHLSL/hlsl/types/invalid-hitobject-decls-struct.hlsl
@@ -0,0 +1,344 @@
+// RUN: %dxc -T lib_6_9 -DTYPE=HitStruct -verify %s
+// RUN: %dxc -T lib_6_9 -DTYPE=HitStructSub -verify %s
+
+
+#define PASTE_(x,y) x##y
+#define PASTE(x,y) PASTE_(x,y)
+
+#ifndef TYPE
+#define TYPE HitTpl<dx::HitObject>
+#endif
+
+// Add tests for base types and instantiated template classes with HitObjects
+
+struct HitStruct {
+  float4 f;
+  dx::HitObject hit;
+};
+
+struct HitStructSub : HitStruct {
+  int3 is;
+};
+
+template <typename T>
+struct HitTpl {
+  float4 f;
+  T val;
+};
+
+TYPE global_type;
+// expected-error@-1{{object 'dx::HitObject' is not allowed in cbuffers or tbuffers}}
+// expected-note@16{{'dx::HitObject' field declared here}}
+dx::HitObject global_hit;
+// expected-error@-1{{object 'dx::HitObject' is not allowed in cbuffers or tbuffers}}
+dx::HitObject global_hit_arr[10];
+// expected-error@-1{{object 'dx::HitObject' is not allowed in cbuffers or tbuffers}}
+
+static TYPE static_gv;
+// expected-error@-1{{object 'dx::HitObject' is not allowed in global variables}}
+// expected-note@16{{'dx::HitObject' field declared here}}
+
+cbuffer BadBuffy {
+  dx::HitObject cb_hit;
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in cbuffers or tbuffers}}
+  dx::HitObject cb_hit_arr[10];
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in cbuffers or tbuffers}}
+};
+
+tbuffer BadTuffy {
+  dx::HitObject tb_vec; 
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in cbuffers or tbuffers}}
+  dx::HitObject tb_vec_arr[10];
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in cbuffers or tbuffers}}
+  TYPE tb_vec_rec; 
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in cbuffers or tbuffers}}
+  // expected-note@16{{'dx::HitObject' field declared here}}
+  TYPE tb_vec_rec_arr[10]; 
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in cbuffers or tbuffers}}
+  // expected-note@16{{'dx::HitObject' field declared here}}
+};
+
+StructuredBuffer<TYPE> struct_buf;
+// expected-error@-1{{object 'dx::HitObject' is not allowed in structured buffers}}
+// expected-note@16{{'dx::HitObject' field declared here}}
+RWStructuredBuffer<TYPE> rw_struct_buf;
+// expected-error@-1{{object 'dx::HitObject' is not allowed in structured buffers}}
+// expected-note@16{{'dx::HitObject' field declared here}}
+ConstantBuffer<TYPE> const_buf;
+// expected-error@-1{{object 'dx::HitObject' is not allowed in ConstantBuffers or TextureBuffers}}
+// expected-note@16{{'dx::HitObject' field declared here}}
+TextureBuffer<TYPE> tex_buf;
+// expected-error@-1{{object 'dx::HitObject' is not allowed in ConstantBuffers or TextureBuffers}}
+// expected-note@16{{'dx::HitObject' field declared here}}
+
+ByteAddressBuffer bab;
+RWByteAddressBuffer rw_bab;
+
+[Shader("raygeneration")]
+void main()
+{
+  bab.Load<TYPE>(0);
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in builtin template parameters}}
+  // expected-note@16{{'dx::HitObject' field declared here}}
+  // expected-error@-3{{Explicit template arguments on intrinsic Load must be a single numeric type}}
+  rw_bab.Load<TYPE>(0);
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in builtin template parameters}}
+  // expected-note@16{{'dx::HitObject' field declared here}}
+  // expected-error@-3{{Explicit template arguments on intrinsic Load must be a single numeric type}}
+  TYPE val;
+  rw_bab.Store<TYPE>(0, val);
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in builtin template parameters}}
+  // expected-note@16{{'dx::HitObject' field declared here}}
+  // expected-error@-3{{Explicit template arguments on intrinsic Store must be a single numeric type}}
+}
+
+[shader("pixel")]
+TYPE ps_main( 
+// expected-error@-1{{object 'dx::HitObject' is not allowed in entry function return type}}
+// expected-note@16{{'dx::HitObject' field declared here}}
+    TYPE vec : V) : SV_Target {
+    // expected-error@-1{{object 'dx::HitObject' is not allowed in entry function parameters}}
+    // expected-note@16{{'dx::HitObject' field declared here}}
+  return vec;
+}
+
+[shader("vertex")]
+TYPE vs_main(
+// expected-error@-1{{object 'dx::HitObject' is not allowed in entry function return type}}
+// expected-note@16{{'dx::HitObject' field declared here}}
+    TYPE parm : P) : SV_Target {
+    // expected-error@-1{{object 'dx::HitObject' is not allowed in entry function parameters}}
+    // expected-note@16{{'dx::HitObject' field declared here}}
+  parm.f = 0;
+  return parm;
+}
+
+
+[shader("geometry")]
+[maxvertexcount(3)]
+void gs_point(
+    line TYPE e,
+    // expected-error@-1{{object 'dx::HitObject' is not allowed in entry function parameters}}
+    // expected-note@16{{'dx::HitObject' field declared here}}
+    inout PointStream<TYPE> OutputStream0)
+    // expected-error@-1{{object 'dx::HitObject' is not allowed in geometry streams}}
+    // expected-note@16{{'dx::HitObject' field declared here}}
+{}
+
+[shader("geometry")]
+[maxvertexcount(12)]
+void gs_line( 
+    line TYPE a,
+    // expected-error@-1{{object 'dx::HitObject' is not allowed in entry function parameters}}
+    // expected-note@16{{'dx::HitObject' field declared here}}
+    inout LineStream<TYPE> OutputStream0)
+    // expected-error@-1{{object 'dx::HitObject' is not allowed in geometry streams}}
+    // expected-note@16{{'dx::HitObject' field declared here}}
+{}
+
+
+[shader("geometry")]
+[maxvertexcount(12)]
+void gs_tri(
+    triangle TYPE a,
+    // expected-error@-1{{object 'dx::HitObject' is not allowed in entry function parameters}}
+    // expected-note@16{{'dx::HitObject' field declared here}}
+    inout TriangleStream<TYPE> OutputStream0)
+    // expected-error@-1{{object 'dx::HitObject' is not allowed in geometry streams}}
+    // expected-note@16{{'dx::HitObject' field declared here}}
+{}
+
+[shader("domain")]
+[domain("tri")]
+void ds_main(
+    OutputPatch<TYPE, 3> TrianglePatch)
+    // expected-error@-1{{object 'dx::HitObject' is not allowed in tessellation patches}}
+    // expected-note@16{{'dx::HitObject' field declared here}}
+{}
+
+void patch_const(
+    InputPatch<TYPE, 3> inpatch,
+    // expected-error@-1{{object 'dx::HitObject' is not allowed in tessellation patches}}
+    // expected-note@16{{'dx::HitObject' field declared here}}
+    OutputPatch<TYPE, 3> outpatch)
+    // expected-error@-1{{object 'dx::HitObject' is not allowed in tessellation patches}}
+    // expected-note@16{{'dx::HitObject' field declared here}}
+{}
+
+[shader("hull")]
+[domain("tri")]
+[outputtopology("triangle_cw")]
+[outputcontrolpoints(32)]
+[patchconstantfunc("patch_const")]
+void hs_main(InputPatch<TYPE, 3> TrianglePatch) {}
+// expected-error@-1{{object 'dx::HitObject' is not allowed in tessellation patches}}
+// expected-note@16{{'dx::HitObject' field declared here}}
+
+RaytracingAccelerationStructure RTAS;
+
+struct [raypayload] DXRHitStruct {
+  float4 f : write(closesthit) : read(caller);
+  TYPE hit : write(closesthit) : read(caller);
+};
+
+struct [raypayload] DXRHitStructSub : DXRHitStruct {
+  int3 is : write(closesthit) : read(caller);
+};
+
+template<typename T>
+struct [raypayload] DXRHitTpl {
+  float4 f : write(closesthit) : read(caller);
+  T hit : write(closesthit) : read(caller);
+};
+
+#define RTTYPE PASTE(DXR,TYPE)
+
+
+TYPE userFunc(TYPE arg) {
+  return arg;
+}
+
+[shader("raygeneration")]
+void raygen() {
+  RTTYPE p = (RTTYPE)0;
+  RayDesc ray = (RayDesc)0;
+  TraceRay(RTAS, RAY_FLAG_NONE, 0, 0, 1, 0, ray, p); 
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in user-defined struct parameter}}
+  // expected-note@16{{'dx::HitObject' field declared here}}
+  CallShader(0, p);
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in user-defined struct parameter}}
+  // expected-note@16{{'dx::HitObject' field declared here}}
+  TYPE val;
+  TYPE res = userFunc(val);
+}
+
+[shader("closesthit")]
+void closesthit(
+    inout RTTYPE payload,
+    // expected-error@-1{{payload parameter 'payload' must be a user-defined type composed of only numeric types}}
+    // expected-error@-2{{object 'dx::HitObject' is not allowed in entry function parameters}}
+    // expected-note@16{{'dx::HitObject' field declared here}}
+    in RTTYPE attribs) {
+    // expected-error@-1{{attributes parameter 'attribs' must be a user-defined type composed of only numeric types}}
+    // expected-error@-2{{object 'dx::HitObject' is not allowed in entry function parameters}}
+    // expected-note@16{{'dx::HitObject' field declared here}}
+  RayDesc ray;
+  TraceRay( RTAS, RAY_FLAG_NONE, 0xff, 0, 1, 0, ray, payload );
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in user-defined struct parameter}}
+  // expected-note@16{{'dx::HitObject' field declared here}}
+  CallShader(0, payload); 
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in user-defined struct parameter}}
+  // expected-note@16{{'dx::HitObject' field declared here}}
+}
+
+[shader("anyhit")]
+void AnyHit(
+    inout RTTYPE payload, 
+    // expected-error@-1{{payload parameter 'payload' must be a user-defined type composed of only numeric types}}
+    // expected-error@-2{{object 'dx::HitObject' is not allowed in entry function parameters}}
+    // expected-note@16{{'dx::HitObject' field declared here}}
+    in RTTYPE attribs)
+    // expected-error@-1{{attributes parameter 'attribs' must be a user-defined type composed of only numeric types}}
+    // expected-error@-2{{object 'dx::HitObject' is not allowed in entry function parameters}}
+    // expected-note@16{{'dx::HitObject' field declared here}}
+{
+}
+
+[shader("miss")]
+void Miss(
+    inout RTTYPE payload){
+    // expected-error@-1{{payload parameter 'payload' must be a user-defined type composed of only numeric types}}
+    // expected-error@-2{{object 'dx::HitObject' is not allowed in entry function parameters}}
+    // expected-note@16{{'dx::HitObject' field declared here}}
+  RayDesc ray;
+  TraceRay( RTAS, RAY_FLAG_NONE, 0xff, 0, 1, 0, ray, payload ); 
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in user-defined struct parameter}}
+  // expected-note@16{{'dx::HitObject' field declared here}}
+  CallShader(0, payload);
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in user-defined struct parameter}}
+  // expected-note@16{{'dx::HitObject' field declared here}}
+}
+
+[shader("intersection")]
+void Intersection() {
+  float hitT = RayTCurrent();
+  RTTYPE attr = (RTTYPE)0;
+  bool bReported = ReportHit(hitT, 0, attr);
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in user-defined struct parameter}}
+  // expected-note@16{{'dx::HitObject' field declared here}}
+}
+
+[shader("callable")]
+void callable1(
+    inout RTTYPE p) { 
+    // expected-error@-1{{object 'dx::HitObject' is not allowed in entry function parameters}}
+    // expected-note@16{{'dx::HitObject' field declared here}}
+    // expected-error@-3{{callable parameter 'p' must be a user-defined type composed of only numeric types}}
+  CallShader(0, p); 
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in user-defined struct parameter}}
+  // expected-note@16{{'dx::HitObject' field declared here}}
+}
+
+static groupshared TYPE gs_var;
+// expected-error@-1{{object 'dx::HitObject' is not allowed in groupshared variables}}
+// expected-note@16{{'dx::HitObject' field declared here}}
+
+[shader("amplification")]
+[numthreads(1,1,1)]
+void Amp() {
+  TYPE as_pld;
+  DispatchMesh(1,1,1,as_pld); 
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in user-defined struct parameter}}
+  // expected-note@16{{'dx::HitObject' field declared here}}
+}
+
+struct NodeHitStruct {
+  uint3 grid : SV_DispatchGrid;
+  TYPE hit;
+};
+
+struct NodeHitStructSub : NodeHitStruct {
+  int3 is;
+};
+
+template<typename T>
+struct NodeHitTpl {
+  uint3 grid : SV_DispatchGrid;
+  T hit;
+};
+
+#define NTYPE PASTE(Node,TYPE)
+
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NumThreads(8,1,1)]
+[NodeMaxDispatchGrid(8, 1, 1)]
+void broadcast(
+// expected-error@-1{{Broadcasting node shader 'broadcast' with NodeMaxDispatchGrid attribute must declare an input record containing a field with SV_DispatchGrid semantic}}
+    DispatchNodeInputRecord<NTYPE> input,
+    // expected-error@-1{{object 'dx::HitObject' is not allowed in node records}}
+    // expected-note@16{{'dx::HitObject' field declared here}}
+    NodeOutput<TYPE> output)
+    // expected-error@-1{{object 'dx::HitObject' is not allowed in node records}}
+    // expected-note@16{{'dx::HitObject' field declared here}}
+{
+  ThreadNodeOutputRecords<TYPE> touts; 
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in node records}}
+  // expected-note@16{{'dx::HitObject' field declared here}}
+  GroupNodeOutputRecords<TYPE> gouts;
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in node records}}
+  // expected-note@16{{'dx::HitObject' field declared here}}
+}
+
+[Shader("node")]
+[NodeLaunch("coalescing")]
+[NumThreads(8,1,1)]
+void coalesce(GroupNodeInputRecords<TYPE> input) {}
+// expected-error@-1{{object 'dx::HitObject' is not allowed in node records}}
+// expected-note@16{{'dx::HitObject' field declared here}}
+
+[Shader("node")]
+[NodeLaunch("thread")]
+void threader(ThreadNodeInputRecord<TYPE> input) {}
+// expected-error@-1{{object 'dx::HitObject' is not allowed in node records}}
+// expected-note@16{{'dx::HitObject' field declared here}}
diff --git a/tools/clang/test/SemaHLSL/hlsl/types/invalid-hitobject-decls-templated.hlsl b/tools/clang/test/SemaHLSL/hlsl/types/invalid-hitobject-decls-templated.hlsl
new file mode 100644
index 0000000000..4ffd53878d
--- /dev/null
+++ b/tools/clang/test/SemaHLSL/hlsl/types/invalid-hitobject-decls-templated.hlsl
@@ -0,0 +1,340 @@
+// RUN: %dxc -T lib_6_9 -verify %s
+
+
+#define PASTE_(x,y) x##y
+#define PASTE(x,y) PASTE_(x,y)
+
+#define TYPE HitTpl<dx::HitObject>
+
+// Add tests for base types and instantiated template classes with HitObjects
+
+struct HitStruct {
+  float4 f;
+  dx::HitObject hit;
+};
+
+struct HitStructSub : HitStruct {
+  int3 is;
+};
+
+template <typename T>
+struct HitTpl {
+  float4 f;
+  T val;
+};
+
+RaytracingAccelerationStructure RTAS;
+
+struct [raypayload] DXRHitStruct {
+  float4 f : write(closesthit) : read(caller);
+  TYPE hit : write(closesthit) : read(caller);
+};
+
+struct [raypayload] DXRHitStructSub : DXRHitStruct {
+  int3 is : write(closesthit) : read(caller);
+};
+
+template<typename T>
+struct [raypayload] DXRHitTpl {
+  float4 f : write(closesthit) : read(caller);
+  T hit : write(closesthit) : read(caller);
+};
+
+struct NodeHitStruct {
+  uint3 grid : SV_DispatchGrid;
+  TYPE hit;
+};
+
+struct NodeHitStructSub : NodeHitStruct {
+  int3 is;
+};
+
+template<typename T>
+struct NodeHitTpl {
+  uint3 grid : SV_DispatchGrid;
+  T hit;
+};
+
+TYPE global_type;
+// expected-error@-1{{object 'dx::HitObject' is not allowed in cbuffers or tbuffers}}
+// expected-note@23{{'dx::HitObject' field declared here}}
+dx::HitObject global_hit;
+// expected-error@-1{{object 'dx::HitObject' is not allowed in cbuffers or tbuffers}}
+dx::HitObject global_hit_arr[10];
+// expected-error@-1{{object 'dx::HitObject' is not allowed in cbuffers or tbuffers}}
+
+static TYPE static_gv;
+// expected-error@-1{{object 'dx::HitObject' is not allowed in global variables}}
+// expected-note@23{{'dx::HitObject' field declared here}}
+
+cbuffer BadBuffy {
+  dx::HitObject cb_hit;
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in cbuffers or tbuffers}}
+  dx::HitObject cb_hit_arr[10];
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in cbuffers or tbuffers}}
+};
+
+tbuffer BadTuffy {
+  dx::HitObject tb_vec; 
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in cbuffers or tbuffers}}
+  dx::HitObject tb_vec_arr[10];
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in cbuffers or tbuffers}}
+  TYPE tb_vec_rec; 
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in cbuffers or tbuffers}}
+  // expected-note@23{{'dx::HitObject' field declared here}}
+  TYPE tb_vec_rec_arr[10]; 
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in cbuffers or tbuffers}}
+  // expected-note@23{{'dx::HitObject' field declared here}}
+};
+
+StructuredBuffer<TYPE> struct_buf;
+// expected-error@-1{{object 'dx::HitObject' is not allowed in structured buffers}}
+// expected-note@23{{'dx::HitObject' field declared here}}
+RWStructuredBuffer<TYPE> rw_struct_buf;
+// expected-error@-1{{object 'dx::HitObject' is not allowed in structured buffers}}
+// expected-note@23{{'dx::HitObject' field declared here}}
+ConstantBuffer<TYPE> const_buf;
+// expected-error@-1{{object 'dx::HitObject' is not allowed in ConstantBuffers or TextureBuffers}}
+// expected-note@23{{'dx::HitObject' field declared here}}
+TextureBuffer<TYPE> tex_buf;
+// expected-error@-1{{object 'dx::HitObject' is not allowed in ConstantBuffers or TextureBuffers}}
+// expected-note@23{{'dx::HitObject' field declared here}}
+
+ByteAddressBuffer bab;
+RWByteAddressBuffer rw_bab;
+
+[Shader("raygeneration")]
+void main()
+{
+  bab.Load<TYPE>(0);
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in builtin template parameters}}
+  // expected-note@23{{'dx::HitObject' field declared here}}
+  // expected-error@-3{{Explicit template arguments on intrinsic Load must be a single numeric type}}
+  rw_bab.Load<TYPE>(0);
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in builtin template parameters}}
+  // expected-note@23{{'dx::HitObject' field declared here}}
+  // expected-error@-3{{Explicit template arguments on intrinsic Load must be a single numeric type}}
+  TYPE val;
+  rw_bab.Store<TYPE>(0, val);
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in builtin template parameters}}
+  // expected-note@23{{'dx::HitObject' field declared here}}
+  // expected-error@-3{{Explicit template arguments on intrinsic Store must be a single numeric type}}
+}
+
+[shader("pixel")]
+TYPE ps_main( 
+// expected-error@-1{{object 'dx::HitObject' is not allowed in entry function return type}}
+// expected-note@23{{'dx::HitObject' field declared here}}
+    TYPE vec : V) : SV_Target {
+    // expected-error@-1{{object 'dx::HitObject' is not allowed in entry function parameters}}
+    // expected-note@23{{'dx::HitObject' field declared here}}
+  return vec;
+}
+
+[shader("vertex")]
+TYPE vs_main(
+// expected-error@-1{{object 'dx::HitObject' is not allowed in entry function return type}}
+// expected-note@23{{'dx::HitObject' field declared here}}
+    TYPE parm : P) : SV_Target {
+    // expected-error@-1{{object 'dx::HitObject' is not allowed in entry function parameters}}
+    // expected-note@23{{'dx::HitObject' field declared here}}
+  parm.f = 0;
+  return parm;
+}
+
+
+[shader("geometry")]
+[maxvertexcount(3)]
+void gs_point(
+    line TYPE e,
+    // expected-error@-1{{object 'dx::HitObject' is not allowed in entry function parameters}}
+    // expected-note@23{{'dx::HitObject' field declared here}}
+    inout PointStream<TYPE> OutputStream0)
+    // expected-error@-1{{object 'dx::HitObject' is not allowed in geometry streams}}
+    // expected-note@23{{'dx::HitObject' field declared here}}
+{}
+
+[shader("geometry")]
+[maxvertexcount(12)]
+void gs_line( 
+    line TYPE a,
+    // expected-error@-1{{object 'dx::HitObject' is not allowed in entry function parameters}}
+    // expected-note@23{{'dx::HitObject' field declared here}}
+    inout LineStream<TYPE> OutputStream0)
+    // expected-error@-1{{object 'dx::HitObject' is not allowed in geometry streams}}
+    // expected-note@23{{'dx::HitObject' field declared here}}
+{}
+
+
+[shader("geometry")]
+[maxvertexcount(12)]
+void gs_line(
+    line TYPE a,
+    // expected-error@-1{{object 'dx::HitObject' is not allowed in entry function parameters}}
+    // expected-note@23{{'dx::HitObject' field declared here}}
+    inout TriangleStream<TYPE> OutputStream0)
+    // expected-error@-1{{object 'dx::HitObject' is not allowed in geometry streams}}
+    // expected-note@23{{'dx::HitObject' field declared here}}
+{}
+
+[shader("domain")]
+[domain("tri")]
+void ds_main(
+    OutputPatch<TYPE, 3> TrianglePatch)
+    // expected-error@-1{{object 'dx::HitObject' is not allowed in tessellation patches}}
+    // expected-note@23{{'dx::HitObject' field declared here}}
+{}
+
+void patch_const(
+    InputPatch<TYPE, 3> inpatch,
+    // expected-error@-1{{object 'dx::HitObject' is not allowed in tessellation patches}}
+    // expected-note@23{{'dx::HitObject' field declared here}}
+    OutputPatch<TYPE, 3> outpatch)
+    // expected-error@-1{{object 'dx::HitObject' is not allowed in tessellation patches}}
+    // expected-note@23{{'dx::HitObject' field declared here}}
+{}
+
+[shader("hull")]
+[domain("tri")]
+[outputtopology("triangle_cw")]
+[outputcontrolpoints(32)]
+[patchconstantfunc("patch_const")]
+void hs_main(InputPatch<TYPE, 3> TrianglePatch) {}
+// expected-error@-1{{object 'dx::HitObject' is not allowed in tessellation patches}}
+// expected-note@23{{'dx::HitObject' field declared here}}
+
+#define RTTYPE PASTE(DXR,TYPE)
+
+TYPE userFunc(TYPE arg) {
+  return arg;
+}
+
+[shader("raygeneration")]
+void raygen() {
+  RTTYPE p = (RTTYPE)0;
+  RayDesc ray = (RayDesc)0;
+  TraceRay(RTAS, RAY_FLAG_NONE, 0, 0, 1, 0, ray, p); 
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in user-defined struct parameter}}
+  // expected-note@40{{'dx::HitObject' field declared here}}
+  CallShader(0, p);
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in user-defined struct parameter}}
+  // expected-note@40{{'dx::HitObject' field declared here}}
+  TYPE val;
+  TYPE res = userFunc(val);
+}
+
+[shader("closesthit")]
+void closesthit(
+    inout RTTYPE payload,
+    // expected-error@-1{{payload parameter 'payload' must be a user-defined type composed of only numeric types}}
+    // expected-error@-2{{object 'dx::HitObject' is not allowed in entry function parameters}}
+    // expected-note@40{{'dx::HitObject' field declared here}}
+    in RTTYPE attribs) {
+    // expected-error@-1{{attributes parameter 'attribs' must be a user-defined type composed of only numeric types}}
+    // expected-error@-2{{object 'dx::HitObject' is not allowed in entry function parameters}}
+    // expected-note@40{{'dx::HitObject' field declared here}}
+  RayDesc ray;
+  TraceRay( RTAS, RAY_FLAG_NONE, 0xff, 0, 1, 0, ray, payload );
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in user-defined struct parameter}}
+  // expected-note@40{{'dx::HitObject' field declared here}}
+  CallShader(0, payload); 
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in user-defined struct parameter}}
+  // expected-note@40{{'dx::HitObject' field declared here}}
+}
+
+[shader("anyhit")]
+void AnyHit(
+    inout RTTYPE payload, 
+    // expected-error@-1{{payload parameter 'payload' must be a user-defined type composed of only numeric types}}
+    // expected-error@-2{{object 'dx::HitObject' is not allowed in entry function parameters}}
+    // expected-note@40{{'dx::HitObject' field declared here}}
+    in RTTYPE attribs)
+    // expected-error@-1{{attributes parameter 'attribs' must be a user-defined type composed of only numeric types}}
+    // expected-error@-2{{object 'dx::HitObject' is not allowed in entry function parameters}}
+    // expected-note@40{{'dx::HitObject' field declared here}}
+{
+}
+
+[shader("miss")]
+void Miss(
+    inout RTTYPE payload){
+    // expected-error@-1{{payload parameter 'payload' must be a user-defined type composed of only numeric types}}
+    // expected-error@-2{{object 'dx::HitObject' is not allowed in entry function parameters}}
+    // expected-note@40{{'dx::HitObject' field declared here}}
+  RayDesc ray;
+  TraceRay( RTAS, RAY_FLAG_NONE, 0xff, 0, 1, 0, ray, payload ); 
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in user-defined struct parameter}}
+  // expected-note@40{{'dx::HitObject' field declared here}}
+  CallShader(0, payload);
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in user-defined struct parameter}}
+  // expected-note@40{{'dx::HitObject' field declared here}}
+}
+
+[shader("intersection")]
+void Intersection() {
+  float hitT = RayTCurrent();
+  RTTYPE attr = (RTTYPE)0;
+  bool bReported = ReportHit(hitT, 0, attr);
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in user-defined struct parameter}}
+  // expected-note@40{{'dx::HitObject' field declared here}}
+}
+
+[shader("callable")]
+void callable1(
+    inout RTTYPE p) { 
+    // expected-error@-1{{object 'dx::HitObject' is not allowed in entry function parameters}}
+    // expected-note@40{{'dx::HitObject' field declared here}}
+    // expected-error@-3{{callable parameter 'p' must be a user-defined type composed of only numeric types}}
+  CallShader(0, p); 
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in user-defined struct parameter}}
+  // expected-note@40{{'dx::HitObject' field declared here}}
+}
+
+static groupshared TYPE gs_var;
+// expected-error@-1{{object 'dx::HitObject' is not allowed in groupshared variables}}
+// expected-note@23{{'dx::HitObject' field declared here}}
+
+[shader("amplification")]
+[numthreads(1,1,1)]
+void Amp() {
+  TYPE as_pld;
+  DispatchMesh(1,1,1,as_pld); 
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in user-defined struct parameter}}
+  // expected-note@23{{'dx::HitObject' field declared here}}
+}
+
+#define NTYPE PASTE(Node,TYPE)
+
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NumThreads(8,1,1)]
+[NodeMaxDispatchGrid(8, 1, 1)]
+void broadcast(
+// expected-error@-1{{Broadcasting node shader 'broadcast' with NodeMaxDispatchGrid attribute must declare an input record containing a field with SV_DispatchGrid semantic}}
+    DispatchNodeInputRecord<NTYPE> input,
+    // expected-error@-1{{object 'dx::HitObject' is not allowed in node records}}
+    // expected-note@55{{'dx::HitObject' field declared here}}
+    NodeOutput<TYPE> output)
+    // expected-error@-1{{object 'dx::HitObject' is not allowed in node records}}
+    // expected-note@23{{'dx::HitObject' field declared here}}
+{
+  ThreadNodeOutputRecords<TYPE> touts; 
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in node records}}
+  // expected-note@23{{'dx::HitObject' field declared here}}
+  GroupNodeOutputRecords<TYPE> gouts;
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in node records}}
+  // expected-note@23{{'dx::HitObject' field declared here}}
+}
+
+[Shader("node")]
+[NodeLaunch("coalescing")]
+[NumThreads(8,1,1)]
+void coalesce(GroupNodeInputRecords<TYPE> input) {}
+// expected-error@-1{{object 'dx::HitObject' is not allowed in node records}}
+// expected-note@23{{'dx::HitObject' field declared here}}
+
+[Shader("node")]
+[NodeLaunch("thread")]
+void threader(ThreadNodeInputRecord<TYPE> input) {}
+// expected-error@-1{{object 'dx::HitObject' is not allowed in node records}}
+// expected-note@23{{'dx::HitObject' field declared here}}
diff --git a/tools/clang/test/SemaHLSL/hlsl/workgraph/invalid_node_record_type.hlsl b/tools/clang/test/SemaHLSL/hlsl/workgraph/invalid_node_record_type.hlsl
index 40b820a1b4..de523d51d1 100644
--- a/tools/clang/test/SemaHLSL/hlsl/workgraph/invalid_node_record_type.hlsl
+++ b/tools/clang/test/SemaHLSL/hlsl/workgraph/invalid_node_record_type.hlsl
@@ -76,7 +76,7 @@ void node07(RWThreadNodeInputRecord<f2x2> input) // expected-error {{'f2x2' (aka
 
 [Shader("node")]
 [NodeLaunch("thread")]
-void node08(ThreadNodeInputRecord<BAD_RECORD> input) // expected-error {{object 'SamplerState' may not appear in a node record}}
+void node08(ThreadNodeInputRecord<BAD_RECORD> input) // expected-error {{object 'SamplerState' is not allowed in node records}}
 { }
 
 [Shader("node")]
@@ -86,17 +86,17 @@ void node09(ThreadNodeInputRecord<BAD_RECORD[4]> input) // expected-error {{'BAD
 
 [Shader("node")]
 [NodeLaunch("thread")]
-void node10(RWThreadNodeInputRecord<BAD_RECORD2> input) // expected-error {{object 'SamplerState' may not appear in a node record}}
+void node10(RWThreadNodeInputRecord<BAD_RECORD2> input) // expected-error {{object 'SamplerState' is not allowed in node records}}
 { }
 
 [Shader("node")]
 [NodeLaunch("thread")]
-void node11(NodeOutput<BAD_RECORD> input) // expected-error {{object 'SamplerState' may not appear in a node record}}
+void node11(NodeOutput<BAD_RECORD> input) // expected-error {{object 'SamplerState' is not allowed in node records}}
 { }
 
 [Shader("node")]
 [NodeLaunch("thread")]
-void node12(NodeOutputArray<MyBadRecord> output) // expected-error {{object 'SamplerState' may not appear in a node record}}
+void node12(NodeOutputArray<MyBadRecord> output) // expected-error {{object 'SamplerState' is not allowed in node records}}
 { }
 
 [Shader("node")]
@@ -129,7 +129,7 @@ void node16()
 
   ThreadNodeOutputRecords<f2x2> outrec2; // expected-error {{'f2x2' (aka 'matrix<float, 2, 2>') is not valid as a node record type - struct/class required}}
 
-  GroupNodeOutputRecords<MyBadRecord> outrec3; // expected-error {{object 'SamplerState' may not appear in a node record}}
+  GroupNodeOutputRecords<MyBadRecord> outrec3; // expected-error {{object 'SamplerState' is not allowed in node records}}
 
   ThreadNodeOutputRecords<SamplerState> outrec4; // expected-error {{'SamplerState' is not valid as a node record type - struct/class required}}
 }
@@ -151,10 +151,10 @@ void node17(ThreadNodeInputRecord<MyTemplateStruct<int> > input)
 
 [Shader("node")]
 [NodeLaunch("thread")]
-void node18(ThreadNodeInputRecord<MyTemplateStruct<SamplerState> > input) // expected-error {{object 'SamplerState' may not appear in a node record}}
+void node18(ThreadNodeInputRecord<MyTemplateStruct<SamplerState> > input) // expected-error {{object 'SamplerState' is not allowed in node records}}
 { }
 
 [Shader("node")]
 [NodeLaunch("thread")]
-void node19(RWThreadNodeInputRecord<MyNestedTemplateStruct> input) // expected-error {{object 'SamplerState' may not appear in a node record}}
+void node19(RWThreadNodeInputRecord<MyNestedTemplateStruct> input) // expected-error {{object 'SamplerState' is not allowed in node records}}
 { }
diff --git a/tools/clang/test/SemaHLSL/template-checks.hlsl b/tools/clang/test/SemaHLSL/template-checks.hlsl
index d0d736fc1f..751e89b652 100644
--- a/tools/clang/test/SemaHLSL/template-checks.hlsl
+++ b/tools/clang/test/SemaHLSL/template-checks.hlsl
@@ -1,8 +1,8 @@
 // RUN: %dxc -Tlib_6_3 -verify %s
 
 Texture2D<float4> t_float4;
-Texture2D<SamplerState> t_obj_sampler;          /* expected-error {{'SamplerState' is an object and cannot be used as a type parameter}} fxc-error {{X3124: object element type cannot be an object type}} */
-Texture2D<Texture2D<float4> > t_obj_tex;        /* expected-error {{'Texture2D<float4>' is an object and cannot be used as a type parameter}} fxc-error {{X3124: object element type cannot be an object type}} */
+Texture2D<SamplerState> t_obj_sampler;          /* expected-error {{object 'SamplerState' is not allowed in builtin template parameters}} fxc-error {{X3124: object element type cannot be an object type}} */
+Texture2D<Texture2D<float4> > t_obj_tex;        /* expected-error {{object 'Texture2D<float4>' is not allowed in builtin template parameters}} fxc-error {{X3124: object element type cannot be an object type}} */
 
 matrix<SamplerState, 1, 2> m_obj_sampler;       /* expected-error {{'SamplerState' cannot be used as a type parameter where a scalar is required}} fxc-error {{X3123: matrix element type must be a scalar type}} */
 matrix<bool, 1, 2> m_bool;
@@ -15,7 +15,7 @@ matrix<bool, 1, 2> m_bool;
 
 RWBuffer<double3> rwb_struct;    /* expected-error {{elements of typed buffers and textures must fit in four 32-bit quantities}} fxc-error {{X3037: elements of typed buffers and textures must fit in four 32-bit quantities}} */
 
-RWBuffer<SamplerState> rwb_struct_objs; /* expected-error {{'SamplerState' is an object and cannot be used as a type parameter}} */
+RWBuffer<SamplerState> rwb_struct_objs; /* expected-error {{object 'SamplerState' is not allowed in builtin template parameters}} */
 
 void vain() {
   // Nothing to do here.
diff --git a/tools/clang/test/SemaHLSL/template-udt-load.hlsl b/tools/clang/test/SemaHLSL/template-udt-load.hlsl
index 591f27b384..f666297bb9 100644
--- a/tools/clang/test/SemaHLSL/template-udt-load.hlsl
+++ b/tools/clang/test/SemaHLSL/template-udt-load.hlsl
@@ -8,6 +8,8 @@ RWBuffer<float> Out;
 [numthreads(1,1,1)]
 void main()
 { 
-  RWBuffer<float> FB = In.Load<RWBuffer<float> >(0); // expected-error {{Explicit template arguments on intrinsic Load must be a single numeric type}}
+  RWBuffer<float> FB = In.Load<RWBuffer<float> >(0);
+  // expected-error@-1{{Explicit template arguments on intrinsic Load must be a single numeric type}}
+  // expected-error@-2{{object 'RWBuffer<float>' is not allowed in builtin template parameters}}
   Out[0] = FB[0];
 }

From adffd31eb02f690d9d1afe86c7fa6f12b4e70aa1 Mon Sep 17 00:00:00 2001
From: Anupama Chandrasekhar <anupamac@nvidia.com>
Date: Fri, 16 May 2025 15:24:57 -0700
Subject: [PATCH 44/93] Implement HLSL Diagnostics for LinAlg operations
 (#7430)

This PR implements checks to validate the linalg builtin functions:
__builtin_MatVecMul, __builtin_MatVecMulAdd,
__builtin_OuterProductAccumulate and __builtin_VectorAccumulate. This
includes:
- verify valid types for input and output vectors
- const checks for compile-time const parameters
- value checks for interpretation and layout (enum) parameters
- min/max checks for matrix dimensions
- verify input-output vector are the right dimensions for the given
matrix (dimM and dimK) : packed and unpacked cases
- verify matrix layout, transpose and stride rules
- incorrect shader model warning

Adds tests for all the above error checks.

Implements
https://github.com/microsoft/DirectXShaderCompiler/issues/7336
---
 include/dxc/dxcapi.internal.h                 |   10 +-
 .../clang/Basic/DiagnosticSemaKinds.td        |   37 +
 tools/clang/lib/Headers/hlsl/dx/linalg.h      |   44 +-
 tools/clang/lib/Sema/SemaHLSL.cpp             |  562 ++++++-
 .../mat-vec-mul-add_multioverload.hlsl        |   86 +-
 .../mat-vec-mul_multioverload.hlsl            |   86 +-
 ...uter-product-accumulate-multioverload.hlsl |    7 +-
 .../CodeGenDXIL/hlsl/linalg/mat-vec-mul.hlsl  |   60 +-
 .../hlsl/linalg/builtins/mul_add_invalid.hlsl | 1398 +++++++++++++++++
 .../hlsl/linalg/builtins/mul_add_valid.hlsl   |  244 +++
 .../hlsl/linalg/builtins/mul_invalid.hlsl     | 1156 ++++++++++++++
 .../hlsl/linalg/builtins/mul_valid.hlsl       |  344 ++++
 .../outer_product_accumulate_invalid.hlsl     |  256 +++
 .../outer_product_accumulate_valid.hlsl       |   66 +
 .../hlsl/linalg/make-interp-vec-errors.hlsl   |    4 +-
 .../hlsl/linalg/mat-vec-mul-errors.hlsl       |    2 +-
 .../hlsl/linalg/mat-vec-muladd-errors.hlsl    |    2 +-
 .../linalg/outerproductaccumulate-errors.hlsl |    6 +-
 .../hlsl/linalg/unavailable-pre-sm69.hlsl     |   14 +-
 utils/hct/gen_intrin_main.txt                 |    8 +-
 utils/hct/hctdb.py                            |    5 +-
 21 files changed, 4271 insertions(+), 126 deletions(-)
 create mode 100644 tools/clang/test/SemaHLSL/hlsl/linalg/builtins/mul_add_invalid.hlsl
 create mode 100644 tools/clang/test/SemaHLSL/hlsl/linalg/builtins/mul_add_valid.hlsl
 create mode 100644 tools/clang/test/SemaHLSL/hlsl/linalg/builtins/mul_invalid.hlsl
 create mode 100644 tools/clang/test/SemaHLSL/hlsl/linalg/builtins/mul_valid.hlsl
 create mode 100644 tools/clang/test/SemaHLSL/hlsl/linalg/builtins/outer_product_accumulate_invalid.hlsl
 create mode 100644 tools/clang/test/SemaHLSL/hlsl/linalg/builtins/outer_product_accumulate_valid.hlsl

diff --git a/include/dxc/dxcapi.internal.h b/include/dxc/dxcapi.internal.h
index 28bd3e7066..41891338e6 100644
--- a/include/dxc/dxcapi.internal.h
+++ b/include/dxc/dxcapi.internal.h
@@ -133,11 +133,15 @@ enum LEGAL_INTRINSIC_COMPTYPES {
   LICOMPTYPE_HIT_OBJECT = 51,
   LICOMPTYPE_RAY_QUERY = 52,
 
+  LICOMPTYPE_LINALG = 53, // f32, partial-precision-f32, f16,
+                          // i32, i16, u32, u16,
+                          // int8_4packed, uint8_4packed
+
 #ifdef ENABLE_SPIRV_CODEGEN
-  LICOMPTYPE_VK_BUFFER_POINTER = 53,
-  LICOMPTYPE_COUNT = 54
+  LICOMPTYPE_VK_BUFFER_POINTER = 54,
+  LICOMPTYPE_COUNT = 55
 #else
-  LICOMPTYPE_COUNT = 53
+  LICOMPTYPE_COUNT = 54
 #endif
 };
 
diff --git a/tools/clang/include/clang/Basic/DiagnosticSemaKinds.td b/tools/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 003aa50795..48412facad 100644
--- a/tools/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/tools/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -8018,6 +8018,43 @@ def err_hlsl_reorder_unsupported_stage : Error<
    "dx::MaybeReorderThread is unavailable in shader stage '%0' (requires 'raygeneration')">;
 def err_hlsl_hitobject_unsupported_stage : Error<
    "dx::HitObject is unavailable in shader stage '%0' (requires 'raygeneration', 'closesthit' or 'miss')">;
+
+// Linear Algebra Operations
+def err_hlsl_linalg_isunsigned_incorrect_for_given_type : Error<
+  "%0 must be %select{false|true}1 for vector of "
+  "%select{floating point|signed integer|unsigned integer}2 type">;
+def err_hlsl_linalg_interpretation_value_incorrect : Error<
+  "%0 is an invalid %select{memory|register}1 interpretation value">;
+def err_hlsl_linalg_matrix_layout_is_not_transposable : Error<
+  "RowMajor and ColumnMajor matrices are not transposable">;
+def err_hlsl_linalg_optimal_matrix_layout_matrix_stride_must_be_zero : Error<
+  "for optimal matrix layout, matrix stride must be 0">;
+def err_hlsl_linalg_matrix_dim_must_be_greater_than_zero: Error<
+  "matrix dimension must be greater than 0">;
+def err_hlsl_linalg_matrix_layout_invalid : Error<
+  "matrix layout %0 is not valid, must be in the range [%1, %2]">;
+
+def err_hlsl_linalg_mul_muladd_output_vector_size_not_equal_to_matrix_M : Error<
+  "output vector length must be equal to Matrix M dimension in a linalg Mul/MulAdd operation">;
+def err_hlsl_linalg_mul_muladd_unpacked_input_vector_size_not_equal_to_matrix_K : Error<
+  "unpacked input vector length must be equal to Matrix K dimension in a linalg Mul/MulAdd operation">;
+def err_hlsl_linalg_mul_muladd_packed_input_vector_size_incorrect : Error<
+  "packed input vector length must be the smallest number that can hold matrix dim K values of the "
+  "packed(smaller) type in linalg mul/muladd operations">;
+def err_hlsl_linalg_mul_muladd_isUnsigned_for_packed_input_must_be_true : Error<
+  "IsInputUnsigned must be true for packed input interpretations in linalg mul/muladd operations">;
+def err_hlsl_linalg_mul_muladd_packed_input_vector_must_be_uint : Error<
+  "packed input vector type must be a 32-bit unsigned int type in linalg mul/muladd operations">;
+def err_hlsl_linalg_mul_muladd_invalid_dim: Error<
+  "matrix dimension %select{M|K when using unpacked input vectors|K "
+  "when using packed input vectors}0 must be less than %1, in a linalg "
+  "Mul/MulAdd operation">;
+
+def err_hlsl_linalg_outer_prod_acc_vector_type_mismatch : Error<
+  "input vectors of outerproductaccumulate must have the same element type">;
+def err_hlsl_linalg_outer_prod_acc_matrix_layout_must_be_outer_prod_acc_optimal : Error<
+  "matrix layout for outerproductaccumulate must be %0">;
+
 // HLSL Change Ends
 
 // SPIRV Change Starts
diff --git a/tools/clang/lib/Headers/hlsl/dx/linalg.h b/tools/clang/lib/Headers/hlsl/dx/linalg.h
index 51e662bbc9..4f5e62070d 100644
--- a/tools/clang/lib/Headers/hlsl/dx/linalg.h
+++ b/tools/clang/lib/Headers/hlsl/dx/linalg.h
@@ -43,14 +43,30 @@ enum MatrixLayout {
 // Helper for signedness
 //
 namespace details {
-template <typename T> bool IsUnsigned() { return false; }
+
+template <typename T> struct IsUnsigned {};
+
+#define _SPECIALIZE_ISUNSIGNED(type, value)                                    \
+  template <> struct IsUnsigned<type> {                                        \
+    static const bool Value = value;                                           \
+  }
+
+_SPECIALIZE_ISUNSIGNED(uint8_t4_packed, true);
+_SPECIALIZE_ISUNSIGNED(int8_t4_packed, true);
+_SPECIALIZE_ISUNSIGNED(uint32_t, true);
+_SPECIALIZE_ISUNSIGNED(int32_t, false);
+_SPECIALIZE_ISUNSIGNED(float32_t, false);
 
 #ifdef __HLSL_ENABLE_16_BIT
-template <> bool IsUnsigned<uint16_t>() { return true; }
-#endif
+_SPECIALIZE_ISUNSIGNED(uint16_t, true);
+_SPECIALIZE_ISUNSIGNED(int16_t, false);
+_SPECIALIZE_ISUNSIGNED(float16_t, false);
+#else  // //__HLSL_ENABLE_16_BIT
+_SPECIALIZE_ISUNSIGNED(half, false);
+#endif //__HLSL_ENABLE_16_BIT
+
+#undef _SPECIALIZE_ISUNSIGNED
 
-template <> bool IsUnsigned<uint32_t>() { return true; }
-template <> bool IsUnsigned<uint64_t>() { return true; }
 } // namespace details
 
 //
@@ -116,10 +132,10 @@ Mul(MatrixRefImpl<MatrixBufferTy, MatrixDT, MatrixM, MatrixK, MatrixLayout,
   vector<OutputElTy, MatrixM> OutputVector;
 
   __builtin_MatVecMul(
-      /*out*/ OutputVector, details::IsUnsigned<OutputElTy>(), InputVector.Data,
-      details::IsUnsigned<InputElTy>(), InputDT, Matrix.Buffer,
-      Matrix.StartOffset, MatrixDT, MatrixM, MatrixK, MatrixLayout,
-      MatrixTranspose, Matrix.Stride);
+      /*out*/ OutputVector, details::IsUnsigned<OutputElTy>::Value,
+      InputVector.Data, details::IsUnsigned<InputElTy>::Value, InputDT,
+      Matrix.Buffer, Matrix.StartOffset, MatrixDT, MatrixM, MatrixK,
+      MatrixLayout, MatrixTranspose, Matrix.Stride);
 
   return OutputVector;
 }
@@ -143,11 +159,11 @@ MulAdd(MatrixRefImpl<MatrixBufferTy, MatrixDT, MatrixM, MatrixK, MatrixLayout,
   vector<OutputElTy, MatrixM> OutputVector;
 
   __builtin_MatVecMulAdd(
-      /*out*/ OutputVector, details::IsUnsigned<OutputElTy>(), InputVector.Data,
-      details::IsUnsigned<InputElTy>(), InputDT, Matrix.Buffer,
-      Matrix.StartOffset, MatrixDT, MatrixM, MatrixK, MatrixLayout,
-      MatrixTranspose, Matrix.Stride, BiasVector.Buffer, BiasVector.StartOffset,
-      BiasVectorDT);
+      /*out*/ OutputVector, details::IsUnsigned<OutputElTy>::Value,
+      InputVector.Data, details::IsUnsigned<InputElTy>::Value, InputDT,
+      Matrix.Buffer, Matrix.StartOffset, MatrixDT, MatrixM, MatrixK,
+      MatrixLayout, MatrixTranspose, Matrix.Stride, BiasVector.Buffer,
+      BiasVector.StartOffset, BiasVectorDT);
 
   return OutputVector;
 }
diff --git a/tools/clang/lib/Sema/SemaHLSL.cpp b/tools/clang/lib/Sema/SemaHLSL.cpp
index e5424ecdde..fa59aa6ef7 100644
--- a/tools/clang/lib/Sema/SemaHLSL.cpp
+++ b/tools/clang/lib/Sema/SemaHLSL.cpp
@@ -15,6 +15,7 @@
 
 #include "clang/Sema/SemaHLSL.h"
 #include "VkConstantsTables.h"
+#include "dxc/DXIL/DxilConstants.h"
 #include "dxc/DXIL/DxilFunctionProps.h"
 #include "dxc/DXIL/DxilShaderModel.h"
 #include "dxc/DXIL/DxilUtil.h"
@@ -1139,6 +1140,14 @@ static const ArBasicKind g_RayDescCT[] = {AR_OBJECT_RAY_DESC, AR_BASIC_UNKNOWN};
 static const ArBasicKind g_RayQueryCT[] = {AR_OBJECT_RAY_QUERY,
                                            AR_BASIC_UNKNOWN};
 
+static const ArBasicKind g_LinAlgCT[] = {
+    AR_BASIC_FLOAT32,       AR_BASIC_FLOAT32_PARTIAL_PRECISION,
+    AR_BASIC_FLOAT16,       AR_BASIC_INT32,
+    AR_BASIC_INT16,         AR_BASIC_UINT32,
+    AR_BASIC_UINT16,        AR_BASIC_INT8_4PACKED,
+    AR_BASIC_UINT8_4PACKED, AR_BASIC_NOCAST,
+    AR_BASIC_UNKNOWN};
+
 static const ArBasicKind g_AccelerationStructCT[] = {
     AR_OBJECT_ACCELERATION_STRUCT, AR_BASIC_UNKNOWN};
 
@@ -1302,6 +1311,7 @@ const ArBasicKind *g_LegalIntrinsicCompTypes[] = {
     g_ThreadNodeOutputRecordsCT, // LICOMPTYPE_THREAD_NODE_OUTPUT_RECORDS
     g_DxHitObjectCT,             // LICOMPTYPE_HIT_OBJECT
     g_RayQueryCT,                // LICOMPTYPE_RAY_QUERY
+    g_LinAlgCT,                  // LICOMPTYPE_LINALG
 #ifdef ENABLE_SPIRV_CODEGEN
     g_VKBufferPointerCT, // LICOMPTYPE_VK_BUFFER_POINTER
 #endif
@@ -11674,6 +11684,537 @@ static bool CheckBarrierCall(Sema &S, FunctionDecl *FD, CallExpr *CE,
   return false;
 }
 
+// MatVec Ops
+static const unsigned kMatVecMulOutputVectorIdx = 0;
+static const unsigned kMatVecMulOutputIsUnsignedIdx = 1;
+static const unsigned kMatVecMulInputVectorIdx = 2;
+static const unsigned kMatVecMulIsInputUnsignedIdx = 3;
+static const unsigned kMatVecMulInputInterpretationIdx = 4;
+// static const unsigned kMatVecMulMatrixBufferIdx = 5;
+// static const unsigned kMatVecMulMatrixOffsetIdx = 6;
+static const unsigned kMatVecMulMatrixInterpretationIdx = 7;
+static const unsigned kMatVecMulMatrixMIdx = 8;
+static const unsigned kMatVecMulMatrixKIdx = 9;
+static const unsigned kMatVecMulMatrixLayoutIdx = 10;
+static const unsigned kMatVecMulMatrixTransposeIdx = 11;
+static const unsigned kMatVecMulMatrixStrideIdx = 12;
+
+// MatVecAdd
+const unsigned kMatVecMulAddBiasInterpretation = 15;
+
+static bool IsValidMatrixLayoutForMulAndMulAddOps(unsigned Layout) {
+  return Layout <=
+         static_cast<unsigned>(DXIL::LinalgMatrixLayout::OuterProductOptimal);
+}
+
+static bool IsOptimalTypeMatrixLayout(unsigned Layout) {
+  return (
+      Layout == (static_cast<unsigned>(DXIL::LinalgMatrixLayout::MulOptimal)) ||
+      (Layout ==
+       (static_cast<unsigned>(DXIL::LinalgMatrixLayout::OuterProductOptimal))));
+}
+
+static bool IsValidTransposeForMatrixLayout(unsigned Layout, bool Transposed) {
+  switch (static_cast<DXIL::LinalgMatrixLayout>(Layout)) {
+  case DXIL::LinalgMatrixLayout::RowMajor:
+  case DXIL::LinalgMatrixLayout::ColumnMajor:
+    return !Transposed;
+
+  default:
+    return true;
+  }
+}
+
+static bool IsPackedType(unsigned type) {
+  return (type == static_cast<unsigned>(DXIL::ComponentType::PackedS8x32) ||
+          type == static_cast<unsigned>(DXIL::ComponentType::PackedU8x32));
+}
+
+static bool IsValidLinalgTypeInterpretation(uint32_t Input, bool InRegister) {
+
+  switch (static_cast<DXIL::ComponentType>(Input)) {
+  case DXIL::ComponentType::I16:
+  case DXIL::ComponentType::U16:
+  case DXIL::ComponentType::I32:
+  case DXIL::ComponentType::U32:
+  case DXIL::ComponentType::F16:
+  case DXIL::ComponentType::F32:
+  case DXIL::ComponentType::U8:
+  case DXIL::ComponentType::I8:
+  case DXIL::ComponentType::F8_E4M3:
+  case DXIL::ComponentType::F8_E5M2:
+    return true;
+  case DXIL::ComponentType::PackedS8x32:
+  case DXIL::ComponentType::PackedU8x32:
+    return InRegister;
+  default:
+    return false;
+  }
+}
+
+static bool IsValidVectorAndMatrixDimensions(Sema &S, CallExpr *CE,
+                                             unsigned InputVectorSize,
+                                             unsigned OutputVectorSize,
+                                             unsigned MatrixK, unsigned MatrixM,
+                                             bool isInputPacked) {
+  // Check if output vector size equals to matrix dimension M
+  if (OutputVectorSize != MatrixM) {
+    Expr *OutputVector = CE->getArg(kMatVecMulOutputVectorIdx);
+    S.Diags.Report(
+        OutputVector->getExprLoc(),
+        diag::
+            err_hlsl_linalg_mul_muladd_output_vector_size_not_equal_to_matrix_M);
+    return false;
+  }
+
+  // Check if input vector size equals to matrix dimension K in the unpacked
+  // case.
+  // Check if input vector size equals the smallest number that can hold
+  // matrix dimension K values
+  const unsigned PackingFactor = isInputPacked ? 4 : 1;
+  unsigned MinInputVectorSize = (MatrixK + PackingFactor - 1) / PackingFactor;
+  if (InputVectorSize != MinInputVectorSize) {
+    Expr *InputVector = CE->getArg(kMatVecMulInputVectorIdx);
+    if (isInputPacked) {
+      S.Diags.Report(
+          InputVector->getExprLoc(),
+          diag::err_hlsl_linalg_mul_muladd_packed_input_vector_size_incorrect);
+      return false;
+    } else {
+      S.Diags.Report(
+          InputVector->getExprLoc(),
+          diag::
+              err_hlsl_linalg_mul_muladd_unpacked_input_vector_size_not_equal_to_matrix_K);
+      return false;
+    }
+  }
+
+  return true;
+}
+
+static void CheckCommonMulAndMulAddParameters(Sema &S, CallExpr *CE,
+                                              const hlsl::ShaderModel *SM) {
+  // Check if IsOutputUnsigned is a const parameter
+  bool IsOutputUnsignedFlagValue = false;
+  Expr *IsOutputUnsignedExpr = CE->getArg(kMatVecMulOutputIsUnsignedIdx);
+  llvm::APSInt IsOutputUnsignedExprVal;
+  if (IsOutputUnsignedExpr->isIntegerConstantExpr(IsOutputUnsignedExprVal,
+                                                  S.Context)) {
+    IsOutputUnsignedFlagValue = IsOutputUnsignedExprVal.getBoolValue();
+  } else {
+    S.Diags.Report(IsOutputUnsignedExpr->getExprLoc(), diag::err_expr_not_ice)
+        << 0;
+    return;
+  }
+
+  Expr *OutputVectorExpr = CE->getArg(kMatVecMulOutputVectorIdx);
+  unsigned OutputVectorSizeValue = 0;
+  if (IsHLSLVecType(OutputVectorExpr->getType())) {
+    OutputVectorSizeValue = GetHLSLVecSize(OutputVectorExpr->getType());
+    QualType OutputVectorType =
+        GetHLSLVecElementType(OutputVectorExpr->getType());
+    const Type *OutputVectorTypePtr = OutputVectorType.getTypePtr();
+
+    // Check if IsOutputUnsigned flag matches output vector type.
+    // Must be true for unsigned int outputs, false for signed int/float
+    // outputs.
+    if (IsOutputUnsignedFlagValue &&
+        !OutputVectorTypePtr->isUnsignedIntegerType()) {
+      DXASSERT_NOMSG(OutputVectorTypePtr->isSignedIntegerType() ||
+                     OutputVectorTypePtr->isFloatingType());
+      S.Diags.Report(IsOutputUnsignedExpr->getExprLoc(),
+                     diag::err_hlsl_linalg_isunsigned_incorrect_for_given_type)
+          << "IsOuputUnsigned" << false
+          << (OutputVectorTypePtr->isSignedIntegerType() ? 1 : 0);
+      return;
+    } else if (!IsOutputUnsignedFlagValue &&
+               OutputVectorTypePtr->isUnsignedIntegerType()) {
+      S.Diags.Report(IsOutputUnsignedExpr->getExprLoc(),
+                     diag::err_hlsl_linalg_isunsigned_incorrect_for_given_type)
+          << "IsOuputUnsigned" << true << 2;
+      return;
+    }
+  }
+
+  // Check if isInputUnsigned parameter is a constant
+  bool IsInputUnsignedFlagValue = false;
+  Expr *IsInputUnsignedExpr = CE->getArg(kMatVecMulIsInputUnsignedIdx);
+  llvm::APSInt IsInputUnsignedExprVal;
+  if (IsInputUnsignedExpr->isIntegerConstantExpr(IsInputUnsignedExprVal,
+                                                 S.Context)) {
+    IsInputUnsignedFlagValue = IsInputUnsignedExprVal.getBoolValue();
+  } else {
+    S.Diags.Report(IsInputUnsignedExpr->getExprLoc(), diag::err_expr_not_ice)
+        << 0;
+    return;
+  }
+
+  // Get InputInterpretation, check if it is constant
+  Expr *InputInterpretationExpr = CE->getArg(kMatVecMulInputInterpretationIdx);
+  llvm::APSInt InputInterpretationExprVal;
+  unsigned InputInterpretationValue = 0;
+  if (InputInterpretationExpr->isIntegerConstantExpr(InputInterpretationExprVal,
+                                                     S.Context)) {
+    InputInterpretationValue = InputInterpretationExprVal.getLimitedValue();
+    const bool InRegisterInterpretation = true;
+    if (!IsValidLinalgTypeInterpretation(InputInterpretationValue,
+                                         InRegisterInterpretation)) {
+      S.Diags.Report(InputInterpretationExpr->getExprLoc(),
+                     diag::err_hlsl_linalg_interpretation_value_incorrect)
+          << std::to_string(InputInterpretationValue)
+          << InRegisterInterpretation;
+      return;
+    }
+  } else {
+    S.Diags.Report(InputInterpretationExpr->getExprLoc(),
+                   diag::err_expr_not_ice)
+        << 0;
+    return;
+  }
+
+  bool IsInputVectorPacked = IsPackedType(InputInterpretationValue);
+
+  // For packed types input vector type must be uint and isUnsigned must be
+  // true. The signedness is determined from the InputInterpretation
+  Expr *InputVectorExpr = CE->getArg(kMatVecMulInputVectorIdx);
+  unsigned InputVectorSizeValue = 0;
+  if (IsHLSLVecType(InputVectorExpr->getType())) {
+    InputVectorSizeValue = GetHLSLVecSize(InputVectorExpr->getType());
+    QualType InputVectorType =
+        GetHLSLVecElementType(InputVectorExpr->getType());
+    unsigned BitWidth = S.Context.getTypeSize(InputVectorType);
+    bool Is32Bit = (BitWidth == 32);
+    const Type *InputVectorTypePtr = InputVectorType.getTypePtr();
+
+    // Check if the isUnsigned flag setting
+    if (IsInputVectorPacked) {
+      // Check that the input vector element type is "32bit"
+      if (!Is32Bit) {
+        S.Diags.Report(
+            InputVectorExpr->getExprLoc(),
+            diag::err_hlsl_linalg_mul_muladd_packed_input_vector_must_be_uint);
+        return;
+      }
+
+      // Check that the input vector element type is an unsigned int
+      if (!InputVectorTypePtr->isUnsignedIntegerType()) {
+        S.Diags.Report(
+            InputVectorExpr->getExprLoc(),
+            diag::err_hlsl_linalg_mul_muladd_packed_input_vector_must_be_uint);
+        return;
+      }
+
+      // Check that isInputUnsigned is always true
+      // Actual signedness is inferred from the InputInterpretation
+      if (!IsInputUnsignedFlagValue) {
+        S.Diags.Report(
+            IsInputUnsignedExpr->getExprLoc(),
+            diag::
+                err_hlsl_linalg_mul_muladd_isUnsigned_for_packed_input_must_be_true);
+        return;
+      }
+    } else {
+      if (IsInputUnsignedFlagValue &&
+          !InputVectorTypePtr->isUnsignedIntegerType()) {
+        DXASSERT_NOMSG(InputVectorTypePtr->isSignedIntegerType() ||
+                       InputVectorTypePtr->isFloatingType());
+        S.Diags.Report(
+            IsInputUnsignedExpr->getExprLoc(),
+            diag::err_hlsl_linalg_isunsigned_incorrect_for_given_type)
+            << "IsInputUnsigned" << false
+            << (InputVectorTypePtr->isSignedIntegerType() ? 1 : 0);
+        return;
+      } else if (!IsInputUnsignedFlagValue &&
+                 InputVectorTypePtr->isUnsignedIntegerType()) {
+        S.Diags.Report(
+            IsInputUnsignedExpr->getExprLoc(),
+            diag::err_hlsl_linalg_isunsigned_incorrect_for_given_type)
+            << "IsInputUnsigned" << true << 2;
+        return;
+      }
+    }
+  }
+
+  // Get Matrix Dimensions M and K, check if they are constants
+  Expr *MatrixKExpr = CE->getArg(kMatVecMulMatrixKIdx);
+  llvm::APSInt MatrixKExprVal;
+  unsigned MatrixKValue = 0;
+  if (MatrixKExpr->isIntegerConstantExpr(MatrixKExprVal, S.Context)) {
+    MatrixKValue = MatrixKExprVal.getLimitedValue();
+  } else {
+    S.Diags.Report(MatrixKExpr->getExprLoc(), diag::err_expr_not_ice) << 0;
+    return;
+  }
+
+  Expr *MatrixMExpr = CE->getArg(kMatVecMulMatrixMIdx);
+  llvm::APSInt MatrixMExprVal;
+  unsigned MatrixMValue = 0;
+  if (MatrixMExpr->isIntegerConstantExpr(MatrixMExprVal, S.Context)) {
+    MatrixMValue = MatrixMExprVal.getLimitedValue();
+  } else {
+    S.Diags.Report(MatrixMExpr->getExprLoc(), diag::err_expr_not_ice) << 0;
+    return;
+  }
+
+  // Check MatrixM and MatrixK values are non-zero
+  if (MatrixMValue == 0) {
+    S.Diags.Report(MatrixMExpr->getExprLoc(),
+                   diag::err_hlsl_linalg_matrix_dim_must_be_greater_than_zero)
+        << std::to_string(DXIL::kSM69MaxVectorLength);
+    return;
+  }
+
+  if (MatrixKValue == 0) {
+    S.Diags.Report(MatrixKExpr->getExprLoc(),
+                   diag::err_hlsl_linalg_matrix_dim_must_be_greater_than_zero)
+        << std::to_string(DXIL::kSM69MaxVectorLength);
+    return;
+  }
+
+  // Check MatrixM and MatrixK values are less than max
+  // Matrix dimension cannot exceed largest vector length in a Mul/MulAdd
+  // operation.
+  if (MatrixMValue > DXIL::kSM69MaxVectorLength) {
+    S.Diags.Report(MatrixMExpr->getExprLoc(),
+                   diag::err_hlsl_linalg_mul_muladd_invalid_dim)
+        << 0 << std::to_string(DXIL::kSM69MaxVectorLength);
+    return;
+  }
+
+  // For packed input vectors 4 values are packed in a uint, so max Matrix K
+  // can be 4096
+  if (IsInputVectorPacked) {
+    const unsigned PackingFactor =
+        4; // Only supported packed formats: DATA_TYPE_(U)SINT8_T4_PACKED
+    if (MatrixKValue > DXIL::kSM69MaxVectorLength * PackingFactor) {
+      S.Diags.Report(MatrixKExpr->getExprLoc(),
+                     diag::err_hlsl_linalg_mul_muladd_invalid_dim)
+          << 2 << std::to_string(DXIL::kSM69MaxVectorLength * PackingFactor);
+      return;
+    }
+  } else {
+    if (MatrixKValue > DXIL::kSM69MaxVectorLength) {
+      S.Diags.Report(MatrixKExpr->getExprLoc(),
+                     diag::err_hlsl_linalg_mul_muladd_invalid_dim)
+          << 1 << std::to_string(DXIL::kSM69MaxVectorLength);
+      return;
+    }
+  }
+
+  if (!IsValidVectorAndMatrixDimensions(S, CE, InputVectorSizeValue,
+                                        OutputVectorSizeValue, MatrixKValue,
+                                        MatrixMValue, IsInputVectorPacked)) {
+    return;
+  }
+
+  // Get MatrixInterpretation, check if it is constant
+  // Make sure it is a valid value
+  Expr *MatrixInterpretationExpr =
+      CE->getArg(kMatVecMulMatrixInterpretationIdx);
+  llvm::APSInt MatrixInterpretationExprVal;
+  unsigned MatrixInterpretationValue = 0;
+  if (MatrixInterpretationExpr->isIntegerConstantExpr(
+          MatrixInterpretationExprVal, S.Context)) {
+    MatrixInterpretationValue = MatrixInterpretationExprVal.getLimitedValue();
+    const bool InRegisterInterpretation = false;
+    if (!IsValidLinalgTypeInterpretation(MatrixInterpretationValue,
+                                         InRegisterInterpretation)) {
+      S.Diags.Report(MatrixInterpretationExpr->getExprLoc(),
+                     diag::err_hlsl_linalg_interpretation_value_incorrect)
+          << std::to_string(MatrixInterpretationValue)
+          << InRegisterInterpretation;
+      return;
+    }
+  } else {
+    S.Diags.Report(MatrixInterpretationExpr->getExprLoc(),
+                   diag::err_expr_not_ice)
+        << 0;
+    return;
+  }
+
+  // Get MatrixLayout, check if it is constant and valid value
+  Expr *MatrixLayoutExpr = CE->getArg(kMatVecMulMatrixLayoutIdx);
+  llvm::APSInt MatrixLayoutExprVal;
+  unsigned MatrixLayoutValue = 0;
+  if (MatrixLayoutExpr->isIntegerConstantExpr(MatrixLayoutExprVal, S.Context)) {
+    MatrixLayoutValue = MatrixLayoutExprVal.getLimitedValue();
+    if (!IsValidMatrixLayoutForMulAndMulAddOps(MatrixLayoutValue)) {
+      S.Diags.Report(MatrixLayoutExpr->getExprLoc(),
+                     diag::err_hlsl_linalg_matrix_layout_invalid)
+          << std::to_string(MatrixLayoutValue)
+          << std::to_string(
+                 static_cast<unsigned>(DXIL::LinalgMatrixLayout::RowMajor))
+          << std::to_string(static_cast<unsigned>(
+                 DXIL::LinalgMatrixLayout::OuterProductOptimal));
+      return;
+    }
+  } else {
+    S.Diags.Report(MatrixLayoutExpr->getExprLoc(), diag::err_expr_not_ice) << 0;
+    return;
+  }
+
+  // Get MatrixTranspose, check if it is constant
+  Expr *MatrixTransposeExpr = CE->getArg(kMatVecMulMatrixTransposeIdx);
+  llvm::APSInt MatrixTransposeExprVal;
+  unsigned MatrixTransposeValue = 0;
+  if (MatrixTransposeExpr->isIntegerConstantExpr(MatrixTransposeExprVal,
+                                                 S.Context)) {
+    MatrixTransposeValue = MatrixTransposeExprVal.getBoolValue();
+    if (!IsValidTransposeForMatrixLayout(MatrixLayoutValue,
+                                         MatrixTransposeValue)) {
+
+      S.Diags.Report(MatrixTransposeExpr->getExprLoc(),
+                     diag::err_hlsl_linalg_matrix_layout_is_not_transposable);
+      return;
+    }
+  } else {
+    S.Diags.Report(MatrixTransposeExpr->getExprLoc(), diag::err_expr_not_ice)
+        << 0;
+    return;
+  }
+
+  // Get MatrixStride, check if it is constant, if yes it should be zero
+  // for optimal layouts
+  Expr *MatrixStrideExpr = CE->getArg(kMatVecMulMatrixStrideIdx);
+  llvm::APSInt MatrixStrideExprVal;
+  unsigned MatrixStrideValue = 0;
+  if (MatrixStrideExpr->isIntegerConstantExpr(MatrixStrideExprVal, S.Context)) {
+    MatrixStrideValue = MatrixStrideExprVal.getLimitedValue();
+    if (IsOptimalTypeMatrixLayout(MatrixLayoutValue) &&
+        MatrixStrideValue != 0) {
+      S.Diags.Report(
+          MatrixStrideExpr->getExprLoc(),
+          diag::
+              err_hlsl_linalg_optimal_matrix_layout_matrix_stride_must_be_zero);
+      return;
+    }
+  }
+}
+
+static void CheckMulCall(Sema &S, FunctionDecl *FD, CallExpr *CE,
+                         const hlsl::ShaderModel *SM) {
+  CheckCommonMulAndMulAddParameters(S, CE, SM);
+}
+
+static void CheckMulAddCall(Sema &S, FunctionDecl *FD, CallExpr *CE,
+                            const hlsl::ShaderModel *SM) {
+  CheckCommonMulAndMulAddParameters(S, CE, SM);
+
+  // Check if BiasInterpretation is constant and a valid value
+  Expr *BiasInterpretationExpr = CE->getArg(kMatVecMulAddBiasInterpretation);
+  llvm::APSInt BiasInterpretationExprVal;
+  unsigned BiasInterpretationValue = 0;
+  if (BiasInterpretationExpr->isIntegerConstantExpr(BiasInterpretationExprVal,
+                                                    S.Context)) {
+    BiasInterpretationValue = BiasInterpretationExprVal.getLimitedValue();
+    const bool InRegisterInterpretation = false;
+    if (!IsValidLinalgTypeInterpretation(BiasInterpretationValue,
+                                         InRegisterInterpretation)) {
+      S.Diags.Report(BiasInterpretationExpr->getExprLoc(),
+                     diag::err_hlsl_linalg_interpretation_value_incorrect)
+          << std::to_string(BiasInterpretationValue)
+          << InRegisterInterpretation;
+      return;
+    }
+  } else {
+    S.Diags.Report(BiasInterpretationExpr->getExprLoc(), diag::err_expr_not_ice)
+        << 0;
+    return;
+  }
+}
+
+// Linalg Outer Product Accumulate
+// OuterProductAccumulate builtin function parameters
+static const unsigned kOuterProdAccInputVector1Idx = 0;
+static const unsigned kOuterProdAccInputVector2Idx = 1;
+// static const unsigned kOuterProdAccMatrixBufferIdx = 2;
+// static const unsigned kOuterProdAccMatrixOffsetIdx = 3;
+static const unsigned kOuterProdAccMatrixInterpretationIdx = 4;
+static const unsigned kOuterProdAccMatrixLayoutIdx = 5;
+static const unsigned kOuterProdAccMatrixStrideIdx = 6;
+
+static void CheckOuterProductAccumulateCall(Sema &S, FunctionDecl *FD,
+                                            CallExpr *CE) {
+  // Check InputVector1 and InputVector2 are the same type
+  const Expr *InputVector1Expr = CE->getArg(kOuterProdAccInputVector1Idx);
+  const Expr *InputVector2Expr = CE->getArg(kOuterProdAccInputVector2Idx);
+  QualType InputVector1Type = InputVector1Expr->getType();
+  QualType InputVector2Type = InputVector2Expr->getType();
+
+  // Get the element types of the vectors
+  const QualType InputVector1ElementType =
+      GetHLSLVecElementType(InputVector1Type);
+  const QualType InputVector2ElementType =
+      GetHLSLVecElementType(InputVector2Type);
+
+  if (!S.Context.hasSameType(InputVector1ElementType,
+                             InputVector2ElementType)) {
+    S.Diags.Report(InputVector2Expr->getExprLoc(),
+                   diag::err_hlsl_linalg_outer_prod_acc_vector_type_mismatch);
+    return;
+  }
+
+  // Check Matrix Interpretation is a constant and a valid value
+  Expr *MatrixInterpretationExpr =
+      CE->getArg(kOuterProdAccMatrixInterpretationIdx);
+  llvm::APSInt MatrixInterpretationExprVal;
+  unsigned MatrixInterpretationValue = 0;
+  if (MatrixInterpretationExpr->isIntegerConstantExpr(
+          MatrixInterpretationExprVal, S.Context)) {
+    MatrixInterpretationValue = MatrixInterpretationExprVal.getLimitedValue();
+    const bool InRegisterInterpretation = false;
+    if (!IsValidLinalgTypeInterpretation(MatrixInterpretationValue,
+                                         InRegisterInterpretation)) {
+      S.Diags.Report(MatrixInterpretationExpr->getExprLoc(),
+                     diag::err_hlsl_linalg_interpretation_value_incorrect)
+          << std::to_string(MatrixInterpretationValue)
+          << InRegisterInterpretation;
+      return;
+    }
+  } else {
+    S.Diags.Report(MatrixInterpretationExpr->getExprLoc(),
+                   diag::err_expr_not_ice)
+        << 0;
+    return;
+  }
+
+  // Check Matrix Layout must be a constant and Training Optimal
+  Expr *MatrixLayoutExpr = CE->getArg(kOuterProdAccMatrixLayoutIdx);
+  llvm::APSInt MatrixLayoutExprVal;
+  unsigned MatrixLayoutValue = 0;
+  if (MatrixLayoutExpr->isIntegerConstantExpr(MatrixLayoutExprVal, S.Context)) {
+    MatrixLayoutValue = MatrixLayoutExprVal.getLimitedValue();
+    if (MatrixLayoutValue !=
+        static_cast<unsigned>(DXIL::LinalgMatrixLayout::OuterProductOptimal)) {
+      S.Diags.Report(
+          MatrixLayoutExpr->getExprLoc(),
+          diag::
+              err_hlsl_linalg_outer_prod_acc_matrix_layout_must_be_outer_prod_acc_optimal)
+          << std::to_string(static_cast<unsigned>(
+                 DXIL::LinalgMatrixLayout::OuterProductOptimal));
+      return;
+    }
+  } else {
+    S.Diags.Report(MatrixLayoutExpr->getExprLoc(), diag::err_expr_not_ice) << 0;
+    return;
+  }
+
+  // Matrix Stride must be zero (Training Optimal matrix layout)
+  Expr *MatrixStrideExpr = CE->getArg(kOuterProdAccMatrixStrideIdx);
+  llvm::APSInt MatrixStrideExprVal;
+  unsigned MatrixStrideValue = 0;
+  if (MatrixStrideExpr->isIntegerConstantExpr(MatrixStrideExprVal, S.Context)) {
+    MatrixStrideValue = MatrixStrideExprVal.getLimitedValue();
+    if (MatrixStrideValue != 0) {
+      S.Diags.Report(
+          MatrixStrideExpr->getExprLoc(),
+          diag::
+              err_hlsl_linalg_optimal_matrix_layout_matrix_stride_must_be_zero);
+      return;
+    }
+  }
+}
+
 #ifdef ENABLE_SPIRV_CODEGEN
 static bool CheckVKBufferPointerCast(Sema &S, FunctionDecl *FD, CallExpr *CE,
                                      bool isStatic) {
@@ -11721,6 +12262,15 @@ void Sema::CheckHLSLFunctionCall(FunctionDecl *FDecl, CallExpr *TheCall,
   case hlsl::IntrinsicOp::IOP_Barrier:
     CheckBarrierCall(*this, FDecl, TheCall, SM);
     break;
+  case hlsl::IntrinsicOp::IOP___builtin_MatVecMul:
+    CheckMulCall(*this, FDecl, TheCall, SM);
+    break;
+  case hlsl::IntrinsicOp::IOP___builtin_MatVecMulAdd:
+    CheckMulAddCall(*this, FDecl, TheCall, SM);
+    break;
+  case hlsl::IntrinsicOp::IOP___builtin_OuterProductAccumulate:
+    CheckOuterProductAccumulateCall(*this, FDecl, TheCall);
+    break;
 #ifdef ENABLE_SPIRV_CODEGEN
   case hlsl::IntrinsicOp::IOP_Vkreinterpret_pointer_cast:
     CheckVKBufferPointerCast(*this, FDecl, TheCall, false);
@@ -12119,18 +12669,6 @@ void Sema::DiagnoseReachableHLSLCall(CallExpr *CE, const hlsl::ShaderModel *SM,
     break;
   case hlsl::IntrinsicOp::IOP_DxMaybeReorderThread:
     DiagnoseReachableSERCall(*this, CE, EntrySK, EntryDecl, true);
-    break;
-  case hlsl::IntrinsicOp::IOP___builtin_MatVecMul:
-  case hlsl::IntrinsicOp::IOP___builtin_MatVecMulAdd:
-  case hlsl::IntrinsicOp::IOP___builtin_OuterProductAccumulate:
-  case hlsl::IntrinsicOp::IOP___builtin_VectorAccumulate:
-    if (!SM->IsSM69Plus()) {
-      Diags.Report(CE->getExprLoc(),
-                   diag::warn_hlsl_intrinsic_in_wrong_shader_model)
-          << FD->getNameAsString() << EntryDecl->getNameAsString() << "6.9";
-      return;
-    }
-
     break;
   default:
     break;
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/mat-vec-mul-add_multioverload.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/mat-vec-mul-add_multioverload.hlsl
index 98a568fa22..de811982d6 100644
--- a/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/mat-vec-mul-add_multioverload.hlsl
+++ b/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/mat-vec-mul-add_multioverload.hlsl
@@ -1,43 +1,57 @@
-// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=float16_t -DIU=0 -DITY=float16_t -DII=F16 -DMI=F16 -DML=RowMajor -DMT=0 -DBI=F16 | FileCheck %s --check-prefixes COMMON,DXIL-0
-// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=float16_t -DIU=0 -DITY=float16_t -DII=F8_E4M3 -DMI=F8_E4M3 -DML=MulOptimal -DMT=0 -DBI=F16 | FileCheck %s --check-prefixes COMMON,DXIL-1
-// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=float16_t -DIU=0 -DITY=float16_t -DII=F8_E5M2 -DMI=F8_E5M2 -DML=MulOptimal -DMT=1 -DBI=F16 | FileCheck %s --check-prefixes COMMON,DXIL-2
-// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=0 -DITY=uint -DII=PackedS8x32 -DMI=I8 -DML=OuterProductOptimal -DMT=1 -DBI=I32 | FileCheck %s --check-prefixes COMMON,DXIL-3
-// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=0 -DITY=float -DII=I8 -DMI=I8 -DML=RowMajor -DMT=0 -DBI=I32 | FileCheck %s --check-prefixes COMMON,DXIL-4
-// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=1 -DOTY=uint -DIU=0 -DITY=float -DII=I8 -DMI=F16 -DML=RowMajor -DMT=0 -DBI=I8 | FileCheck %s --check-prefixes COMMON,DXIL-5
-// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=1 -DITY=uint -DII=U8 -DMI=I8 -DML=ColumnMajor -DMT=0 -DBI=I8 | FileCheck %s --check-prefixes COMMON,DXIL-6
-// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=0 -DITY=int -DII=U8 -DMI=U8 -DML=MulOptimal -DMT=1 -DBI=I8 | FileCheck %s --check-prefixes COMMON,DXIL-7
-
-// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=float16_t -DIU=0 -DITY=float16_t -DII=F16 -DMI=F16 -DML=RowMajor -DMT=0 -DBI=F16 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-0
-// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=float16_t -DIU=0 -DITY=float16_t -DII=F8_E4M3 -DMI=F8_E4M3 -DML=MulOptimal -DMT=0 -DBI=F16 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-1
-// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=float16_t -DIU=0 -DITY=float16_t -DII=F8_E5M2 -DMI=F8_E5M2 -DML=MulOptimal -DMT=1 -DBI=F16 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-2
-// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=0 -DITY=uint -DII=PackedS8x32 -DMI=I8 -DML=OuterProductOptimal -DMT=1 -DBI=I32 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-3
-// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=0 -DITY=float -DII=I8 -DMI=I8 -DML=RowMajor -DMT=0 -DBI=I32 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-4
-// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=1 -DOTY=uint -DIU=0 -DITY=float -DII=I8 -DMI=F16 -DML=RowMajor -DMT=0 -DBI=I8 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-5
-// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=1 -DITY=uint -DII=U8 -DMI=I8 -DML=ColumnMajor -DMT=0 -DBI=I8 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-6
-// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=0 -DITY=int -DII=U8 -DMI=U8 -DML=MulOptimal -DMT=1 -DBI=I8 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-7
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=float16_t -DIU=0 -DITY=float16_t -DINUM=8 -DII=F16 -DMI=F16 -DML=RowMajor -DMT=0 -DBI=F16 -DMST=64 | FileCheck %s --check-prefixes COMMON,DXIL-0
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=float16_t -DIU=0 -DITY=float16_t -DINUM=8 -DII=F8_E4M3 -DMI=F8_E4M3 -DML=MulOptimal -DMT=0 -DBI=F16 -DMST=0 | FileCheck %s --check-prefixes COMMON,DXIL-1
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=float16_t -DIU=0 -DITY=float16_t -DINUM=8 -DII=F8_E5M2 -DMI=F8_E5M2 -DML=MulOptimal -DMT=1 -DBI=F16 -DMST=0 | FileCheck %s --check-prefixes COMMON,DXIL-2
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=1 -DITY=uint -DINUM=2 -DII=PackedS8x32 -DMI=I8 -DML=OuterProductOptimal -DMT=1 -DBI=I32 -DMST=0 | FileCheck %s --check-prefixes COMMON,DXIL-3
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=0 -DITY=float -DINUM=8 -DII=I8 -DMI=I8 -DML=RowMajor -DMT=0 -DBI=I32 -DMST=64 | FileCheck %s --check-prefixes COMMON,DXIL-4
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=1 -DOTY=uint -DIU=0 -DITY=float -DINUM=8 -DII=I8 -DMI=F16 -DML=RowMajor -DMT=0 -DBI=I8 -DMST=64 | FileCheck %s --check-prefixes COMMON,DXIL-5
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=1 -DITY=uint -DINUM=8 -DII=U8 -DMI=I8 -DML=ColumnMajor -DMT=0 -DBI=I8 -DMST=64 | FileCheck %s --check-prefixes COMMON,DXIL-6
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=0 -DITY=int -DINUM=8 -DII=U8 -DMI=U8 -DML=MulOptimal -DMT=1 -DBI=I8 -DMST=0 | FileCheck %s --check-prefixes COMMON,DXIL-7
+
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=float16_t -DIU=0 -DITY=float16_t -DINUM=8 -DII=F16 -DMI=F16 -DML=RowMajor -DMT=0 -DBI=F16 -DMST=64 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-0
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=float16_t -DIU=0 -DITY=float16_t -DINUM=8 -DII=F8_E4M3 -DMI=F8_E4M3 -DML=MulOptimal -DMT=0 -DBI=F16 -DMST=0 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-1
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=float16_t -DIU=0 -DITY=float16_t -DINUM=8 -DII=F8_E5M2 -DMI=F8_E5M2 -DML=MulOptimal -DMT=1 -DBI=F16 -DMST=0 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-2
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=1 -DITY=uint -DINUM=2 -DII=PackedS8x32 -DMI=I8 -DML=OuterProductOptimal -DMT=1 -DBI=I32 -DMST=0 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-3
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=0 -DITY=float -DINUM=8 -DII=I8 -DMI=I8 -DML=RowMajor -DMT=0 -DBI=I32 -DMST=64 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-4
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=1 -DOTY=uint -DIU=0 -DITY=float -DINUM=8 -DII=I8 -DMI=F16 -DML=RowMajor -DMT=0 -DBI=I8 -DMST=64 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-5
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=1 -DITY=uint -DINUM=8 -DII=U8 -DMI=I8 -DML=ColumnMajor -DMT=0 -DBI=I8 -DMST=64 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-6
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=0 -DITY=int -DINUM=8 -DII=U8 -DMI=U8 -DML=MulOptimal -DMT=1 -DBI=I8 -DMST=0 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-7
 
 
 // COMMON: define void @main()
 
 // Test minimum support set of combinations for matVecMul
-// HLOP-0: call void @"dx.hl.op..void (i32, <4 x half>*, i1, <8 x half>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32, %dx.types.Handle, i32, i32)"(i32 391, <4 x half>* %output_vector, i1 false, <8 x half> %{{[^ ]+}}, i1 false, i32 8, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 8, i32 8, i32 8, i32 0, i1 false, i32 64, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 8)
-// DXIL-0: call <4 x half> @dx.op.matVecMulAdd.v4f16.v8f16(i32 306, <8 x half> {{[^ ]+}}, i1 false, i32 8, %dx.types.Handle {{[^ ]+}}, i32 0, i32 8, i32 8, i32 8, i32 0, i1 false, i32 64, %dx.types.Handle {{[^ ]+}}, i32 0, i32 8, i1 false)  ; MatVecMulAdd(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,biasBuffer,biasOffset,biasIntepretation,isOutputUnsigned)
-// HLOP-1: call void @"dx.hl.op..void (i32, <4 x half>*, i1, <8 x half>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32, %dx.types.Handle, i32, i32)"(i32 391, <4 x half>* %output_vector, i1 false, <8 x half> %{{[^ ]+}}, i1 false, i32 21, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 21, i32 8, i32 8, i32 2, i1 false, i32 64, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 8)
-// DXIL-1: call <4 x half> @dx.op.matVecMulAdd.v4f16.v8f16(i32 306, <8 x half> {{[^ ]+}}, i1 false, i32 21, %dx.types.Handle {{[^ ]+}}, i32 0, i32 21, i32 8, i32 8, i32 2, i1 false, i32 64, %dx.types.Handle {{[^ ]+}}, i32 0, i32 8, i1 false)  ; MatVecMulAdd(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,biasBuffer,biasOffset,biasIntepretation,isOutputUnsigned)
-// HLOP-2: call void @"dx.hl.op..void (i32, <4 x half>*, i1, <8 x half>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32, %dx.types.Handle, i32, i32)"(i32 391, <4 x half>* %output_vector, i1 false, <8 x half> %{{[^ ]+}}, i1 false, i32 22, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 22, i32 8, i32 8, i32 2, i1 true, i32 64, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 8)
-// DXIL-2: call <4 x half> @dx.op.matVecMulAdd.v4f16.v8f16(i32 306, <8 x half> {{[^ ]+}}, i1 false, i32 22, %dx.types.Handle {{[^ ]+}}, i32 0, i32 22, i32 8, i32 8, i32 2, i1 true, i32 64, %dx.types.Handle {{[^ ]+}}, i32 0, i32 8, i1 false)  ; MatVecMulAdd(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,biasBuffer,biasOffset,biasIntepretation,isOutputUnsigned)
-// HLOP-3: call void @"dx.hl.op..void (i32, <4 x i32>*, i1, <8 x i32>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32, %dx.types.Handle, i32, i32)"(i32 391, <4 x i32>* %output_vector, i1 false, <8 x i32> %{{[^ ]+}}, i1 false, i32 17, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 20, i32 8, i32 8, i32 3, i1 true, i32 64, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 4)
-// DXIL-3: call <4 x i32> @dx.op.matVecMulAdd.v4i32.v8i32(i32 306, <8 x i32> {{[^ ]+}}, i1 false, i32 17, %dx.types.Handle {{[^ ]+}}, i32 0, i32 20, i32 8, i32 8, i32 3, i1 true, i32 64, %dx.types.Handle {{[^ ]+}}, i32 0, i32 4, i1 false)  ; MatVecMulAdd(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,biasBuffer,biasOffset,biasIntepretation,isOutputUnsigned)
-// HLOP-4: call void @"dx.hl.op..void (i32, <4 x i32>*, i1, <8 x float>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32, %dx.types.Handle, i32, i32)"(i32 391, <4 x i32>* %output_vector, i1 false, <8 x float> %{{[^ ]+}}, i1 false, i32 20, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 20, i32 8, i32 8, i32 0, i1 false, i32 64, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 4)
-// DXIL-4: call <4 x i32> @dx.op.matVecMulAdd.v4i32.v8f32(i32 306, <8 x float> {{[^ ]+}}, i1 false, i32 20, %dx.types.Handle {{[^ ]+}}, i32 0, i32 20, i32 8, i32 8, i32 0, i1 false, i32 64, %dx.types.Handle {{[^ ]+}}, i32 0, i32 4, i1 false)  ; MatVecMulAdd(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,biasBuffer,biasOffset,biasIntepretation,isOutputUnsigned)
+// HLOP-0: call void @"dx.hl.op..void (i32, <8 x half>*, i1, <8 x half>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32, %dx.types.Handle, i32, i32)"(i32 391, <8 x half>* %output_vector, i1 false, <8 x half> %{{[^ ]+}}, i1 false, i32 8, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 8, i32 8, i32 8, i32 0, i1 false, i32 64, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 8)
+
+// DXIL-0: call <8 x half> @dx.op.matVecMulAdd.v8f16.v8f16(i32 306, <8 x half> {{[^ ]+}}, i1 false, i32 8, %dx.types.Handle {{[^ ]+}}, i32 0, i32 8, i32 8, i32 8, i32 0, i1 false, i32 64, %dx.types.Handle {{[^ ]+}}, i32 0, i32 8, i1 false)  ; MatVecMulAdd(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,biasBuffer,biasOffset,biasIntepretation,isOutputUnsigned)
+
+// HLOP-1: call void @"dx.hl.op..void (i32, <8 x half>*, i1, <8 x half>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32, %dx.types.Handle, i32, i32)"(i32 391, <8 x half>* %output_vector, i1 false, <8 x half> %{{[^ ]+}}, i1 false, i32 21, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 21, i32 8, i32 8, i32 2, i1 false, i32 0, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 8)
+
+// DXIL-1: call <8 x half> @dx.op.matVecMulAdd.v8f16.v8f16(i32 306, <8 x half> {{[^ ]+}}, i1 false, i32 21, %dx.types.Handle {{[^ ]+}}, i32 0, i32 21, i32 8, i32 8, i32 2, i1 false, i32 0, %dx.types.Handle {{[^ ]+}}, i32 0, i32 8, i1 false)  ; MatVecMulAdd(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,biasBuffer,biasOffset,biasIntepretation,isOutputUnsigned)
+
+// HLOP-2: call void @"dx.hl.op..void (i32, <8 x half>*, i1, <8 x half>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32, %dx.types.Handle, i32, i32)"(i32 391, <8 x half>* %output_vector, i1 false, <8 x half> %{{[^ ]+}}, i1 false, i32 22, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 22, i32 8, i32 8, i32 2, i1 true, i32 0, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 8)
+
+// DXIL-2: call <8 x half> @dx.op.matVecMulAdd.v8f16.v8f16(i32 306, <8 x half> {{[^ ]+}}, i1 false, i32 22, %dx.types.Handle {{[^ ]+}}, i32 0, i32 22, i32 8, i32 8, i32 2, i1 true, i32 0, %dx.types.Handle {{[^ ]+}}, i32 0, i32 8, i1 false)  ; MatVecMulAdd(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,biasBuffer,biasOffset,biasIntepretation,isOutputUnsigned)
+
+// HLOP-3: call void @"dx.hl.op..void (i32, <8 x i32>*, i1, <2 x i32>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32, %dx.types.Handle, i32, i32)"(i32 391, <8 x i32>* %output_vector, i1 false, <2 x i32> %{{[^ ]+}}, i1 true, i32 17, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 20, i32 8, i32 8, i32 3, i1 true, i32 0, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 4)
+
+// DXIL-3: call <8 x i32> @dx.op.matVecMulAdd.v8i32.v2i32(i32 306, <2 x i32> {{[^ ]+}}, i1 true, i32 17, %dx.types.Handle {{[^ ]+}}, i32 0, i32 20, i32 8, i32 8, i32 3, i1 true, i32 0, %dx.types.Handle {{[^ ]+}}, i32 0, i32 4, i1 false)  ; MatVecMulAdd(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,biasBuffer,biasOffset,biasIntepretation,isOutputUnsigned)
+
+// HLOP-4: call void @"dx.hl.op..void (i32, <8 x i32>*, i1, <8 x float>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32, %dx.types.Handle, i32, i32)"(i32 391, <8 x i32>* %output_vector, i1 false, <8 x float> %{{[^ ]+}}, i1 false, i32 20, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 20, i32 8, i32 8, i32 0, i1 false, i32 64, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 4)
+
+// DXIL-4: call <8 x i32> @dx.op.matVecMulAdd.v8i32.v8f32(i32 306, <8 x float> {{[^ ]+}}, i1 false, i32 20, %dx.types.Handle {{[^ ]+}}, i32 0, i32 20, i32 8, i32 8, i32 0, i1 false, i32 64, %dx.types.Handle {{[^ ]+}}, i32 0, i32 4, i1 false)  ; MatVecMulAdd(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,biasBuffer,biasOffset,biasIntepretation,isOutputUnsigned)
 
 // Test unsigned variations
-// HLOP-5: call void @"dx.hl.op..void (i32, <4 x i32>*, i1, <8 x float>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32, %dx.types.Handle, i32, i32)"(i32 391, <4 x i32>* %output_vector, i1 true, <8 x float> %{{[^ ]+}}, i1 false, i32 20, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 8, i32 8, i32 8, i32 0, i1 false, i32 64, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 20)
-// DXIL-5: call <4 x i32> @dx.op.matVecMulAdd.v4i32.v8f32(i32 306, <8 x float> {{[^ ]+}}, i1 false, i32 20, %dx.types.Handle {{[^ ]+}}, i32 0, i32 8, i32 8, i32 8, i32 0, i1 false, i32 64, %dx.types.Handle {{[^ ]+}}, i32 0, i32 20, i1 true)  ; MatVecMulAdd(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,biasBuffer,biasOffset,biasIntepretation,isOutputUnsigned)
-// HLOP-6: call void @"dx.hl.op..void (i32, <4 x i32>*, i1, <8 x i32>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32, %dx.types.Handle, i32, i32)"(i32 391, <4 x i32>* %output_vector, i1 false, <8 x i32> %{{[^ ]+}}, i1 true, i32 19, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 20, i32 8, i32 8, i32 1, i1 false, i32 64, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 20)
-// DXIL-6: call <4 x i32> @dx.op.matVecMulAdd.v4i32.v8i32(i32 306, <8 x i32> {{[^ ]+}}, i1 true, i32 19, %dx.types.Handle {{[^ ]+}}, i32 0, i32 20, i32 8, i32 8, i32 1, i1 false, i32 64, %dx.types.Handle {{[^ ]+}}, i32 0, i32 20, i1 false)  ; MatVecMulAdd(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,biasBuffer,biasOffset,biasIntepretation,isOutputUnsigned)
-// HLOP-7: call void @"dx.hl.op..void (i32, <4 x i32>*, i1, <8 x i32>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32, %dx.types.Handle, i32, i32)"(i32 391, <4 x i32>* %output_vector, i1 false, <8 x i32> %{{[^ ]+}}, i1 false, i32 19, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 19, i32 8, i32 8, i32 2, i1 true, i32 64, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 20)
-// DXIL-7: call <4 x i32> @dx.op.matVecMulAdd.v4i32.v8i32(i32 306, <8 x i32> {{[^ ]+}}, i1 false, i32 19, %dx.types.Handle {{[^ ]+}}, i32 0, i32 19, i32 8, i32 8, i32 2, i1 true, i32 64, %dx.types.Handle {{[^ ]+}}, i32 0, i32 20, i1 false)  ; MatVecMulAdd(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,biasBuffer,biasOffset,biasIntepretation,isOutputUnsigned)
+// HLOP-5: call void @"dx.hl.op..void (i32, <8 x i32>*, i1, <8 x float>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32, %dx.types.Handle, i32, i32)"(i32 391, <8 x i32>* %output_vector, i1 true, <8 x float> %{{[^ ]+}}, i1 false, i32 20, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 8, i32 8, i32 8, i32 0, i1 false, i32 64, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 20)
+
+// DXIL-5: call <8 x i32> @dx.op.matVecMulAdd.v8i32.v8f32(i32 306, <8 x float> {{[^ ]+}}, i1 false, i32 20, %dx.types.Handle {{[^ ]+}}, i32 0, i32 8, i32 8, i32 8, i32 0, i1 false, i32 64, %dx.types.Handle {{[^ ]+}}, i32 0, i32 20, i1 true)  ; MatVecMulAdd(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,biasBuffer,biasOffset,biasIntepretation,isOutputUnsigned)
+
+// HLOP-6: call void @"dx.hl.op..void (i32, <8 x i32>*, i1, <8 x i32>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32, %dx.types.Handle, i32, i32)"(i32 391, <8 x i32>* %output_vector, i1 false, <8 x i32> %{{[^ ]+}}, i1 true, i32 19, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 20, i32 8, i32 8, i32 1, i1 false, i32 64, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 20)
+
+// DXIL-6: call <8 x i32> @dx.op.matVecMulAdd.v8i32.v8i32(i32 306, <8 x i32> {{[^ ]+}}, i1 true, i32 19, %dx.types.Handle {{[^ ]+}}, i32 0, i32 20, i32 8, i32 8, i32 1, i1 false, i32 64, %dx.types.Handle {{[^ ]+}}, i32 0, i32 20, i1 false)  ; MatVecMulAdd(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,biasBuffer,biasOffset,biasIntepretation,isOutputUnsigned)
+
+// HLOP-7: call void @"dx.hl.op..void (i32, <8 x i32>*, i1, <8 x i32>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32, %dx.types.Handle, i32, i32)"(i32 391, <8 x i32>* %output_vector, i1 false, <8 x i32> %{{[^ ]+}}, i1 false, i32 19, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 19, i32 8, i32 8, i32 2, i1 true, i32 0, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 20)
+
+// DXIL-7: call <8 x i32> @dx.op.matVecMulAdd.v8i32.v8i32(i32 306, <8 x i32> {{[^ ]+}}, i1 false, i32 19, %dx.types.Handle {{[^ ]+}}, i32 0, i32 19, i32 8, i32 8, i32 2, i1 true, i32 0, %dx.types.Handle {{[^ ]+}}, i32 0, i32 20, i1 false)  ; MatVecMulAdd(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,biasBuffer,biasOffset,biasIntepretation,isOutputUnsigned)
 
 
 ByteAddressBuffer input_vector_buffer; 
@@ -84,10 +98,10 @@ enum MatLayout {
 [NumThreads(1,1,1)]
 void main()
 {    
-    vector<OTY, 4> output_vector;
+    vector<OTY, 8> output_vector;
     static const uint is_output_unsigned = OU;
     
-    vector<ITY, 8> input_vector = input_vector_buffer.Load<vector<ITY, 8> >(0);
+    vector<ITY, INUM> input_vector = input_vector_buffer.Load<vector<ITY, INUM> >(0);
     const uint is_input_unsigned = IU;
     const uint input_interpretation = II;
     
@@ -97,7 +111,7 @@ void main()
     const uint matrix_dimK = 8;
     const uint matrix_layout = ML;
     const bool matrix_is_transposed = (bool) MT; 
-    const uint matrix_stride = 64;
+    const uint matrix_stride = MST;
 
     const uint bias_offset = 0;
     const uint bias_interpretation = BI;
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/mat-vec-mul_multioverload.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/mat-vec-mul_multioverload.hlsl
index 2ca2648503..8b14fb4cf1 100644
--- a/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/mat-vec-mul_multioverload.hlsl
+++ b/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/mat-vec-mul_multioverload.hlsl
@@ -1,42 +1,56 @@
-// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=float16_t -DIU=0 -DITY=float16_t -DII=F16 -DMI=F16 -DML=RowMajor -DMT=0 | FileCheck %s --check-prefixes COMMON,DXIL-0
-// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=float16_t -DIU=0 -DITY=float16_t -DII=F8_E4M3 -DMI=F8_E4M3 -DML=MulOptimal -DMT=0 | FileCheck %s --check-prefixes COMMON,DXIL-1
-// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=float16_t -DIU=0 -DITY=float16_t -DII=F8_E5M2 -DMI=F8_E5M2 -DML=MulOptimal -DMT=1 | FileCheck %s --check-prefixes COMMON,DXIL-2
-// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=0 -DITY=uint -DII=PackedS8x32 -DMI=I8 -DML=OuterProductOptimal -DMT=1 | FileCheck %s --check-prefixes COMMON,DXIL-3
-// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=0 -DITY=float -DII=I8 -DMI=I8 -DML=RowMajor -DMT=0 | FileCheck %s --check-prefixes COMMON,DXIL-4
-// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=1 -DOTY=uint -DIU=0 -DITY=float -DII=I8 -DMI=F16 -DML=RowMajor -DMT=0 | FileCheck %s --check-prefixes COMMON,DXIL-5
-// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=1 -DITY=uint -DII=U8 -DMI=I8 -DML=ColumnMajor -DMT=0 | FileCheck %s --check-prefixes COMMON,DXIL-6
-// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=0 -DITY=int -DII=U8 -DMI=U8 -DML=MulOptimal -DMT=1 | FileCheck %s --check-prefixes COMMON,DXIL-7
-
-// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=float16_t -DIU=0 -DITY=float16_t -DII=F16 -DMI=F16 -DML=RowMajor -DMT=0 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-0
-// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=float16_t -DIU=0 -DITY=float16_t -DII=F8_E4M3 -DMI=F8_E4M3 -DML=MulOptimal -DMT=0 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-1
-// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=float16_t -DIU=0 -DITY=float16_t -DII=F8_E5M2 -DMI=F8_E5M2 -DML=MulOptimal -DMT=1 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-2
-// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=0 -DITY=uint -DII=PackedS8x32 -DMI=I8 -DML=OuterProductOptimal -DMT=1 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-3
-// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=0 -DITY=float -DII=I8 -DMI=I8 -DML=RowMajor -DMT=0 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-4
-// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=1 -DOTY=uint -DIU=0 -DITY=float -DII=I8 -DMI=F16 -DML=RowMajor -DMT=0 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-5
-// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=1 -DITY=uint -DII=U8 -DMI=I8 -DML=ColumnMajor -DMT=0 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-6
-// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=0 -DITY=int -DII=U8 -DMI=U8 -DML=MulOptimal -DMT=1 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-7
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=float16_t -DIU=0 -DITY=float16_t -DINUM=8 -DII=F16 -DMI=F16 -DML=RowMajor -DMT=0 -DMST=64 | FileCheck %s --check-prefixes COMMON,DXIL-0
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=float16_t -DIU=0 -DITY=float16_t -DINUM=8 -DII=F8_E4M3 -DMI=F8_E4M3 -DML=MulOptimal -DMT=0 -DMST=0 | FileCheck %s --check-prefixes COMMON,DXIL-1
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=float16_t -DIU=0 -DITY=float16_t -DINUM=8 -DII=F8_E5M2 -DMI=F8_E5M2 -DML=MulOptimal -DMT=1 -DMST=0| FileCheck %s --check-prefixes COMMON,DXIL-2
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=1 -DITY=uint -DII=PackedS8x32 -DINUM=2 -DMI=I8 -DML=OuterProductOptimal -DMT=1 -DMST=0 | FileCheck %s --check-prefixes COMMON,DXIL-3
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=0 -DITY=float -DINUM=8 -DII=I8 -DMI=I8 -DML=RowMajor -DMT=0 -DMST=64 | FileCheck %s --check-prefixes COMMON,DXIL-4
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=1 -DOTY=uint -DIU=0 -DITY=float -DINUM=8 -DII=I8 -DMI=F16 -DINUM=8 -DML=RowMajor -DMT=0 -DMST=64 | FileCheck %s --check-prefixes COMMON,DXIL-5
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=1 -DITY=uint -DINUM=8 -DII=U8 -DMI=I8 -DINUM=8 -DML=ColumnMajor -DMT=0 -DMST=64 | FileCheck %s --check-prefixes COMMON,DXIL-6
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=0 -DITY=int -DINUM=8 -DII=U8 -DMI=U8 -DINUM=8 -DML=MulOptimal -DMT=1 -DMST=0 | FileCheck %s --check-prefixes COMMON,DXIL-7
+
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=float16_t -DIU=0 -DITY=float16_t -DINUM=8 -DII=F16 -DMI=F16 -DML=RowMajor -DMT=0 -DMST=64 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-0
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=float16_t -DIU=0 -DITY=float16_t -DINUM=8 -DII=F8_E4M3 -DMI=F8_E4M3 -DML=MulOptimal -DMT=0 -DMST=0 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-1
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=float16_t -DIU=0 -DITY=float16_t -DINUM=8 -DII=F8_E5M2 -DMI=F8_E5M2 -DML=MulOptimal -DMT=1 -DMST=0 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-2
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=1 -DITY=uint -DINUM=2 -DII=PackedS8x32 -DMI=I8 -DML=OuterProductOptimal -DMT=1 -DMST=0 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-3
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=0 -DITY=float -DINUM=8 -DII=I8 -DMI=I8 -DML=RowMajor -DMT=0 -DMST=64 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-4
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=1 -DOTY=uint -DIU=0 -DITY=float -DINUM=8 -DII=I8 -DMI=F16 -DML=RowMajor -DMT=0 -DMST=64 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-5
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=1 -DITY=uint -DINUM=8 -DII=U8 -DMI=I8 -DML=ColumnMajor -DMT=0 -DMST=64 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-6
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=0 -DITY=int -DINUM=8 -DII=U8 -DMI=U8 -DML=MulOptimal -DMT=1 -DMST=0 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-7
 
 // COMMON: define void @main()
 
 // Test minimum support set of combinations for matVecMul
-// HLOP-0: call void @"dx.hl.op..void (i32, <4 x half>*, i1, <8 x half>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32)"(i32 390, <4 x half>* %output_vector, i1 false, <8 x half> %{{[^ ]+}}, i1 false, i32 8, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 8, i32 8, i32 8, i32 0, i1 false, i32 64)
-// DXIL-0: call <4 x half> @dx.op.matVecMul.v4f16.v8f16(i32 305, <8 x half> {{[^ ]+}}, i1 false, i32 8, %dx.types.Handle {{[^ ]+}}, i32 0, i32 8, i32 8, i32 8, i32 0, i1 false, i32 64, i1 false)  ; MatVecMul(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,isOutputUnsigned)
-// HLOP-1: call void @"dx.hl.op..void (i32, <4 x half>*, i1, <8 x half>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32)"(i32 390, <4 x half>* %output_vector, i1 false, <8 x half> %{{[^ ]+}}, i1 false, i32 21, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 21, i32 8, i32 8, i32 2, i1 false, i32 64)
-// DXIL-1: call <4 x half> @dx.op.matVecMul.v4f16.v8f16(i32 305, <8 x half> {{[^ ]+}}, i1 false, i32 21, %dx.types.Handle {{[^ ]+}}, i32 0, i32 21, i32 8, i32 8, i32 2, i1 false, i32 64, i1 false)  ; MatVecMul(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,isOutputUnsigned)
-// HLOP-2: call void @"dx.hl.op..void (i32, <4 x half>*, i1, <8 x half>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32)"(i32 390, <4 x half>* %output_vector, i1 false, <8 x half> %{{[^ ]+}}, i1 false, i32 22, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 22, i32 8, i32 8, i32 2, i1 true, i32 64)
-// DXIL-2: call <4 x half> @dx.op.matVecMul.v4f16.v8f16(i32 305, <8 x half> {{[^ ]+}}, i1 false, i32 22, %dx.types.Handle {{[^ ]+}}, i32 0, i32 22, i32 8, i32 8, i32 2, i1 true, i32 64, i1 false)  ; MatVecMul(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,isOutputUnsigned)
-// HLOP-3: call void @"dx.hl.op..void (i32, <4 x i32>*, i1, <8 x i32>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32)"(i32 390, <4 x i32>* %output_vector, i1 false, <8 x i32> %{{[^ ]+}}, i1 false, i32 17, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 20, i32 8, i32 8, i32 3, i1 true, i32 64)
-// DXIL-3: call <4 x i32> @dx.op.matVecMul.v4i32.v8i32(i32 305, <8 x i32> {{[^ ]+}}, i1 false, i32 17, %dx.types.Handle {{[^ ]+}}, i32 0, i32 20, i32 8, i32 8, i32 3, i1 true, i32 64, i1 false)  ; MatVecMul(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,isOutputUnsigned)
-// HLOP-4: call void @"dx.hl.op..void (i32, <4 x i32>*, i1, <8 x float>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32)"(i32 390, <4 x i32>* %output_vector, i1 false, <8 x float> %{{[^ ]+}}, i1 false, i32 20, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 20, i32 8, i32 8, i32 0, i1 false, i32 64)
-// DXIL-4: call <4 x i32> @dx.op.matVecMul.v4i32.v8f32(i32 305, <8 x float> {{[^ ]+}}, i1 false, i32 20, %dx.types.Handle {{[^ ]+}}, i32 0, i32 20, i32 8, i32 8, i32 0, i1 false, i32 64, i1 false)  ; MatVecMul(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,isOutputUnsigned)
+// HLOP-0: call void @"dx.hl.op..void (i32, <8 x half>*, i1, <8 x half>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32)"(i32 390, <8 x half>* %output_vector, i1 false, <8 x half> %{{[^ ]+}}, i1 false, i32 8, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 8, i32 8, i32 8, i32 0, i1 false, i32 64)
+
+// DXIL-0: call <8 x half> @dx.op.matVecMul.v8f16.v8f16(i32 305, <8 x half> {{[^ ]+}}, i1 false, i32 8, %dx.types.Handle {{[^ ]+}}, i32 0, i32 8, i32 8, i32 8, i32 0, i1 false, i32 64, i1 false)  ; MatVecMul(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,isOutputUnsigned)
+
+// HLOP-1: call void @"dx.hl.op..void (i32, <8 x half>*, i1, <8 x half>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32)"(i32 390, <8 x half>* %output_vector, i1 false, <8 x half> %{{[^ ]+}}, i1 false, i32 21, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 21, i32 8, i32 8, i32 2, i1 false, i32 0)
+
+// DXIL-1: call <8 x half> @dx.op.matVecMul.v8f16.v8f16(i32 305, <8 x half> {{[^ ]+}}, i1 false, i32 21, %dx.types.Handle {{[^ ]+}}, i32 0, i32 21, i32 8, i32 8, i32 2, i1 false, i32 0, i1 false)  ; MatVecMul(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,isOutputUnsigned)
+
+// HLOP-2: call void @"dx.hl.op..void (i32, <8 x half>*, i1, <8 x half>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32)"(i32 390, <8 x half>* %output_vector, i1 false, <8 x half> %{{[^ ]+}}, i1 false, i32 22, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 22, i32 8, i32 8, i32 2, i1 true, i32 0)
+
+// DXIL-2: call <8 x half> @dx.op.matVecMul.v8f16.v8f16(i32 305, <8 x half> {{[^ ]+}}, i1 false, i32 22, %dx.types.Handle {{[^ ]+}}, i32 0, i32 22, i32 8, i32 8, i32 2, i1 true, i32 0, i1 false)  ; MatVecMul(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,isOutputUnsigned)
+
+// HLOP-3: call void @"dx.hl.op..void (i32, <8 x i32>*, i1, <2 x i32>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32)"(i32 390, <8 x i32>* %output_vector, i1 false, <2 x i32> %{{[^ ]+}}, i1 true, i32 17, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 20, i32 8, i32 8, i32 3, i1 true, i32 0)
+
+// DXIL-3: call <8 x i32>  @dx.op.matVecMul.v8i32.v2i32(i32 305, <2 x i32> {{[^ ]+}}, i1 true, i32 17, %dx.types.Handle {{[^ ]+}}, i32 0, i32 20, i32 8, i32 8, i32 3, i1 true, i32 0, i1 false)  ; MatVecMul(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,isOutputUnsigned)
+
+// HLOP-4: call void @"dx.hl.op..void (i32, <8 x i32>*, i1, <8 x float>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32)"(i32 390, <8 x i32>* %output_vector, i1 false, <8 x float> %{{[^ ]+}}, i1 false, i32 20, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 20, i32 8, i32 8, i32 0, i1 false, i32 64)
+
+// DXIL-4: call <8 x i32> @dx.op.matVecMul.v8i32.v8f32(i32 305, <8 x float> {{[^ ]+}}, i1 false, i32 20, %dx.types.Handle {{[^ ]+}}, i32 0, i32 20, i32 8, i32 8, i32 0, i1 false, i32 64, i1 false)  ; MatVecMul(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,isOutputUnsigned)
 
 // Test unsigned variations
-// HLOP-5: call void @"dx.hl.op..void (i32, <4 x i32>*, i1, <8 x float>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32)"(i32 390, <4 x i32>* %output_vector, i1 true, <8 x float> %{{[^ ]+}}, i1 false, i32 20, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 8, i32 8, i32 8, i32 0, i1 false, i32 64)
-// DXIL-5: call <4 x i32> @dx.op.matVecMul.v4i32.v8f32(i32 305, <8 x float> {{[^ ]+}}, i1 false, i32 20, %dx.types.Handle {{[^ ]+}}, i32 0, i32 8, i32 8, i32 8, i32 0, i1 false, i32 64, i1 true)  ; MatVecMul(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,isOutputUnsigned)
-// HLOP-6: call void @"dx.hl.op..void (i32, <4 x i32>*, i1, <8 x i32>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32)"(i32 390, <4 x i32>* %output_vector, i1 false, <8 x i32> %{{[^ ]+}}, i1 true, i32 19, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 20, i32 8, i32 8, i32 1, i1 false, i32 64)
-// DXIL-6: call <4 x i32> @dx.op.matVecMul.v4i32.v8i32(i32 305, <8 x i32> {{[^ ]+}}, i1 true, i32 19, %dx.types.Handle {{[^ ]+}}, i32 0, i32 20, i32 8, i32 8, i32 1, i1 false, i32 64, i1 false)  ; MatVecMul(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,isOutputUnsigned)
-// HLOP-7: call void @"dx.hl.op..void (i32, <4 x i32>*, i1, <8 x i32>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32)"(i32 390, <4 x i32>* %output_vector, i1 false, <8 x i32> %{{[^ ]+}}, i1 false, i32 19, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 19, i32 8, i32 8, i32 2, i1 true, i32 64)
-// DXIL-7: call <4 x i32> @dx.op.matVecMul.v4i32.v8i32(i32 305, <8 x i32> {{[^ ]+}}, i1 false, i32 19, %dx.types.Handle {{[^ ]+}}, i32 0, i32 19, i32 8, i32 8, i32 2, i1 true, i32 64, i1 false)  ; MatVecMul(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,isOutputUnsigned)
+// HLOP-5: call void @"dx.hl.op..void (i32, <8 x i32>*, i1, <8 x float>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32)"(i32 390, <8 x i32>* %output_vector, i1 true, <8 x float> %{{[^ ]+}}, i1 false, i32 20, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 8, i32 8, i32 8, i32 0, i1 false, i32 64)
+
+// DXIL-5: call <8 x i32> @dx.op.matVecMul.v8i32.v8f32(i32 305, <8 x float> {{[^ ]+}}, i1 false, i32 20, %dx.types.Handle {{[^ ]+}}, i32 0, i32 8, i32 8, i32 8, i32 0, i1 false, i32 64, i1 true)  ; MatVecMul(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,isOutputUnsigned)
+
+// HLOP-6: call void @"dx.hl.op..void (i32, <8 x i32>*, i1, <8 x i32>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32)"(i32 390, <8 x i32>* %output_vector, i1 false, <8 x i32> %{{[^ ]+}}, i1 true, i32 19, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 20, i32 8, i32 8, i32 1, i1 false, i32 64)
+
+// DXIL-6: call <8 x i32> @dx.op.matVecMul.v8i32.v8i32(i32 305, <8 x i32> {{[^ ]+}}, i1 true, i32 19, %dx.types.Handle {{[^ ]+}}, i32 0, i32 20, i32 8, i32 8, i32 1, i1 false, i32 64, i1 false)  ; MatVecMul(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,isOutputUnsigned)
+
+// HLOP-7: call void @"dx.hl.op..void (i32, <8 x i32>*, i1, <8 x i32>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32)"(i32 390, <8 x i32>* %output_vector, i1 false, <8 x i32> %{{[^ ]+}}, i1 false, i32 19, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 19, i32 8, i32 8, i32 2, i1 true, i32 0)
+
+// DXIL-7: call <8 x i32> @dx.op.matVecMul.v8i32.v8i32(i32 305, <8 x i32> {{[^ ]+}}, i1 false, i32 19, %dx.types.Handle {{[^ ]+}}, i32 0, i32 19, i32 8, i32 8, i32 2, i1 true, i32 0, i1 false)  ; MatVecMul(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,isOutputUnsigned)
 
 
 ByteAddressBuffer input_vector_buffer; 
@@ -83,10 +97,10 @@ enum MatLayout {
 [NumThreads(1,1,1)]
 void main()
 {    
-    vector<OTY, 4> output_vector;
+    vector<OTY, 8> output_vector;
     static const uint is_output_unsigned = OU;
     
-    vector<ITY, 8> input_vector = input_vector_buffer.Load<vector<ITY, 8> >(0);
+    vector<ITY, INUM> input_vector = input_vector_buffer.Load<vector<ITY, INUM> >(0);
     const uint is_input_unsigned = IU;
     const uint input_interpretation = II;
     
@@ -96,7 +110,7 @@ void main()
     const uint matrix_dimK = 8;
     const uint matrix_layout = ML;
     const bool matrix_is_transposed = (bool) MT; 
-    const uint matrix_stride = 64;
+    const uint matrix_stride = MST;
 
     __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector, is_input_unsigned, input_interpretation, matrix_buffer, matrix_offset, matrix_interpretation, 
         matrix_dimM, matrix_dimK, matrix_layout, matrix_is_transposed, matrix_stride);
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/outer-product-accumulate-multioverload.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/outer-product-accumulate-multioverload.hlsl
index c40365078f..c53b7d8f21 100644
--- a/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/outer-product-accumulate-multioverload.hlsl
+++ b/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/outer-product-accumulate-multioverload.hlsl
@@ -1,7 +1,6 @@
 // RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DITY=float16_t -DMI=F16 -DML=OuterProductOptimal | FileCheck %s --check-prefixes COMMON,DXIL-0
 // RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DITY=float16_t -DMI=F8_E4M3 -DML=OuterProductOptimal | FileCheck %s --check-prefixes COMMON,DXIL-1
 // RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DITY=uint -DMI=U8 -DML=OuterProductOptimal | FileCheck %s --check-prefixes COMMON,DXIL-2
-
 // RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DITY=float16_t -DMI=F16 -DML=OuterProductOptimal -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-0
 // RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DITY=float16_t -DMI=F8_E4M3 -DML=OuterProductOptimal -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-1
 // RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DITY=uint -DMI=U8 -DML=OuterProductOptimal -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-2
@@ -11,11 +10,17 @@ ByteAddressBuffer input_vector_buffer2;
 RWByteAddressBuffer matrix_buffer;
 
 // COMMON: define void @main()
+
 // DXIL-0: call void @dx.op.outerProductAccumulate.v8f16.v8f16(i32 307, <8 x half> %{{[^ ]+}}, <8 x half> %{{[^ ]+}}, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 8, i32 3, i32 0)  ; OuterProductAccumulate(inputVector1,inputVector2,matrixBuffer,matrixOffset,matrixIntepretation,matrixLayout,matrixStride)
+
 // HLOP-0: call void @"dx.hl.op..void (i32, <8 x half>, <8 x half>, %dx.types.Handle, i32, i32, i32, i32)"(i32 392, <8 x half> %{{[^ ]+}}, <8 x half> %{{[^ ]+}}, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 8, i32 3, i32 0)
+
 // DXIL-1: call void @dx.op.outerProductAccumulate.v8f16.v8f16(i32 307, <8 x half> %{{[^ ]+}}, <8 x half> %{{[^ ]+}}, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 21, i32 3, i32 0)  ; OuterProductAccumulate(inputVector1,inputVector2,matrixBuffer,matrixOffset,matrixIntepretation,matrixLayout,matrixStride)
+
 // HLOP-1: call void @"dx.hl.op..void (i32, <8 x half>, <8 x half>, %dx.types.Handle, i32, i32, i32, i32)"(i32 392, <8 x half> %{{[^ ]+}}, <8 x half> %{{[^ ]+}}, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 21, i32 3, i32 0)
+
 // DXIL-2: call void @dx.op.outerProductAccumulate.v8i32.v8i32(i32 307, <8 x i32> %{{[^ ]+}}, <8 x i32> %{{[^ ]+}}, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 19, i32 3, i32 0)  ; OuterProductAccumulate(inputVector1,inputVector2,matrixBuffer,matrixOffset,matrixIntepretation,matrixLayout,matrixStride)
+
 // HLOP-2: call void @"dx.hl.op..void (i32, <8 x i32>, <8 x i32>, %dx.types.Handle, i32, i32, i32, i32)"(i32 392, <8 x i32> %{{[^ ]+}}, <8 x i32> %{{[^ ]+}}, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 19, i32 3, i32 0)
 
 enum CompType {
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/linalg/mat-vec-mul.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/linalg/mat-vec-mul.hlsl
index 141801c71c..26bcc75da2 100644
--- a/tools/clang/test/CodeGenDXIL/hlsl/linalg/mat-vec-mul.hlsl
+++ b/tools/clang/test/CodeGenDXIL/hlsl/linalg/mat-vec-mul.hlsl
@@ -15,26 +15,78 @@ export float4 Test1(vector<float, 4> Input) {
       Matrix, MakeInterpretedVector<DATA_TYPE_FLOAT16>(Input));
 }
 
-export vector<float, 8> Test2(vector<uint8_t4_packed, 6> Input) {
+export vector<float, 8> Test2(vector<uint, 6> Input) {
   using namespace dx::linalg;
 
   MatrixRef<DATA_TYPE_UINT8, 8, 6 * 4, MATRIX_LAYOUT_MUL_OPTIMAL> Matrix = {
       Buf, 0, 0};
 
   // note the stride argument is dropped.
-  // CHECK: %{{.+}} = call <8 x float> @dx.op.matVecMul.v8f32.v6f32(i32 305, <6 x float> %{{.+}}, i1 false, i32 18, %dx.types.Handle %{{.+}}, i32 0, i32 19, i32 8, i32 24, i32 2, i1 false, i32 0, i1 false)
+  // CHECK: %{{.+}} = call <8 x float> @dx.op.matVecMul.v8f32.v6i32(i32 305, <6 x i32> %{{.+}}, i1 true, i32 18, %dx.types.Handle %{{.+}}, i32 0, i32 19, i32 8, i32 24, i32 2, i1 false, i32 0, i1 false)
   return Mul<float>(Matrix,
                     MakeInterpretedVector<DATA_TYPE_UINT8_T4_PACKED>(Input));
 }
 
 // test that "stride" isn't ignored in non-optimal layouts
-export vector<float, 8> Test3(vector<uint8_t4_packed, 6> Input) {
+export vector<float, 8> Test3(vector<uint, 6> Input) {
   using namespace dx::linalg;
 
   MatrixRef<DATA_TYPE_UINT8, 8, 6 * 4, MATRIX_LAYOUT_ROW_MAJOR> Matrix = {
       Buf, 0, 6 * 4 * 8};
 
-  // CHECK: %{{.+}} = call <8 x float> @dx.op.matVecMul.v8f32.v6f32(i32 305, <6 x float> %{{.+}}, i1 false, i32 18, %dx.types.Handle %{{.+}}, i32 0, i32 19, i32 8, i32 24, i32 0, i1 false, i32 192, i1 false)
+  // CHECK: %{{.+}} = call <8 x float> @dx.op.matVecMul.v8f32.v6i32(i32 305, <6 x i32> %{{.+}}, i1 true, i32 18, %dx.types.Handle %{{.+}}, i32 0, i32 19, i32 8, i32 24, i32 0, i1 false, i32 192, i1 false)
   return Mul<float>(Matrix,
                     MakeInterpretedVector<DATA_TYPE_UINT8_T4_PACKED>(Input));
 }
+
+// test that isUnsigned is set correctly for uint16_t
+export vector<uint16_t, 8> Test4(vector<uint, 6> Input) {
+  using namespace dx::linalg;
+
+  MatrixRef<DATA_TYPE_UINT8, 8, 6 * 4, MATRIX_LAYOUT_ROW_MAJOR> Matrix = {
+      Buf, 0, 6 * 4 * 8};
+
+  // CHECK: %{{.+}} = call <8 x i16> @dx.op.matVecMul.v8i16.v6i32(i32 305, <6 x i32> %{{.+}}, i1 true, i32 18, %dx.types.Handle %{{.+}}, i32 0, i32 19, i32 8, i32 24, i32 0, i1 false, i32 192, i1 true)
+  return Mul<uint16_t>(Matrix,
+                    MakeInterpretedVector<DATA_TYPE_UINT8_T4_PACKED>(Input));  
+
+}
+
+// test that isUnsigned is set correctly for uint32_t
+export vector<uint, 8> Test5(vector<uint, 6> Input) {
+  using namespace dx::linalg;
+
+  MatrixRef<DATA_TYPE_UINT8, 8, 6 * 4, MATRIX_LAYOUT_ROW_MAJOR> Matrix = {
+      Buf, 0, 6 * 4 * 8};
+
+  // CHECK: %{{.+}} = call <8 x i32> @dx.op.matVecMul.v8i32.v6i32(i32 305, <6 x i32> %{{.+}}, i1 true, i32 18, %dx.types.Handle %{{.+}}, i32 0, i32 19, i32 8, i32 24, i32 0, i1 false, i32 192, i1 true)
+  return Mul<uint>(Matrix,
+                    MakeInterpretedVector<DATA_TYPE_UINT8_T4_PACKED>(Input));  
+
+}
+
+// test that isUnsigned is set correctly for uint8_t4_packed
+export vector<uint, 8> Test5(vector<uint8_t4_packed, 6> Input) {
+  using namespace dx::linalg;
+
+  MatrixRef<DATA_TYPE_UINT8, 8, 6 * 4, MATRIX_LAYOUT_ROW_MAJOR> Matrix = {
+      Buf, 0, 6 * 4 * 8};
+
+  // CHECK: %{{.+}} = call <8 x i32> @dx.op.matVecMul.v8i32.v6i32(i32 305, <6 x i32> %{{.+}}, i1 true, i32 18, %dx.types.Handle %{{.+}}, i32 0, i32 19, i32 8, i32 24, i32 0, i1 false, i32 192, i1 true)
+  return Mul<uint>(Matrix,
+                    MakeInterpretedVector<DATA_TYPE_UINT8_T4_PACKED>(Input));  
+
+}
+
+// test that isUnsigned is set correctly for int8_t4_packed
+export vector<uint, 8> Test5(vector<int8_t4_packed, 6> Input) {
+  using namespace dx::linalg;
+
+  MatrixRef<DATA_TYPE_UINT8, 8, 6 * 4, MATRIX_LAYOUT_ROW_MAJOR> Matrix = {
+      Buf, 0, 6 * 4 * 8};
+
+  // CHECK: %{{.+}} = call <8 x i32> @dx.op.matVecMul.v8i32.v6i32(i32 305, <6 x i32> %{{.+}}, i1 true, i32 17, %dx.types.Handle %{{.+}}, i32 0, i32 19, i32 8, i32 24, i32 0, i1 false, i32 192, i1 true)
+  return Mul<uint>(Matrix,
+                    MakeInterpretedVector<DATA_TYPE_SINT8_T4_PACKED>(Input));  
+
+}
\ No newline at end of file
diff --git a/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/mul_add_invalid.hlsl b/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/mul_add_invalid.hlsl
new file mode 100644
index 0000000000..866fad8225
--- /dev/null
+++ b/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/mul_add_invalid.hlsl
@@ -0,0 +1,1398 @@
+// RUN: %dxc -I %hlsl_headers -T lib_6_9 -enable-16bit-types %s -verify
+
+#include <dx/linalg.h>
+
+using namespace dx::linalg;
+
+ByteAddressBuffer input_vector_buffer;
+ByteAddressBuffer matrix_buffer;
+ByteAddressBuffer bias_buffer;
+RWByteAddressBuffer output_vector_buffer;
+ByteAddressBuffer constants_buffer;
+
+// Output vector, isUnsigned mismatch
+void test_invalid_output_vector_type() {
+
+  vector<float, 4> input_vector =
+      input_vector_buffer.Load<vector<float, 4> >(0);
+  const uint is_input_unsigned = 0;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+  const uint bias_offset = 0;
+  const uint bias_interpretation = DataType::DATA_TYPE_FLOAT32;
+
+  vector<uint, 4> output_vector_0;
+  const uint is_output_unsigned_0 = 0;
+
+  // expected-error@+1 {{IsOuputUnsigned must be true for vector of unsigned integer type}}
+  __builtin_MatVecMulAdd(output_vector_0, is_output_unsigned_0, input_vector,
+                        is_input_unsigned, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  vector<int32_t, 4> output_vector_1;
+  const uint is_output_unsigned_1 = 1;
+
+  // expected-error@+1 {{IsOuputUnsigned must be false for vector of signed integer type}}
+  __builtin_MatVecMulAdd(output_vector_1, is_output_unsigned_1, input_vector,
+                        is_input_unsigned, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  vector<float, 4> output_vector_2;
+  const uint is_output_unsigned_2 = 1;
+
+  // expected-error@+1 {{IsOuputUnsigned must be false for vector of floating point type}}
+  __builtin_MatVecMulAdd(output_vector_2, is_output_unsigned_2, input_vector,
+                        is_input_unsigned, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+}
+
+// IsOutputUnsigned is not a constant parameter
+void test_invalid_is_output_unsigned_non_const() {
+
+  vector<uint, 4> output_vector_0;
+  vector<float, 4> input_vector =
+      input_vector_buffer.Load<vector<float, 4> >(0);
+  const uint is_input_unsigned = 0;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+  const uint bias_offset = 0;
+  const uint bias_interpretation = DataType::DATA_TYPE_FLOAT32;
+
+  const uint is_output_unsigned_0 = constants_buffer.Load<uint>(0);
+
+  // expected-error@+1 {{expression is not an integer constant expression}}
+  __builtin_MatVecMulAdd(output_vector_0, is_output_unsigned_0, input_vector,
+                        is_input_unsigned, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+}
+
+// Input vector is incorrect type - 64 bit types
+void test_invalid_input_vector_type() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+  const uint bias_offset = 0;
+  const uint bias_interpretation = DataType::DATA_TYPE_FLOAT32;
+
+    vector<int64_t, 4> input_vector_0 =
+      input_vector_buffer.Load<vector<int64_t, 4> >(0);
+    const uint is_input_unsigned_0 = 0;
+
+// expected-error@+2 {{no matching function for call to '__builtin_MatVecMulAdd'}}
+// expected-note@+1 {{candidate function not viable: no known conversion from 'vector<int64_t, 4>' to 'vector<float, 4>' for 3rd argument}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector_0,
+                        is_input_unsigned_0, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+    vector<uint64_t, 4> input_vector_1 =
+      input_vector_buffer.Load<vector<uint64_t, 4> >(0);
+    const uint is_input_unsigned_1 = 1;
+
+// expected-error@+2 {{no matching function for call to '__builtin_MatVecMulAdd'}}
+// expected-note@+1 {{candidate function not viable: no known conversion from 'vector<uint64_t, 4>' to 'vector<float, 4>' for 3rd argument}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector_1,
+                        is_input_unsigned_1, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+    vector<float64_t, 4> input_vector_2 =
+      input_vector_buffer.Load<vector<float64_t, 4> >(0);
+    const uint is_input_unsigned_2 = 0;
+
+// expected-error@+2 {{no matching function for call to '__builtin_MatVecMulAdd'}}
+// expected-note@+1 {{candidate function not viable: no known conversion from 'vector<float64_t, 4>' to 'vector<float, 4>' for 3rd argument}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector_2,
+                        is_input_unsigned_2, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+}
+
+// Input vector is incorrect type for packed InputInterpretation
+void test_invalid_input_vector_type_packed_input_interpretation() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+  const uint bias_offset = 0;
+  const uint bias_interpretation = DataType::DATA_TYPE_FLOAT32;
+
+  const uint input_interpretation_0 = DataType::DATA_TYPE_SINT8_T4_PACKED;
+  vector<int16_t, 2> input_vector_0 =
+      input_vector_buffer.Load<vector<int16_t, 2> >(0);
+  const uint is_input_unsigned_0 = 1;
+
+  // expected-error@+1 {{packed input vector type must be a 32-bit unsigned int type in linalg mul/muladd operations}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector_0,
+                        is_input_unsigned_0, input_interpretation_0, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  const uint input_interpretation_1 = DataType::DATA_TYPE_UINT8_T4_PACKED;
+  vector<uint16_t, 2> input_vector_1 =
+      input_vector_buffer.Load<vector<uint16_t, 2> >(0);
+  const uint is_input_unsigned_1 = 0;
+
+  // expected-error@+1 {{packed input vector type must be a 32-bit unsigned int type in linalg mul/muladd operations}} 
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector_1,
+                        is_input_unsigned_1, input_interpretation_1, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  const uint input_interpretation_2 = DataType::DATA_TYPE_UINT8_T4_PACKED;
+  vector<int32_t, 1> input_vector_2 =
+      input_vector_buffer.Load<vector<int32_t, 1> >(0);
+  const uint is_input_unsigned_2 = 1;
+  
+  // expected-error@+1 {{packed input vector type must be a 32-bit unsigned int type in linalg mul/muladd operations}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector_2,
+                        is_input_unsigned_2, input_interpretation_2, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  const uint input_interpretation_3 = DataType::DATA_TYPE_SINT8_T4_PACKED;
+  vector<int32_t, 1> input_vector_3 =
+      input_vector_buffer.Load<vector<int32_t, 1> >(0);
+  const uint is_input_unsigned_3 = 0;
+
+  // expected-error@+1 {{packed input vector type must be a 32-bit unsigned int type in linalg mul/muladd operations}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector_3,
+                        is_input_unsigned_3, input_interpretation_3, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  const uint input_interpretation_4 = DataType::DATA_TYPE_SINT8_T4_PACKED;
+  vector<float, 1> input_vector_4 =
+      input_vector_buffer.Load<vector<float, 1> >(0);
+  const uint is_input_unsigned_4 = 0;
+
+  // expected-error@+1 {{packed input vector type must be a 32-bit unsigned int type in linalg mul/muladd operations}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector_4, 
+                        is_input_unsigned_4, input_interpretation_4, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+}
+
+// IsInputUnsigned must be true for packed input vector type
+void test_invalid_is_input_unsigned_packed_input_vector_type() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;  
+  const uint matrix_stride = 64;
+  const uint bias_offset = 0;
+  const uint bias_interpretation = DataType::DATA_TYPE_FLOAT32;
+
+  const uint input_interpretation_0 = DataType::DATA_TYPE_UINT8_T4_PACKED;  
+  vector<uint, 1> input_vector_0 = 
+      input_vector_buffer.Load<vector<uint, 1> >(0);
+  const uint is_input_unsigned_0 = 0;
+
+  // expected-error@+2 {{IsInputUnsigned must be true for packed input interpretations in linalg mul/muladd operations}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector_0,
+                        is_input_unsigned_0, input_interpretation_0, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,  
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  const uint input_interpretation_1 = DataType::DATA_TYPE_SINT8_T4_PACKED;
+  vector<uint, 1> input_vector_1 =
+      input_vector_buffer.Load<vector<uint, 1> >(0);
+  const uint is_input_unsigned_1 = 0;
+  
+  // expected-error@+2 {{IsInputUnsigned must be true for packed input interpretations in linalg mul/muladd operations}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector_1,
+                        is_input_unsigned_1, input_interpretation_1, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+}
+
+// Check packed input vector dimension
+void test_invalid_packed_input_vector_dimension() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  const uint is_input_unsigned = 1;
+  const uint input_interpretation = DataType::DATA_TYPE_UINT8_T4_PACKED;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_UINT8;
+  const uint matrix_dimM = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_MUL_OPTIMAL;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 0;
+  const uint bias_offset = 0;
+  const uint bias_interpretation = DataType::DATA_TYPE_UINT32;
+
+  vector<uint, 4> input_vector_0 =
+      input_vector_buffer.Load<vector<uint, 4> >(0);
+  const uint matrix_dimK_0 = 4;
+
+  // expected-error@+1 {{packed input vector length must be the smallest number that can hold matrix dim K values of the packed(smaller) type in linalg mul/muladd operations}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector_0,
+                        is_input_unsigned, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK_0, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  vector<uint, 1> input_vector_1 =
+      input_vector_buffer.Load<vector<uint, 1> >(0);
+  const uint matrix_dimK_1 = 7;
+
+  // expected-error@+1 {{packed input vector length must be the smallest number that can hold matrix dim K values of the packed(smaller) type in linalg mul/muladd operations}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector_1,
+                        is_input_unsigned, input_interpretation, matrix_buffer, 
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK_1, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  vector<uint, 3> input_vector_2 =
+      input_vector_buffer.Load<vector<uint, 1> >(0);
+  const uint matrix_dimK_2 = 7;
+
+  // expected-error@+1 {{packed input vector length must be the smallest number that can hold matrix dim K values of the packed(smaller) type in linalg mul/muladd operations}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector_2,
+                        is_input_unsigned, input_interpretation, matrix_buffer, 
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK_2, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+}
+
+// Check is Input vector type/isInputUnsigned matched
+void test_invalid_input_vector_type_mismatch() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+  const uint bias_offset = 0;
+  const uint bias_interpretation = DataType::DATA_TYPE_FLOAT32;
+
+  vector<uint, 4> input_vector_0 =
+      input_vector_buffer.Load<vector<uint, 4> >(0);    
+  const uint is_input_unsigned_0 = 0;
+
+  // expected-error@+2 {{IsInputUnsigned must be true for vector of unsigned integer type}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector_0,
+                        is_input_unsigned_0, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  vector<int32_t, 4> input_vector_1 =
+      input_vector_buffer.Load<vector<int32_t, 4> >(0);
+  const uint is_input_unsigned_1 = 1;
+
+  // expected-error@+2 {{IsInputUnsigned must be false for vector of signed integer type}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector_1,
+                        is_input_unsigned_1, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  vector<float16_t, 4> input_vector_2 =
+      input_vector_buffer.Load<vector<float16_t, 4> >(0);
+  const uint is_input_unsigned_2 = 1;
+
+  // expected-error@+2 {{IsInputUnsigned must be false for vector of floating point type}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector_2,
+                        is_input_unsigned_2, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+}
+
+//  Check is Matrix M dimension is a constant parameter
+void test_invalid_matrix_M_dimension() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  vector<float, 4> input_vector =
+      input_vector_buffer.Load<vector<float, 4> >(0);
+  const uint is_input_unsigned = 0;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimK = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64; 
+  const uint bias_offset = 0;
+  const uint bias_interpretation = DataType::DATA_TYPE_FLOAT32;
+
+  const uint matrix_dimM = constants_buffer.Load<uint>(0);   
+  
+  // expected-error@+3 {{expression is not an integer constant expression}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+}
+
+//  Check is Matrix K dimension is a constant parameter
+void test_invalid_matrix_K_dimension() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  vector<float, 4> input_vector =
+      input_vector_buffer.Load<vector<float, 4> >(0);
+  const uint is_input_unsigned = 0; 
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+  const uint bias_offset = 0;
+  const uint bias_interpretation = DataType::DATA_TYPE_FLOAT32;
+
+  const uint matrix_dimK = constants_buffer.Load<uint>(0);
+  
+  // expected-error@+4 {{expression is not an integer constant expression}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+}
+
+// Check is Matrix M dimension is non-zero
+void test_invalid_matrix_M_dimension_non_zero() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  vector<float, 4> input_vector =
+      input_vector_buffer.Load<vector<float, 4> >(0);
+  const uint is_input_unsigned = 0;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimK = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+  const uint bias_offset = 0;
+  const uint bias_interpretation = DataType::DATA_TYPE_FLOAT32;
+
+  const uint matrix_dimM = 0;
+  // expected-error@+3 {{matrix dimension must be greater than 0}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+}
+
+// Check is Matrix K dimension is non-zero
+void test_invalid_matrix_K_dimension_non_zero() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  vector<float, 4> input_vector =
+      input_vector_buffer.Load<vector<float, 4> >(0);
+  const uint is_input_unsigned = 0;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+  const uint bias_offset = 0;
+  const uint bias_interpretation = DataType::DATA_TYPE_FLOAT32;
+
+  const uint matrix_dimK = 0;
+  // expected-error@+4 {{matrix dimension must be greater than 0}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+}
+
+// Check if Matrix M dimension is less than Max
+void test_invalid_matrix_M_dimension_less_than_Max() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  const uint is_input_unsigned = 1;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimK = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = matrix_dimK * 4;
+  const uint bias_offset = 0;
+  const uint bias_interpretation = DataType::DATA_TYPE_FLOAT32;
+
+  vector<uint, 4> input_vector_0 =
+      input_vector_buffer.Load<vector<uint, 4> >(0);
+  const uint input_interpretation_0 = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM_0 = 1025;
+
+  // expected-error@+3 {{matrix dimension M must be less than 1024, in a linalg Mul/MulAdd operation}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector_0,
+                        is_input_unsigned, input_interpretation_0, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM_0,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  vector<uint, 1> input_vector_1 =
+      input_vector_buffer.Load<vector<uint, 1> >(0);
+  const uint input_interpretation_1 = DataType::DATA_TYPE_UINT8_T4_PACKED;
+  const uint matrix_dimM_1 = 4097;
+
+  // expected-error@+3 {{matrix dimension M must be less than 1024, in a linalg Mul/MulAdd operation}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector_1,
+                        is_input_unsigned, input_interpretation_1, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM_1,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+}
+
+// Check if Matrix K dimension is less than Max in unpacked input vector case
+void test_invalid_matrix_K_dimension_less_than_Max_unpacked_input_vector() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  const uint is_input_unsigned = 1;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+  const uint bias_offset = 0;
+  const uint bias_interpretation = DataType::DATA_TYPE_FLOAT32;
+
+  vector<uint, 4> input_vector_0 =
+      input_vector_buffer.Load<vector<uint, 4> >(0);
+  const uint input_interpretation_0 = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimK_0 = 1025;
+
+  // expected-error@+4 {{matrix dimension K when using unpacked input vectors must be less than 1024, in a linalg Mul/MulAdd operation}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector_0,
+                        is_input_unsigned, input_interpretation_0, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK_0, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  vector<uint, 4> input_vector_1 =
+      input_vector_buffer.Load<vector<uint, 4> >(0);
+  const uint input_interpretation_1 = DataType::DATA_TYPE_UINT8;
+  const uint matrix_dimK_1 = 4096;
+  // expected-error@+4 {{matrix dimension K when using unpacked input vectors must be less than 1024, in a linalg Mul/MulAdd operation}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector_1, 
+                        is_input_unsigned, input_interpretation_1, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK_1, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+}
+
+// Check if Matrix M dimension is less than Max in packed input vector case
+void test_invalid_matrix_M_dimension_less_than_Max_packed_input_vector() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  const uint is_input_unsigned = 1;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 1024;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 4096;
+  const uint bias_offset = 0;
+  const uint bias_interpretation = DataType::DATA_TYPE_FLOAT32;
+
+  vector<uint, 1024> input_vector_0 =
+      input_vector_buffer.Load<vector<uint, 1024> >(0);
+  const uint input_interpretation_0 = DataType::DATA_TYPE_UINT8_T4_PACKED;
+  const uint matrix_dimK_0 = 4097;
+
+  // expected-error@+4 {{matrix dimension K when using packed input vectors must be less than 4096, in a linalg Mul/MulAdd operation}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector_0,
+                        is_input_unsigned, input_interpretation_0, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK_0, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+}
+
+void test_invalid_input_interpretation_non_const() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  vector<float, 4> input_vector =
+      input_vector_buffer.Load<vector<float, 4> >(0);
+  const uint is_input_unsigned = 0;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+  const uint bias_offset = 0;
+  const uint bias_interpretation = DataType::DATA_TYPE_FLOAT32;
+
+  const uint input_interpretation = constants_buffer.Load<uint>(0);
+
+  // expected-error@+2 {{expression is not an integer constant expression}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+}
+
+// Check if InputInterpretation is a valid value
+void test_invalid_input_interpretation_value() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  vector<float, 4> input_vector =
+      input_vector_buffer.Load<vector<float, 4> >(0);   
+  const uint is_input_unsigned = 0;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+  const uint bias_offset = 0;
+  const uint bias_interpretation = DataType::DATA_TYPE_FLOAT32;
+
+  const uint input_interpretation_0 = 0;
+
+  // expected-error@+2 {{0 is an invalid register interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation_0, matrix_buffer,   
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  const uint input_interpretation_1 = 1;
+
+  // expected-error@+2 {{1 is an invalid register interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation_1, matrix_buffer,   
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  const uint input_interpretation_2 = 6;
+
+  // expected-error@+2 {{6 is an invalid register interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation_2, matrix_buffer,   
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  const uint input_interpretation_3 = 7;
+
+  // expected-error@+2 {{7 is an invalid register interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation_3, matrix_buffer,   
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);   
+
+  const uint input_interpretation_4 = 10;
+
+  // expected-error@+2 {{10 is an invalid register interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation_4, matrix_buffer,   
+                        matrix_offset, matrix_interpretation, matrix_dimM,    
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  const uint input_interpretation_5 = 11;       
+
+  // expected-error@+2 {{11 is an invalid register interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation_5, matrix_buffer,   
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  const uint input_interpretation_6 = 12;
+
+  // expected-error@+2 {{12 is an invalid register interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation_6, matrix_buffer,   
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  const uint input_interpretation_7 = 13;
+
+  // expected-error@+2 {{13 is an invalid register interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation_7, matrix_buffer,   
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  const uint input_interpretation_8 = 14;
+
+  // expected-error@+2 {{14 is an invalid register interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation_8, matrix_buffer,   
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  const uint input_interpretation_9 = 15;
+
+  // expected-error@+2 {{15 is an invalid register interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation_9, matrix_buffer,   
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  const uint input_interpretation_10 = 16;
+
+  // expected-error@+2 {{16 is an invalid register interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation_10, matrix_buffer,   
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  const uint input_interpretation_11 = 23;
+
+  // expected-error@+2 {{23 is an invalid register interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation_11, matrix_buffer,   
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  const uint input_interpretation_12 = 100;
+
+  // expected-error@+2 {{100 is an invalid register interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation_12, matrix_buffer,   
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+}
+// Check if Input and Output vector dimensions are valid -non packed
+void test_invalid_input_output_vector_dimensions_non_packed_square_matrix() {
+
+  const uint is_output_unsigned = 1;
+  const uint is_input_unsigned = 0;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 32;
+  const uint matrix_dimK = 32;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+  const uint bias_offset = 0;
+  const uint bias_interpretation = DataType::DATA_TYPE_FLOAT32;
+
+  vector<uint, 32> output_vector_0;
+  vector<float, 30> input_vector_0 =   
+      input_vector_buffer.Load<vector<float, 30> >(0);
+
+  // expected-error@+1 {{unpacked input vector length must be equal to Matrix K dimension in a linalg Mul/MulAdd operation}}
+  __builtin_MatVecMulAdd(output_vector_0, is_output_unsigned, input_vector_0,  
+                        is_input_unsigned, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  vector<uint, 30> output_vector_1;
+  vector<float, 32> input_vector_1 =   
+      input_vector_buffer.Load<vector<float, 32> >(0);
+
+  // expected-error@+1 {{output vector length must be equal to Matrix M dimension in a linalg Mul/MulAdd operation}}
+  __builtin_MatVecMulAdd(output_vector_1, is_output_unsigned, input_vector_1,    
+                        is_input_unsigned, input_interpretation, matrix_buffer,   
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+}
+
+// Check if Input and Output vector dimensions are valid -non packed
+void test_invalid_input_output_vector_dimensions_non_packed_rectangle_matrix() {
+
+  const uint is_output_unsigned = 1;
+  const uint is_input_unsigned = 0;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 16;
+  const uint matrix_dimK = 32;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+  const uint bias_offset = 0;
+  const uint bias_interpretation = DataType::DATA_TYPE_FLOAT32;
+
+  // Use dimension of Matrix K to trigger error
+  vector<uint, 32> output_vector_0;
+  vector<float, 32> input_vector_0 =   
+      input_vector_buffer.Load<vector<float, 32> >(0);
+
+  // expected-error@+1 {{output vector length must be equal to Matrix M dimension in a linalg Mul/MulAdd operation}}
+  __builtin_MatVecMulAdd(output_vector_0, is_output_unsigned, input_vector_0,  
+                        is_input_unsigned, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+ 
+ // Check off by 1 errors
+  vector<uint, 17> output_vector_1;
+  vector<float, 16> input_vector_1 =   
+      input_vector_buffer.Load<vector<float, 16> >(0);
+
+  // expected-error@+1 {{output vector length must be equal to Matrix M dimension in a linalg Mul/MulAdd operation}}
+  __builtin_MatVecMulAdd(output_vector_1, is_output_unsigned, input_vector_1,    
+                        is_input_unsigned, input_interpretation, matrix_buffer,   
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+ // Check off by 1 errors
+ vector<uint, 15> output_vector_2;
+ vector<float, 16> input_vector_2 =   
+     input_vector_buffer.Load<vector<float, 16> >(0);
+
+ // expected-error@+1 {{output vector length must be equal to Matrix M dimension in a linalg Mul/MulAdd operation}}         
+ __builtin_MatVecMulAdd(output_vector_2, is_output_unsigned, input_vector_2,    
+                        is_input_unsigned, input_interpretation, matrix_buffer,   
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  // Use dimension of Matrix M to trigger error 
+  vector<uint, 16> output_vector_3;
+  vector<float, 16> input_vector_3 =   
+      input_vector_buffer.Load<vector<float, 16> >(0);
+
+  // expected-error@+1 {{unpacked input vector length must be equal to Matrix K dimension in a linalg Mul/MulAdd operation}}
+  __builtin_MatVecMulAdd(output_vector_3, is_output_unsigned, input_vector_3,  
+                        is_input_unsigned, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  // Check off by 1 errors
+  vector<uint, 16> output_vector_4;
+  vector<float, 31> input_vector_4 =   
+      input_vector_buffer.Load<vector<float, 31> >(0);
+
+  // expected-error@+1 {{unpacked input vector length must be equal to Matrix K dimension in a linalg Mul/MulAdd operation}}    
+  __builtin_MatVecMulAdd(output_vector_4, is_output_unsigned, input_vector_4,  
+                        is_input_unsigned, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  // Check off by 1 errors
+  vector<uint, 16> output_vector_5;
+  vector<float, 33> input_vector_5 =   
+      input_vector_buffer.Load<vector<float, 33> >(0);
+
+  // expected-error@+1 {{unpacked input vector length must be equal to Matrix K dimension in a linalg Mul/MulAdd operation}}    
+  __builtin_MatVecMulAdd(output_vector_5, is_output_unsigned, input_vector_5,  
+                        is_input_unsigned, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+    // Swap dimensions to trigger error
+    vector<uint, 32> output_vector_6;
+    vector<float, 16> input_vector_6 =   
+        input_vector_buffer.Load<vector<float, 16> >(0);
+
+    // expected-error@+1 {{output vector length must be equal to Matrix M dimension in a linalg Mul/MulAdd operation}}    
+    __builtin_MatVecMulAdd(output_vector_6, is_output_unsigned, input_vector_6,  
+                          is_input_unsigned, input_interpretation, matrix_buffer,
+                          matrix_offset, matrix_interpretation, matrix_dimM,
+                          matrix_dimK, matrix_layout, matrix_is_transposed,
+                          matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+}
+
+// Check if matrtrix  interpretation is a constant value
+void test_invalid_matrix_interpretation_constant_value() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  vector<float, 4> input_vector =
+      input_vector_buffer.Load<vector<float, 4> >(0);
+  const uint is_input_unsigned = 0;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+  const uint bias_offset = 0;
+  const uint bias_interpretation = DataType::DATA_TYPE_FLOAT32;
+
+  const uint matrix_interpretation_0 = constants_buffer.Load<uint>(0);
+
+  // expected-error@+3 {{expression is not an integer constant expression}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation_0, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+}
+
+// Check for invalid matrix interpretation value
+void test_invalid_matrix_interpretation_value() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  vector<float, 4> input_vector =
+      input_vector_buffer.Load<vector<float, 4> >(0);
+  const uint is_input_unsigned = 0;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+  const uint bias_offset = 0;
+  const uint bias_interpretation = DataType::DATA_TYPE_FLOAT32;
+
+  const uint matrix_interpretation_0 = 0;
+
+  // expected-error@+3 {{0 is an invalid memory interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation_0, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  const uint matrix_interpretation_1 = 1;
+
+  // expected-error@+3 {{1 is an invalid memory interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation_1, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  const uint matrix_interpretation_2 = 6;
+
+  // expected-error@+3 {{6 is an invalid memory interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation_2, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  const uint matrix_interpretation_3 = 7;
+
+  // expected-error@+3 {{7 is an invalid memory interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation, matrix_buffer,   
+                        matrix_offset, matrix_interpretation_3, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  const uint matrix_interpretation_4 = 10;
+
+  // expected-error@+3 {{10 is an invalid memory interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation_4, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  const uint matrix_interpretation_5 = 11;
+
+  // expected-error@+3 {{11 is an invalid memory interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation_5, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  const uint matrix_interpretation_6 = 12;
+
+  // expected-error@+3 {{12 is an invalid memory interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation_6, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  const uint matrix_interpretation_7 = 13;
+
+  // expected-error@+3 {{13 is an invalid memory interpretation value}} 
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation_7, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  const uint matrix_interpretation_8 = 14;
+
+  // expected-error@+3 {{14 is an invalid memory interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation_8, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  const uint matrix_interpretation_9 = 15;
+
+  // expected-error@+3 {{15 is an invalid memory interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation_9, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  const uint matrix_interpretation_10 = 16;
+
+  // expected-error@+3 {{16 is an invalid memory interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation_10, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  const uint matrix_interpretation_11 = 23;
+  // expected-error@+3 {{23 is an invalid memory interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation_11, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  const uint matrix_interpretation_12 = 100;
+
+  // expected-error@+3 {{100 is an invalid memory interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation_12, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+}
+
+// Check if matrix Layout is a constant value
+void test_invalid_matrix_layout_constant_value() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  vector<float, 4> input_vector =
+      input_vector_buffer.Load<vector<float, 4> >(0);
+  const uint is_input_unsigned = 0;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+  const uint bias_offset = 0;
+  const uint bias_interpretation = DataType::DATA_TYPE_FLOAT32;
+
+  const uint matrix_layout = constants_buffer.Load<uint>(0);
+
+  // expected-error@+4 {{expression is not an integer constant expression}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation, matrix_buffer,   
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+}
+
+// Check invalid matrix layout value
+void test_invalid_matrix_layout_value() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  vector<float, 4> input_vector =
+      input_vector_buffer.Load<vector<float, 4> >(0);
+  const uint is_input_unsigned = 0;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+  const uint bias_offset = 0;
+  const uint bias_interpretation = DataType::DATA_TYPE_FLOAT32;
+
+  const uint matrix_layout_0 = 4;
+
+  // expected-error@+4 {{matrix layout 4 is not valid, must be in the range [0, 3]}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout_0, matrix_is_transposed,
+                      matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+}
+
+// Check if matrix is transposed is a constant value
+void test_invalid_matrix_transposed_constant_value() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  vector<float, 4> input_vector =
+      input_vector_buffer.Load<vector<float, 4> >(0);
+  const uint is_input_unsigned = 0;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = constants_buffer.Load<bool>(0);
+  const uint matrix_stride = 64;
+  const uint bias_offset = 0;
+  const uint bias_interpretation = DataType::DATA_TYPE_FLOAT32;
+
+  // expected-error@+4 {{expression is not an integer constant expression}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+}
+
+// Check if invalid matrix transpose value is used
+void test_invalid_matrix_transpose_value() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  vector<float, 4> input_vector =   
+      input_vector_buffer.Load<vector<float, 4> >(0);
+  const uint is_input_unsigned = 0;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4;   
+  const uint matrix_stride = 64;
+  const uint bias_offset = 0;
+  const uint bias_interpretation = DataType::DATA_TYPE_FLOAT32;
+
+  const uint matrix_layout_0 = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed_0 = true;
+
+  // expected-error@+4 {{RowMajor and ColumnMajor matrices are not transposable}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout_0, matrix_is_transposed_0,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  const uint matrix_layout_1 = MatrixLayout::MATRIX_LAYOUT_COLUMN_MAJOR;
+  const bool matrix_is_transposed_1 = true;
+
+  // expected-error@+4 {{RowMajor and ColumnMajor matrices are not transposable}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout_1, matrix_is_transposed_1,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+}
+
+
+// Check invalid matrix stride value for optimal matrix layout
+void test_invalid_matrix_stride_constant_value() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  vector<float, 4> input_vector =
+      input_vector_buffer.Load<vector<float, 4> >(0);
+  const uint is_input_unsigned = 0;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4;
+  const bool matrix_is_transposed = false;
+
+  const uint matrix_layout_0 = MatrixLayout::MATRIX_LAYOUT_MUL_OPTIMAL;
+  const uint matrix_stride_0 = 64;
+  const uint bias_offset = 0;
+  const uint bias_interpretation = DataType::DATA_TYPE_FLOAT32;
+
+  // expected-error@+5 {{for optimal matrix layout, matrix stride must be 0}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout_0, matrix_is_transposed,
+                        matrix_stride_0, bias_buffer, bias_offset, bias_interpretation);
+
+  const uint matrix_layout_1 = MatrixLayout::MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL;
+  const uint matrix_stride_1 = 64;
+  
+  // expected-error@+5 {{for optimal matrix layout, matrix stride must be 0}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation, matrix_buffer,   
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout_1, matrix_is_transposed,
+                        matrix_stride_1, bias_buffer, bias_offset, bias_interpretation);
+}
+
+// Check bias interpretation is not a constant value
+void test_invalid_bias_interpretation() {
+  vector<float, 4> output_vector;
+  const uint is_output_unsigned = 0;
+  vector<float, 4> input_vector =
+      input_vector_buffer.Load<vector<float, 4> >(0);
+  const uint is_input_unsigned = 0;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const uint matrix_is_transposed = 0;
+  const uint matrix_stride = 0;
+  const uint bias_offset = 0;
+
+  const uint bias_interpretation_0 = constants_buffer.Load<uint>(0);
+
+  // expected-error@+6 {{expression is not an integer constant expression}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                         is_input_unsigned, input_interpretation, matrix_buffer,
+                         matrix_offset, matrix_interpretation, matrix_dimM,
+                         matrix_dimK, matrix_layout, matrix_is_transposed,
+                         matrix_stride, bias_buffer, bias_offset,
+                         bias_interpretation_0);
+}
+
+// Check bias interpretation is not a valid value
+void test_invalid_bias_interpretation_value() {
+  vector<float, 4> output_vector;
+  const uint is_output_unsigned = 0;
+  vector<float, 4> input_vector =
+      input_vector_buffer.Load<vector<float, 4> >(0);
+  const uint is_input_unsigned = 0;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4; 
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const uint matrix_is_transposed = 0;
+  const uint matrix_stride = 0;
+  const uint bias_offset = 0;
+
+  const uint bias_interpretation_0 = 0;
+
+  // expected-error@+6 {{0 is an invalid memory interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                         is_input_unsigned, input_interpretation, matrix_buffer,
+                         matrix_offset, matrix_interpretation, matrix_dimM,
+                         matrix_dimK, matrix_layout, matrix_is_transposed,
+                         matrix_stride, bias_buffer, bias_offset,
+                         bias_interpretation_0);
+
+  const uint bias_interpretation_1 = 1;
+
+  // expected-error@+6 {{1 is an invalid memory interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                         is_input_unsigned, input_interpretation, matrix_buffer,
+                         matrix_offset, matrix_interpretation, matrix_dimM,
+                         matrix_dimK, matrix_layout, matrix_is_transposed,
+                         matrix_stride, bias_buffer, bias_offset,
+                         bias_interpretation_1);
+
+  const uint bias_interpretation_2 = 6;
+
+  // expected-error@+6 {{6 is an invalid memory interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                         is_input_unsigned, input_interpretation, matrix_buffer,
+                         matrix_offset, matrix_interpretation, matrix_dimM,
+                         matrix_dimK, matrix_layout, matrix_is_transposed,
+                         matrix_stride, bias_buffer, bias_offset,
+                         bias_interpretation_2);
+
+  const uint bias_interpretation_3 = 7;
+
+  // expected-error@+6 {{7 is an invalid memory interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                         is_input_unsigned, input_interpretation, matrix_buffer,
+                         matrix_offset, matrix_interpretation, matrix_dimM,
+                         matrix_dimK, matrix_layout, matrix_is_transposed,
+                         matrix_stride, bias_buffer, bias_offset,
+                         bias_interpretation_3);
+
+  const uint bias_interpretation_4 = 10;
+
+  // expected-error@+6 {{10 is an invalid memory interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                         is_input_unsigned, input_interpretation, matrix_buffer,
+                         matrix_offset, matrix_interpretation, matrix_dimM,
+                         matrix_dimK, matrix_layout, matrix_is_transposed,
+                         matrix_stride, bias_buffer, bias_offset,
+                         bias_interpretation_4);
+
+  const uint bias_interpretation_5 = 11;
+
+  // expected-error@+6 {{11 is an invalid memory interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                         is_input_unsigned, input_interpretation, matrix_buffer,
+                         matrix_offset, matrix_interpretation, matrix_dimM,
+                         matrix_dimK, matrix_layout, matrix_is_transposed,
+                         matrix_stride, bias_buffer, bias_offset,
+                         bias_interpretation_5);
+
+  const uint bias_interpretation_6 = 12;
+
+  // expected-error@+6 {{12 is an invalid memory interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                         is_input_unsigned, input_interpretation, matrix_buffer,  
+                         matrix_offset, matrix_interpretation, matrix_dimM,
+                         matrix_dimK, matrix_layout, matrix_is_transposed,
+                         matrix_stride, bias_buffer, bias_offset,
+                         bias_interpretation_6);
+
+  const uint bias_interpretation_7 = 13;
+
+  // expected-error@+6 {{13 is an invalid memory interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                         is_input_unsigned, input_interpretation, matrix_buffer,
+                         matrix_offset, matrix_interpretation, matrix_dimM,
+                         matrix_dimK, matrix_layout, matrix_is_transposed,
+                         matrix_stride, bias_buffer, bias_offset,
+                         bias_interpretation_7);
+
+  const uint bias_interpretation_8 = 14;
+
+  // expected-error@+6 {{14 is an invalid memory interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                         is_input_unsigned, input_interpretation, matrix_buffer,
+                         matrix_offset, matrix_interpretation, matrix_dimM,
+                         matrix_dimK, matrix_layout, matrix_is_transposed,
+                         matrix_stride, bias_buffer, bias_offset,
+                         bias_interpretation_8);
+
+  const uint bias_interpretation_9 = 15;
+
+  // expected-error@+6 {{15 is an invalid memory interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                         is_input_unsigned, input_interpretation, matrix_buffer,
+                         matrix_offset, matrix_interpretation, matrix_dimM,
+                         matrix_dimK, matrix_layout, matrix_is_transposed,
+                         matrix_stride, bias_buffer, bias_offset,
+                         bias_interpretation_9);
+
+  const uint bias_interpretation_10 = 16;  
+  
+  // expected-error@+6 {{16 is an invalid memory interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                         is_input_unsigned, input_interpretation, matrix_buffer,
+                         matrix_offset, matrix_interpretation, matrix_dimM,
+                         matrix_dimK, matrix_layout, matrix_is_transposed,
+                         matrix_stride, bias_buffer, bias_offset,
+                         bias_interpretation_10);
+
+  const uint bias_interpretation_11 = DataType::DATA_TYPE_SINT8_T4_PACKED;
+
+  // expected-error@+6 {{17 is an invalid memory interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                         is_input_unsigned, input_interpretation, matrix_buffer,
+                         matrix_offset, matrix_interpretation, matrix_dimM,
+                         matrix_dimK, matrix_layout, matrix_is_transposed,
+                         matrix_stride, bias_buffer, bias_offset,
+                         bias_interpretation_11);
+
+  const uint bias_interpretation_12 = DataType::DATA_TYPE_UINT8_T4_PACKED;
+
+  // expected-error@+6 {{18 is an invalid memory interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                         is_input_unsigned, input_interpretation, matrix_buffer,
+                         matrix_offset, matrix_interpretation, matrix_dimM,
+                         matrix_dimK, matrix_layout, matrix_is_transposed,
+                         matrix_stride, bias_buffer, bias_offset,
+                         bias_interpretation_12);
+
+  const uint bias_interpretation_13 = 23;
+
+  // expected-error@+6 {{23 is an invalid memory interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                         is_input_unsigned, input_interpretation, matrix_buffer,
+                         matrix_offset, matrix_interpretation, matrix_dimM,
+                         matrix_dimK, matrix_layout, matrix_is_transposed,
+                         matrix_stride, bias_buffer, bias_offset,
+                         bias_interpretation_13);
+
+  const uint bias_interpretation_14 = 100;
+
+  // expected-error@+6 {{100 is an invalid memory interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                         is_input_unsigned, input_interpretation, matrix_buffer,
+                         matrix_offset, matrix_interpretation, matrix_dimM,
+                         matrix_dimK, matrix_layout, matrix_is_transposed,
+                         matrix_stride, bias_buffer, bias_offset,
+                         bias_interpretation_14);
+  }     
diff --git a/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/mul_add_valid.hlsl b/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/mul_add_valid.hlsl
new file mode 100644
index 0000000000..4b0bd6dd87
--- /dev/null
+++ b/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/mul_add_valid.hlsl
@@ -0,0 +1,244 @@
+// RUN: %dxc -I %hlsl_headers -T lib_6_9 %s
+
+#include <dx/linalg.h>
+
+using namespace dx::linalg;
+
+ByteAddressBuffer input_vector_buffer;
+ByteAddressBuffer matrix_buffer;
+ByteAddressBuffer bias_buffer;
+RWByteAddressBuffer output_vector_buffer;
+ByteAddressBuffer constants_buffer;
+
+// Check valid input vector packed types
+void test_valid_input_vector_packed_types() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+  const uint bias_offset = 0;
+  const uint bias_interpretation = DataType::DATA_TYPE_FLOAT32;
+
+ const uint input_interpretation_0 = DataType::DATA_TYPE_UINT8_T4_PACKED;
+ vector<uint32_t, 1> input_vector_0 =
+     input_vector_buffer.Load<vector<uint32_t, 4> >(0);
+ const uint is_input_unsigned_0 = 1;
+
+ // expected-no-diagnostics@+1
+ __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector_0, 
+                       is_input_unsigned_0, input_interpretation_0, matrix_buffer,
+                       matrix_offset, matrix_interpretation, matrix_dimM,
+                       matrix_dimK, matrix_layout, matrix_is_transposed,
+                       matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+ const uint input_interpretation_1 = DataType::DATA_TYPE_SINT8_T4_PACKED;
+ vector<uint32_t, 1> input_vector_1 =
+     input_vector_buffer.Load<vector<uint32_t, 1> >(0);
+ const uint is_input_unsigned_1 = 1;
+
+ // expected-no-diagnostics@+1  
+ __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector_1,
+                       is_input_unsigned_1, input_interpretation_1, matrix_buffer,
+                       matrix_offset, matrix_interpretation, matrix_dimM,
+                       matrix_dimK, matrix_layout, matrix_is_transposed,
+                       matrix_stride, bias_buffer, bias_offset, bias_interpretation);                  
+
+}
+
+// IsInputUnsigned must be true for packed input vector type
+void test_valid_is_input_unsigned_packed_input_vector_type() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;  
+  const uint matrix_stride = 64;
+  const uint bias_offset = 0;
+  const uint bias_interpretation = DataType::DATA_TYPE_FLOAT32;
+
+  const uint input_interpretation_0 = DataType::DATA_TYPE_UINT8_T4_PACKED;  
+  vector<uint, 1> input_vector_0 = 
+      input_vector_buffer.Load<vector<uint, 1> >(0);
+  const uint is_input_unsigned_0 = 1;
+
+  // expected-no-diagnostics@+2
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector_0,
+                        is_input_unsigned_0, input_interpretation_0, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,  
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  const uint input_interpretation_1 = DataType::DATA_TYPE_SINT8_T4_PACKED;
+  vector<uint, 1> input_vector_1 =
+      input_vector_buffer.Load<vector<uint, 1> >(0);
+  const uint is_input_unsigned_1 = 1;
+  
+  // expected-no-diagnostics@+2
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector_1,
+                        is_input_unsigned_1, input_interpretation_1, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+}
+
+// Check packed input vector dimension
+void test_valid_packed_input_vector_dimension() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  const uint is_input_unsigned = 1;
+  const uint input_interpretation = DataType::DATA_TYPE_UINT8_T4_PACKED;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_UINT8;
+  const uint matrix_dimM = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_MUL_OPTIMAL;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 0;
+  const uint bias_offset = 0;
+  const uint bias_interpretation = DataType::DATA_TYPE_UINT32;
+
+  vector<uint, 1> input_vector_0 =
+      input_vector_buffer.Load<vector<uint, 1> >(0);
+  const uint matrix_dimK_0 = 4;
+
+  // expected-no-diagnostics@+1
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector_0,
+                        is_input_unsigned, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK_0, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  vector<uint, 2> input_vector_1 =
+      input_vector_buffer.Load<vector<uint, 2> >(0);
+  const uint matrix_dimK_1 = 7;
+
+  // expected-no-diagnostics@+1
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector_1,
+                        is_input_unsigned, input_interpretation, matrix_buffer, 
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK_1, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+}
+
+// Check if Matrix M dimension is less than Max
+void test_valid_matrix_M_dimension_less_than_Max() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  const uint is_input_unsigned = 1;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimK = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = matrix_dimK * 4;
+  const uint bias_offset = 0;
+  const uint bias_interpretation = DataType::DATA_TYPE_FLOAT32;
+
+  vector<uint, 4> input_vector_0 =
+      input_vector_buffer.Load<vector<uint, 4> >(0);
+  const uint input_interpretation_0 = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM_0 = 4;
+
+  // expected-no-diagnostics@+1
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector_0,
+                        is_input_unsigned, input_interpretation_0, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM_0,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  vector<uint, 1> input_vector_1 =
+      input_vector_buffer.Load<vector<uint, 1> >(0);
+  const uint input_interpretation_1 = DataType::DATA_TYPE_UINT8_T4_PACKED;
+  const uint matrix_dimM_1 = 4;
+
+  // expected-no-diagnostics@+1
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector_1,
+                        is_input_unsigned, input_interpretation_1, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM_1,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+}
+
+// Check if Matrix K dimension is less than Max in unpacked input vector case
+void test_valid_matrix_K_dimension_less_than_Max_unpacked_input_vector() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  const uint is_input_unsigned = 1;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+  const uint bias_offset = 0;
+  const uint bias_interpretation = DataType::DATA_TYPE_FLOAT32;
+
+  vector<uint, 4> input_vector_0 =
+      input_vector_buffer.Load<vector<uint, 4> >(0);
+  const uint input_interpretation_0 = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimK_0 = 4;
+
+  // expected-no-diagnostics@+1
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector_0,
+                        is_input_unsigned, input_interpretation_0, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK_0, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  vector<uint, 4> input_vector_1 =
+      input_vector_buffer.Load<vector<uint, 4> >(0);
+  const uint input_interpretation_1 = DataType::DATA_TYPE_UINT8;
+  const uint matrix_dimK_1 = 4;
+  // expected-no-diagnostics@+1
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector_1, 
+                        is_input_unsigned, input_interpretation_1, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK_1, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+}
+
+// Check if Matrix M dimension is less than Max in packed input vector case
+void test_valid_matrix_M_dimension_less_than_Max_packed_input_vector() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  const uint is_input_unsigned = 1;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+  const uint bias_offset = 0;
+  const uint bias_interpretation = DataType::DATA_TYPE_FLOAT32;
+
+  vector<uint, 1024> input_vector_0 =
+      input_vector_buffer.Load<vector<uint, 1024> >(0);
+  const uint input_interpretation_0 = DataType::DATA_TYPE_UINT8_T4_PACKED;
+  const uint matrix_dimK_0 = 4096;
+
+  // expected-no-diagnostics@+1
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector_0,
+                        is_input_unsigned, input_interpretation_0, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK_0, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+}
+
+
+
diff --git a/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/mul_invalid.hlsl b/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/mul_invalid.hlsl
new file mode 100644
index 0000000000..14f34d62c4
--- /dev/null
+++ b/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/mul_invalid.hlsl
@@ -0,0 +1,1156 @@
+// RUN: %dxc -I %hlsl_headers -T lib_6_9 -enable-16bit-types %s -verify
+
+#include <dx/linalg.h>
+
+using namespace dx::linalg;
+
+ByteAddressBuffer input_vector_buffer;
+ByteAddressBuffer matrix_buffer;
+RWByteAddressBuffer output_vector_buffer;
+ByteAddressBuffer constants_buffer;
+
+// Output vector, isUnsigned mismatch
+void test_invalid_output_vector_type() {
+
+  vector<float, 4> input_vector =
+      input_vector_buffer.Load<vector<float, 4> >(0);
+  const uint is_input_unsigned = 0;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+
+  vector<uint, 4> output_vector_0;
+  const uint is_output_unsigned_0 = 0;
+
+  // expected-error@+1 {{IsOuputUnsigned must be true for vector of unsigned integer type}}
+  __builtin_MatVecMul(output_vector_0, is_output_unsigned_0, input_vector,
+                      is_input_unsigned, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  vector<int32_t, 4> output_vector_1;
+  const uint is_output_unsigned_1 = 1;
+
+  // expected-error@+1 {{IsOuputUnsigned must be false for vector of signed integer type}}
+  __builtin_MatVecMul(output_vector_1, is_output_unsigned_1, input_vector,
+                      is_input_unsigned, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  vector<float, 4> output_vector_2;
+  const uint is_output_unsigned_2 = 1;
+
+  // expected-error@+1 {{IsOuputUnsigned must be false for vector of floating point type}}
+  __builtin_MatVecMul(output_vector_2, is_output_unsigned_2, input_vector,
+                      is_input_unsigned, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+}
+
+// IsOutputUnsigned is not a constant parameter
+void test_invalid_is_output_unsigned_non_const() {
+
+  vector<uint, 4> output_vector_0;
+  vector<float, 4> input_vector =
+      input_vector_buffer.Load<vector<float, 4> >(0);
+  const uint is_input_unsigned = 0;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+
+  const uint is_output_unsigned_0 = constants_buffer.Load<uint>(0);
+
+  // expected-error@+1 {{expression is not an integer constant expression}}
+  __builtin_MatVecMul(output_vector_0, is_output_unsigned_0, input_vector,
+                      is_input_unsigned, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+}
+
+// Input vector is incorrect type - 64 bit types
+void test_invalid_input_vector_type() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+
+    vector<int64_t, 4> input_vector_0 =
+      input_vector_buffer.Load<vector<int64_t, 4> >(0);
+    const uint is_input_unsigned_0 = 0;
+
+// expected-error@+2 {{no matching function for call to '__builtin_MatVecMul'}}
+// expected-note@+1 {{candidate function not viable: no known conversion from 'vector<int64_t, 4>' to 'vector<float, 4>' for 3rd argument}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector_0,
+                      is_input_unsigned_0, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+    vector<uint64_t, 4> input_vector_1 =
+      input_vector_buffer.Load<vector<uint64_t, 4> >(0);
+    const uint is_input_unsigned_1 = 1;
+
+// expected-error@+2 {{no matching function for call to '__builtin_MatVecMul'}}
+// expected-note@+1 {{candidate function not viable: no known conversion from 'vector<uint64_t, 4>' to 'vector<float, 4>' for 3rd argument}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector_1,
+                      is_input_unsigned_1, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+    vector<float64_t, 4> input_vector_2 =
+      input_vector_buffer.Load<vector<float64_t, 4> >(0);
+    const uint is_input_unsigned_2 = 0;
+
+// expected-error@+2 {{no matching function for call to '__builtin_MatVecMul'}}
+// expected-note@+1 {{candidate function not viable: no known conversion from 'vector<float64_t, 4>' to 'vector<float, 4>' for 3rd argument}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector_2,
+                      is_input_unsigned_2, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+}
+
+// Input vector is incorrect type for packed InputInterpretation
+void test_invalid_input_vector_type_packed_input_interpretation() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+
+  const uint input_interpretation_0 = DataType::DATA_TYPE_SINT8_T4_PACKED;
+  vector<int16_t, 2> input_vector_0 =
+      input_vector_buffer.Load<vector<int16_t, 2> >(0);
+  const uint is_input_unsigned_0 = 1;
+
+  // expected-error@+1 {{packed input vector type must be a 32-bit unsigned int type in linalg mul/muladd operations}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector_0,
+                      is_input_unsigned_0, input_interpretation_0, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  const uint input_interpretation_1 = DataType::DATA_TYPE_UINT8_T4_PACKED;
+  vector<uint16_t, 2> input_vector_1 =
+      input_vector_buffer.Load<vector<uint16_t, 2> >(0);
+  const uint is_input_unsigned_1 = 0;
+
+  // expected-error@+1 {{packed input vector type must be a 32-bit unsigned int type in linalg mul/muladd operations}} 
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector_1,
+                      is_input_unsigned_1, input_interpretation_1, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  const uint input_interpretation_2 = DataType::DATA_TYPE_UINT8_T4_PACKED;
+  vector<int32_t, 1> input_vector_2 =
+      input_vector_buffer.Load<vector<int32_t, 1> >(0);
+  const uint is_input_unsigned_2 = 1;
+  
+  // expected-error@+1 {{packed input vector type must be a 32-bit unsigned int type in linalg mul/muladd operations}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector_2,
+                      is_input_unsigned_2, input_interpretation_2, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  const uint input_interpretation_3 = DataType::DATA_TYPE_SINT8_T4_PACKED;
+  vector<int32_t, 1> input_vector_3 =
+      input_vector_buffer.Load<vector<int32_t, 1> >(0);
+  const uint is_input_unsigned_3 = 0;
+
+  // expected-error@+1 {{packed input vector type must be a 32-bit unsigned int type in linalg mul/muladd operations}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector_3,
+                      is_input_unsigned_3, input_interpretation_3, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  const uint input_interpretation_4 = DataType::DATA_TYPE_SINT8_T4_PACKED;
+  vector<float, 1> input_vector_4 =
+      input_vector_buffer.Load<vector<float, 1> >(0);
+  const uint is_input_unsigned_4 = 0;
+
+  // expected-error@+1 {{packed input vector type must be a 32-bit unsigned int type in linalg mul/muladd operations}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector_4, 
+                      is_input_unsigned_4, input_interpretation_4, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+}
+
+// IsInputUnsigned must be true for packed input vector type
+void test_invalid_is_input_unsigned_packed_input_vector_type() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;  
+  const uint matrix_stride = 64;
+  const uint bias_offset = 0;
+  const uint bias_interpretation = DataType::DATA_TYPE_FLOAT32;
+
+  const uint input_interpretation_0 = DataType::DATA_TYPE_UINT8_T4_PACKED;  
+  vector<uint, 1> input_vector_0 = 
+      input_vector_buffer.Load<vector<uint, 1> >(0);
+  const uint is_input_unsigned_0 = 0;
+
+  // expected-error@+2 {{IsInputUnsigned must be true for packed input interpretations in linalg mul/muladd operations}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector_0,
+                      is_input_unsigned_0, input_interpretation_0, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,  
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  const uint input_interpretation_1 = DataType::DATA_TYPE_SINT8_T4_PACKED;
+  vector<uint, 1> input_vector_1 =
+      input_vector_buffer.Load<vector<uint, 1> >(0);
+  const uint is_input_unsigned_1 = 0;
+  
+  // expected-error@+2 {{IsInputUnsigned must be true for packed input interpretations in linalg mul/muladd operations}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector_1,
+                      is_input_unsigned_1, input_interpretation_1, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+}
+
+// Check packed input vector dimension
+void test_invalid_packed_input_vector_dimension() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  const uint is_input_unsigned = 1;
+  const uint input_interpretation = DataType::DATA_TYPE_UINT8_T4_PACKED;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_UINT8;
+  const uint matrix_dimM = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_MUL_OPTIMAL;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 0;
+  const uint bias_offset = 0;
+  const uint bias_interpretation = DataType::DATA_TYPE_UINT32;
+
+  vector<uint, 4> input_vector_0 =
+      input_vector_buffer.Load<vector<uint, 4> >(0);
+  const uint matrix_dimK_0 = 4;
+
+  // expected-error@+1 {{packed input vector length must be the smallest number that can hold matrix dim K values of the packed(smaller) type in linalg mul/muladd operations}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector_0,
+                      is_input_unsigned, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK_0, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  vector<uint, 1> input_vector_1 =
+      input_vector_buffer.Load<vector<uint, 1> >(0);
+  const uint matrix_dimK_1 = 7;
+
+  // expected-error@+1 {{packed input vector length must be the smallest number that can hold matrix dim K values of the packed(smaller) type in linalg mul/muladd operations}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector_1,
+                      is_input_unsigned, input_interpretation, matrix_buffer, 
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK_1, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  vector<uint, 3> input_vector_2 =
+      input_vector_buffer.Load<vector<uint, 1> >(0);
+  const uint matrix_dimK_2 = 7;
+
+  // expected-error@+1 {{packed input vector length must be the smallest number that can hold matrix dim K values of the packed(smaller) type in linalg mul/muladd operations}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector_2,
+                      is_input_unsigned, input_interpretation, matrix_buffer, 
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK_2, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+}
+
+// Input vector type/isInputUnsigned mismatch
+void test_invalid_input_vector_type_mismatch() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+
+  vector<uint, 4> input_vector_0 =
+      input_vector_buffer.Load<vector<uint, 4> >(0);    
+  const uint is_input_unsigned_0 = 0;
+
+  // expected-error@+2 {{IsInputUnsigned must be true for vector of unsigned integer type}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector_0,
+                      is_input_unsigned_0, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  vector<int32_t, 4> input_vector_1 =
+      input_vector_buffer.Load<vector<int32_t, 4> >(0);
+  const uint is_input_unsigned_1 = 1;
+
+  // expected-error@+2 {{IsInputUnsigned must be false for vector of signed integer type}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector_1,
+                      is_input_unsigned_1, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  vector<float16_t, 4> input_vector_2 =
+      input_vector_buffer.Load<vector<float16_t, 4> >(0);
+  const uint is_input_unsigned_2 = 1;
+
+  // expected-error@+2 {{IsInputUnsigned must be false for vector of floating point type}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector_2,
+                      is_input_unsigned_2, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+}
+
+//  Check is Matrix M dimension is a constant parameter
+void test_invalid_matrix_M_dimension() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  vector<float, 4> input_vector =
+      input_vector_buffer.Load<vector<float, 4> >(0);
+  const uint is_input_unsigned = 0;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimK = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64; 
+
+  const uint matrix_dimM = constants_buffer.Load<uint>(0);   
+  
+  // expected-error@+3 {{expression is not an integer constant expression}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+}
+
+//  Check is Matrix K dimension is a constant parameter
+void test_invalid_matrix_K_dimension() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  vector<float, 4> input_vector =
+      input_vector_buffer.Load<vector<float, 4> >(0);
+  const uint is_input_unsigned = 0; 
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+
+  const uint matrix_dimK = constants_buffer.Load<uint>(0);
+  
+  // expected-error@+4 {{expression is not an integer constant expression}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+}
+
+// Check is Matrix M dimension is non-zero
+void test_invalid_matrix_M_dimension_non_zero() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  vector<float, 4> input_vector =
+      input_vector_buffer.Load<vector<float, 4> >(0);
+  const uint is_input_unsigned = 0;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimK = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+
+  const uint matrix_dimM = 0;
+  // expected-error@+3 {{matrix dimension must be greater than 0}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+}
+
+// Check is Matrix K dimension is non-zero
+void test_invalid_matrix_K_dimension_non_zero() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  vector<float, 4> input_vector =
+      input_vector_buffer.Load<vector<float, 4> >(0);
+  const uint is_input_unsigned = 0;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+
+  const uint matrix_dimK = 0;
+  // expected-error@+4 {{matrix dimension must be greater than 0}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+}
+
+// Check if Matrix M dimension is less than Max
+void test_invalid_matrix_M_dimension_less_than_Max() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  const uint is_input_unsigned = 1;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimK = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = matrix_dimK * 4;
+
+  vector<uint, 4> input_vector_0 =
+      input_vector_buffer.Load<vector<uint, 4> >(0);
+  const uint input_interpretation_0 = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM_0 = 1025;
+
+  // expected-error@+3 {{matrix dimension M must be less than 1024, in a linalg Mul/MulAdd operation}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector_0,
+                      is_input_unsigned, input_interpretation_0, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM_0,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  vector<uint, 1> input_vector_1 =
+      input_vector_buffer.Load<vector<uint, 1> >(0);
+  const uint input_interpretation_1 = DataType::DATA_TYPE_UINT8_T4_PACKED;
+  const uint matrix_dimM_1 = 4097;
+
+  // expected-error@+3 {{matrix dimension M must be less than 1024, in a linalg Mul/MulAdd operation}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector_1,
+                      is_input_unsigned, input_interpretation_1, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM_1,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+}
+
+// Check if Matrix K dimension is less than Max in unpacked input vector case
+void test_invalid_matrix_K_dimension_less_than_Max_unpacked_input_vector() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  const uint is_input_unsigned = 1;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+
+  vector<uint, 4> input_vector_0 =
+      input_vector_buffer.Load<vector<uint, 4> >(0);
+  const uint input_interpretation_0 = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimK_0 = 1025;
+
+  // expected-error@+4 {{matrix dimension K when using unpacked input vectors must be less than 1024, in a linalg Mul/MulAdd operation}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector_0,
+                      is_input_unsigned, input_interpretation_0, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK_0, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  vector<uint, 4> input_vector_1 =
+      input_vector_buffer.Load<vector<uint, 4> >(0);
+  const uint input_interpretation_1 = DataType::DATA_TYPE_UINT8;
+  const uint matrix_dimK_1 = 4096;
+  // expected-error@+4 {{matrix dimension K when using unpacked input vectors must be less than 1024, in a linalg Mul/MulAdd operation}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector_1, 
+                      is_input_unsigned, input_interpretation_1, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK_1, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+}
+
+// Check if Matrix M dimension is less than Max in packed input vector case
+void test_invalid_matrix_M_dimension_less_than_Max_packed_input_vector() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  const uint is_input_unsigned = 1;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 1024;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 4096;
+
+  vector<uint, 1024> input_vector_0 =
+      input_vector_buffer.Load<vector<uint, 1024> >(0);
+  const uint input_interpretation_0 = DataType::DATA_TYPE_UINT8_T4_PACKED;
+  const uint matrix_dimK_0 = 4097;
+
+  // expected-error@+4 {{matrix dimension K when using packed input vectors must be less than 4096, in a linalg Mul/MulAdd operation}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector_0,
+                      is_input_unsigned, input_interpretation_0, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK_0, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+}
+
+//Check if InputInterpretation is a constant parameter
+void test_invalid_input_interpretation_non_const() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  vector<float, 4> input_vector =
+      input_vector_buffer.Load<vector<float, 4> >(0);
+  const uint is_input_unsigned = 0;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+
+  const uint input_interpretation = constants_buffer.Load<uint>(0);
+
+  // expected-error@+2 {{expression is not an integer constant expression}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+}
+
+// Check if InputInterpretation is a valid value
+void test_invalid_input_interpretation_value() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  vector<float, 4> input_vector =
+      input_vector_buffer.Load<vector<float, 4> >(0);   
+  const uint is_input_unsigned = 0;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+
+  const uint input_interpretation_0 = 0;
+
+  // expected-error@+2 {{0 is an invalid register interpretation value}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation_0, matrix_buffer,   
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  const uint input_interpretation_1 = 1;
+
+  // expected-error@+2 {{1 is an invalid register interpretation value}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation_1, matrix_buffer,   
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  const uint input_interpretation_2 = 6;
+
+  // expected-error@+2 {{6 is an invalid register interpretation value}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation_2, matrix_buffer,   
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  const uint input_interpretation_3 = 7;
+
+  // expected-error@+2 {{7 is an invalid register interpretation value}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation_3, matrix_buffer,   
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);   
+
+  const uint input_interpretation_4 = 10;
+
+  // expected-error@+2 {{10 is an invalid register interpretation value}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation_4, matrix_buffer,   
+                      matrix_offset, matrix_interpretation, matrix_dimM,    
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  const uint input_interpretation_5 = 11;       
+
+  // expected-error@+2 {{11 is an invalid register interpretation value}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation_5, matrix_buffer,   
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  const uint input_interpretation_6 = 12;
+
+  // expected-error@+2 {{12 is an invalid register interpretation value}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation_6, matrix_buffer,   
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  const uint input_interpretation_7 = 13;
+
+  // expected-error@+2 {{13 is an invalid register interpretation value}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation_7, matrix_buffer,   
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  const uint input_interpretation_8 = 14;
+
+  // expected-error@+2 {{14 is an invalid register interpretation value}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation_8, matrix_buffer,   
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  const uint input_interpretation_9 = 15;
+
+  // expected-error@+2 {{15 is an invalid register interpretation value}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation_9, matrix_buffer,   
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  const uint input_interpretation_10 = 16;
+
+  // expected-error@+2 {{16 is an invalid register interpretation value}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation_10, matrix_buffer,   
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  const uint input_interpretation_11 = 23;
+
+  // expected-error@+2 {{23 is an invalid register interpretation value}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation_11, matrix_buffer,   
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  const uint input_interpretation_12 = 100;
+
+  // expected-error@+2 {{100 is an invalid register interpretation value}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation_12, matrix_buffer,   
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+}
+// Check if Input and Output vector dimensions are valid -non packed
+void test_invalid_input_output_vector_dimensions_non_packed_square_matrix() {
+
+  const uint is_output_unsigned = 1;
+  const uint is_input_unsigned = 0;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 32;
+  const uint matrix_dimK = 32;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+
+  vector<uint, 32> output_vector_0;
+  vector<float, 30> input_vector_0 =   
+      input_vector_buffer.Load<vector<float, 30> >(0);
+
+  // expected-error@+1 {{unpacked input vector length must be equal to Matrix K dimension in a linalg Mul/MulAdd operation}}
+  __builtin_MatVecMul(output_vector_0, is_output_unsigned, input_vector_0,  
+                      is_input_unsigned, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  vector<uint, 30> output_vector_1;
+  vector<float, 32> input_vector_1 =   
+      input_vector_buffer.Load<vector<float, 32> >(0);
+
+  // expected-error@+1 {{output vector length must be equal to Matrix M dimension in a linalg Mul/MulAdd operation}}
+  __builtin_MatVecMul(output_vector_1, is_output_unsigned, input_vector_1,    
+                      is_input_unsigned, input_interpretation, matrix_buffer,   
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+}
+
+// Check if Input and Output vector dimensions are valid -non packed
+void test_invalid_input_output_vector_dimensions_non_packed_rectangle_matrix() {
+
+  const uint is_output_unsigned = 1;
+  const uint is_input_unsigned = 0;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 16;
+  const uint matrix_dimK = 32;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+
+  // Use dimension of Matrix K to trigger error
+  vector<uint, 32> output_vector_0;
+  vector<float, 32> input_vector_0 =   
+      input_vector_buffer.Load<vector<float, 32> >(0);
+
+  // expected-error@+1 {{output vector length must be equal to Matrix M dimension in a linalg Mul/MulAdd operation}}
+  __builtin_MatVecMul(output_vector_0, is_output_unsigned, input_vector_0,  
+                      is_input_unsigned, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+ 
+ // Check off by 1 errors
+  vector<uint, 17> output_vector_1;
+  vector<float, 16> input_vector_1 =   
+      input_vector_buffer.Load<vector<float, 16> >(0);
+
+  // expected-error@+1 {{output vector length must be equal to Matrix M dimension in a linalg Mul/MulAdd operation}}
+  __builtin_MatVecMul(output_vector_1, is_output_unsigned, input_vector_1,    
+                      is_input_unsigned, input_interpretation, matrix_buffer,   
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+ // Check off by 1 errors
+ vector<uint, 15> output_vector_2;
+ vector<float, 16> input_vector_2 =   
+     input_vector_buffer.Load<vector<float, 16> >(0);
+
+ // expected-error@+1 {{output vector length must be equal to Matrix M dimension in a linalg Mul/MulAdd operation}}         
+ __builtin_MatVecMul(output_vector_2, is_output_unsigned, input_vector_2,    
+                      is_input_unsigned, input_interpretation, matrix_buffer,   
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  // Use dimension of Matrix M to trigger error 
+  vector<uint, 16> output_vector_3;
+  vector<float, 16> input_vector_3 =   
+      input_vector_buffer.Load<vector<float, 16> >(0);
+
+  // expected-error@+1 {{unpacked input vector length must be equal to Matrix K dimension in a linalg Mul/MulAdd operation}}
+  __builtin_MatVecMul(output_vector_3, is_output_unsigned, input_vector_3,  
+                      is_input_unsigned, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  // Check off by 1 errors
+  vector<uint, 16> output_vector_4;
+  vector<float, 31> input_vector_4 =   
+      input_vector_buffer.Load<vector<float, 31> >(0);
+
+  // expected-error@+1 {{unpacked input vector length must be equal to Matrix K dimension in a linalg Mul/MulAdd operation}}    
+  __builtin_MatVecMul(output_vector_4, is_output_unsigned, input_vector_4,  
+                      is_input_unsigned, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  // Check off by 1 errors
+  vector<uint, 16> output_vector_5;
+  vector<float, 33> input_vector_5 =   
+      input_vector_buffer.Load<vector<float, 33> >(0);
+
+  // expected-error@+1 {{unpacked input vector length must be equal to Matrix K dimension in a linalg Mul/MulAdd operation}}    
+  __builtin_MatVecMul(output_vector_5, is_output_unsigned, input_vector_5,  
+                      is_input_unsigned, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+    // Swap dimensions to trigger error
+    vector<uint, 32> output_vector_6;
+    vector<float, 16> input_vector_6 =   
+        input_vector_buffer.Load<vector<float, 16> >(0);
+
+    // expected-error@+1 {{output vector length must be equal to Matrix M dimension in a linalg Mul/MulAdd operation}}    
+    __builtin_MatVecMul(output_vector_6, is_output_unsigned, input_vector_6,  
+                        is_input_unsigned, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride);
+}
+
+// Check if matrtrix  interpretation is a constant value
+void test_invalid_matrix_interpretation_constant_value() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  vector<float, 4> input_vector =
+      input_vector_buffer.Load<vector<float, 4> >(0);
+  const uint is_input_unsigned = 0;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+
+  const uint matrix_interpretation_0 = constants_buffer.Load<uint>(0);
+
+  // expected-error@+3 {{expression is not an integer constant expression}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation_0, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+}
+
+// Check for invalid matrix interpretation value
+void test_invalid_matrix_interpretation_value() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  vector<float, 4> input_vector =
+      input_vector_buffer.Load<vector<float, 4> >(0);
+  const uint is_input_unsigned = 0;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+
+  const uint matrix_interpretation_0 = 0;
+
+  // expected-error@+3 {{0 is an invalid memory interpretation value}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation_0, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  const uint matrix_interpretation_1 = 1;
+
+  // expected-error@+3 {{1 is an invalid memory interpretation value}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation_1, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  const uint matrix_interpretation_2 = 6;
+
+  // expected-error@+3 {{6 is an invalid memory interpretation value}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation_2, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  const uint matrix_interpretation_3 = 7;
+
+  // expected-error@+3 {{7 is an invalid memory interpretation value}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation, matrix_buffer,   
+                      matrix_offset, matrix_interpretation_3, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  const uint matrix_interpretation_4 = 10;
+
+  // expected-error@+3 {{10 is an invalid memory interpretation value}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation_4, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  const uint matrix_interpretation_5 = 11;
+
+  // expected-error@+3 {{11 is an invalid memory interpretation value}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation_5, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  const uint matrix_interpretation_6 = 12;
+
+  // expected-error@+3 {{12 is an invalid memory interpretation value}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation_6, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  const uint matrix_interpretation_7 = 13;
+
+  // expected-error@+3 {{13 is an invalid memory interpretation value}} 
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation_7, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);       
+
+  const uint matrix_interpretation_8 = 14;
+
+  // expected-error@+3 {{14 is an invalid memory interpretation value}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation_8, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  const uint matrix_interpretation_9 = 15;
+
+  // expected-error@+3 {{15 is an invalid memory interpretation value}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation_9, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  const uint matrix_interpretation_10 = 16;
+
+  // expected-error@+3 {{16 is an invalid memory interpretation value}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation_10, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  const uint matrix_interpretation_11 = 23;
+  // expected-error@+3 {{23 is an invalid memory interpretation value}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation_11, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  const uint matrix_interpretation_12 = 100;
+
+  // expected-error@+3 {{100 is an invalid memory interpretation value}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation_12, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+}
+
+// Check if matrix Layout is a constant value
+void test_invalid_matrix_layout_constant_value() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  vector<float, 4> input_vector =
+      input_vector_buffer.Load<vector<float, 4> >(0);   
+  const uint is_input_unsigned = 0;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+
+  const uint matrix_layout = constants_buffer.Load<uint>(0);
+
+  // expected-error@+4 {{expression is not an integer constant expression}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation, matrix_buffer,   
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+}
+
+// Check invalid matrix layout value
+void test_invalid_matrix_layout_value() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  vector<float, 4> input_vector =
+      input_vector_buffer.Load<vector<float, 4> >(0);
+  const uint is_input_unsigned = 0;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+
+  const uint matrix_layout_0 = 4;
+
+  // expected-error@+4 {{matrix layout 4 is not valid, must be in the range [0, 3]}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout_0, matrix_is_transposed,
+                      matrix_stride);
+}
+
+// Check if matrix is transposed is a constant value
+void test_invalid_matrix_transposed_constant_value() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  vector<float, 4> input_vector =
+      input_vector_buffer.Load<vector<float, 4> >(0);
+  const uint is_input_unsigned = 0;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = constants_buffer.Load<bool>(0);
+  const uint matrix_stride = 64;
+
+  // expected-error@+4 {{expression is not an integer constant expression}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+}
+
+// Check if invalid matrix transpose value is used
+void test_invalid_matrix_transpose_value() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  vector<float, 4> input_vector =   
+      input_vector_buffer.Load<vector<float, 4> >(0);
+  const uint is_input_unsigned = 0;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4;   
+  const uint matrix_stride = 64;
+
+  const uint matrix_layout_0 = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed_0 = true;
+
+  // expected-error@+4 {{RowMajor and ColumnMajor matrices are not transposable}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout_0, matrix_is_transposed_0,
+                      matrix_stride);
+
+  const uint matrix_layout_1 = MatrixLayout::MATRIX_LAYOUT_COLUMN_MAJOR;
+  const bool matrix_is_transposed_1 = true;
+
+  // expected-error@+4 {{RowMajor and ColumnMajor matrices are not transposable}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout_1, matrix_is_transposed_1,
+                      matrix_stride);
+}
+
+
+// Check invalid matrix stride value for optimal matrix layout
+void test_invalid_matrix_stride_constant_value() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  vector<float, 4> input_vector =
+      input_vector_buffer.Load<vector<float, 4> >(0);
+  const uint is_input_unsigned = 0;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4;
+  const bool matrix_is_transposed = false;
+
+  const uint matrix_layout_0 = MatrixLayout::MATRIX_LAYOUT_MUL_OPTIMAL;
+  const uint matrix_stride_0 = 64;
+
+  // expected-error@+5 {{for optimal matrix layout, matrix stride must be 0}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout_0, matrix_is_transposed,
+                      matrix_stride_0);
+
+  const uint matrix_layout_1 = MatrixLayout::MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL;
+  const uint matrix_stride_1 = 64;
+
+  // expected-error@+5 {{for optimal matrix layout, matrix stride must be 0}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation, matrix_buffer,   
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout_1, matrix_is_transposed,
+                      matrix_stride_1);
+}
diff --git a/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/mul_valid.hlsl b/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/mul_valid.hlsl
new file mode 100644
index 0000000000..5972b22b95
--- /dev/null
+++ b/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/mul_valid.hlsl
@@ -0,0 +1,344 @@
+// RUN: %dxc -I %hlsl_headers -T lib_6_9 -enable-16bit-types %s -verify
+
+#include <dx/linalg.h>
+
+using namespace dx::linalg;
+
+ByteAddressBuffer input_vector_buffer;
+ByteAddressBuffer matrix_buffer; 
+RWByteAddressBuffer output_vector_buffer;
+ByteAddressBuffer const_buffer;
+
+// Output vector, isUnsigned mismatch
+void test_valid_output_vector_type() {
+
+    vector<float, 4> input_vector = input_vector_buffer.Load<vector<float, 4> >(0);
+    const uint is_input_unsigned = 0;
+    const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+    const uint matrix_offset = 0;
+    const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+    const uint matrix_dimM = 4;
+    const uint matrix_dimK = 4;
+    const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+    const bool matrix_is_transposed = false;
+    const uint matrix_stride = 64;
+
+    vector<uint, 4> output_vector_0;
+    const uint is_output_unsigned_0 = 1;
+
+    // expected-no-diagnostics@+1
+    __builtin_MatVecMul(output_vector_0, is_output_unsigned_0, input_vector,
+        is_input_unsigned, input_interpretation, matrix_buffer, matrix_offset,
+        matrix_interpretation, matrix_dimM, matrix_dimK, matrix_layout,
+        matrix_is_transposed, matrix_stride);
+
+    vector<int32_t, 4> output_vector_1;
+    const uint is_output_unsigned_1 = 0;
+
+    // expected-no-diagnostics@+1
+    __builtin_MatVecMul(output_vector_1, is_output_unsigned_1, input_vector,
+        is_input_unsigned, input_interpretation, matrix_buffer, matrix_offset,
+        matrix_interpretation, matrix_dimM, matrix_dimK, matrix_layout,
+        matrix_is_transposed, matrix_stride);
+
+    vector<float, 4> output_vector_2;
+    const uint is_output_unsigned_2 = 0;
+
+    // expected-no-diagnostics@+1
+    __builtin_MatVecMul(output_vector_2, is_output_unsigned_2, input_vector,
+        is_input_unsigned, input_interpretation, matrix_buffer, matrix_offset,
+        matrix_interpretation, matrix_dimM, matrix_dimK, matrix_layout,
+        matrix_is_transposed, matrix_stride);
+}
+
+void test_valid_is_output_unsigned_non_const() {
+
+  vector<uint, 4> output_vector_0;
+  vector<float, 4> input_vector =
+      input_vector_buffer.Load<vector<float, 4> >(0);
+  const uint is_input_unsigned = 0;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+
+  const uint is_output_unsigned_0 = 1;
+
+  // expected-no-diagnostics@+1
+  __builtin_MatVecMul(output_vector_0, is_output_unsigned_0, input_vector,
+                      is_input_unsigned, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+}
+
+// Input vector is incorrect type
+void test_valid_input_vector_type() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+
+    vector<int32_t, 4> input_vector_0 =
+      input_vector_buffer.Load<vector<int32_t, 4> >(0);
+    const uint is_input_unsigned_0 = 0;
+
+ // expected-no-diagnostics@+1
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector_0,
+                      is_input_unsigned_0, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+    vector<uint32_t, 4> input_vector_1 =
+      input_vector_buffer.Load<vector<uint32_t, 4> >(0);
+    const uint is_input_unsigned_1 = 1;
+
+ // expected-no-diagnostics@+1 
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector_1,
+                      is_input_unsigned_1, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+    vector<float16_t, 4> input_vector_2 =
+      input_vector_buffer.Load<vector<float16_t, 4> >(0);
+    const uint is_input_unsigned_2 = 0;
+
+ // expected-no-diagnostics@+1
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector_2,
+                      is_input_unsigned_2, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+}
+
+// Check valid input vector packed types
+void test_valid_input_vector_packed_types() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+
+ const uint input_interpretation_0 = DataType::DATA_TYPE_UINT8_T4_PACKED;
+ vector<uint32_t, 1> input_vector_0 =
+     input_vector_buffer.Load<vector<uint32_t, 1> >(0);
+ const uint is_input_unsigned_0 = 1;
+
+ // expected-no-diagnostics@+1
+ __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector_0, 
+                     is_input_unsigned_0, input_interpretation_0, matrix_buffer,
+                     matrix_offset, matrix_interpretation, matrix_dimM,
+                     matrix_dimK, matrix_layout, matrix_is_transposed,
+                     matrix_stride);
+
+ const uint input_interpretation_1 = DataType::DATA_TYPE_SINT8_T4_PACKED;
+ vector<uint32_t, 1> input_vector_1 =
+     input_vector_buffer.Load<vector<uint32_t, 1> >(0);
+ const uint is_input_unsigned_1 = 1;
+
+ // expected-no-diagnostics@+1  
+ __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector_1,
+                     is_input_unsigned_1, input_interpretation_1, matrix_buffer,
+                     matrix_offset, matrix_interpretation, matrix_dimM,
+                     matrix_dimK, matrix_layout, matrix_is_transposed,
+                     matrix_stride);                  
+
+}
+
+// IsInputUnsigned must be true for packed input vector type
+void test_valid_is_input_unsigned_packed_input_vector_type() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;  
+  const uint matrix_stride = 64;
+
+  const uint input_interpretation_0 = DataType::DATA_TYPE_UINT8_T4_PACKED;  
+  vector<uint, 1> input_vector_0 = 
+      input_vector_buffer.Load<vector<uint, 1> >(0);
+  const uint is_input_unsigned_0 = 1;
+
+  // expected-no-diagnostics@+2
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector_0,
+                      is_input_unsigned_0, input_interpretation_0, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,  
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  const uint input_interpretation_1 = DataType::DATA_TYPE_SINT8_T4_PACKED;
+  vector<uint, 1> input_vector_1 =
+      input_vector_buffer.Load<vector<uint, 1> >(0);
+  const uint is_input_unsigned_1 = 1;
+  
+  // expected-no-diagnostics@+2
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector_1,
+                      is_input_unsigned_1, input_interpretation_1, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+}
+
+// Check packed input vector dimension
+void test_valid_packed_input_vector_dimension() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  const uint is_input_unsigned = 1;
+  const uint input_interpretation = DataType::DATA_TYPE_UINT8_T4_PACKED;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_UINT8;
+  const uint matrix_dimM = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_MUL_OPTIMAL;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 0;
+
+  vector<uint, 1> input_vector_0 =
+      input_vector_buffer.Load<vector<uint, 1> >(0);
+  const uint matrix_dimK_0 = 4;
+
+  // expected-no-diagnostics@+1
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector_0,
+                      is_input_unsigned, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK_0, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  vector<uint, 2> input_vector_1 =
+      input_vector_buffer.Load<vector<uint, 2> >(0);
+  const uint matrix_dimK_1 = 7;
+
+  // expected-no-diagnostics@+1
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector_1,
+                      is_input_unsigned, input_interpretation, matrix_buffer, 
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK_1, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+}
+
+// Check if Matrix M dimension is less than Max
+void test_valid_matrix_M_dimension_less_than_Max() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  const uint is_input_unsigned = 1;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimK = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = matrix_dimK * 4;
+
+  vector<uint, 4> input_vector_0 =
+      input_vector_buffer.Load<vector<uint, 4> >(0);
+  const uint input_interpretation_0 = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM_0 = 4;
+
+  // expected-no-diagnostics@+1
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector_0,
+                      is_input_unsigned, input_interpretation_0, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM_0,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  vector<uint, 1> input_vector_1 =
+      input_vector_buffer.Load<vector<uint, 1> >(0);
+  const uint input_interpretation_1 = DataType::DATA_TYPE_UINT8_T4_PACKED;
+  const uint matrix_dimM_1 = 4;
+
+  // expected-no-diagnostics@+1
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector_1,
+                      is_input_unsigned, input_interpretation_1, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM_1,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+}
+
+// Check if Matrix K dimension is less than Max in unpacked input vector case
+void test_valid_matrix_K_dimension_less_than_Max_unpacked_input_vector() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  const uint is_input_unsigned = 1;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+
+  vector<uint, 4> input_vector_0 =
+      input_vector_buffer.Load<vector<uint, 4> >(0);
+  const uint input_interpretation_0 = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimK_0 = 4;
+
+  // expected-no-diagnostics@+1
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector_0,
+                      is_input_unsigned, input_interpretation_0, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK_0, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  vector<uint, 4> input_vector_1 =
+      input_vector_buffer.Load<vector<uint, 4> >(0);
+  const uint input_interpretation_1 = DataType::DATA_TYPE_UINT8;
+  const uint matrix_dimK_1 = 4;
+
+  // expected-no-diagnostics@+1
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector_1, 
+                      is_input_unsigned, input_interpretation_1, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK_1, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+}
+
+// Check if Matrix M dimension is less than Max in packed input vector case
+void test_valid_matrix_M_dimension_less_than_Max_packed_input_vector() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  const uint is_input_unsigned = 1;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+
+  vector<uint, 1024> input_vector_0 =
+      input_vector_buffer.Load<vector<uint, 1024> >(0);
+  const uint input_interpretation_0 = DataType::DATA_TYPE_UINT8_T4_PACKED;
+  const uint matrix_dimK_0 = 4096;
+
+  // expected-no-diagnostics@+1
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector_0,
+                      is_input_unsigned, input_interpretation_0, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK_0, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+}
diff --git a/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/outer_product_accumulate_invalid.hlsl b/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/outer_product_accumulate_invalid.hlsl
new file mode 100644
index 0000000000..4e15c92a5d
--- /dev/null
+++ b/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/outer_product_accumulate_invalid.hlsl
@@ -0,0 +1,256 @@
+// RUN: %dxc -I %hlsl_headers -T lib_6_9 -enable-16bit-types %s -verify
+
+#include <dx/linalg.h>
+
+using namespace dx::linalg;
+
+ByteAddressBuffer input_vector_buffer;
+RWByteAddressBuffer accumulate_buffer;
+ByteAddressBuffer constants_buffer;
+
+// Check if input vectors aren't the same component type
+void test_invalid_input_vector_component_type() {
+
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL;
+  const uint matrix_stride = 0;
+
+  vector<float, 4> input_vector_0_0 = input_vector_buffer.Load<vector<float, 4> >(0);
+  vector<uint, 4> input_vector_1_0 = input_vector_buffer.Load<vector<uint, 4> >(0);
+
+  // expected-error@+1 {{input vectors of outerproductaccumulate must have the same element type}}
+  __builtin_OuterProductAccumulate(input_vector_0_0, input_vector_1_0,
+                                  accumulate_buffer, matrix_offset,
+                                  matrix_interpretation, matrix_layout,
+                                  matrix_stride);
+
+  vector<int, 4> input_vector_0_1 = input_vector_buffer.Load<vector<int, 4> >(0);
+  vector<float, 4> input_vector_1_1 = input_vector_buffer.Load<vector<float, 4> >(0);
+
+  // expected-error@+1 {{input vectors of outerproductaccumulate must have the same element type}}
+  __builtin_OuterProductAccumulate(input_vector_0_1, input_vector_1_1,
+                                  accumulate_buffer, matrix_offset,
+                                  matrix_interpretation, matrix_layout,
+                                  matrix_stride);
+}
+
+// Check for non constant matrix interpretation
+void test_non_constant_matrix_interpretation() {
+
+  vector<float, 4> input_vector_0 = input_vector_buffer.Load<vector<float, 4> >(0);
+  vector<float, 4> input_vector_1 = input_vector_buffer.Load<vector<float, 4> >(0);
+  const uint matrix_offset = 0;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL;
+  const uint matrix_stride = 0;
+
+  const uint matrix_interpretation = constants_buffer.Load<uint>(0);
+
+  // expected-error@+3 {{expression is not an integer constant expression}}
+  __builtin_OuterProductAccumulate(input_vector_0, input_vector_1,
+                                  accumulate_buffer, matrix_offset,
+                                  matrix_interpretation, matrix_layout,
+                                  matrix_stride);
+}
+
+// Check for matrix interpretation is not a valid value
+void test_invalid_matrix_interpretation() {
+
+  vector<float, 4> input_vector_0 = input_vector_buffer.Load<vector<float, 4> >(0);
+  vector<float, 4> input_vector_1 = input_vector_buffer.Load<vector<float, 4> >(0);
+  const uint matrix_offset = 0;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL;
+  const uint matrix_stride = 0;
+
+  const uint matrix_interpretation = 0;
+
+  // expected-error@+3 {{0 is an invalid memory interpretation value}}
+  __builtin_OuterProductAccumulate(input_vector_0, input_vector_1,
+                                  accumulate_buffer, matrix_offset,
+                                  matrix_interpretation, matrix_layout,
+                                  matrix_stride);
+
+  const uint matrix_interpretation_2 = 1;
+
+  // expected-error@+3 {{1 is an invalid memory interpretation value}}
+  __builtin_OuterProductAccumulate(input_vector_0, input_vector_1,
+                                  accumulate_buffer, matrix_offset,
+                                  matrix_interpretation_2, matrix_layout,
+                                  matrix_stride);
+
+  const uint matrix_interpretation_3 = 6;
+
+  // expected-error@+3 {{6 is an invalid memory interpretation value}}
+  __builtin_OuterProductAccumulate(input_vector_0, input_vector_1,
+                                  accumulate_buffer, matrix_offset,
+                                  matrix_interpretation_3, matrix_layout,
+                                  matrix_stride);
+
+  const uint matrix_interpretation_4 = 7;
+
+  // expected-error@+3 {{7 is an invalid memory interpretation value}}
+  __builtin_OuterProductAccumulate(input_vector_0, input_vector_1,
+                                  accumulate_buffer, matrix_offset,
+                                  matrix_interpretation_4, matrix_layout,
+                                  matrix_stride);
+
+  const uint matrix_interpretation_5 = 10;
+
+  // expected-error@+3 {{10 is an invalid memory interpretation value}}
+  __builtin_OuterProductAccumulate(input_vector_0, input_vector_1,
+                                  accumulate_buffer, matrix_offset,
+                                  matrix_interpretation_5, matrix_layout,
+                                  matrix_stride); 
+
+  const uint matrix_interpretation_6 = 11;
+
+  // expected-error@+3 {{11 is an invalid memory interpretation value}}
+  __builtin_OuterProductAccumulate(input_vector_0, input_vector_1,
+                                  accumulate_buffer, matrix_offset,
+                                  matrix_interpretation_6, matrix_layout,
+                                  matrix_stride);
+
+  const uint matrix_interpretation_7 = 12;
+
+  // expected-error@+3 {{12 is an invalid memory interpretation value}}
+  __builtin_OuterProductAccumulate(input_vector_0, input_vector_1,
+                                  accumulate_buffer, matrix_offset,
+                                  matrix_interpretation_7, matrix_layout,
+                                  matrix_stride);
+
+  const uint matrix_interpretation_8 = 13;
+
+  // expected-error@+3 {{13 is an invalid memory interpretation value}}
+  __builtin_OuterProductAccumulate(input_vector_0, input_vector_1,
+                                  accumulate_buffer, matrix_offset,
+                                  matrix_interpretation_8, matrix_layout,
+                                  matrix_stride);
+
+  const uint matrix_interpretation_9 = 14;
+
+  // expected-error@+3 {{14 is an invalid memory interpretation value}}
+  __builtin_OuterProductAccumulate(input_vector_0, input_vector_1,
+                                  accumulate_buffer, matrix_offset,
+                                  matrix_interpretation_9, matrix_layout,
+                                  matrix_stride);
+
+  const uint matrix_interpretation_10 = 15;
+
+  // expected-error@+3 {{15 is an invalid memory interpretation value}}
+  __builtin_OuterProductAccumulate(input_vector_0, input_vector_1,
+                                  accumulate_buffer, matrix_offset,
+                                  matrix_interpretation_10, matrix_layout,
+                                  matrix_stride);
+
+  const uint matrix_interpretation_11 = 16;
+
+  // expected-error@+3 {{16 is an invalid memory interpretation value}}
+  __builtin_OuterProductAccumulate(input_vector_0, input_vector_1,
+                                  accumulate_buffer, matrix_offset,
+                                  matrix_interpretation_11, matrix_layout,
+                                  matrix_stride); 
+
+  const uint matrix_interpretation_12 = DataType::DATA_TYPE_SINT8_T4_PACKED;
+
+  // expected-error@+3 {{17 is an invalid memory interpretation value}}
+  __builtin_OuterProductAccumulate(input_vector_0, input_vector_1,
+                                  accumulate_buffer, matrix_offset,
+                                  matrix_interpretation_12, matrix_layout,
+                                  matrix_stride);
+
+  const uint matrix_interpretation_13 = DataType::DATA_TYPE_UINT8_T4_PACKED;
+
+  // expected-error@+3 {{18 is an invalid memory interpretation value}}
+  __builtin_OuterProductAccumulate(input_vector_0, input_vector_1,
+                                  accumulate_buffer, matrix_offset,
+                                  matrix_interpretation_13, matrix_layout,
+                                  matrix_stride);
+
+  const uint matrix_interpretation_14 = 23;
+
+  // expected-error@+3 {{23 is an invalid memory interpretation value}}
+  __builtin_OuterProductAccumulate(input_vector_0, input_vector_1,
+                                  accumulate_buffer, matrix_offset,
+                                  matrix_interpretation_14, matrix_layout,
+                                  matrix_stride);
+
+  const uint matrix_interpretation_15 = 100;
+
+  // expected-error@+3 {{100 is an invalid memory interpretation value}}
+  __builtin_OuterProductAccumulate(input_vector_0, input_vector_1,
+                                  accumulate_buffer, matrix_offset,
+                                  matrix_interpretation_15, matrix_layout,
+                                  matrix_stride);                   
+                              
+}
+
+// Check for matrix layout is not a constant parameter
+void test_non_constant_matrix_layout() {
+
+  vector<float, 4> input_vector_0 = input_vector_buffer.Load<vector<float, 4> >(0);
+  vector<float, 4> input_vector_1 = input_vector_buffer.Load<vector<float, 4> >(0);
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_stride = 0;
+
+  const uint matrix_layout = constants_buffer.Load<uint>(0);
+
+  // expected-error@+3 {{expression is not an integer constant expression}}
+  __builtin_OuterProductAccumulate(input_vector_0, input_vector_1,
+                                  accumulate_buffer, matrix_offset,
+                                  matrix_interpretation, matrix_layout,
+                                  matrix_stride);
+}
+
+// Check for matrix layout is not a valid value
+void test_invalid_matrix_layout() {
+
+  vector<float, 4> input_vector_0 = input_vector_buffer.Load<vector<float, 4> >(0);
+  vector<float, 4> input_vector_1 = input_vector_buffer.Load<vector<float, 4> >(0);
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32; 
+  const uint matrix_stride = 0;
+
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+
+  // expected-error@+3 {{matrix layout for outerproductaccumulate must be 3}}
+  __builtin_OuterProductAccumulate(input_vector_0, input_vector_1,
+                                  accumulate_buffer, matrix_offset,
+                                  matrix_interpretation, matrix_layout,
+                                  matrix_stride);
+
+  const uint matrix_layout_2 = MatrixLayout::MATRIX_LAYOUT_COLUMN_MAJOR;
+
+  // expected-error@+3 {{matrix layout for outerproductaccumulate must be 3}}
+  __builtin_OuterProductAccumulate(input_vector_0, input_vector_1,
+                                  accumulate_buffer, matrix_offset,
+                                  matrix_interpretation, matrix_layout_2,
+                                  matrix_stride);
+
+  const uint matrix_layout_3 = MatrixLayout::MATRIX_LAYOUT_MUL_OPTIMAL;
+
+  // expected-error@+3 {{matrix layout for outerproductaccumulate must be 3}}
+  __builtin_OuterProductAccumulate(input_vector_0, input_vector_1,
+                                  accumulate_buffer, matrix_offset,
+                                  matrix_interpretation, matrix_layout_3,
+                                  matrix_stride);                               
+                                  
+}
+
+// Check for matrix stride is zero, if constant
+void test_zero_matrix_stride() {
+
+  vector<float, 4> input_vector_0 = input_vector_buffer.Load<vector<float, 4> >(0);
+  vector<float, 4> input_vector_1 = input_vector_buffer.Load<vector<float, 4> >(0);
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL;
+
+  const uint matrix_stride = 16;
+
+  // expected-error@+4 {{for optimal matrix layout, matrix stride must be 0}}
+  __builtin_OuterProductAccumulate(input_vector_0, input_vector_1,
+                                  accumulate_buffer, matrix_offset,
+                                  matrix_interpretation, matrix_layout,
+                                  matrix_stride);
+}
diff --git a/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/outer_product_accumulate_valid.hlsl b/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/outer_product_accumulate_valid.hlsl
new file mode 100644
index 0000000000..85298e2dbb
--- /dev/null
+++ b/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/outer_product_accumulate_valid.hlsl
@@ -0,0 +1,66 @@
+// RUN: %dxc -I %hlsl_headers -T lib_6_9 -enable-16bit-types %s -verify
+
+#include <dx/linalg.h>
+
+using namespace dx::linalg;
+
+ByteAddressBuffer input_vector_buffer;
+RWByteAddressBuffer accumulate_buffer;
+ByteAddressBuffer constants_buffer;
+
+// Check for input vectors aren't the same component type
+void test_invalid_input_vector_component_type() {
+
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL;
+  const uint matrix_stride = 0;
+
+  vector<float, 4> input_vector_0_0 = input_vector_buffer.Load<vector<float, 4> >(0);
+  vector<float, 16> input_vector_1_0 = input_vector_buffer.Load<vector<float, 16> >(0);
+
+      // expected-no-diagnostics@+1
+  __builtin_OuterProductAccumulate(input_vector_0_0, input_vector_1_0,
+                                  accumulate_buffer, matrix_offset,
+                                  matrix_interpretation, matrix_layout,
+                                  matrix_stride);
+
+  vector<int, 32> input_vector_0_1 = input_vector_buffer.Load<vector<int, 32> >(0);
+  vector<int ,16> input_vector_1_1 = input_vector_buffer.Load<vector<int, 16> >(0);
+
+     // expected-no-diagnostics@+1
+  __builtin_OuterProductAccumulate(input_vector_0_1, input_vector_1_1,
+                                  accumulate_buffer, matrix_offset,
+                                  matrix_interpretation, matrix_layout,
+                                  matrix_stride);
+
+  vector<uint, 4> input_vector_0_2 = input_vector_buffer.Load<vector<uint, 4> >(0);
+  vector<uint, 16> input_vector_1_2 = input_vector_buffer.Load<vector<uint, 16> >(0);
+
+  // expected-no-diagnostics@+1
+  __builtin_OuterProductAccumulate(input_vector_0_2, input_vector_1_2,
+                                  accumulate_buffer, matrix_offset,
+                                  matrix_interpretation, matrix_layout,
+                                  matrix_stride);
+}
+
+// Check for non constant matrix stride
+void test_non_constant_matrix_stride() {
+
+  vector<float, 4> input_vector_0 = input_vector_buffer.Load<vector<float, 4> >(0);
+  vector<float, 4> input_vector_1 = input_vector_buffer.Load<vector<float, 4> >(0);
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL;
+
+  const uint matrix_stride = constants_buffer.Load<uint>(0);
+
+  // expected-no-diagnostics@+4
+  __builtin_OuterProductAccumulate(input_vector_0, input_vector_1,
+                                  accumulate_buffer, matrix_offset,
+                                  matrix_interpretation, matrix_layout,
+                                  matrix_stride);
+}
+
+// Check for matrix stride is not a valid value
+
diff --git a/tools/clang/test/SemaHLSL/hlsl/linalg/make-interp-vec-errors.hlsl b/tools/clang/test/SemaHLSL/hlsl/linalg/make-interp-vec-errors.hlsl
index 9f2793d417..be67d92546 100644
--- a/tools/clang/test/SemaHLSL/hlsl/linalg/make-interp-vec-errors.hlsl
+++ b/tools/clang/test/SemaHLSL/hlsl/linalg/make-interp-vec-errors.hlsl
@@ -10,7 +10,7 @@ export float4 Test1(vector<float, 4> Input) {
       Buf, 0, 0};
 
   // expected-error@+3{{no matching function for call to 'MakeInterpretedVector'}}
-  // expected-note@dx/linalg.h:97{{candidate template ignored: invalid explicitly-specified argument for template parameter 'DT'}}
+  // expected-note@dx/linalg.h:113{{candidate template ignored: invalid explicitly-specified argument for template parameter 'DT'}}
   return Mul<float>(    
       Matrix, MakeInterpretedVector<2>(Input));
 }
@@ -26,7 +26,7 @@ export float4 Test2(vector<float, 4> Input) {
       Buf, 0, 0};
 
   // expected-error@+3{{no matching function for call to 'MakeInterpretedVector'}}
-  // expected-note@dx/linalg.h:97{{candidate template ignored: invalid explicitly-specified argument for template parameter 'DT'}}
+  // expected-note@dx/linalg.h:113{{candidate template ignored: invalid explicitly-specified argument for template parameter 'DT'}}
   return Mul<float>(    
       Matrix, MakeInterpretedVector<DATA_TYPE_InvalidType>(Input));
 }
diff --git a/tools/clang/test/SemaHLSL/hlsl/linalg/mat-vec-mul-errors.hlsl b/tools/clang/test/SemaHLSL/hlsl/linalg/mat-vec-mul-errors.hlsl
index 2d5a11e83e..b911de648e 100644
--- a/tools/clang/test/SemaHLSL/hlsl/linalg/mat-vec-mul-errors.hlsl
+++ b/tools/clang/test/SemaHLSL/hlsl/linalg/mat-vec-mul-errors.hlsl
@@ -11,6 +11,6 @@ vector<float, 128> MixUpVectorAndMatrixArguments(vector<float, 128> Input) {
       Buf, 0, 0};
 
   // expected-error@+2{{no matching function for call to 'Mul'}}
-  // expected-note@dx/linalg.h:111{{candidate template ignored: could not match 'MatrixRefImpl' against 'InterpretedVector'}}
+  // expected-note@dx/linalg.h:127{{candidate template ignored: could not match 'MatrixRefImpl' against 'InterpretedVector'}}
   return Mul<float>(MakeInterpretedVector<DATA_TYPE_FLOAT16>(Input), Matrix);
 }
diff --git a/tools/clang/test/SemaHLSL/hlsl/linalg/mat-vec-muladd-errors.hlsl b/tools/clang/test/SemaHLSL/hlsl/linalg/mat-vec-muladd-errors.hlsl
index f444f81c3a..24ad3ef46c 100644
--- a/tools/clang/test/SemaHLSL/hlsl/linalg/mat-vec-muladd-errors.hlsl
+++ b/tools/clang/test/SemaHLSL/hlsl/linalg/mat-vec-muladd-errors.hlsl
@@ -11,6 +11,6 @@ vector<float, 128> MixUpVectorAndMatrixArguments(vector<float, 128> Input) {
       Buf, 0, 0};
 
   // expected-error@+2{{no matching function for call to 'MulAdd'}}
-  // expected-note@dx/linalg.h:137{{candidate template ignored: could not match 'MatrixRefImpl' against 'InterpretedVector'}}
+  // expected-note@dx/linalg.h:153{{candidate template ignored: could not match 'MatrixRefImpl' against 'InterpretedVector'}}
   return MulAdd<float>(MakeInterpretedVector<DATA_TYPE_SINT16>(Input), Matrix, MakeInterpretedVector<DATA_TYPE_SINT16>(Input));
 }
diff --git a/tools/clang/test/SemaHLSL/hlsl/linalg/outerproductaccumulate-errors.hlsl b/tools/clang/test/SemaHLSL/hlsl/linalg/outerproductaccumulate-errors.hlsl
index 6f503b367b..5759631bcb 100644
--- a/tools/clang/test/SemaHLSL/hlsl/linalg/outerproductaccumulate-errors.hlsl
+++ b/tools/clang/test/SemaHLSL/hlsl/linalg/outerproductaccumulate-errors.hlsl
@@ -12,7 +12,7 @@ export void Test4(vector<half, 128> Input1, vector<half, 64> Input2) {
       matrix = {RWBuf, 0, 0};
 
   // expected-error@+3{{no matching function for call to 'OuterProductAccumulate'}}
-  // expected-note@dx/linalg.h:161{{candidate template ignored: could not match 0 against 1}}
+  // expected-note@dx/linalg.h:177{{candidate template ignored: could not match 0 against 1}}
 
   OuterProductAccumulate(Input1, Input2, matrix);  
 }
@@ -25,7 +25,7 @@ export void Test5(vector<int, 128> Input1, vector<uint, 128> Input2) {
       matrix = {RWBuf, 0, 0};
 
   // expected-error@+3{{no matching function for call to 'OuterProductAccumulate'}}
-  // expected-note@dx/linalg.h:161{{candidate template ignored: could not match 0 against 1}}
+  // expected-note@dx/linalg.h:177{{candidate template ignored: could not match 0 against 1}}
 
   OuterProductAccumulate(Input1, Input2, matrix);  
 }
@@ -38,7 +38,7 @@ export void Test4(vector<half, 64> Input1, vector<half, 64> Input2) {
       matrix = {RWBuf, 0, 0};
 
   // expected-error@+3{{no matching function for call to 'OuterProductAccumulate'}}
-  // expected-note@dx/linalg.h:161{{candidate template ignored: deduced conflicting types for parameter 'ElTy' ('int' vs. 'unsigned int')}}
+  // expected-note@dx/linalg.h:177{{candidate template ignored: deduced conflicting types for parameter 'ElTy' ('int' vs. 'unsigned int')}}
 
   OuterProductAccumulate(Input1, Input2, matrix);  
 }
diff --git a/tools/clang/test/SemaHLSL/hlsl/linalg/unavailable-pre-sm69.hlsl b/tools/clang/test/SemaHLSL/hlsl/linalg/unavailable-pre-sm69.hlsl
index d5e251ae8b..57683b9a59 100644
--- a/tools/clang/test/SemaHLSL/hlsl/linalg/unavailable-pre-sm69.hlsl
+++ b/tools/clang/test/SemaHLSL/hlsl/linalg/unavailable-pre-sm69.hlsl
@@ -23,7 +23,7 @@ void cs_main()
     const bool matrix_is_transposed = false; 
     const uint matrix_stride = 64;
 
-    //expected-error@+1{{intrinsic __builtin_MatVecMul potentially used by 'cs_main' requires shader model 6.9 or greater}}
+    //expected-error@+1{{intrinsic hlsl::__builtin_MatVecMul potentially used by ''cs_main'' requires shader model 6.9 or greater}}
     __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector, 
       is_input_unsigned, input_interpretation, matrix_buffer, matrix_offset,
       matrix_interpretation, matrix_dimM, matrix_dimK, matrix_layout,
@@ -32,7 +32,7 @@ void cs_main()
     const uint bias_offset = 0;
     const uint bias_interpretation = 9; /*F32*/
 
-    //expected-error@+1{{intrinsic __builtin_MatVecMulAdd potentially used by 'cs_main' requires shader model 6.9 or greater}}
+    //expected-error@+1{{intrinsic hlsl::__builtin_MatVecMulAdd potentially used by ''cs_main'' requires shader model 6.9 or greater}}
     __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
       is_input_unsigned, input_interpretation, matrix_buffer, matrix_offset,
       matrix_interpretation, matrix_dimM, matrix_dimK, matrix_layout,
@@ -44,16 +44,16 @@ void cs_main()
     const uint opa_matrix_offset = 0;
     const uint opa_matrix_interpretation = 5; /*U32*/
     const uint opa_matrix_layout = 3; /*OuterProductOptimal*/
-    const uint opa_matrix_stride = 64;
+    const uint opa_matrix_stride = 0;
 
-    //expected-error@+1{{intrinsic __builtin_OuterProductAccumulate potentially used by 'cs_main' requires shader model 6.9 or greater}}
+    //expected-error@+1{{intrinsic hlsl::__builtin_OuterProductAccumulate potentially used by ''cs_main'' requires shader model 6.9 or greater}}
     __builtin_OuterProductAccumulate(input_vector1, input_vector2,
       rw_matrix_buffer, opa_matrix_offset, opa_matrix_interpretation,
       opa_matrix_layout, opa_matrix_stride);
 
     const uint va_matrix_offset = 0;
 
-     //expected-error@+1{{intrinsic __builtin_VectorAccumulate potentially used by 'cs_main' requires shader model 6.9 or greater}}
-     __builtin_VectorAccumulate(input_vector1, rw_matrix_buffer,
-       va_matrix_offset);
+    //expected-error@+1{{intrinsic hlsl::__builtin_VectorAccumulate potentially used by ''cs_main'' requires shader model 6.9 or greater}}
+    __builtin_VectorAccumulate(input_vector1, rw_matrix_buffer,
+      va_matrix_offset);
 }
\ No newline at end of file
diff --git a/utils/hct/gen_intrin_main.txt b/utils/hct/gen_intrin_main.txt
index e5e4119330..60bef02f18 100644
--- a/utils/hct/gen_intrin_main.txt
+++ b/utils/hct/gen_intrin_main.txt
@@ -383,13 +383,13 @@ void [[]] Barrier(in NodeRecordOrUAV o, in uint SemanticFlags);
 
 uint [[]] GetRemainingRecursionLevels();
 
-void [[]] __builtin_MatVecMul(out numeric<c> OutputVector, in bool OutputIsUnsigned, in numeric<c2> InputVector, in bool InputIsUnsigned, in uint InputInterpretation, in ByteAddressBuffer MatrixBuffer, in uint MatrixOffset, in uint MatrixInterpretation, in uint M, in uint K, in uint MatrixLayout, in bool MatrixIsTransposed, in uint MatrixStride);
+void [[min_sm=6.9]] __builtin_MatVecMul(out LinAlg<c> OutputVector, in bool OutputIsUnsigned, in LinAlg<c2> InputVector, in bool InputIsUnsigned, in uint InputInterpretation, in ByteAddressBuffer MatrixBuffer, in uint MatrixOffset, in uint MatrixInterpretation, in uint M, in uint K, in uint MatrixLayout, in bool MatrixIsTransposed, in uint MatrixStride);
 
-void [[]] __builtin_MatVecMulAdd(out numeric<c> OutputVector, in bool OutputIsUnsigned, in numeric<c2> InputVector, in bool InputIsUnsigned, in uint InputInterpretation, in ByteAddressBuffer MatrixBuffer, in uint MatrixOffset, in uint MatrixInterpretation, in uint M, in uint K, in uint MatrixLayout, in bool MatrixIsTransposed, in uint MatrixStride, in ByteAddressBuffer BiasVector, in uint BiasOffset, in uint BiasInterpretation);
+void [[min_sm=6.9]] __builtin_MatVecMulAdd(out LinAlg<c> OutputVector, in bool OutputIsUnsigned, in LinAlg<c2> InputVector, in bool InputIsUnsigned, in uint InputInterpretation, in ByteAddressBuffer MatrixBuffer, in uint MatrixOffset, in uint MatrixInterpretation, in uint M, in uint K, in uint MatrixLayout, in bool MatrixIsTransposed, in uint MatrixStride, in ByteAddressBuffer BiasVector, in uint BiasOffset, in uint BiasInterpretation);
 
-void [[]] __builtin_OuterProductAccumulate(in numeric<c> InputVector1, in numeric<c2> InputVector2, in RWByteAddressBuffer MatrixBuffer, in uint MatrixOffset, in uint MatrixInterpretation, in uint MatrixLayout, in uint MatrixStride);
+void [[min_sm=6.9]] __builtin_OuterProductAccumulate(in LinAlg<c> InputVector1, in LinAlg<c2> InputVector2, in RWByteAddressBuffer MatrixBuffer, in uint MatrixOffset, in uint MatrixInterpretation, in uint MatrixLayout, in uint MatrixStride);
 
-void [[]] __builtin_VectorAccumulate(in numeric<c> InputVector, in RWByteAddressBuffer MatrixBuffer, in uint MatrixOffset);
+void [[min_sm=6.9]] __builtin_VectorAccumulate(in LinAlg<c> InputVector, in RWByteAddressBuffer MatrixBuffer, in uint MatrixOffset);
 
 } namespace
 
diff --git a/utils/hct/hctdb.py b/utils/hct/hctdb.py
index 5567a6a88d..3af3cde949 100644
--- a/utils/hct/hctdb.py
+++ b/utils/hct/hctdb.py
@@ -8419,13 +8419,13 @@ def build_valrules(self):
         self.add_valrule_msg(
             "Instr.MatVecOpIsUnsignedFlagsAreConst",
             "In Linalg Mul/MulAdd functions, IsUnsigned flag is a constant.",
-            "'%1' is not a constant value",
+            "%0 is not a constant value",
         )
 
         self.add_valrule_msg(
             "Instr.LinalgInterpretationParamAreConst",
             "In Linalg operations, Interpretation value is a constant.",
-            "'%1' is not a constant value",
+            "%0 is not a constant value",
         )
 
         self.add_valrule_msg(
@@ -9357,6 +9357,7 @@ def __init__(self, intrinsic_defs, opcode_data):
             "DxHitObject": "LICOMPTYPE_HIT_OBJECT",
             "VkBufferPointer": "LICOMPTYPE_VK_BUFFER_POINTER",
             "RayQuery": "LICOMPTYPE_RAY_QUERY",
+            "LinAlg": "LICOMPTYPE_LINALG",
         }
 
         self.trans_rowcol = {"r": "IA_R", "c": "IA_C", "r2": "IA_R2", "c2": "IA_C2"}

From d72e2b1a15d22fc825e2f3c939f1baac43281ae9 Mon Sep 17 00:00:00 2001
From: Dan Brown <61992655+danbrown-amd@users.noreply.github.com>
Date: Thu, 22 May 2025 12:39:06 -0600
Subject: [PATCH 45/93] Removes improper modification copyright notice. (#7477)

Modification copyright notices were added in error to files changed in
[PR
#7163](https://github.com/microsoft/DirectXShaderCompiler/pull/7163).
---
 include/dxc/dxcapi.internal.h                          | 3 ---
 lib/HLSL/HLOperationLower.cpp                          | 3 ---
 tools/clang/include/clang/AST/HlslTypes.h              | 3 ---
 tools/clang/include/clang/AST/OperationKinds.h         | 3 ---
 tools/clang/include/clang/Basic/Attr.td                | 3 ---
 tools/clang/include/clang/Basic/DiagnosticSemaKinds.td | 3 ---
 tools/clang/include/clang/SPIRV/SpirvBuilder.h         | 3 ---
 tools/clang/include/clang/SPIRV/SpirvContext.h         | 3 ---
 tools/clang/include/clang/SPIRV/SpirvInstruction.h     | 3 ---
 tools/clang/include/clang/SPIRV/SpirvType.h            | 3 ---
 tools/clang/include/clang/SPIRV/SpirvVisitor.h         | 3 ---
 tools/clang/lib/AST/ASTContextHLSL.cpp                 | 3 ---
 tools/clang/lib/AST/Expr.cpp                           | 3 ---
 tools/clang/lib/AST/ExprConstant.cpp                   | 3 ---
 tools/clang/lib/AST/HlslTypes.cpp                      | 3 ---
 tools/clang/lib/Lex/PPMacroExpansion.cpp               | 3 ---
 tools/clang/lib/SPIRV/AlignmentSizeCalculator.cpp      | 3 ---
 tools/clang/lib/SPIRV/CapabilityVisitor.cpp            | 3 ---
 tools/clang/lib/SPIRV/EmitVisitor.cpp                  | 3 ---
 tools/clang/lib/SPIRV/EmitVisitor.h                    | 3 ---
 tools/clang/lib/SPIRV/LowerTypeVisitor.cpp             | 3 ---
 tools/clang/lib/SPIRV/LowerTypeVisitor.h               | 3 ---
 tools/clang/lib/SPIRV/SpirvBuilder.cpp                 | 3 ---
 tools/clang/lib/SPIRV/SpirvContext.cpp                 | 3 ---
 tools/clang/lib/SPIRV/SpirvEmitter.cpp                 | 3 ---
 tools/clang/lib/SPIRV/SpirvEmitter.h                   | 3 ---
 tools/clang/lib/SPIRV/SpirvInstruction.cpp             | 3 ---
 tools/clang/lib/Sema/SemaCast.cpp                      | 3 ---
 tools/clang/lib/Sema/SemaExprCXX.cpp                   | 3 ---
 tools/clang/lib/Sema/SemaHLSL.cpp                      | 3 ---
 utils/hct/gen_intrin_main.txt                          | 3 ---
 utils/hct/hctdb.py                                     | 2 --
 32 files changed, 95 deletions(-)

diff --git a/include/dxc/dxcapi.internal.h b/include/dxc/dxcapi.internal.h
index 41891338e6..46a485206e 100644
--- a/include/dxc/dxcapi.internal.h
+++ b/include/dxc/dxcapi.internal.h
@@ -7,9 +7,6 @@
 //                                                                           //
 // Provides non-public declarations for the DirectX Compiler component.      //
 //                                                                           //
-// Modifications Copyright(C) 2025 Advanced Micro Devices, Inc.              //
-// All rights reserved.                                                      //
-//                                                                           //
 ///////////////////////////////////////////////////////////////////////////////
 
 #ifndef __DXC_API_INTERNAL__
diff --git a/lib/HLSL/HLOperationLower.cpp b/lib/HLSL/HLOperationLower.cpp
index 58c1de3941..7d5eb0edce 100644
--- a/lib/HLSL/HLOperationLower.cpp
+++ b/lib/HLSL/HLOperationLower.cpp
@@ -7,9 +7,6 @@
 //                                                                           //
 // Lower functions to lower HL operations to DXIL operations.                //
 //                                                                           //
-// Modifications Copyright(C) 2025 Advanced Micro Devices, Inc.              //
-// All rights reserved.                                                      //
-//                                                                           //
 ///////////////////////////////////////////////////////////////////////////////
 
 #include "dxc/DXIL/DxilConstants.h"
diff --git a/tools/clang/include/clang/AST/HlslTypes.h b/tools/clang/include/clang/AST/HlslTypes.h
index c14f562101..58d2d916b1 100644
--- a/tools/clang/include/clang/AST/HlslTypes.h
+++ b/tools/clang/include/clang/AST/HlslTypes.h
@@ -6,9 +6,6 @@
 // This file is distributed under the University of Illinois Open Source     //
 // License. See LICENSE.TXT for details.                                     //
 //                                                                           //
-// Modifications Copyright(C) 2025 Advanced Micro Devices, Inc.              //
-// All rights reserved.                                                      //
-//                                                                           //
 ///
 /// \file                                                                    //
 /// \brief Defines the HLSL type system interface.                           //
diff --git a/tools/clang/include/clang/AST/OperationKinds.h b/tools/clang/include/clang/AST/OperationKinds.h
index 3909c8b5e8..d19082d699 100644
--- a/tools/clang/include/clang/AST/OperationKinds.h
+++ b/tools/clang/include/clang/AST/OperationKinds.h
@@ -5,9 +5,6 @@
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
-// Modifications Copyright(C) 2025 Advanced Micro Devices, Inc.
-// All rights reserved.
-//
 //===----------------------------------------------------------------------===//
 //
 // This file enumerates the different kinds of operations that can be
diff --git a/tools/clang/include/clang/Basic/Attr.td b/tools/clang/include/clang/Basic/Attr.td
index db7fdea8d9..1797597d17 100644
--- a/tools/clang/include/clang/Basic/Attr.td
+++ b/tools/clang/include/clang/Basic/Attr.td
@@ -5,9 +5,6 @@
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
-// Modifications Copyright(C) 2025 Advanced Micro Devices, Inc.
-// All rights reserved.
-//
 //===----------------------------------------------------------------------===//
 
 class DocumentationCategory<string name> {
diff --git a/tools/clang/include/clang/Basic/DiagnosticSemaKinds.td b/tools/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 48412facad..0d98792688 100644
--- a/tools/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/tools/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -5,9 +5,6 @@
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
-// Modifications Copyright(C) 2025 Advanced Micro Devices, Inc.
-// All rights reserved.
-//
 //===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
diff --git a/tools/clang/include/clang/SPIRV/SpirvBuilder.h b/tools/clang/include/clang/SPIRV/SpirvBuilder.h
index e4e6ef308f..2da14dab54 100644
--- a/tools/clang/include/clang/SPIRV/SpirvBuilder.h
+++ b/tools/clang/include/clang/SPIRV/SpirvBuilder.h
@@ -5,9 +5,6 @@
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
-// Modifications Copyright(C) 2025 Advanced Micro Devices, Inc.
-// All rights reserved.
-//
 //===----------------------------------------------------------------------===//
 #ifndef LLVM_CLANG_SPIRV_SPIRVBUILDER_H
 #define LLVM_CLANG_SPIRV_SPIRVBUILDER_H
diff --git a/tools/clang/include/clang/SPIRV/SpirvContext.h b/tools/clang/include/clang/SPIRV/SpirvContext.h
index c18c139642..50ff77d4b4 100644
--- a/tools/clang/include/clang/SPIRV/SpirvContext.h
+++ b/tools/clang/include/clang/SPIRV/SpirvContext.h
@@ -5,9 +5,6 @@
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
-// Modifications Copyright(C) 2025 Advanced Micro Devices, Inc.
-// All rights reserved.
-//
 //===----------------------------------------------------------------------===//
 #ifndef LLVM_CLANG_SPIRV_SPIRVCONTEXT_H
 #define LLVM_CLANG_SPIRV_SPIRVCONTEXT_H
diff --git a/tools/clang/include/clang/SPIRV/SpirvInstruction.h b/tools/clang/include/clang/SPIRV/SpirvInstruction.h
index 6d95459373..20cd57525c 100644
--- a/tools/clang/include/clang/SPIRV/SpirvInstruction.h
+++ b/tools/clang/include/clang/SPIRV/SpirvInstruction.h
@@ -5,9 +5,6 @@
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
-// Modifications Copyright(C) 2025 Advanced Micro Devices, Inc.
-// All rights reserved.
-//
 //===----------------------------------------------------------------------===//
 #ifndef LLVM_CLANG_SPIRV_SPIRVINSTRUCTION_H
 #define LLVM_CLANG_SPIRV_SPIRVINSTRUCTION_H
diff --git a/tools/clang/include/clang/SPIRV/SpirvType.h b/tools/clang/include/clang/SPIRV/SpirvType.h
index 00a00ef238..d39fc6943b 100644
--- a/tools/clang/include/clang/SPIRV/SpirvType.h
+++ b/tools/clang/include/clang/SPIRV/SpirvType.h
@@ -5,9 +5,6 @@
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
-// Modifications Copyright(C) 2025 Advanced Micro Devices, Inc.
-// All rights reserved.
-//
 //===----------------------------------------------------------------------===//
 #ifndef LLVM_CLANG_SPIRV_SPIRVTYPE_H
 #define LLVM_CLANG_SPIRV_SPIRVTYPE_H
diff --git a/tools/clang/include/clang/SPIRV/SpirvVisitor.h b/tools/clang/include/clang/SPIRV/SpirvVisitor.h
index 95bc46aa5f..fef06da503 100644
--- a/tools/clang/include/clang/SPIRV/SpirvVisitor.h
+++ b/tools/clang/include/clang/SPIRV/SpirvVisitor.h
@@ -5,9 +5,6 @@
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
-// Modifications Copyright(C) 2025 Advanced Micro Devices, Inc.
-// All rights reserved.
-//
 //===----------------------------------------------------------------------===//
 #ifndef LLVM_CLANG_SPIRV_SPIRVVISITOR_H
 #define LLVM_CLANG_SPIRV_SPIRVVISITOR_H
diff --git a/tools/clang/lib/AST/ASTContextHLSL.cpp b/tools/clang/lib/AST/ASTContextHLSL.cpp
index 0a688c03fa..913b28ced8 100644
--- a/tools/clang/lib/AST/ASTContextHLSL.cpp
+++ b/tools/clang/lib/AST/ASTContextHLSL.cpp
@@ -6,9 +6,6 @@
 // This file is distributed under the University of Illinois Open Source     //
 // License. See LICENSE.TXT for details.                                     //
 //                                                                           //
-// Modifications Copyright(C) 2025 Advanced Micro Devices, Inc.              //
-// All rights reserved.                                                      //
-//                                                                           //
 //  This file implements the ASTContext interface for HLSL.                  //
 //                                                                           //
 ///////////////////////////////////////////////////////////////////////////////
diff --git a/tools/clang/lib/AST/Expr.cpp b/tools/clang/lib/AST/Expr.cpp
index c6dc21217e..8ed14508af 100644
--- a/tools/clang/lib/AST/Expr.cpp
+++ b/tools/clang/lib/AST/Expr.cpp
@@ -5,9 +5,6 @@
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
-// Modifications Copyright(C) 2025 Advanced Micro Devices, Inc.
-// All rights reserved.
-//
 //===----------------------------------------------------------------------===//
 //
 // This file implements the Expr class and subclasses.
diff --git a/tools/clang/lib/AST/ExprConstant.cpp b/tools/clang/lib/AST/ExprConstant.cpp
index 69e0760bce..baa0349cfe 100644
--- a/tools/clang/lib/AST/ExprConstant.cpp
+++ b/tools/clang/lib/AST/ExprConstant.cpp
@@ -5,9 +5,6 @@
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
-// Modifications Copyright(C) 2025 Advanced Micro Devices, Inc.
-// All rights reserved.
-//
 //===----------------------------------------------------------------------===//
 //
 // This file implements the Expr constant evaluator.
diff --git a/tools/clang/lib/AST/HlslTypes.cpp b/tools/clang/lib/AST/HlslTypes.cpp
index 05386ddaa5..017f0f7218 100644
--- a/tools/clang/lib/AST/HlslTypes.cpp
+++ b/tools/clang/lib/AST/HlslTypes.cpp
@@ -5,9 +5,6 @@
 // Copyright (C) Microsoft Corporation. All rights reserved.                 //
 // This file is distributed under the University of Illinois Open Source     //
 // License. See LICENSE.TXT for details.                                     //
-//
-// Modifications Copyright(C) 2025 Advanced Micro Devices, Inc.
-// All rights reserved.
 //                                                                           //
 ///
 /// \file                                                                    //
diff --git a/tools/clang/lib/Lex/PPMacroExpansion.cpp b/tools/clang/lib/Lex/PPMacroExpansion.cpp
index ebfb93df2e..16040d69c7 100644
--- a/tools/clang/lib/Lex/PPMacroExpansion.cpp
+++ b/tools/clang/lib/Lex/PPMacroExpansion.cpp
@@ -5,9 +5,6 @@
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
-// Modifications Copyright(C) 2025 Advanced Micro Devices, Inc.
-// All rights reserved.
-//
 //===----------------------------------------------------------------------===//
 //
 // This file implements the top level handling of macro expansion for the
diff --git a/tools/clang/lib/SPIRV/AlignmentSizeCalculator.cpp b/tools/clang/lib/SPIRV/AlignmentSizeCalculator.cpp
index db140f4766..9bb2f1b1fa 100644
--- a/tools/clang/lib/SPIRV/AlignmentSizeCalculator.cpp
+++ b/tools/clang/lib/SPIRV/AlignmentSizeCalculator.cpp
@@ -5,9 +5,6 @@
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
-// Modifications Copyright(C) 2025 Advanced Micro Devices, Inc.
-// All rights reserved.
-//
 //===----------------------------------------------------------------------===//
 
 #include "AlignmentSizeCalculator.h"
diff --git a/tools/clang/lib/SPIRV/CapabilityVisitor.cpp b/tools/clang/lib/SPIRV/CapabilityVisitor.cpp
index 9ca9cbc6cd..43ab2540b4 100644
--- a/tools/clang/lib/SPIRV/CapabilityVisitor.cpp
+++ b/tools/clang/lib/SPIRV/CapabilityVisitor.cpp
@@ -5,9 +5,6 @@
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
-// Modifications Copyright(C) 2025 Advanced Micro Devices, Inc.
-// All rights reserved.
-//
 //===----------------------------------------------------------------------===//
 
 #include "CapabilityVisitor.h"
diff --git a/tools/clang/lib/SPIRV/EmitVisitor.cpp b/tools/clang/lib/SPIRV/EmitVisitor.cpp
index f58160254a..7d39b0ec1f 100644
--- a/tools/clang/lib/SPIRV/EmitVisitor.cpp
+++ b/tools/clang/lib/SPIRV/EmitVisitor.cpp
@@ -5,9 +5,6 @@
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
-// Modifications Copyright(C) 2025 Advanced Micro Devices, Inc.
-// All rights reserved.
-//
 //===----------------------------------------------------------------------===//
 
 // Do not change the inclusion order between "dxc/Support/*" files.
diff --git a/tools/clang/lib/SPIRV/EmitVisitor.h b/tools/clang/lib/SPIRV/EmitVisitor.h
index bfa0710998..1cec230e50 100644
--- a/tools/clang/lib/SPIRV/EmitVisitor.h
+++ b/tools/clang/lib/SPIRV/EmitVisitor.h
@@ -5,9 +5,6 @@
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
-// Modifications Copyright(C) 2025 Advanced Micro Devices, Inc.
-// All rights reserved.
-//
 //===----------------------------------------------------------------------===//
 #ifndef LLVM_CLANG_SPIRV_EMITVISITOR_H
 #define LLVM_CLANG_SPIRV_EMITVISITOR_H
diff --git a/tools/clang/lib/SPIRV/LowerTypeVisitor.cpp b/tools/clang/lib/SPIRV/LowerTypeVisitor.cpp
index b31d19b5d8..8238750af9 100644
--- a/tools/clang/lib/SPIRV/LowerTypeVisitor.cpp
+++ b/tools/clang/lib/SPIRV/LowerTypeVisitor.cpp
@@ -5,9 +5,6 @@
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
-// Modifications Copyright(C) 2025 Advanced Micro Devices, Inc.
-// All rights reserved.
-//
 //===----------------------------------------------------------------------===//
 
 #include "LowerTypeVisitor.h"
diff --git a/tools/clang/lib/SPIRV/LowerTypeVisitor.h b/tools/clang/lib/SPIRV/LowerTypeVisitor.h
index 5b26b67e3a..26b6e44f6d 100644
--- a/tools/clang/lib/SPIRV/LowerTypeVisitor.h
+++ b/tools/clang/lib/SPIRV/LowerTypeVisitor.h
@@ -5,9 +5,6 @@
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
-// Modifications Copyright(C) 2025 Advanced Micro Devices, Inc.
-// All rights reserved.
-//
 //===----------------------------------------------------------------------===//
 
 #ifndef LLVM_CLANG_LIB_SPIRV_LOWERTYPEVISITOR_H
diff --git a/tools/clang/lib/SPIRV/SpirvBuilder.cpp b/tools/clang/lib/SPIRV/SpirvBuilder.cpp
index 689fc0715f..a0dcb5420b 100644
--- a/tools/clang/lib/SPIRV/SpirvBuilder.cpp
+++ b/tools/clang/lib/SPIRV/SpirvBuilder.cpp
@@ -5,9 +5,6 @@
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
-// Modifications Copyright(C) 2025 Advanced Micro Devices, Inc.
-// All rights reserved.
-//
 //===----------------------------------------------------------------------===//
 
 #include "clang/SPIRV/SpirvBuilder.h"
diff --git a/tools/clang/lib/SPIRV/SpirvContext.cpp b/tools/clang/lib/SPIRV/SpirvContext.cpp
index 47dfc67433..cb44d3a3a8 100644
--- a/tools/clang/lib/SPIRV/SpirvContext.cpp
+++ b/tools/clang/lib/SPIRV/SpirvContext.cpp
@@ -5,9 +5,6 @@
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
-// Modifications Copyright(C) 2025 Advanced Micro Devices, Inc.
-// All rights reserved.
-//
 //===----------------------------------------------------------------------===//
 
 #include <algorithm>
diff --git a/tools/clang/lib/SPIRV/SpirvEmitter.cpp b/tools/clang/lib/SPIRV/SpirvEmitter.cpp
index 9ffa978511..ea2347edce 100644
--- a/tools/clang/lib/SPIRV/SpirvEmitter.cpp
+++ b/tools/clang/lib/SPIRV/SpirvEmitter.cpp
@@ -5,9 +5,6 @@
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
-// Modifications Copyright(C) 2025 Advanced Micro Devices, Inc.
-// All rights reserved.
-//
 //===----------------------------------------------------------------------===//
 //
 //  This file implements a SPIR-V emitter class that takes in HLSL AST and emits
diff --git a/tools/clang/lib/SPIRV/SpirvEmitter.h b/tools/clang/lib/SPIRV/SpirvEmitter.h
index 10694313a8..978e88e4ed 100644
--- a/tools/clang/lib/SPIRV/SpirvEmitter.h
+++ b/tools/clang/lib/SPIRV/SpirvEmitter.h
@@ -5,9 +5,6 @@
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
-// Modifications Copyright(C) 2025 Advanced Micro Devices, Inc.
-// All rights reserved.
-//
 //===----------------------------------------------------------------------===//
 //
 //  This file defines a SPIR-V emitter class that takes in HLSL AST and emits
diff --git a/tools/clang/lib/SPIRV/SpirvInstruction.cpp b/tools/clang/lib/SPIRV/SpirvInstruction.cpp
index 3b5861710d..f6ac29f379 100644
--- a/tools/clang/lib/SPIRV/SpirvInstruction.cpp
+++ b/tools/clang/lib/SPIRV/SpirvInstruction.cpp
@@ -5,9 +5,6 @@
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
-// Modifications Copyright(C) 2025 Advanced Micro Devices, Inc.
-// All rights reserved.
-//
 //===----------------------------------------------------------------------===//
 //
 //  This file implements the in-memory representation of SPIR-V instructions.
diff --git a/tools/clang/lib/Sema/SemaCast.cpp b/tools/clang/lib/Sema/SemaCast.cpp
index f5a864e2b6..dcff6c2461 100644
--- a/tools/clang/lib/Sema/SemaCast.cpp
+++ b/tools/clang/lib/Sema/SemaCast.cpp
@@ -5,9 +5,6 @@
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
-// Modifications Copyright(C) 2025 Advanced Micro Devices, Inc.
-// All rights reserved.
-//
 //===----------------------------------------------------------------------===//
 //
 //  This file implements semantic analysis for cast expressions, including
diff --git a/tools/clang/lib/Sema/SemaExprCXX.cpp b/tools/clang/lib/Sema/SemaExprCXX.cpp
index 5113c56205..1e70b95476 100644
--- a/tools/clang/lib/Sema/SemaExprCXX.cpp
+++ b/tools/clang/lib/Sema/SemaExprCXX.cpp
@@ -5,9 +5,6 @@
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
-// Modifications Copyright(C) 2025 Advanced Micro Devices, Inc.
-// All rights reserved.
-//
 //===----------------------------------------------------------------------===//
 ///
 /// \file
diff --git a/tools/clang/lib/Sema/SemaHLSL.cpp b/tools/clang/lib/Sema/SemaHLSL.cpp
index fa59aa6ef7..2163eef8a3 100644
--- a/tools/clang/lib/Sema/SemaHLSL.cpp
+++ b/tools/clang/lib/Sema/SemaHLSL.cpp
@@ -6,9 +6,6 @@
 // This file is distributed under the University of Illinois Open Source     //
 // License. See LICENSE.TXT for details.                                     //
 //                                                                           //
-// Modifications Copyright(C) 2025 Advanced Micro Devices, Inc.              //
-// All rights reserved.                                                      //
-//                                                                           //
 //  This file implements the semantic support for HLSL.                      //
 //                                                                           //
 ///////////////////////////////////////////////////////////////////////////////
diff --git a/utils/hct/gen_intrin_main.txt b/utils/hct/gen_intrin_main.txt
index 60bef02f18..ae8df55a0c 100644
--- a/utils/hct/gen_intrin_main.txt
+++ b/utils/hct/gen_intrin_main.txt
@@ -1,9 +1,6 @@
 // Copyright (C) Microsoft Corporation. All rights reserved.
 // This file is distributed under the University of Illinois Open Source License. See LICENSE.TXT for details.
 //
-// Modifications Copyright(C) 2025 Advanced Micro Devices, Inc.
-// All rights reserved.
-//
 // See hctdb.py for the implementation of intrinsic file processing.
 //
 // Intrinsic declarations are grouped into namespaces that
diff --git a/utils/hct/hctdb.py b/utils/hct/hctdb.py
index 3af3cde949..2b94b13134 100644
--- a/utils/hct/hctdb.py
+++ b/utils/hct/hctdb.py
@@ -1,7 +1,5 @@
 # Copyright (C) Microsoft Corporation. All rights reserved.
 # This file is distributed under the University of Illinois Open Source License. See LICENSE.TXT for details.
-# Modifications Copyright(C) 2025 Advanced Micro Devices, Inc.
-# All rights reserved.
 ###############################################################################
 # DXIL information.                                                           #
 ###############################################################################

From d14d174d9c54845696613e2d00387a67d98f6fb4 Mon Sep 17 00:00:00 2001
From: Tex Riddell <texr@microsoft.com>
Date: Tue, 27 May 2025 13:16:26 -0700
Subject: [PATCH 46/93] Update Release notes for 1.8.2505 (#7481) (#7483)

This change updates release notes with notable changes for release
1.8.2505.

(cherry picked from commit 9efbb6c3242cbb40c1844a2589171ff1c27cf956)
---
 docs/ReleaseNotes.md | 36 +++++++++++++++++++++++++++++++++++-
 1 file changed, 35 insertions(+), 1 deletion(-)

diff --git a/docs/ReleaseNotes.md b/docs/ReleaseNotes.md
index 7788c57726..274164158e 100644
--- a/docs/ReleaseNotes.md
+++ b/docs/ReleaseNotes.md
@@ -23,8 +23,42 @@ Place release notes for the upcoming release below this line and remove this lin
 
 ### Version 1.8.2505
 
+#### Potentially breaking changes
+
 - Typed buffers (including ROV buffers) no longer accept types other than vectors and scalars. Any other types will produce descriptive errors. This removes support for appropriately sized matrices and structs. Though it worked in some contexts, code generated from such types was unreliable.
-- By default, the internal validator will be used instead of searching externally for an existing DXIL.dll.
+  - Load and Store operations have been refactored as a consequence. Behavior should be identical, please file issues if discrepancies are observed.
+- The compiler will now always use the internal validator instead of searching for an external DXIL.dll.  The (hidden) `-select-validator` option has been removed.
+
+#### Notable SPIR-V updates
+
+- Fix unnecessary Int64 requirement when loading Float64
+- Added vk::BufferPointer, see [proposal](https://github.com/microsoft/hlsl-specs/blob/main/proposals/0010-vk-buffer-ref.md) for more details.
+- Implement QuadAny and QuadAll (#7266)
+- Fix -fvk-invert-y (#7447)
+
+#### Shader Model 6.9 Preview
+
+You can now compile shaders to SM 6.9, but this is a preview, so shader hashes will be set to the PREVIEW_BYPASS pattern.
+SM 6.9 shaders will only work with AgilitySDK 1.717.0-preview, a supported preview driver, and use of experimental shader models in developer mode.
+Preview shaders will not be compatible with the SM 6.9 release, or likely even later versions of the SM 6.9 preview.
+
+SM 6.9 Preview Additions:
+
+- Long vectors are allowed in HLSL when targeting shader model 6.9. Vectors up to 1024 elements in length can be loaded from/stored to raw buffers and used in elementwise operations. See the [long vector proposal](https://github.com/microsoft/hlsl-specs/blob/main/proposals/0026-hlsl-long-vector-type.md) for more details.
+- HLSL Vectors are still limited to a maximum of 4 elements when used in certain contexts:
+  - entry function inputs/outputs
+  - parameter, payload, attribute, and node record types for mesh, raytracing, and node shaders
+  - constant buffers (cbuffer), texture buffers (tbuffer), textures and typed buffers
+  - Note: some HLSL elementwise intrinsics do not yet support long vectors in this preview
+- Native vectors of up to 1024 elements are now present in DXIL. This includes vector llvm instructions, load/store, and various elementwise DXIL operations. This may result in smaller DXIL and potentially other performance improvements. See the [dxil vectors proposal](https://github.com/microsoft/hlsl-specs/blob/main/proposals/0030-dxil-vectors.md) for more details.
+- Cooperative Vector operations, a subset of Linear Algebra (LinAlg). See the [cooperative vectors proposal](https://github.com/microsoft/hlsl-specs/blob/main/proposals/0029-cooperative-vector.md) and the [HLSL header based API proposal](https://github.com/microsoft/hlsl-specs/blob/main/proposals/0031-hlsl-vector-matrix-operations.md) for more details.
+  - New built-in operations are added for multiplying long vectors with a matrix in a ByteAddressBuffer, optionally with accumulation and bias data, as well as outer product and vector accumulate operations.
+  - An HLSL header shipped with this release provides a more convenient API for using these built-in operations.
+- Support for [Opacity Micromaps](https://github.com/microsoft/hlsl-specs/blob/main/proposals/0024-opacity-micromaps.md) in DXR shaders as well as for RayQuery.
+  - Unlocks DXR performance improvements using triangle sub-divisions for fast hit/miss detection to reduce the need for anyhit invocations.
+- Support for [Shader Execution Reordering](https://github.com/microsoft/hlsl-specs/blob/main/proposals/0027-shader-execution-reordering.md) in DXR.
+  - Introduces `MaybeReorderThread()` to explicitly specify where and how shader execution coherence can be improved. `MaybeReorderThread()` can be used in raygeneration shaders.
+  - `HitObject` decouples traversal, intersection testing and anyhit shading from closesthit and miss shading for more control and better reordering opportunities. `HitObject` can be used in raygeneration, closesthit and miss shaders.
 
 ### Version 1.8.2502
 

From 66287b27442d0af17a152d024a6deaadb075cd30 Mon Sep 17 00:00:00 2001
From: Steven Perron <stevenperron@google.com>
Date: Wed, 28 May 2025 19:46:27 -0400
Subject: [PATCH 47/93] Update submodules (#7492)

Update the submodules to the latest, and renabled the test that is now
passing.

Fixes #7160
---
 external/SPIRV-Headers                                          | 2 +-
 external/SPIRV-Tools                                            | 2 +-
 .../clang/test/CodeGenSPIRV/meshshading.ext.cullprimative.hlsl  | 2 --
 3 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/external/SPIRV-Headers b/external/SPIRV-Headers
index aa6cef192b..c9aad99f92 160000
--- a/external/SPIRV-Headers
+++ b/external/SPIRV-Headers
@@ -1 +1 @@
-Subproject commit aa6cef192b8e693916eb713e7a9ccadf06062ceb
+Subproject commit c9aad99f9276817f18f72a4696239237c83cb775
diff --git a/external/SPIRV-Tools b/external/SPIRV-Tools
index a62abcb402..da48bb20bd 160000
--- a/external/SPIRV-Tools
+++ b/external/SPIRV-Tools
@@ -1 +1 @@
-Subproject commit a62abcb402009b9ca5975e6167c09f237f630e0e
+Subproject commit da48bb20bdfc8a214d5bffdacca2d1d2ae849009
diff --git a/tools/clang/test/CodeGenSPIRV/meshshading.ext.cullprimative.hlsl b/tools/clang/test/CodeGenSPIRV/meshshading.ext.cullprimative.hlsl
index cb5d7f771f..2a143afab2 100644
--- a/tools/clang/test/CodeGenSPIRV/meshshading.ext.cullprimative.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/meshshading.ext.cullprimative.hlsl
@@ -1,6 +1,4 @@
 // RUN: %dxc -T ms_6_6 -fspv-target-env=vulkan1.1spirv1.4 -E main %s -spirv | FileCheck %s
-// XFAIL: *
-// FIXME(7160): test disabled until the spirv-val fix is merged.
 
 struct MeshletPrimitiveOut
 {

From dc59ed092b17b48436e9220a950eee3d974dbbe9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nathan=20Gau=C3=ABr?= <brioche@google.com>
Date: Fri, 30 May 2025 12:53:17 +0200
Subject: [PATCH 48/93] [SPIR-V] Fix r-value being used in mul intrinsic
 (#7489)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When dealing with the Load method on buffers, the operator call can emit
a pointer instead of an actual load, and the user is then responsible on
loading the value if required.
The `mul` instrinsic code was not handling this, hence caused the
pointer to be passed as-is in SPIR-V.

Fixes #7246

Signed-off-by: Nathan Gauër <brioche@google.com>
---
 tools/clang/lib/SPIRV/SpirvEmitter.cpp        | 17 ++++++++++---
 tools/clang/lib/SPIRV/SpirvEmitter.h          |  3 +++
 .../test/CodeGenSPIRV/intrinsics.mul.hlsl     | 25 +++++++++++++++++++
 3 files changed, 41 insertions(+), 4 deletions(-)

diff --git a/tools/clang/lib/SPIRV/SpirvEmitter.cpp b/tools/clang/lib/SPIRV/SpirvEmitter.cpp
index ea2347edce..92e4c687ca 100644
--- a/tools/clang/lib/SPIRV/SpirvEmitter.cpp
+++ b/tools/clang/lib/SPIRV/SpirvEmitter.cpp
@@ -1260,6 +1260,15 @@ SpirvInstruction *SpirvEmitter::doExpr(const Expr *expr,
   return result;
 }
 
+SpirvInstruction *SpirvEmitter::doExprEnsuringRValue(const Expr *E,
+                                                     SourceLocation location,
+                                                     SourceRange range) {
+  SpirvInstruction *I = doExpr(E);
+  if (I->isRValue())
+    return I;
+  return spvBuilder.createLoad(E->getType(), I, location, range);
+}
+
 SpirvInstruction *SpirvEmitter::loadIfGLValue(const Expr *expr,
                                               SourceRange rangeOverride) {
   // We are trying to load the value here, which is what an LValueToRValue
@@ -11364,8 +11373,8 @@ SpirvInstruction *SpirvEmitter::processIntrinsicMul(const CallExpr *callExpr) {
     uint32_t numRows = 0;
     if (isMxNMatrix(returnType, &elemType, &numRows)) {
       llvm::SmallVector<SpirvInstruction *, 4> rows;
-      auto *arg0Id = doExpr(arg0);
-      auto *arg1Id = doExpr(arg1);
+      auto *arg0Id = doExprEnsuringRValue(arg0, loc, range);
+      auto *arg1Id = doExprEnsuringRValue(arg1, loc, range);
       for (uint32_t i = 0; i < numRows; ++i) {
         auto *scalar = spvBuilder.createCompositeExtract(elemType, arg0Id, {i},
                                                          loc, range);
@@ -11380,8 +11389,8 @@ SpirvInstruction *SpirvEmitter::processIntrinsicMul(const CallExpr *callExpr) {
   }
 
   // All the following cases require handling arg0 and arg1 expressions first.
-  auto *arg0Id = doExpr(arg0);
-  auto *arg1Id = doExpr(arg1);
+  auto *arg0Id = doExprEnsuringRValue(arg0, loc, range);
+  auto *arg1Id = doExprEnsuringRValue(arg1, loc, range);
 
   // mul(scalar, scalar)
   if (isScalarType(arg0Type) && isScalarType(arg1Type))
diff --git a/tools/clang/lib/SPIRV/SpirvEmitter.h b/tools/clang/lib/SPIRV/SpirvEmitter.h
index 978e88e4ed..e5daed603d 100644
--- a/tools/clang/lib/SPIRV/SpirvEmitter.h
+++ b/tools/clang/lib/SPIRV/SpirvEmitter.h
@@ -80,6 +80,9 @@ class SpirvEmitter : public ASTConsumer {
   void doDecl(const Decl *decl);
   void doStmt(const Stmt *stmt, llvm::ArrayRef<const Attr *> attrs = {});
   SpirvInstruction *doExpr(const Expr *expr, SourceRange rangeOverride = {});
+  SpirvInstruction *doExprEnsuringRValue(const Expr *expr,
+                                         SourceLocation location,
+                                         SourceRange range);
 
   /// Processes the given expression and emits SPIR-V instructions. If the
   /// result is a GLValue, does an additional load.
diff --git a/tools/clang/test/CodeGenSPIRV/intrinsics.mul.hlsl b/tools/clang/test/CodeGenSPIRV/intrinsics.mul.hlsl
index 4d04896781..629e7527c3 100644
--- a/tools/clang/test/CodeGenSPIRV/intrinsics.mul.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/intrinsics.mul.hlsl
@@ -1,5 +1,8 @@
 // RUN: %dxc -T ps_6_0 -E main -fcgl  %s -spirv | FileCheck %s
 
+StructuredBuffer<float3> buffer_vec;
+StructuredBuffer<float3x3> buffer_mat;
+
 /*
 According to HLSL reference, mul() has the following versions:
 
@@ -448,6 +451,7 @@ void main() {
 // mul( Mat(Mx1) * Mat(1xN) ) --> Mat(MxN) matrix
   float1x3 mat1x3;
   float3x2 mat3x2;
+  float3x3 mat3x3;
   float3x1 mat3x1;
   float1x4 mat1x4;
 
@@ -474,4 +478,25 @@ void main() {
 // CHECK-NEXT: [[result3:%[0-9]+]] = OpCompositeConstruct %mat3v4float [[row0]] [[row1]] [[row2]]
 // CHECK-NEXT:                    OpStore %result3 [[result3]]
   float3x4   result3 = mul( mat3x1, mat1x4 ); // result is float3x4 matrix
+
+  float3 v3;
+
+// CHECK: [[matp:%[0-9]+]] = OpAccessChain %_ptr_Uniform_mat3v3float %buffer_mat %int_0 %int_0
+// CHECK:  [[mat:%[0-9]+]] = OpLoad %mat3v3float [[matp]]
+// CHECK:  [[vec:%[0-9]+]] = OpLoad %v3float %v3
+// CHECK:           {{.*}} = OpVectorTimesMatrix %v3float [[vec]] [[mat]]
+  float3 result4 = mul(buffer_mat.Load(0), v3);
+
+// CHECK:  [[mat:%[0-9]+]] = OpLoad %mat3v3float %mat3x3
+// CHECK: [[vecp:%[0-9]+]] = OpAccessChain %_ptr_Uniform_v3float %buffer_vec %int_0 %int_1
+// CHECK:  [[vec:%[0-9]+]] = OpLoad %v3float [[vecp]]
+// CHECK:           {{.*}} = OpVectorTimesMatrix %v3float [[vec]] [[mat]]
+  float3 result5 = mul(mat3x3, buffer_vec.Load(1));
+
+// CHECK: [[matp:%[0-9]+]] = OpAccessChain %_ptr_Uniform_mat3v3float %buffer_mat %int_0 %int_2
+// CHECK:  [[mat:%[0-9]+]] = OpLoad %mat3v3float [[matp]]
+// CHECK: [[vecp:%[0-9]+]] = OpAccessChain %_ptr_Uniform_v3float %buffer_vec %int_0 %int_2
+// CHECK:  [[vec:%[0-9]+]] = OpLoad %v3float [[vecp]]
+// CHECK:           {{.*}} = OpVectorTimesMatrix %v3float [[vec]] [[mat]]
+  float3 result6 = mul(buffer_mat.Load(2), buffer_vec.Load(2));
 }

From d8ef89cdcfc356cd983ef918c3fa324e16ba55ad Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nathan=20Gau=C3=ABr?= <brioche@google.com>
Date: Fri, 30 May 2025 12:53:45 +0200
Subject: [PATCH 49/93] [SPIR-V] Add payload to OpEmitMeshTasksEXT (#7485)

This commit fixes the missing payload parameter for the
OpEmitMeshTasksEXT instruction.
Errors such as the passed variable storage class or type are already
tested.

Fixes #7082

Co-Authored-by: baldurk <baldurk@baldurk.org>

Co-authored-by: baldurk <baldurk@baldurk.org>
---
 tools/clang/lib/SPIRV/SpirvEmitter.cpp            |  7 ++++---
 .../meshshading.ext.amplification.payload.hlsl    | 15 +++++++++++++++
 2 files changed, 19 insertions(+), 3 deletions(-)
 create mode 100644 tools/clang/test/CodeGenSPIRV/meshshading.ext.amplification.payload.hlsl

diff --git a/tools/clang/lib/SPIRV/SpirvEmitter.cpp b/tools/clang/lib/SPIRV/SpirvEmitter.cpp
index 92e4c687ca..c005f6576c 100644
--- a/tools/clang/lib/SPIRV/SpirvEmitter.cpp
+++ b/tools/clang/lib/SPIRV/SpirvEmitter.cpp
@@ -13021,7 +13021,7 @@ void SpirvEmitter::processDispatchMesh(const CallExpr *callExpr) {
           : spv::StorageClass::Output;
   auto *payloadArg = doExpr(args[3]);
   bool isValid = false;
-  const VarDecl *param = nullptr;
+  SpirvInstruction *param = nullptr;
   if (const auto *implCastExpr = dyn_cast<CastExpr>(args[3])) {
     if (const auto *arg = dyn_cast<DeclRefExpr>(implCastExpr->getSubExpr())) {
       if (const auto *paramDecl = dyn_cast<VarDecl>(arg->getDecl())) {
@@ -13029,7 +13029,8 @@ void SpirvEmitter::processDispatchMesh(const CallExpr *callExpr) {
           isValid = declIdMapper.createPayloadStageVars(
               sigPoint, sc, paramDecl, /*asInput=*/false, paramDecl->getType(),
               "out.var", &payloadArg);
-          param = paramDecl;
+          param =
+              declIdMapper.getDeclEvalInfo(paramDecl, paramDecl->getLocation());
         }
       }
     }
@@ -13046,7 +13047,7 @@ void SpirvEmitter::processDispatchMesh(const CallExpr *callExpr) {
 
   if (featureManager.isExtensionEnabled(Extension::EXT_mesh_shader)) {
     // for EXT_mesh_shader, create opEmitMeshTasksEXT.
-    spvBuilder.createEmitMeshTasksEXT(threadX, threadY, threadZ, loc, nullptr,
+    spvBuilder.createEmitMeshTasksEXT(threadX, threadY, threadZ, loc, param,
                                       range);
   } else {
     // for NV_mesh_shader, set TaskCountNV = threadX * threadY * threadZ.
diff --git a/tools/clang/test/CodeGenSPIRV/meshshading.ext.amplification.payload.hlsl b/tools/clang/test/CodeGenSPIRV/meshshading.ext.amplification.payload.hlsl
new file mode 100644
index 0000000000..c50ef252e9
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/meshshading.ext.amplification.payload.hlsl
@@ -0,0 +1,15 @@
+// RUN: %dxc -E main -T as_6_8 -spirv %s -E main -fspv-target-env=vulkan1.1spirv1.4 | FileCheck %s
+
+struct S {
+  uint a;
+};
+
+groupshared S s;
+// CHECK: %s = OpVariable {{.*}} TaskPayloadWorkgroupEXT
+
+[numthreads(1, 1, 1)]
+void main()
+{
+// CHECK: OpEmitMeshTasksEXT %uint_1 %uint_1 %uint_1 %s
+	DispatchMesh(1, 1, 1, s);
+}

From 194b57a37ea4a69f947df1d56ac95aece72ed943 Mon Sep 17 00:00:00 2001
From: Vovan675 <31342770+Vovan675@users.noreply.github.com>
Date: Fri, 30 May 2025 17:42:13 +0300
Subject: [PATCH 50/93] Fix markdown in SPIR-V.rst (#7112)

Fix markdown syntax in a few places. They shown incorrectly on github
---
 docs/SPIR-V.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/SPIR-V.rst b/docs/SPIR-V.rst
index f3981ba854..771cf0e5a2 100644
--- a/docs/SPIR-V.rst
+++ b/docs/SPIR-V.rst
@@ -3967,7 +3967,7 @@ RayQuery Mapping to SPIR-V
 +---------------------------------------------------+-------------------------------------------------------------------------+
 |``.WorldRayDirection``                             | ``OpRayQueryGetWorldRayDirectionKHR``                                   |
 +---------------------------------------------------+-------------------------------------------------------------------------+
-|``.WorldRayOrigin`                                 | ``OpRayQueryGetWorldRayOriginKHR``                                      |
+|``.WorldRayOrigin``                                | ``OpRayQueryGetWorldRayOriginKHR``                                      |
 +---------------------------------------------------+-------------------------------------------------------------------------+
 
 Shader Model 6.0+ Wave Intrinsics

From 20f291eb6f0092ac3bf7b652769cc0396e4335ac Mon Sep 17 00:00:00 2001
From: wszqkzqk <wszqkzqk@qq.com>
Date: Fri, 30 May 2025 22:43:08 +0800
Subject: [PATCH 51/93] Add LoongArch 64 bit (#7020)

[LoongArch](https://docs.kernel.org/arch/loongarch/introduction.html) is
a new RISC ISA developed by loongson. There are already a lot of
[community support and
testing](https://www.phoronix.com/search/LoongArch) about it. Like #4894
, this PR add support for LoongArch 64bit.

- Add loongarch64 target to config.guess
- Update config-ix.cmake to support loongarch64
- Tested on loongarch64, see the [build
log](https://github.com/user-attachments/files/17893008/directx-shader-compiler-1.8.2407-1-loong64-build.log)

Signed-off-by: Zhou Qiankang <wszqkzqk@qq.com>
---
 autoconf/config.guess | 3 +++
 cmake/config-ix.cmake | 2 ++
 2 files changed, 5 insertions(+)

diff --git a/autoconf/config.guess b/autoconf/config.guess
index cf0541d1f1..62df94c187 100755
--- a/autoconf/config.guess
+++ b/autoconf/config.guess
@@ -929,6 +929,9 @@ EOF
     ia64:Linux:*:*)
 	echo ${UNAME_MACHINE}-unknown-linux-gnu
 	exit ;;
+    loongarch64:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	exit ;;
     m32r*:Linux:*:*)
 	echo ${UNAME_MACHINE}-unknown-linux-gnu
 	exit ;;
diff --git a/cmake/config-ix.cmake b/cmake/config-ix.cmake
index 4541d08162..226881ad30 100644
--- a/cmake/config-ix.cmake
+++ b/cmake/config-ix.cmake
@@ -367,6 +367,8 @@ elseif (LLVM_NATIVE_ARCH MATCHES "wasm64")
   set(LLVM_NATIVE_ARCH WebAssembly)
 elseif (LLVM_NATIVE_ARCH MATCHES "riscv64")
   set(LLVM_NATIVE_ARCH RISCV)
+elseif (LLVM_NATIVE_ARCH MATCHES "loongarch64")
+  set(LLVM_NATIVE_ARCH LoongArch)
 elseif (LLVM_NATIVE_ARCH MATCHES "e2k")
   set(LLVM_NATIVE_ARCH E2K)
 else ()

From 085550991a87e0b7ae3ae988b1da87b73f70a29e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nathan=20Gau=C3=ABr?= <brioche@google.com>
Date: Mon, 2 Jun 2025 13:03:54 +0200
Subject: [PATCH 52/93] [SPIR-V] Fix bool cast on buffers with swizzle (#7497)

HLSL resources can store booleans. SPIR-V resources can't. We handle
this by using integers in resources, and casting at the interface.

Swizzle path was handled a bit differently, and was not going through
the common load/store path which handles the cast.

Fixes #7475
---
 tools/clang/lib/SPIRV/SpirvEmitter.cpp        | 16 ++++++------
 tools/clang/lib/SPIRV/SpirvEmitter.h          |  3 ++-
 .../op.vector.swizzle.buffer-store.hlsl       | 26 +++++++++++++++++++
 3 files changed, 36 insertions(+), 9 deletions(-)
 create mode 100644 tools/clang/test/CodeGenSPIRV/op.vector.swizzle.buffer-store.hlsl

diff --git a/tools/clang/lib/SPIRV/SpirvEmitter.cpp b/tools/clang/lib/SPIRV/SpirvEmitter.cpp
index c005f6576c..e62197094f 100644
--- a/tools/clang/lib/SPIRV/SpirvEmitter.cpp
+++ b/tools/clang/lib/SPIRV/SpirvEmitter.cpp
@@ -1281,7 +1281,8 @@ SpirvInstruction *SpirvEmitter::loadIfGLValue(const Expr *expr,
 }
 
 SpirvInstruction *SpirvEmitter::loadIfGLValue(const Expr *expr,
-                                              SpirvInstruction *info) {
+                                              SpirvInstruction *info,
+                                              SourceRange rangeOverride) {
   const auto exprType = expr->getType();
 
   // Do nothing if this is already rvalue
@@ -1316,9 +1317,11 @@ SpirvInstruction *SpirvEmitter::loadIfGLValue(const Expr *expr,
     return info;
   }
 
+  SourceRange range =
+      (rangeOverride != SourceRange()) ? rangeOverride : expr->getSourceRange();
   SpirvInstruction *loadedInstr = nullptr;
-  loadedInstr = spvBuilder.createLoad(exprType, info, expr->getExprLoc(),
-                                      expr->getSourceRange());
+  loadedInstr =
+      spvBuilder.createLoad(exprType, info, expr->getExprLoc(), range);
   assert(loadedInstr);
 
   // Special-case: According to the SPIR-V Spec: There is no physical size or
@@ -7969,15 +7972,12 @@ SpirvInstruction *SpirvEmitter::tryToAssignToVectorElements(
   }
 
   auto *vec1 = doExpr(base, range);
-  auto *vec1Val =
-      vec1->isRValue()
-          ? vec1
-          : spvBuilder.createLoad(baseType, vec1, base->getLocStart(), range);
+  auto *vec1Val = vec1->isRValue() ? vec1 : loadIfGLValue(base, vec1, range);
   auto *shuffle = spvBuilder.createVectorShuffle(
       baseType, vec1Val, rhs, selectors, lhs->getLocStart(), range);
 
   if (!tryToAssignToRWBufferRWTexture(base, shuffle))
-    spvBuilder.createStore(vec1, shuffle, lhs->getLocStart(), range);
+    storeValue(vec1, shuffle, base->getType(), lhs->getLocStart(), range);
 
   // TODO: OK, this return value is incorrect for compound assignments, for
   // which cases we should return lvalues. Should at least emit errors if
diff --git a/tools/clang/lib/SPIRV/SpirvEmitter.h b/tools/clang/lib/SPIRV/SpirvEmitter.h
index e5daed603d..0c77f2fc24 100644
--- a/tools/clang/lib/SPIRV/SpirvEmitter.h
+++ b/tools/clang/lib/SPIRV/SpirvEmitter.h
@@ -176,7 +176,8 @@ class SpirvEmitter : public ASTConsumer {
   /// Overload with pre computed SpirvEvalInfo.
   ///
   /// The given expr will not be evaluated again.
-  SpirvInstruction *loadIfGLValue(const Expr *expr, SpirvInstruction *info);
+  SpirvInstruction *loadIfGLValue(const Expr *expr, SpirvInstruction *info,
+                                  SourceRange rangeOverride = {});
 
   /// Loads the pointer of the aliased-to-variable if the given expression is a
   /// DeclRefExpr referencing an alias variable. See DeclResultIdMapper for
diff --git a/tools/clang/test/CodeGenSPIRV/op.vector.swizzle.buffer-store.hlsl b/tools/clang/test/CodeGenSPIRV/op.vector.swizzle.buffer-store.hlsl
new file mode 100644
index 0000000000..5d77d222f9
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/op.vector.swizzle.buffer-store.hlsl
@@ -0,0 +1,26 @@
+// RUN: %dxc -T cs_6_0 -E main -fcgl  %s -spirv | FileCheck %s
+
+RWStructuredBuffer<bool4>  buffer;
+
+// CHECK-DAG: [[v4_0:%[0-9]+]] = OpConstantComposite %v4uint %uint_0 %uint_0 %uint_0 %uint_0
+// CHECK-DAG: [[v4_1:%[0-9]+]] = OpConstantComposite %v4uint %uint_1 %uint_1 %uint_1 %uint_1
+
+[numthreads(1, 1, 1)]
+void main()
+{
+// CHECK:  [[ptr:%[0-9]+]] = OpAccessChain %_ptr_Uniform_v4uint %buffer %int_0 %uint_0
+// CHECK: [[load:%[0-9]+]] = OpLoad %v4uint [[ptr]]
+// CHECK: [[cast:%[0-9]+]] = OpINotEqual %v4bool [[load]] [[v4_0]]
+// CHECK: [[shuf:%[0-9]+]] = OpVectorShuffle %v3bool [[cast]] [[cast]] 0 1 2
+// CHECK:                    OpStore %a [[shuf]]
+  bool3 a = buffer[0].xyz;
+
+// CHECK:    [[a:%[0-9]+]] = OpLoad %v3bool %a
+// CHECK:  [[ptr:%[0-9]+]] = OpAccessChain %_ptr_Uniform_v4uint %buffer %int_0 %uint_1
+// CHECK: [[load:%[0-9]+]] = OpLoad %v4uint [[ptr]]
+// CHECK: [[cast:%[0-9]+]] = OpINotEqual %v4bool [[load]] [[v4_0]]
+// CHECK: [[shuf:%[0-9]+]] = OpVectorShuffle %v4bool [[cast]] [[a]] 4 5 6 3
+// CHECK: [[cast:%[0-9]+]] = OpSelect %v4uint [[shuf]] [[v4_1]] [[v4_0]]
+// CHECK:                    OpStore [[ptr]] [[cast]]
+  buffer[1].xyz = a;
+}

From 72149fa8debb29054a36b5a5450ace1a875c4ac8 Mon Sep 17 00:00:00 2001
From: Dan Brown <61992655+danbrown-amd@users.noreply.github.com>
Date: Mon, 2 Jun 2025 09:42:00 -0600
Subject: [PATCH 53/93] Eliminates layout mismatch when
 vk::BufferPointer::Get() result returned from called function. (#7500)

Fixes #7460.
---
 tools/clang/lib/SPIRV/SpirvEmitter.cpp        |  1 +
 .../vk.buffer-pointer.rvalue.hlsl             | 42 ++++++++++++++++---
 2 files changed, 37 insertions(+), 6 deletions(-)

diff --git a/tools/clang/lib/SPIRV/SpirvEmitter.cpp b/tools/clang/lib/SPIRV/SpirvEmitter.cpp
index e62197094f..24774875f7 100644
--- a/tools/clang/lib/SPIRV/SpirvEmitter.cpp
+++ b/tools/clang/lib/SPIRV/SpirvEmitter.cpp
@@ -11025,6 +11025,7 @@ SpirvInstruction *SpirvEmitter::processIntrinsicGetBufferContents(
   if (bufferPointer->isRValue()) {
     bufferPointer->setRValue(false);
     bufferPointer->setStorageClass(spv::StorageClass::PhysicalStorageBuffer);
+    bufferPointer->setLayoutRule(spirvOptions.sBufferLayoutRule);
     return bufferPointer;
   }
 
diff --git a/tools/clang/test/CodeGenSPIRV/vk.buffer-pointer.rvalue.hlsl b/tools/clang/test/CodeGenSPIRV/vk.buffer-pointer.rvalue.hlsl
index 930770cc16..5132c57000 100644
--- a/tools/clang/test/CodeGenSPIRV/vk.buffer-pointer.rvalue.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/vk.buffer-pointer.rvalue.hlsl
@@ -1,4 +1,5 @@
-// RUN: %dxc -spirv -HV 202x -Od -T cs_6_9 %s | FileCheck %s
+// RUN: %dxc -spirv -HV 202x -Od -T cs_6_9 %s | FileCheck %s --check-prefix=CHECK --check-prefix=NOFUN
+// RUN: %dxc -spirv -HV 202x -Od -T cs_6_9 -DFUN %s | FileCheck %s --check-prefix=CHECK --check-prefix=FUN
 
 // Issue #7302: implicit object argument of Get() evaluates to rvalue
 
@@ -20,16 +21,45 @@ struct Content
 // CHECK: [[V2UINT:%[_0-9A-Za-z]*]] = OpTypeVector [[UINT]] 2
 // CHECK: [[VECTOR:%[_0-9A-Za-z]*]] = OpConstantComposite [[V2UINT]] [[UDEADBEEF]] [[U0]]
 // CHECK: [[CONTENT:%[_0-9A-Za-z]*]] = OpTypeStruct [[INT]]
-// CHECK: [[PPCONTENT:%[_0-9A-Za-z]*]] = OpTypePointer PhysicalStorageBuffer [[CONTENT]]
-// CHECK: [[PPINT:%[_0-9A-Za-z]*]] = OpTypePointer PhysicalStorageBuffer [[INT]]
+// FUN: [[PFCONTENT:%[_0-9A-Za-z]*]] = OpTypePointer Function [[CONTENT]]
+// FUN: [[PFINT:%[_0-9A-Za-z]*]] = OpTypePointer Function [[INT]]
+// FUN: [[CONTENT0:%[_0-9A-Za-z]*]] = OpTypeStruct [[INT]]
+// FUN: [[PPCONTENT:%[_0-9A-Za-z]*]] = OpTypePointer PhysicalStorageBuffer [[CONTENT0]]
+// NOFUN: [[PPCONTENT:%[_0-9A-Za-z]*]] = OpTypePointer PhysicalStorageBuffer [[CONTENT]]
+// NOFUN: [[PPINT:%[_0-9A-Za-z]*]] = OpTypePointer PhysicalStorageBuffer [[INT]]
+
+Content f() {
+  return bitcast<vk::BufferPointer<Content> >(uint32_t2(0xdeadbeefu,0x0u)).Get();
+}
 
 [numthreads(1, 1, 1)]
 void main()
 {
+#ifdef FUN
+  Content c = f();
+  c.a = 1;
+#else
   bitcast<vk::BufferPointer<Content> >(uint32_t2(0xdeadbeefu,0x0u)).Get().a = 1;
+#endif
 }
 
-// CHECK: [[BITCAST:%[0-9]*]] = OpBitcast [[PPCONTENT]] [[VECTOR]]
-// CHECK: [[PTR:%[0-9]*]] = OpAccessChain [[PPINT]] [[BITCAST]] [[IO]]
-// CHECK: OpStore [[PTR]] [[I1]] Aligned 4
+// NOFUN: [[BITCAST:%[0-9]*]] = OpBitcast [[PPCONTENT]] [[VECTOR]]
+// NOFUN: [[PTR:%[0-9]*]] = OpAccessChain [[PPINT]] [[BITCAST]] [[IO]]
+// NOFUN: OpStore [[PTR]] [[I1]] Aligned 4
+
+// FUN: [[VAR:%[_0-9A-Za-z]*]] = OpVariable [[PFCONTENT]] Function
+// FUN: [[CALL:%[0-9]*]] = OpFunctionCall [[CONTENT]] [[F:%[_0-9A-Za-z]*]]
+// FUN: OpStore [[VAR]] [[CALL]]
+// FUN: [[PTR:%[0-9]*]] = OpAccessChain [[PFINT]] [[VAR]] [[IO]]
+// FUN: OpStore [[PTR]] [[I1]]
+
+// FUN: [[F]] = OpFunction [[CONTENT]]
+// FUN: [[VAR:%[_0-9A-Za-z]*]] = OpVariable [[PFCONTENT]] Function
+// FUN: [[BITCAST:%[0-9]*]] = OpBitcast [[PPCONTENT]] [[VECTOR]]
+// FUN: [[CVAL0:%[0-9]*]] = OpLoad [[CONTENT0]] [[BITCAST]] Aligned 4
+// FUN: [[IVAL:%[0-9]*]] = OpCompositeExtract [[INT]] [[CVAL0]] 0
+// FUN: [[CVAL1:%[0-9]*]] = OpCompositeConstruct [[CONTENT]] [[IVAL]]
+// FUN: OpStore [[VAR]] [[CVAL1]]
+// FUN: [[RET:%[0-9]*]] = OpLoad [[CONTENT]] [[VAR]]
+// FUN: OpReturnValue [[RET]]
 

From 48d6e3c635f0ab3ae79580c37003e6faeca6c671 Mon Sep 17 00:00:00 2001
From: Steven Perron <stevenperron@google.com>
Date: Mon, 2 Jun 2025 13:29:26 -0400
Subject: [PATCH 54/93] [SPIRV] Get alignemnt from pointee type for
 vk::BufferPoitner store (#7501)

A small mistake for stores to vk:BufferPointer when storing directly to
the return value of `get()`. We were getting the alignment of the
pointer itself, which is always 8 instead of the type pointed to.

I tested loads, and it does not have the same problem.

Fixes #7459
---
 tools/clang/lib/SPIRV/SpirvBuilder.cpp                   | 2 +-
 .../clang/test/CodeGenSPIRV/vk.buffer-pointer.read.hlsl  | 9 +++++++--
 .../clang/test/CodeGenSPIRV/vk.buffer-pointer.write.hlsl | 3 +++
 3 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/tools/clang/lib/SPIRV/SpirvBuilder.cpp b/tools/clang/lib/SPIRV/SpirvBuilder.cpp
index a0dcb5420b..d776ba65fb 100644
--- a/tools/clang/lib/SPIRV/SpirvBuilder.cpp
+++ b/tools/clang/lib/SPIRV/SpirvBuilder.cpp
@@ -313,7 +313,7 @@ SpirvStore *SpirvBuilder::createStore(SpirvInstruction *address,
     AlignmentSizeCalculator alignmentCalc(astContext, spirvOptions);
     uint32_t align, size, stride;
     std::tie(align, size) = alignmentCalc.getAlignmentAndSize(
-        address->getAstResultType(), address->getLayoutRule(), llvm::None,
+        source->getAstResultType(), address->getLayoutRule(), llvm::None,
         &stride);
     instruction->setAlignment(align);
   }
diff --git a/tools/clang/test/CodeGenSPIRV/vk.buffer-pointer.read.hlsl b/tools/clang/test/CodeGenSPIRV/vk.buffer-pointer.read.hlsl
index c7d6f0ed2b..cc3b1a0209 100644
--- a/tools/clang/test/CodeGenSPIRV/vk.buffer-pointer.read.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/vk.buffer-pointer.read.hlsl
@@ -36,7 +36,8 @@ struct TestPushConstant_t
 float4 MainPs(void) : SV_Target0
 {
       float4 vTest = g_PushConstants.m_nBufferDeviceAddress.Get().g_vTestFloat4;
-      return vTest;
+      float f = vk::BufferPointer<float,4>(0xdeadbeefull).Get();
+      return vTest+f;
 }
 
 // CHECK: [[FUN]] = OpFunction
@@ -44,5 +45,9 @@ float4 MainPs(void) : SV_Target0
 // CHECK: [[X2:%[_0-9A-Za-z]*]] = OpLoad [[PGLOBALS]] [[X1]]
 // CHECK: [[X3:%[_0-9A-Za-z]*]] = OpAccessChain [[PV4FLOAT2]] [[X2]] [[S1]]
 // CHECK: [[X4:%[_0-9A-Za-z]*]] = OpLoad [[V4FLOAT]] [[X3]] Aligned 16
-// CHECK: OpStore [[OUT]] [[X4]]
+// CHECK: [[TEMP_PTR:%[_0-9A-Za-z]*]] = OpConvertUToPtr %_ptr_PhysicalStorageBuffer_float %ulong_3735928559
+// CHECK: [[LD:%[_0-9A-Za-z]*]] = OpLoad %float [[TEMP_PTR]] Aligned 4
+// CHECK: [[CONSTRUCT:%[_0-9A-Za-z]*]] = OpCompositeConstruct [[V4FLOAT]] [[LD]] [[LD]] [[LD]] [[LD]]
+// CHECK: [[ADD:%[_0-9A-Za-z]*]] = OpFAdd [[V4FLOAT]] [[X4]] [[CONSTRUCT]]
+// CHECK: OpStore [[OUT]] [[ADD]]
 // CHECK: OpFunctionEnd
diff --git a/tools/clang/test/CodeGenSPIRV/vk.buffer-pointer.write.hlsl b/tools/clang/test/CodeGenSPIRV/vk.buffer-pointer.write.hlsl
index b2efd02cbd..843815a4a0 100644
--- a/tools/clang/test/CodeGenSPIRV/vk.buffer-pointer.write.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/vk.buffer-pointer.write.hlsl
@@ -40,6 +40,7 @@ float4 MainPs(void) : SV_Target0
 {
       float4 vTest = float4(1.0,0.0,0.0,0.0);
       g_PushConstants.m_nBufferDeviceAddress.Get().g_vTestFloat4 = vTest;
+      vk::BufferPointer<float,4>(0xdeadbeefull).Get() = 4.5f;
       return vTest;
 }
 
@@ -48,5 +49,7 @@ float4 MainPs(void) : SV_Target0
 // CHECK: [[X2:%[_0-9A-Za-z]*]] = OpLoad [[PGLOBALS]] [[X1]]
 // CHECK: [[X3:%[_0-9A-Za-z]*]] = OpAccessChain [[PV4FLOAT2]] [[X2]] [[S1]]
 // CHECK: OpStore [[X3]] [[CV4FLOAT]] Aligned 16
+// CHECK: [[TEMP_PTR:%[_0-9A-Za-z]*]] = OpConvertUToPtr %_ptr_PhysicalStorageBuffer_float %ulong_3735928559
+// CHECK: OpStore [[TEMP_PTR]] %float_4_5 Aligned 4
 // CHECK: OpStore [[OUT]] [[CV4FLOAT]]
 // CHECK: OpFunctionEnd

From 8a8b29f967b5925a970949984442b3783d730551 Mon Sep 17 00:00:00 2001
From: Dan Brown <61992655+danbrown-amd@users.noreply.github.com>
Date: Tue, 3 Jun 2025 10:22:13 -0600
Subject: [PATCH 55/93] [spirv] AMD work graphs extension (#7353)

Enables work graphs for SPIR-V target, based on AMD_shader_enqueue
extension.
Closes #5960.
---
 tools/clang/include/clang/AST/HlslTypes.h     |   5 +
 .../clang/Basic/DiagnosticSemaKinds.td        |   2 +
 .../include/clang/SPIRV/FeatureManager.h      |   1 +
 .../clang/include/clang/SPIRV/SpirvBuilder.h  |  20 +
 .../clang/include/clang/SPIRV/SpirvContext.h  |  53 ++
 .../include/clang/SPIRV/SpirvInstruction.h    | 146 ++++++
 tools/clang/include/clang/SPIRV/SpirvType.h   |  23 +
 .../clang/include/clang/SPIRV/SpirvVisitor.h  |   6 +
 tools/clang/lib/AST/HlslTypes.cpp             |  40 ++
 tools/clang/lib/SPIRV/CapabilityVisitor.cpp   |  64 ++-
 tools/clang/lib/SPIRV/DebugTypeVisitor.cpp    |  11 +
 tools/clang/lib/SPIRV/DeclResultIdMapper.cpp  |  35 +-
 tools/clang/lib/SPIRV/EmitVisitor.cpp         | 207 +++++++-
 tools/clang/lib/SPIRV/EmitVisitor.h           |  15 +-
 tools/clang/lib/SPIRV/FeatureManager.cpp      |   3 +
 tools/clang/lib/SPIRV/GlPerVertex.cpp         |   3 +
 tools/clang/lib/SPIRV/LowerTypeVisitor.cpp    |  10 +
 tools/clang/lib/SPIRV/PreciseVisitor.cpp      |   3 +
 tools/clang/lib/SPIRV/SpirvBuilder.cpp        |  72 ++-
 tools/clang/lib/SPIRV/SpirvContext.cpp        |  16 +
 tools/clang/lib/SPIRV/SpirvEmitter.cpp        | 452 +++++++++++++++++-
 tools/clang/lib/SPIRV/SpirvEmitter.h          |  44 ++
 tools/clang/lib/SPIRV/SpirvInstruction.cpp    |  57 ++-
 tools/clang/lib/SPIRV/SpirvType.cpp           |   4 +
 tools/clang/lib/Sema/SemaHLSL.cpp             |   4 +
 .../test/CodeGenSPIRV/bezier.hull.hlsl2spv    | 144 +++---
 .../hs.const.output-patch.out.hlsl            |   6 +-
 .../CodeGenSPIRV/node.barrier.compute.hlsl    |  15 +
 .../CodeGenSPIRV/node.barrier.memory-arg.hlsl |  60 +++
 .../CodeGenSPIRV/node.barrier.object-arg.hlsl | 213 +++++++++
 .../node.broadcasting.no-input.hlsl           |  15 +
 .../node.coalescing.num-threads.hlsl          |  16 +
 .../test/CodeGenSPIRV/node.dispatch-grid.hlsl |  28 ++
 .../CodeGenSPIRV/node.empty-node-input.hlsl   |  28 ++
 .../node.finished-cross-group-sharing.hlsl    |  32 ++
 .../node.get-input-record-count.hlsl          |  25 +
 .../node.get-node-output-record.multiple.hlsl |  72 +++
 .../node.get-remaining-recursion-levels.hlsl  |  26 +
 .../node.group-shared.barrier.hlsl            |  18 +
 .../test/CodeGenSPIRV/node.group-shared.hlsl  |  24 +
 .../node.increment-output-count.group.hlsl    |  22 +
 .../node.increment-output-count.thread.hlsl   |  22 +
 ...node.input-record.dispatch-grid.array.hlsl |  26 +
 ...ode.input-record.dispatch-grid.nested.hlsl |  32 ++
 .../CodeGenSPIRV/node.max-dispatch-grid.hlsl  |  30 ++
 .../test/CodeGenSPIRV/node.max-records.hlsl   |  45 ++
 .../test/CodeGenSPIRV/node.member.read.hlsl   | 150 ++++++
 .../CodeGenSPIRV/node.member.read.types.hlsl  | 193 ++++++++
 .../test/CodeGenSPIRV/node.member.write.hlsl  |  88 ++++
 .../node.member.write.matrix.hlsl             | 123 +++++
 .../CodeGenSPIRV/node.member.write.types.hlsl | 150 ++++++
 tools/clang/test/CodeGenSPIRV/node.mesh.hlsl  |  88 ++++
 .../CodeGenSPIRV/node.output-complete.hlsl    |  33 ++
 .../node.output.is-valid.empty.hlsl           |  19 +
 .../CodeGenSPIRV/node.output.is-valid.hlsl    |  24 +
 .../clang/test/CodeGenSPIRV/node.renamed.hlsl |  23 +
 .../test/CodeGenSPIRV/node.share-input.hlsl   |  42 ++
 .../test/CodeGenSPIRV/node.sparse-nodes.hlsl  | 141 ++++++
 .../CodeGenSPIRV/node.thread.num-threads.hlsl |  15 +
 .../node.thread.num-threads.none.hlsl         |  15 +
 .../vk.attribute.image-format.hlsl            |   6 -
 61 files changed, 3193 insertions(+), 112 deletions(-)
 create mode 100644 tools/clang/test/CodeGenSPIRV/node.barrier.compute.hlsl
 create mode 100644 tools/clang/test/CodeGenSPIRV/node.barrier.memory-arg.hlsl
 create mode 100644 tools/clang/test/CodeGenSPIRV/node.barrier.object-arg.hlsl
 create mode 100644 tools/clang/test/CodeGenSPIRV/node.broadcasting.no-input.hlsl
 create mode 100644 tools/clang/test/CodeGenSPIRV/node.coalescing.num-threads.hlsl
 create mode 100644 tools/clang/test/CodeGenSPIRV/node.dispatch-grid.hlsl
 create mode 100644 tools/clang/test/CodeGenSPIRV/node.empty-node-input.hlsl
 create mode 100644 tools/clang/test/CodeGenSPIRV/node.finished-cross-group-sharing.hlsl
 create mode 100644 tools/clang/test/CodeGenSPIRV/node.get-input-record-count.hlsl
 create mode 100644 tools/clang/test/CodeGenSPIRV/node.get-node-output-record.multiple.hlsl
 create mode 100644 tools/clang/test/CodeGenSPIRV/node.get-remaining-recursion-levels.hlsl
 create mode 100644 tools/clang/test/CodeGenSPIRV/node.group-shared.barrier.hlsl
 create mode 100644 tools/clang/test/CodeGenSPIRV/node.group-shared.hlsl
 create mode 100644 tools/clang/test/CodeGenSPIRV/node.increment-output-count.group.hlsl
 create mode 100644 tools/clang/test/CodeGenSPIRV/node.increment-output-count.thread.hlsl
 create mode 100644 tools/clang/test/CodeGenSPIRV/node.input-record.dispatch-grid.array.hlsl
 create mode 100644 tools/clang/test/CodeGenSPIRV/node.input-record.dispatch-grid.nested.hlsl
 create mode 100644 tools/clang/test/CodeGenSPIRV/node.max-dispatch-grid.hlsl
 create mode 100644 tools/clang/test/CodeGenSPIRV/node.max-records.hlsl
 create mode 100644 tools/clang/test/CodeGenSPIRV/node.member.read.hlsl
 create mode 100644 tools/clang/test/CodeGenSPIRV/node.member.read.types.hlsl
 create mode 100644 tools/clang/test/CodeGenSPIRV/node.member.write.hlsl
 create mode 100644 tools/clang/test/CodeGenSPIRV/node.member.write.matrix.hlsl
 create mode 100644 tools/clang/test/CodeGenSPIRV/node.member.write.types.hlsl
 create mode 100644 tools/clang/test/CodeGenSPIRV/node.mesh.hlsl
 create mode 100644 tools/clang/test/CodeGenSPIRV/node.output-complete.hlsl
 create mode 100644 tools/clang/test/CodeGenSPIRV/node.output.is-valid.empty.hlsl
 create mode 100644 tools/clang/test/CodeGenSPIRV/node.output.is-valid.hlsl
 create mode 100644 tools/clang/test/CodeGenSPIRV/node.renamed.hlsl
 create mode 100644 tools/clang/test/CodeGenSPIRV/node.share-input.hlsl
 create mode 100644 tools/clang/test/CodeGenSPIRV/node.sparse-nodes.hlsl
 create mode 100644 tools/clang/test/CodeGenSPIRV/node.thread.num-threads.hlsl
 create mode 100644 tools/clang/test/CodeGenSPIRV/node.thread.num-threads.none.hlsl

diff --git a/tools/clang/include/clang/AST/HlslTypes.h b/tools/clang/include/clang/AST/HlslTypes.h
index 58d2d916b1..43c1effdb8 100644
--- a/tools/clang/include/clang/AST/HlslTypes.h
+++ b/tools/clang/include/clang/AST/HlslTypes.h
@@ -485,7 +485,10 @@ bool IsHLSLObjectWithImplicitMemberAccess(clang::QualType type);
 bool IsHLSLObjectWithImplicitROMemberAccess(clang::QualType type);
 bool IsHLSLRWNodeInputRecordType(clang::QualType type);
 bool IsHLSLRONodeInputRecordType(clang::QualType type);
+bool IsHLSLDispatchNodeInputRecordType(clang::QualType type);
+bool IsHLSLNodeRecordArrayType(clang::QualType type);
 bool IsHLSLNodeOutputType(clang::QualType type);
+bool IsHLSLEmptyNodeRecordType(clang::QualType type);
 
 DXIL::NodeIOKind GetNodeIOType(clang::QualType type);
 
@@ -495,6 +498,8 @@ bool IsHLSLCopyableAnnotatableRecord(clang::QualType QT);
 bool IsHLSLBuiltinRayAttributeStruct(clang::QualType QT);
 bool IsHLSLAggregateType(clang::QualType type);
 clang::QualType GetHLSLResourceResultType(clang::QualType type);
+clang::QualType GetHLSLNodeIOResultType(clang::ASTContext &astContext,
+                                        clang::QualType type);
 unsigned GetHLSLResourceTemplateUInt(clang::QualType type);
 bool IsIncompleteHLSLResourceArrayType(clang::ASTContext &context,
                                        clang::QualType type);
diff --git a/tools/clang/include/clang/Basic/DiagnosticSemaKinds.td b/tools/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 0d98792688..cbd9412566 100644
--- a/tools/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/tools/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -8060,6 +8060,8 @@ def err_hlsl_vk_pointer_cast_alignment: Error<
   "Vulkan buffer pointer cannot be cast to greater alignment">;
 def err_hlsl_vk_static_pointer_cast_type: Error<
   "vk::static_pointer_cast() content type must be base class of argument's content type">;
+def warn_spirv_node_shaders_experimental : Warning<
+  "SPIR-V implementation of node shaders is experimental and subject to change">;
 // SPIRV Change Ends
 
 let CategoryName = "OpenMP Issue" in {
diff --git a/tools/clang/include/clang/SPIRV/FeatureManager.h b/tools/clang/include/clang/SPIRV/FeatureManager.h
index 3c1871df37..94dc5bf1ab 100644
--- a/tools/clang/include/clang/SPIRV/FeatureManager.h
+++ b/tools/clang/include/clang/SPIRV/FeatureManager.h
@@ -57,6 +57,7 @@ enum class Extension {
   KHR_ray_query,
   EXT_shader_image_int64,
   KHR_physical_storage_buffer,
+  AMD_shader_enqueue,
   KHR_vulkan_memory_model,
   NV_compute_shader_derivatives,
   KHR_compute_shader_derivatives,
diff --git a/tools/clang/include/clang/SPIRV/SpirvBuilder.h b/tools/clang/include/clang/SPIRV/SpirvBuilder.h
index 2da14dab54..465f7313f1 100644
--- a/tools/clang/include/clang/SPIRV/SpirvBuilder.h
+++ b/tools/clang/include/clang/SPIRV/SpirvBuilder.h
@@ -434,6 +434,25 @@ class SpirvBuilder {
       QualType resultType, NonSemanticDebugPrintfInstructions instId,
       llvm::ArrayRef<SpirvInstruction *> operands, SourceLocation);
 
+  SpirvInstruction *createIsNodePayloadValid(SpirvInstruction *payloadArray,
+                                             SpirvInstruction *nodeIndex,
+                                             SourceLocation);
+
+  SpirvInstruction *createNodePayloadArrayLength(SpirvInstruction *payloadArray,
+                                                 SourceLocation);
+
+  SpirvInstruction *createAllocateNodePayloads(QualType resultType,
+                                               spv::Scope allocationScope,
+                                               SpirvInstruction *shaderIndex,
+                                               SpirvInstruction *recordCount,
+                                               SourceLocation);
+
+  void createEnqueueOutputNodePayloads(SpirvInstruction *payload,
+                                       SourceLocation);
+
+  SpirvInstruction *createFinishWritingNodePayload(SpirvInstruction *payload,
+                                                   SourceLocation);
+
   /// \brief Creates an OpMemoryBarrier or OpControlBarrier instruction with the
   /// given flags. If execution scope (exec) is provided, an OpControlBarrier
   /// is created; otherwise an OpMemoryBarrier is created.
@@ -763,6 +782,7 @@ class SpirvBuilder {
                        llvm::ArrayRef<SpirvConstant *> constituents,
                        bool specConst = false);
   SpirvConstant *getConstantNull(QualType);
+  SpirvConstant *getConstantString(llvm::StringRef str, bool specConst = false);
   SpirvUndef *getUndef(QualType);
 
   SpirvString *createString(llvm::StringRef str);
diff --git a/tools/clang/include/clang/SPIRV/SpirvContext.h b/tools/clang/include/clang/SPIRV/SpirvContext.h
index 50ff77d4b4..8e0458e731 100644
--- a/tools/clang/include/clang/SPIRV/SpirvContext.h
+++ b/tools/clang/include/clang/SPIRV/SpirvContext.h
@@ -98,6 +98,21 @@ struct RuntimeArrayTypeMapInfo {
   }
 };
 
+// Provides DenseMapInfo for NodePayloadArrayType so we can create a DenseSet of
+// node payload array types.
+struct NodePayloadArrayTypeMapInfo {
+  static inline NodePayloadArrayType *getEmptyKey() { return nullptr; }
+  static inline NodePayloadArrayType *getTombstoneKey() { return nullptr; }
+  static unsigned getHashValue(const NodePayloadArrayType *Val) {
+    return llvm::hash_combine(Val->getElementType(), Val->getNodeDecl());
+  }
+  static bool isEqual(const NodePayloadArrayType *LHS,
+                      const NodePayloadArrayType *RHS) {
+    // Either both are null, or both should have the same underlying type.
+    return (LHS == RHS) || (LHS && RHS && *LHS == *RHS);
+  }
+};
+
 // Provides DenseMapInfo for ImageType so we can create a DenseSet of
 // image types.
 struct ImageTypeMapInfo {
@@ -270,6 +285,9 @@ class SpirvContext {
   const RuntimeArrayType *
   getRuntimeArrayType(const SpirvType *elemType,
                       llvm::Optional<uint32_t> arrayStride);
+  const NodePayloadArrayType *
+  getNodePayloadArrayType(const SpirvType *elemType,
+                          const ParmVarDecl *nodeDecl);
 
   const StructType *getStructType(
       llvm::ArrayRef<StructType::FieldInfo> fields, llvm::StringRef name,
@@ -346,6 +364,7 @@ class SpirvContext {
   bool isDS() const { return curShaderModelKind == ShaderModelKind::Domain; }
   bool isCS() const { return curShaderModelKind == ShaderModelKind::Compute; }
   bool isLib() const { return curShaderModelKind == ShaderModelKind::Library; }
+  bool isNode() const { return curShaderModelKind == ShaderModelKind::Node; }
   bool isRay() const {
     return curShaderModelKind >= ShaderModelKind::RayGeneration &&
            curShaderModelKind <= ShaderModelKind::Callable;
@@ -437,6 +456,31 @@ class SpirvContext {
            instructionsWithLoweredType.end();
   }
 
+  void registerDispatchGridIndex(const RecordDecl *decl, unsigned index) {
+    auto iter = dispatchGridIndices.find(decl);
+    if (iter == dispatchGridIndices.end()) {
+      dispatchGridIndices[decl] = index;
+    }
+  }
+
+  llvm::Optional<unsigned> getDispatchGridIndex(const RecordDecl *decl) {
+    auto iter = dispatchGridIndices.find(decl);
+    if (iter != dispatchGridIndices.end()) {
+      return iter->second;
+    }
+    return llvm::None;
+  }
+
+  void registerNodeDeclPayloadType(const NodePayloadArrayType *type,
+                                   const ParmVarDecl *decl) {
+    nodeDecls[decl] = type;
+  }
+
+  const NodePayloadArrayType *getNodeDeclPayloadType(const ParmVarDecl *decl) {
+    auto iter = nodeDecls.find(decl);
+    return iter == nodeDecls.end() ? nullptr : iter->second;
+  }
+
 private:
   /// \brief The allocator used to create SPIR-V entity objects.
   ///
@@ -481,6 +525,8 @@ class SpirvContext {
   llvm::DenseSet<const ArrayType *, ArrayTypeMapInfo> arrayTypes;
   llvm::DenseSet<const RuntimeArrayType *, RuntimeArrayTypeMapInfo>
       runtimeArrayTypes;
+  llvm::DenseSet<const NodePayloadArrayType *, NodePayloadArrayTypeMapInfo>
+      nodePayloadArrayTypes;
   llvm::SmallVector<const StructType *, 8> structTypes;
   llvm::SmallVector<const HybridStructType *, 8> hybridStructTypes;
   llvm::DenseMap<const SpirvType *, SCToPtrTyMap> pointerTypes;
@@ -507,6 +553,9 @@ class SpirvContext {
   llvm::StringMap<RichDebugInfo> debugInfo;
   SpirvDebugInstruction *currentLexicalScope;
 
+  // Mapping from graphics node input record types to member decoration maps.
+  llvm::MapVector<const RecordDecl *, unsigned> dispatchGridIndices;
+
   // Mapping from SPIR-V type to debug type instruction.
   // The purpose is not to generate several DebugType* instructions for the same
   // type if the type is used for several variables.
@@ -538,6 +587,10 @@ class SpirvContext {
 
   // Set of instructions that already have lowered SPIR-V types.
   llvm::DenseSet<const SpirvInstruction *> instructionsWithLoweredType;
+
+  // Mapping from shader entry function parameter declaration to node payload
+  // array type.
+  llvm::MapVector<const ParmVarDecl *, const NodePayloadArrayType *> nodeDecls;
 };
 
 } // end namespace spirv
diff --git a/tools/clang/include/clang/SPIRV/SpirvInstruction.h b/tools/clang/include/clang/SPIRV/SpirvInstruction.h
index 20cd57525c..52f4128a6c 100644
--- a/tools/clang/include/clang/SPIRV/SpirvInstruction.h
+++ b/tools/clang/include/clang/SPIRV/SpirvInstruction.h
@@ -67,6 +67,7 @@ class SpirvInstruction {
     IK_ConstantInteger,
     IK_ConstantFloat,
     IK_ConstantComposite,
+    IK_ConstantString,
     IK_ConstantNull,
 
     // Pointer <-> uint conversions.
@@ -165,6 +166,13 @@ class SpirvInstruction {
     IK_DebugTypeMember,
     IK_DebugTypeTemplate,
     IK_DebugTypeTemplateParameter,
+
+    // For workgraph instructions
+    IK_IsNodePayloadValid,
+    IK_NodePayloadArrayLength,
+    IK_AllocateNodePayloads,
+    IK_EnqueueNodePayloads,
+    IK_FinishWritingNodePayload,
   };
 
   // All instruction classes should include a releaseMemory method.
@@ -440,9 +448,13 @@ class SpirvExecutionMode : public SpirvExecutionModeBase {
 
   bool invokeVisitor(Visitor *v) override;
 
+  SpirvFunction *getEntryPoint() const { return entryPoint; }
+  spv::ExecutionMode getExecutionMode() const { return execMode; }
   llvm::ArrayRef<uint32_t> getParams() const { return params; }
 
 private:
+  SpirvFunction *entryPoint;
+  spv::ExecutionMode execMode;
   llvm::SmallVector<uint32_t, 4> params;
 };
 
@@ -1056,6 +1068,119 @@ class SpirvBarrier : public SpirvInstruction {
   llvm::Optional<spv::Scope> executionScope;
 };
 
+/// \brief OpIsNodePayloadValidAMDX instruction
+class SpirvIsNodePayloadValid : public SpirvInstruction {
+public:
+  SpirvIsNodePayloadValid(QualType resultType, SourceLocation loc,
+                          SpirvInstruction *payloadArray,
+                          SpirvInstruction *nodeIndex);
+
+  DEFINE_RELEASE_MEMORY_FOR_CLASS(SpirvIsNodePayloadValid)
+
+  // For LLVM-style RTTI
+  static bool classof(const SpirvInstruction *inst) {
+    return inst->getKind() == IK_IsNodePayloadValid;
+  }
+
+  bool invokeVisitor(Visitor *v) override;
+
+  SpirvInstruction *getPayloadArray() { return payloadArray; }
+  SpirvInstruction *getNodeIndex() { return nodeIndex; }
+
+private:
+  SpirvInstruction *payloadArray;
+  SpirvInstruction *nodeIndex;
+};
+
+/// \brief OpNodePayloadArrayLengthAMDX instruction
+class SpirvNodePayloadArrayLength : public SpirvInstruction {
+public:
+  SpirvNodePayloadArrayLength(QualType resultType, SourceLocation loc,
+                              SpirvInstruction *payloadArray);
+
+  DEFINE_RELEASE_MEMORY_FOR_CLASS(SpirvNodePayloadArrayLength)
+
+  // For LLVM-style RTTI
+  static bool classof(const SpirvInstruction *inst) {
+    return inst->getKind() == IK_NodePayloadArrayLength;
+  }
+
+  bool invokeVisitor(Visitor *v) override;
+
+  SpirvInstruction *getPayloadArray() { return payloadArray; }
+
+private:
+  SpirvInstruction *payloadArray;
+};
+
+/// \brief OpAllocateNodePayloadsAMDX instruction
+class SpirvAllocateNodePayloads : public SpirvInstruction {
+public:
+  SpirvAllocateNodePayloads(QualType resultType, SourceLocation loc,
+                            spv::Scope allocationScope,
+                            SpirvInstruction *shaderIndex,
+                            SpirvInstruction *recordCount);
+
+  DEFINE_RELEASE_MEMORY_FOR_CLASS(SpirvAllocateNodePayloads)
+
+  // For LLVM-style RTTI
+  static bool classof(const SpirvInstruction *inst) {
+    return inst->getKind() == IK_AllocateNodePayloads;
+  }
+
+  bool invokeVisitor(Visitor *v) override;
+
+  spv::Scope getAllocationScope() { return allocationScope; }
+  SpirvInstruction *getShaderIndex() { return shaderIndex; }
+  SpirvInstruction *getRecordCount() { return recordCount; }
+
+private:
+  spv::Scope allocationScope;
+  SpirvInstruction *shaderIndex;
+  SpirvInstruction *recordCount;
+};
+
+/// \brief OpReleaseOutputNodePayloadAMDX instruction
+class SpirvEnqueueNodePayloads : public SpirvInstruction {
+public:
+  SpirvEnqueueNodePayloads(SourceLocation loc, SpirvInstruction *payload);
+
+  DEFINE_RELEASE_MEMORY_FOR_CLASS(SpirvEnqueueNodePayloads)
+
+  // For LLVM-style RTTI
+  static bool classof(const SpirvInstruction *inst) {
+    return inst->getKind() == IK_EnqueueNodePayloads;
+  }
+
+  bool invokeVisitor(Visitor *v) override;
+
+  SpirvInstruction *getPayload() { return payload; }
+
+private:
+  SpirvInstruction *payload;
+};
+
+/// \brief OpFinishWritingNodePayloadAMDX instruction
+class SpirvFinishWritingNodePayload : public SpirvInstruction {
+public:
+  SpirvFinishWritingNodePayload(QualType resultType, SourceLocation loc,
+                                SpirvInstruction *payload);
+
+  DEFINE_RELEASE_MEMORY_FOR_CLASS(SpirvFinishWritingNodePayload)
+
+  // For LLVM-style RTTI
+  static bool classof(const SpirvInstruction *inst) {
+    return inst->getKind() == IK_FinishWritingNodePayload;
+  }
+
+  bool invokeVisitor(Visitor *v) override;
+
+  SpirvInstruction *getPayload() { return payload; }
+
+private:
+  SpirvInstruction *payload;
+};
+
 /// \brief Represents SPIR-V binary operation instructions.
 ///
 /// This class includes:
@@ -1352,6 +1477,27 @@ class SpirvConstantNull : public SpirvConstant {
   bool operator==(const SpirvConstantNull &that) const;
 };
 
+class SpirvConstantString : public SpirvConstant {
+public:
+  SpirvConstantString(llvm::StringRef stringLiteral, bool isSpecConst = false);
+
+  DEFINE_RELEASE_MEMORY_FOR_CLASS(SpirvConstantString)
+
+  // For LLVM-style RTTI
+  static bool classof(const SpirvInstruction *inst) {
+    return inst->getKind() == IK_ConstantString;
+  }
+
+  bool invokeVisitor(Visitor *v) override;
+
+  bool operator==(const SpirvConstantString &that) const;
+
+  llvm::StringRef getString() const { return str; }
+
+private:
+  std::string str;
+};
+
 class SpirvConvertPtrToU : public SpirvInstruction {
 public:
   SpirvConvertPtrToU(SpirvInstruction *ptr, QualType type,
diff --git a/tools/clang/include/clang/SPIRV/SpirvType.h b/tools/clang/include/clang/SPIRV/SpirvType.h
index d39fc6943b..7966e3e0de 100644
--- a/tools/clang/include/clang/SPIRV/SpirvType.h
+++ b/tools/clang/include/clang/SPIRV/SpirvType.h
@@ -51,6 +51,7 @@ class SpirvType {
     TK_SampledImage,
     TK_Array,
     TK_RuntimeArray,
+    TK_NodePayloadArrayAMD,
     TK_Struct,
     TK_Pointer,
     TK_ForwardPointer,
@@ -291,6 +292,26 @@ class RuntimeArrayType : public SpirvType {
   llvm::Optional<uint32_t> stride;
 };
 
+class NodePayloadArrayType : public SpirvType {
+public:
+  NodePayloadArrayType(const SpirvType *elemType, const ParmVarDecl *decl)
+      : SpirvType(TK_NodePayloadArrayAMD), elementType(elemType),
+        nodeDecl(decl) {}
+
+  static bool classof(const SpirvType *t) {
+    return t->getKind() == TK_NodePayloadArrayAMD;
+  }
+
+  bool operator==(const NodePayloadArrayType &that) const;
+
+  const SpirvType *getElementType() const { return elementType; }
+  const ParmVarDecl *getNodeDecl() const { return nodeDecl; }
+
+private:
+  const SpirvType *elementType;
+  const ParmVarDecl *nodeDecl;
+};
+
 // The StructType is the lowered type that best represents what a structure type
 // is in SPIR-V. Contains all necessary information for properly emitting a
 // SPIR-V structure type.
@@ -627,6 +648,8 @@ bool SpirvType::isOrContainsType(const SpirvType *type) {
     return isOrContainsType<T, Bitwidth>(pointerType->getPointeeType());
   if (const auto *raType = dyn_cast<RuntimeArrayType>(type))
     return isOrContainsType<T, Bitwidth>(raType->getElementType());
+  if (const auto *npaType = dyn_cast<NodePayloadArrayType>(type))
+    return isOrContainsType<T, Bitwidth>(npaType->getElementType());
   if (const auto *imgType = dyn_cast<ImageType>(type))
     return isOrContainsType<T, Bitwidth>(imgType->getSampledType());
   if (const auto *sampledImageType = dyn_cast<SampledImageType>(type))
diff --git a/tools/clang/include/clang/SPIRV/SpirvVisitor.h b/tools/clang/include/clang/SPIRV/SpirvVisitor.h
index fef06da503..a6de26c807 100644
--- a/tools/clang/include/clang/SPIRV/SpirvVisitor.h
+++ b/tools/clang/include/clang/SPIRV/SpirvVisitor.h
@@ -82,6 +82,11 @@ class Visitor {
   DEFINE_VISIT_METHOD(SpirvAccessChain)
   DEFINE_VISIT_METHOD(SpirvAtomic)
   DEFINE_VISIT_METHOD(SpirvBarrier)
+  DEFINE_VISIT_METHOD(SpirvIsNodePayloadValid)
+  DEFINE_VISIT_METHOD(SpirvNodePayloadArrayLength)
+  DEFINE_VISIT_METHOD(SpirvAllocateNodePayloads)
+  DEFINE_VISIT_METHOD(SpirvEnqueueNodePayloads)
+  DEFINE_VISIT_METHOD(SpirvFinishWritingNodePayload)
   DEFINE_VISIT_METHOD(SpirvBinaryOp)
   DEFINE_VISIT_METHOD(SpirvBitFieldExtract)
   DEFINE_VISIT_METHOD(SpirvBitFieldInsert)
@@ -89,6 +94,7 @@ class Visitor {
   DEFINE_VISIT_METHOD(SpirvConstantInteger)
   DEFINE_VISIT_METHOD(SpirvConstantFloat)
   DEFINE_VISIT_METHOD(SpirvConstantComposite)
+  DEFINE_VISIT_METHOD(SpirvConstantString)
   DEFINE_VISIT_METHOD(SpirvConstantNull)
   DEFINE_VISIT_METHOD(SpirvConvertPtrToU)
   DEFINE_VISIT_METHOD(SpirvConvertUToPtr)
diff --git a/tools/clang/lib/AST/HlslTypes.cpp b/tools/clang/lib/AST/HlslTypes.cpp
index 017f0f7218..7693c065be 100644
--- a/tools/clang/lib/AST/HlslTypes.cpp
+++ b/tools/clang/lib/AST/HlslTypes.cpp
@@ -587,6 +587,12 @@ bool IsHLSLRONodeInputRecordType(clang::QualType type) {
          static_cast<uint32_t>(DXIL::NodeIOFlags::Input);
 }
 
+bool IsHLSLDispatchNodeInputRecordType(clang::QualType type) {
+  return IsHLSLNodeInputType(type) &&
+         (static_cast<uint32_t>(GetNodeIOType(type)) &
+          static_cast<uint32_t>(DXIL::NodeIOFlags::DispatchRecord)) != 0;
+}
+
 bool IsHLSLNodeOutputType(clang::QualType type) {
   return (static_cast<uint32_t>(GetNodeIOType(type)) &
           (static_cast<uint32_t>(DXIL::NodeIOFlags::Output) |
@@ -594,6 +600,23 @@ bool IsHLSLNodeOutputType(clang::QualType type) {
          static_cast<uint32_t>(DXIL::NodeIOFlags::Output);
 }
 
+bool IsHLSLNodeRecordArrayType(clang::QualType type) {
+  if (const RecordType *RT = type->getAs<RecordType>()) {
+    StringRef name = RT->getDecl()->getName();
+    if (name == "ThreadNodeOutputRecords" || name == "GroupNodeOutputRecords" ||
+        name == "GroupNodeInputRecords" || name == "RWGroupNodeInputRecords" ||
+        name == "EmptyNodeInput")
+      return true;
+  }
+  return false;
+}
+
+bool IsHLSLEmptyNodeRecordType(clang::QualType type) {
+  return (static_cast<uint32_t>(GetNodeIOType(type)) &
+          static_cast<uint32_t>(DXIL::NodeIOFlags::EmptyRecord)) ==
+         static_cast<uint32_t>(DXIL::NodeIOFlags::EmptyRecord);
+}
+
 bool IsHLSLStructuredBufferType(clang::QualType type) {
   if (const HLSLResourceAttr *Attr = getAttr<HLSLResourceAttr>(type))
     return Attr->getResKind() == DXIL::ResourceKind::StructuredBuffer;
@@ -836,6 +859,23 @@ QualType GetHLSLResourceResultType(QualType type) {
   return HandleFieldDecl->getType();
 }
 
+QualType GetHLSLNodeIOResultType(ASTContext &astContext, QualType type) {
+  if (hlsl::IsHLSLEmptyNodeRecordType(type)) {
+    RecordDecl *RD = astContext.buildImplicitRecord("");
+    RD->startDefinition();
+    RD->completeDefinition();
+    return astContext.getRecordType(RD);
+  } else if (hlsl::IsHLSLNodeType(type)) {
+    const RecordType *recordType = type->getAs<RecordType>();
+    if (const auto *templateDecl =
+            dyn_cast<ClassTemplateSpecializationDecl>(recordType->getDecl())) {
+      const auto &templateArgs = templateDecl->getTemplateArgs();
+      return templateArgs[0].getAsType();
+    }
+  }
+  return type;
+}
+
 unsigned GetHLSLResourceTemplateUInt(clang::QualType type) {
   const ClassTemplateSpecializationDecl *templateDecl =
       cast<ClassTemplateSpecializationDecl>(
diff --git a/tools/clang/lib/SPIRV/CapabilityVisitor.cpp b/tools/clang/lib/SPIRV/CapabilityVisitor.cpp
index 43ab2540b4..c8444a3b81 100644
--- a/tools/clang/lib/SPIRV/CapabilityVisitor.cpp
+++ b/tools/clang/lib/SPIRV/CapabilityVisitor.cpp
@@ -122,6 +122,12 @@ void CapabilityVisitor::addCapabilityForType(const SpirvType *type,
     }
     addCapabilityForType(raType->getElementType(), loc, sc);
   }
+  // Node payload array also requires additional capability.
+  else if (const auto *npaType = dyn_cast<NodePayloadArrayType>(type)) {
+    addExtension(Extension::AMD_shader_enqueue, "Vulkan 1.3", loc);
+    addCapability(spv::Capability::ShaderEnqueueAMDX, loc);
+    addCapabilityForType(npaType->getElementType(), loc, sc);
+  }
   // Image types
   else if (const auto *imageType = dyn_cast<ImageType>(type)) {
     switch (imageType->getDimension()) {
@@ -254,6 +260,19 @@ bool CapabilityVisitor::visit(SpirvDecoration *decor) {
     addCapability(spv::Capability::FragmentBarycentricKHR);
     break;
   }
+  case spv::Decoration::NodeSharesPayloadLimitsWithAMDX:
+  case spv::Decoration::NodeMaxPayloadsAMDX:
+  case spv::Decoration::TrackFinishWritingAMDX:
+  case spv::Decoration::PayloadNodeNameAMDX:
+  case spv::Decoration::PayloadNodeBaseIndexAMDX:
+  case spv::Decoration::PayloadNodeSparseArrayAMDX:
+  case spv::Decoration::PayloadNodeArraySizeAMDX:
+  case spv::Decoration::PayloadDispatchIndirectAMDX: {
+    featureManager.requestTargetEnv(SPV_ENV_VULKAN_1_3, "WorkGraphs", loc);
+    addCapability(spv::Capability::ShaderEnqueueAMDX, loc);
+    addExtension(Extension::AMD_shader_enqueue, "Vulkan 1.3", loc);
+    break;
+  }
   // Capabilities needed for built-ins
   case spv::Decoration::BuiltIn: {
     AddVulkanMemoryModelForVolatile(decor, loc);
@@ -532,8 +551,14 @@ bool CapabilityVisitor::visitInstruction(SpirvInstruction *instr) {
     addCapability(spv::Capability::GroupNonUniformQuad);
     break;
   case spv::Op::OpVariable: {
-    if (spvOptions.enableReflect &&
-        !cast<SpirvVariable>(instr)->getHlslUserType().empty()) {
+    auto var = cast<SpirvVariable>(instr);
+    auto storage = var->getStorageClass();
+    if (storage == spv::StorageClass::NodePayloadAMDX) {
+      featureManager.requestTargetEnv(SPV_ENV_VULKAN_1_3, "WorkGraphs", loc);
+      addCapability(spv::Capability::ShaderEnqueueAMDX, loc);
+      addExtension(Extension::AMD_shader_enqueue, "Vulkan 1.3", loc);
+    }
+    if (spvOptions.enableReflect && !var->getHlslUserType().empty()) {
       addExtension(Extension::GOOGLE_user_type, "HLSL User Type", loc);
       addExtension(Extension::GOOGLE_hlsl_functionality1, "HLSL User Type",
                    loc);
@@ -577,6 +602,28 @@ bool CapabilityVisitor::visitInstruction(SpirvInstruction *instr) {
     }
     break;
   }
+  case spv::Op::OpConstantStringAMDX:
+  case spv::Op::OpSpecConstantStringAMDX:
+  case spv::Op::OpAllocateNodePayloadsAMDX:
+  case spv::Op::OpEnqueueNodePayloadsAMDX:
+  case spv::Op::OpIsNodePayloadValidAMDX:
+  case spv::Op::OpFinishWritingNodePayloadAMDX: {
+    featureManager.requestTargetEnv(SPV_ENV_VULKAN_1_3, "WorkGraphs", loc);
+    addCapability(spv::Capability::ShaderEnqueueAMDX, loc);
+    addExtension(Extension::AMD_shader_enqueue, "Vulkan 1.3", loc);
+    break;
+  }
+  case spv::Op::OpControlBarrier:
+  case spv::Op::OpMemoryBarrier: {
+    auto barrier = cast<SpirvBarrier>(instr);
+    if ((bool)(barrier->getMemorySemantics() &
+               spv::MemorySemanticsMask::OutputMemoryKHR)) {
+      featureManager.requestTargetEnv(SPV_ENV_VULKAN_1_3, "NODE_OUTPUT_MEMORY",
+                                      loc);
+      addCapability(spv::Capability::VulkanMemoryModel, loc);
+    }
+    break;
+  }
 
   default:
     break;
@@ -642,6 +689,19 @@ bool CapabilityVisitor::visit(SpirvExecutionModeBase *execMode) {
   SourceLocation entryPointSourceLocation =
       execMode->getEntryPoint()->getSourceLocation();
   switch (executionMode) {
+  case spv::ExecutionMode::CoalescingAMDX:
+  case spv::ExecutionMode::MaxNodeRecursionAMDX:
+  case spv::ExecutionMode::StaticNumWorkgroupsAMDX:
+  case spv::ExecutionMode::MaxNumWorkgroupsAMDX:
+    featureManager.requestTargetEnv(SPV_ENV_VULKAN_1_3, "WorkGraphs",
+                                    execModeSourceLocation);
+    addCapability(spv::Capability::ShaderEnqueueAMDX, execModeSourceLocation);
+    addExtension(Extension::AMD_shader_enqueue, "Vulkan 1.3",
+                 execModeSourceLocation);
+    break;
+  case spv::ExecutionMode::SubgroupSize:
+    addCapability(spv::Capability::SubgroupDispatch, execModeSourceLocation);
+    break;
   case spv::ExecutionMode::PostDepthCoverage:
     addCapability(spv::Capability::SampleMaskPostDepthCoverage,
                   entryPointSourceLocation);
diff --git a/tools/clang/lib/SPIRV/DebugTypeVisitor.cpp b/tools/clang/lib/SPIRV/DebugTypeVisitor.cpp
index 058e7b6255..24fab092cc 100644
--- a/tools/clang/lib/SPIRV/DebugTypeVisitor.cpp
+++ b/tools/clang/lib/SPIRV/DebugTypeVisitor.cpp
@@ -356,6 +356,17 @@ SpirvDebugType *DebugTypeVisitor::lowerToDebugType(const SpirvType *spirvType) {
     debugType = spvContext.getDebugTypeArray(spirvType, elemDebugType, counts);
     break;
   }
+  case SpirvType::TK_NodePayloadArrayAMD: {
+    auto *arrType = dyn_cast<NodePayloadArrayType>(spirvType);
+    SpirvDebugInstruction *elemDebugType =
+        lowerToDebugType(arrType->getElementType());
+
+    llvm::SmallVector<uint32_t, 4> counts;
+    counts.push_back(0u);
+
+    debugType = spvContext.getDebugTypeArray(spirvType, elemDebugType, counts);
+    break;
+  }
   case SpirvType::TK_Vector: {
     auto *vecType = dyn_cast<VectorType>(spirvType);
     SpirvDebugInstruction *elemDebugType =
diff --git a/tools/clang/lib/SPIRV/DeclResultIdMapper.cpp b/tools/clang/lib/SPIRV/DeclResultIdMapper.cpp
index de73d5e417..9d0d8f51a3 100644
--- a/tools/clang/lib/SPIRV/DeclResultIdMapper.cpp
+++ b/tools/clang/lib/SPIRV/DeclResultIdMapper.cpp
@@ -467,6 +467,10 @@ hlsl::DxilParamInputQual deduceParamQual(const DeclaratorDecl *decl,
   if (decl->hasAttr<HLSLPayloadAttr>())
     return hlsl::DxilParamInputQual::InPayload;
 
+  if (hlsl::IsHLSLNodeType(type)) {
+    return hlsl::DxilParamInputQual::NodeIO;
+  }
+
   return asInput ? hlsl::DxilParamInputQual::In : hlsl::DxilParamInputQual::Out;
 }
 
@@ -475,6 +479,9 @@ hlsl::DxilParamInputQual deduceParamQual(const DeclaratorDecl *decl,
 const hlsl::SigPoint *deduceSigPoint(const DeclaratorDecl *decl, bool asInput,
                                      const hlsl::ShaderModel::Kind kind,
                                      bool forPCF) {
+  if (kind == hlsl::ShaderModel::Kind::Node) {
+    return hlsl::SigPoint::GetSigPoint(hlsl::SigPoint::Kind::CSIn);
+  }
   return hlsl::SigPoint::GetSigPoint(hlsl::SigPointFromInputQual(
       deduceParamQual(decl, asInput), kind, forPCF));
 }
@@ -2158,6 +2165,8 @@ bool DeclResultIdMapper::assignLocations(
     llvm::DenseSet<StageVariableLocationInfo, StageVariableLocationInfo>
         *stageVariableLocationInfo) {
   for (const auto *var : vars) {
+    if (hlsl::IsHLSLNodeType(var->getAstType()))
+      continue;
     auto locCount = var->getLocationCount();
     uint32_t location = nextLocs(locCount);
     spvBuilder.decorateLocation(var->getSpirvInstr(), location);
@@ -3489,7 +3498,9 @@ SpirvVariable *DeclResultIdMapper::createSpirvInterfaceVariable(
   StageVar stageVar(
       stageVarData.sigPoint, *stageVarData.semantic, builtinAttr, evalType,
       // For HS/DS/GS, we have already stripped the outmost arrayness on type.
-      getLocationAndComponentCount(astContext, stageVarData.type));
+      hlsl::IsHLSLNodeInputType(stageVarData.type)
+          ? LocationAndComponent({0, 0, false})
+          : getLocationAndComponentCount(astContext, stageVarData.type));
   const auto name =
       stageVarData.namePrefix.str() + "." + stageVar.getSemanticStr();
   SpirvVariable *varInstr = createSpirvStageVar(
@@ -3708,6 +3719,22 @@ bool DeclResultIdMapper::createStageVars(StageVarDataBundle &stageVarData,
     stageVarData.semantic = &thisSemantic;
   }
 
+  if (hlsl::IsHLSLNodeType(stageVarData.type)) {
+    // Hijack the notion of semantic to use createSpirvInterfaceVariable
+    StringRef str = stageVarData.decl->getName();
+    stageVarData.semantic->str = stageVarData.semantic->name = str;
+    stageVarData.semantic->semantic = hlsl::Semantic::GetArbitrary();
+    SpirvVariable *varInstr = createSpirvInterfaceVariable(stageVarData);
+    if (!varInstr) {
+      return false;
+    }
+
+    *value = hlsl::IsHLSLNodeInputType(stageVarData.type)
+                 ? varInstr
+                 : loadShaderInputVariable(varInstr, stageVarData);
+    return true;
+  }
+
   if (stageVarData.semantic->isValid() &&
       // Structs with attached semantics will be handled later.
       !stageVarData.type->isStructureType()) {
@@ -4161,6 +4188,8 @@ SpirvVariable *DeclResultIdMapper::getBuiltinVar(spv::BuiltIn builtIn,
   case spv::BuiltIn::GlobalInvocationId:
   case spv::BuiltIn::WorkgroupId:
   case spv::BuiltIn::LocalInvocationIndex:
+  case spv::BuiltIn::RemainingRecursionLevelsAMDX:
+  case spv::BuiltIn::ShaderIndexAMDX:
     sc = spv::StorageClass::Input;
     break;
   case spv::BuiltIn::TaskCountNV:
@@ -4196,7 +4225,9 @@ SpirvVariable *DeclResultIdMapper::createSpirvStageVar(
   const auto type = stageVar->getAstType();
   const auto isPrecise = decl->hasAttr<HLSLPreciseAttr>();
   auto isNointerp = decl->hasAttr<HLSLNoInterpolationAttr>();
-  spv::StorageClass sc = getStorageClassForSigPoint(sigPoint);
+  spv::StorageClass sc = hlsl::IsHLSLNodeInputType(stageVar->getAstType())
+                             ? spv::StorageClass::NodePayloadAMDX
+                             : getStorageClassForSigPoint(sigPoint);
   if (sc == spv::StorageClass::Max)
     return 0;
   stageVar->setStorageClass(sc);
diff --git a/tools/clang/lib/SPIRV/EmitVisitor.cpp b/tools/clang/lib/SPIRV/EmitVisitor.cpp
index 7d39b0ec1f..8de0262ae6 100644
--- a/tools/clang/lib/SPIRV/EmitVisitor.cpp
+++ b/tools/clang/lib/SPIRV/EmitVisitor.cpp
@@ -938,6 +938,73 @@ bool EmitVisitor::visit(SpirvBarrier *inst) {
   curInst.push_back(memoryScopeId);
   curInst.push_back(memorySemanticsId);
   finalizeInstruction(&mainBinary);
+  emitDebugNameForInstruction(getOrAssignResultId<SpirvInstruction>(inst),
+                              inst->getDebugName());
+  return true;
+}
+
+bool EmitVisitor::visit(SpirvIsNodePayloadValid *inst) {
+  initInstruction(inst);
+  curInst.push_back(inst->getResultTypeId());
+  curInst.push_back(getOrAssignResultId<SpirvInstruction>(inst));
+  curInst.push_back(
+      getOrAssignResultId<SpirvInstruction>(inst->getPayloadArray()));
+  curInst.push_back(
+      getOrAssignResultId<SpirvInstruction>(inst->getNodeIndex()));
+  finalizeInstruction(&mainBinary);
+  emitDebugNameForInstruction(getOrAssignResultId<SpirvInstruction>(inst),
+                              inst->getDebugName());
+  return true;
+}
+
+bool EmitVisitor::visit(SpirvNodePayloadArrayLength *inst) {
+  initInstruction(inst);
+  curInst.push_back(inst->getResultTypeId());
+  curInst.push_back(getOrAssignResultId<SpirvInstruction>(inst));
+  curInst.push_back(
+      getOrAssignResultId<SpirvInstruction>(inst->getPayloadArray()));
+  finalizeInstruction(&mainBinary);
+  emitDebugNameForInstruction(getOrAssignResultId<SpirvInstruction>(inst),
+                              inst->getDebugName());
+  return true;
+}
+
+bool EmitVisitor::visit(SpirvAllocateNodePayloads *inst) {
+  const uint32_t allocationScopeId = typeHandler.getOrCreateConstantInt(
+      llvm::APInt(32, static_cast<uint32_t>(inst->getAllocationScope())),
+      context.getUIntType(32), /*isSpecConst */ false);
+
+  initInstruction(inst);
+  curInst.push_back(inst->getResultTypeId());
+  curInst.push_back(getOrAssignResultId<SpirvInstruction>(inst));
+  curInst.push_back(allocationScopeId);
+  curInst.push_back(
+      getOrAssignResultId<SpirvInstruction>(inst->getRecordCount()));
+  curInst.push_back(
+      getOrAssignResultId<SpirvInstruction>(inst->getShaderIndex()));
+  finalizeInstruction(&mainBinary);
+  emitDebugNameForInstruction(getOrAssignResultId<SpirvInstruction>(inst),
+                              inst->getDebugName());
+  return true;
+}
+
+bool EmitVisitor::visit(SpirvEnqueueNodePayloads *inst) {
+  initInstruction(inst);
+  curInst.push_back(getOrAssignResultId<SpirvInstruction>(inst->getPayload()));
+  finalizeInstruction(&mainBinary);
+  emitDebugNameForInstruction(getOrAssignResultId<SpirvInstruction>(inst),
+                              inst->getDebugName());
+  return true;
+}
+
+bool EmitVisitor::visit(SpirvFinishWritingNodePayload *inst) {
+  initInstruction(inst);
+  curInst.push_back(inst->getResultTypeId());
+  curInst.push_back(getOrAssignResultId<SpirvInstruction>(inst));
+  curInst.push_back(getOrAssignResultId<SpirvInstruction>(inst->getPayload()));
+  finalizeInstruction(&mainBinary);
+  emitDebugNameForInstruction(getOrAssignResultId<SpirvInstruction>(inst),
+                              inst->getDebugName());
   return true;
 }
 
@@ -1011,6 +1078,13 @@ bool EmitVisitor::visit(SpirvConstantComposite *inst) {
   return true;
 }
 
+bool EmitVisitor::visit(SpirvConstantString *inst) {
+  typeHandler.getOrCreateConstant(inst);
+  emitDebugNameForInstruction(getOrAssignResultId<SpirvInstruction>(inst),
+                              inst->getDebugName());
+  return true;
+}
+
 bool EmitVisitor::visit(SpirvConstantNull *inst) {
   typeHandler.getOrCreateConstant(inst);
   emitDebugNameForInstruction(getOrAssignResultId<SpirvInstruction>(inst),
@@ -2078,6 +2152,8 @@ uint32_t EmitTypeHandler::getOrCreateConstant(SpirvConstant *inst) {
     return getOrCreateConstantNull(constNull);
   } else if (auto *constBool = dyn_cast<SpirvConstantBoolean>(inst)) {
     return getOrCreateConstantBool(constBool);
+  } else if (auto *constString = dyn_cast<SpirvConstantString>(inst)) {
+    return getOrCreateConstantString(constString);
   } else if (auto *constUndef = dyn_cast<SpirvUndef>(inst)) {
     return getOrCreateUndef(constUndef);
   }
@@ -2116,6 +2192,36 @@ uint32_t EmitTypeHandler::getOrCreateConstantBool(SpirvConstantBoolean *inst) {
   return inst->getResultId();
 }
 
+uint32_t EmitTypeHandler::getOrCreateConstantString(SpirvConstantString *inst) {
+  const StringRef str = inst->getString();
+  const bool isSpecConst = inst->isSpecConstant();
+
+  if (!isSpecConst &&
+      emittedConstantStrings.find(str) != emittedConstantStrings.end()) {
+    // Already emitted this constant value. Reuse.
+    inst->setResultId(emittedConstantStrings[str]->getResultId());
+  } else if (isSpecConst && emittedSpecConstantInstructions.find(inst) !=
+                                emittedSpecConstantInstructions.end()) {
+    // We've already emitted this SpecConstant. Reuse.
+    return inst->getResultId();
+  } else {
+    // Constant wasn't emitted in the past.
+    const auto &words = string::encodeSPIRVString(inst->getString());
+    initTypeInstruction(inst->getopcode());
+    curTypeInst.push_back(getOrAssignResultId<SpirvInstruction>(inst));
+    curTypeInst.insert(curTypeInst.end(), words.begin(), words.end());
+    finalizeTypeInstruction();
+    // Remember this constant for the future (if not a spec constant)
+    if (isSpecConst) {
+      emittedSpecConstantInstructions.insert(inst);
+    } else {
+      emittedConstantStrings[str] = inst;
+    }
+  }
+
+  return inst->getResultId();
+}
+
 uint32_t EmitTypeHandler::getOrCreateConstantNull(SpirvConstantNull *inst) {
   auto found =
       std::find_if(emittedConstantNulls.begin(), emittedConstantNulls.end(),
@@ -2536,6 +2642,84 @@ uint32_t EmitTypeHandler::emitType(const SpirvType *type) {
     if (stride.hasValue())
       emitDecoration(id, spv::Decoration::ArrayStride, {stride.getValue()});
   }
+  // NodePayloadArray types
+  else if (const auto *npaType = dyn_cast<NodePayloadArrayType>(type)) {
+    const uint32_t elemTypeId = emitType(npaType->getElementType());
+    initTypeInstruction(spv::Op::OpTypeNodePayloadArrayAMDX);
+    curTypeInst.push_back(id);
+    curTypeInst.push_back(elemTypeId);
+    finalizeTypeInstruction();
+
+    // Emit decorations
+    const ParmVarDecl *nodeDecl = npaType->getNodeDecl();
+    if (hlsl::IsHLSLNodeOutputType(nodeDecl->getType())) {
+      StringRef name = nodeDecl->getName();
+      unsigned index = 0;
+      if (auto nodeID = nodeDecl->getAttr<HLSLNodeIdAttr>()) {
+        name = nodeID->getName();
+        index = nodeID->getArrayIndex();
+      }
+
+      auto *str = new (context) SpirvConstantString(name);
+      uint32_t nodeName = getOrCreateConstantString(str);
+      emitDecoration(id, spv::Decoration::PayloadNodeNameAMDX, {nodeName},
+                     llvm::None, true);
+      if (index) {
+        uint32_t baseIndex = getOrCreateConstantInt(
+            llvm::APInt(32, index), context.getUIntType(32), false);
+        emitDecoration(id, spv::Decoration::PayloadNodeBaseIndexAMDX,
+                       {baseIndex}, llvm::None, true);
+      }
+    }
+
+    uint32_t maxRecords;
+    if (const auto *attr = nodeDecl->getAttr<HLSLMaxRecordsAttr>()) {
+      maxRecords = getOrCreateConstantInt(llvm::APInt(32, attr->getMaxCount()),
+                                          context.getUIntType(32), false);
+    } else {
+      maxRecords = getOrCreateConstantInt(llvm::APInt(32, 1),
+                                          context.getUIntType(32), false);
+    }
+    emitDecoration(id, spv::Decoration::NodeMaxPayloadsAMDX, {maxRecords},
+                   llvm::None, true);
+
+    if (const auto *attr = nodeDecl->getAttr<HLSLMaxRecordsSharedWithAttr>()) {
+      const DeclContext *dc = nodeDecl->getParentFunctionOrMethod();
+      if (const auto *funDecl = dyn_cast_or_null<FunctionDecl>(dc)) {
+        IdentifierInfo *ii = attr->getName();
+        bool alreadyExists = false;
+        for (auto *paramDecl : funDecl->params()) {
+          if (paramDecl->getIdentifier() == ii) {
+            assert(paramDecl != nodeDecl);
+            auto otherType = context.getNodeDeclPayloadType(paramDecl);
+            const uint32_t otherId =
+                getResultIdForType(otherType, &alreadyExists);
+            assert(alreadyExists && "forward references not allowed in "
+                                    "MaxRecordsSharedWith attribute");
+            emitDecoration(id, spv::Decoration::NodeSharesPayloadLimitsWithAMDX,
+                           {otherId}, llvm::None, true);
+            break;
+          }
+        }
+        assert(alreadyExists &&
+               "invalid reference in MaxRecordsSharedWith attribute");
+      }
+    }
+    if (const auto *attr = nodeDecl->getAttr<HLSLAllowSparseNodesAttr>()) {
+      emitDecoration(id, spv::Decoration::PayloadNodeSparseArrayAMDX, {},
+                     llvm::None);
+    }
+    if (const auto *attr = nodeDecl->getAttr<HLSLUnboundedSparseNodesAttr>()) {
+      emitDecoration(id, spv::Decoration::PayloadNodeSparseArrayAMDX, {},
+                     llvm::None);
+    }
+    if (const auto *attr = nodeDecl->getAttr<HLSLNodeArraySizeAttr>()) {
+      uint32_t arraySize = getOrCreateConstantInt(
+          llvm::APInt(32, attr->getCount()), context.getUIntType(32), false);
+      emitDecoration(id, spv::Decoration::PayloadNodeArraySizeAMDX, {arraySize},
+                     llvm::None, true);
+    }
+  }
   // Structure types
   else if (const auto *structType = dyn_cast<StructType>(type)) {
     std::vector<std::reference_wrapper<const StructType::FieldInfo>>
@@ -2549,6 +2733,15 @@ uint32_t EmitTypeHandler::emitType(const SpirvType *type) {
       }
     }
 
+    if (const auto recordDecl = dyn_cast_or_null<RecordDecl>(
+            context.getStructDeclForSpirvType(structType))) {
+      auto index = context.getDispatchGridIndex(recordDecl);
+      if (index.hasValue()) {
+        emitDecoration(id, spv::Decoration::PayloadDispatchIndirectAMDX, {},
+                       index);
+      }
+    }
+
     // Emit OpMemberName for the struct members.
     for (size_t i = 0; i < fieldsToGenerate.size(); ++i)
       emitNameForType(fieldsToGenerate[i].get().name, id, i);
@@ -2611,6 +2804,13 @@ uint32_t EmitTypeHandler::emitType(const SpirvType *type) {
     else if (interfaceType == StructInterfaceType::UniformBuffer)
       emitDecoration(id, spv::Decoration::Block, {});
 
+    // Emit NodeTrackRWInputSharing decoration if attribute is present.
+    const auto *structDecl = dyn_cast_or_null<RecordDecl>(
+        context.getStructDeclForSpirvType(structType));
+    if (structDecl && structDecl->hasAttr<HLSLNodeTrackRWInputSharingAttr>()) {
+      emitDecoration(id, spv::Decoration::TrackFinishWritingAMDX, {});
+    }
+
     initTypeInstruction(spv::Op::OpTypeStruct);
     curTypeInst.push_back(id);
     for (auto fieldTypeId : fieldTypeIds)
@@ -2753,14 +2953,17 @@ void EmitTypeHandler::emitLiteral(const SpirvConstant *literal,
 void EmitTypeHandler::emitDecoration(uint32_t typeResultId,
                                      spv::Decoration decoration,
                                      llvm::ArrayRef<uint32_t> decorationParams,
-                                     llvm::Optional<uint32_t> memberIndex) {
-
+                                     llvm::Optional<uint32_t> memberIndex,
+                                     bool usesIdParams) {
   spv::Op op =
       memberIndex.hasValue() ? spv::Op::OpMemberDecorate : spv::Op::OpDecorate;
   if (decoration == spv::Decoration::UserTypeGOOGLE) {
     op = memberIndex.hasValue() ? spv::Op::OpMemberDecorateString
                                 : spv::Op::OpDecorateString;
   }
+  if (usesIdParams) {
+    op = spv::Op::OpDecorateId;
+  }
 
   assert(curDecorationInst.empty());
   curDecorationInst.push_back(static_cast<uint32_t>(op));
diff --git a/tools/clang/lib/SPIRV/EmitVisitor.h b/tools/clang/lib/SPIRV/EmitVisitor.h
index 1cec230e50..fb4b22e52b 100644
--- a/tools/clang/lib/SPIRV/EmitVisitor.h
+++ b/tools/clang/lib/SPIRV/EmitVisitor.h
@@ -83,7 +83,8 @@ class EmitTypeHandler {
   // parameters.
   void emitDecoration(uint32_t typeResultId, spv::Decoration,
                       llvm::ArrayRef<uint32_t> decorationParams,
-                      llvm::Optional<uint32_t> memberIndex = llvm::None);
+                      llvm::Optional<uint32_t> memberIndex = llvm::None,
+                      bool usesIdParams = false);
 
   uint32_t getOrCreateConstant(SpirvConstant *);
 
@@ -110,6 +111,7 @@ class EmitTypeHandler {
   uint32_t getOrCreateConstantNull(SpirvConstantNull *);
   uint32_t getOrCreateUndef(SpirvUndef *);
   uint32_t getOrCreateConstantBool(SpirvConstantBoolean *);
+  uint32_t getOrCreateConstantString(SpirvConstantString *);
   template <typename vecType>
   void emitLiteral(const SpirvConstant *, vecType &outInst);
   template <typename vecType>
@@ -173,6 +175,7 @@ class EmitTypeHandler {
       emittedConstantInts;
   llvm::DenseMap<std::pair<uint64_t, const SpirvType *>, uint32_t>
       emittedConstantFloats;
+  llvm::DenseMap<StringRef, const SpirvConstantString *> emittedConstantStrings;
   llvm::SmallVector<SpirvConstantComposite *, 8> emittedConstantComposites;
   llvm::SmallVector<SpirvConstantNull *, 8> emittedConstantNulls;
   llvm::SmallVector<SpirvUndef *, 8> emittedUndef;
@@ -248,6 +251,11 @@ class EmitVisitor : public Visitor {
   bool visit(SpirvAccessChain *) override;
   bool visit(SpirvAtomic *) override;
   bool visit(SpirvBarrier *) override;
+  bool visit(SpirvIsNodePayloadValid *inst) override;
+  bool visit(SpirvNodePayloadArrayLength *inst) override;
+  bool visit(SpirvAllocateNodePayloads *inst) override;
+  bool visit(SpirvEnqueueNodePayloads *inst) override;
+  bool visit(SpirvFinishWritingNodePayload *inst) override;
   bool visit(SpirvBinaryOp *) override;
   bool visit(SpirvBitFieldExtract *) override;
   bool visit(SpirvBitFieldInsert *) override;
@@ -255,6 +263,7 @@ class EmitVisitor : public Visitor {
   bool visit(SpirvConstantInteger *) override;
   bool visit(SpirvConstantFloat *) override;
   bool visit(SpirvConstantComposite *) override;
+  bool visit(SpirvConstantString *) override;
   bool visit(SpirvConstantNull *) override;
   bool visit(SpirvConvertPtrToU *) override;
   bool visit(SpirvConvertUToPtr *) override;
@@ -455,6 +464,10 @@ class EmitVisitor : public Visitor {
   std::vector<uint32_t> mainBinary;
   // String literals to SpirvString objects
   llvm::StringMap<uint32_t> stringIdMap;
+  // String literals to SpirvConstantString objects
+  llvm::StringMap<uint32_t> stringConstantIdMap;
+  // String spec constants
+  llvm::DenseSet<const SpirvInstruction *> stringSpecConstantInstructions;
   // Main file information for debugging that will be used by OpLine.
   uint32_t debugMainFileId;
   // Id for Vulkan DebugInfo extended instruction set. Used when generating
diff --git a/tools/clang/lib/SPIRV/FeatureManager.cpp b/tools/clang/lib/SPIRV/FeatureManager.cpp
index 7fb449fee9..b6aed4d8b6 100644
--- a/tools/clang/lib/SPIRV/FeatureManager.cpp
+++ b/tools/clang/lib/SPIRV/FeatureManager.cpp
@@ -214,6 +214,7 @@ Extension FeatureManager::getExtensionSymbol(llvm::StringRef name) {
       .Case("SPV_EXT_shader_image_int64", Extension::EXT_shader_image_int64)
       .Case("SPV_KHR_physical_storage_buffer",
             Extension::KHR_physical_storage_buffer)
+      .Case("SPV_AMDX_shader_enqueue", Extension::AMD_shader_enqueue)
       .Case("SPV_KHR_vulkan_memory_model", Extension::KHR_vulkan_memory_model)
       .Case("SPV_KHR_compute_shader_derivatives",
             Extension::KHR_compute_shader_derivatives)
@@ -284,6 +285,8 @@ const char *FeatureManager::getExtensionName(Extension symbol) {
     return "SPV_EXT_shader_image_int64";
   case Extension::KHR_physical_storage_buffer:
     return "SPV_KHR_physical_storage_buffer";
+  case Extension::AMD_shader_enqueue:
+    return "SPV_AMDX_shader_enqueue";
   case Extension::KHR_vulkan_memory_model:
     return "SPV_KHR_vulkan_memory_model";
   case Extension::KHR_compute_shader_derivatives:
diff --git a/tools/clang/lib/SPIRV/GlPerVertex.cpp b/tools/clang/lib/SPIRV/GlPerVertex.cpp
index 09b09236b4..aa5a40d008 100644
--- a/tools/clang/lib/SPIRV/GlPerVertex.cpp
+++ b/tools/clang/lib/SPIRV/GlPerVertex.cpp
@@ -324,6 +324,9 @@ bool GlPerVertex::setClipCullDistanceType(SemanticIndexToTypeMap *typeMap,
 
 bool GlPerVertex::doGlPerVertexFacts(const NamedDecl *decl, QualType baseType,
                                      bool asInput) {
+  if (hlsl::IsHLSLNodeType(baseType)) {
+    return true;
+  }
 
   llvm::StringRef semanticStr;
   const hlsl::Semantic *semantic = {};
diff --git a/tools/clang/lib/SPIRV/LowerTypeVisitor.cpp b/tools/clang/lib/SPIRV/LowerTypeVisitor.cpp
index 8238750af9..1869983ae3 100644
--- a/tools/clang/lib/SPIRV/LowerTypeVisitor.cpp
+++ b/tools/clang/lib/SPIRV/LowerTypeVisitor.cpp
@@ -362,6 +362,16 @@ const SpirvType *LowerTypeVisitor::lowerType(const SpirvType *type,
       return raType;
     return spvContext.getRuntimeArrayType(loweredElemType, raType->getStride());
   }
+  // Node payload arrays could contain a hybrid type
+  else if (const auto *npaType = dyn_cast<NodePayloadArrayType>(type)) {
+    const auto *loweredElemType =
+        lowerType(npaType->getElementType(), rule, loc);
+    // If runtime array didn't contain any hybrid types, return itself.
+    if (npaType->getElementType() == loweredElemType)
+      return npaType;
+    return spvContext.getNodePayloadArrayType(loweredElemType,
+                                              npaType->getNodeDecl());
+  }
   // Pointer types could point to a hybrid type.
   else if (const auto *ptrType = dyn_cast<SpirvPointerType>(type)) {
     const auto *loweredPointee =
diff --git a/tools/clang/lib/SPIRV/PreciseVisitor.cpp b/tools/clang/lib/SPIRV/PreciseVisitor.cpp
index 34e6087990..f1869318a4 100644
--- a/tools/clang/lib/SPIRV/PreciseVisitor.cpp
+++ b/tools/clang/lib/SPIRV/PreciseVisitor.cpp
@@ -60,6 +60,9 @@ bool isAccessingPrecise(clang::spirv::SpirvAccessChain *inst) {
     } else if (auto *raType = llvm::dyn_cast<RuntimeArrayType>(baseType)) {
       indexes.pop();
       baseType = raType->getElementType();
+    } else if (auto *npaType = llvm::dyn_cast<NodePayloadArrayType>(baseType)) {
+      indexes.pop();
+      baseType = npaType->getElementType();
     } else if (auto *structType = llvm::dyn_cast<StructType>(baseType)) {
       SpirvInstruction *index = indexes.top();
       if (auto *constInt = llvm::dyn_cast<SpirvConstantInteger>(index)) {
diff --git a/tools/clang/lib/SPIRV/SpirvBuilder.cpp b/tools/clang/lib/SPIRV/SpirvBuilder.cpp
index d776ba65fb..e085603b21 100644
--- a/tools/clang/lib/SPIRV/SpirvBuilder.cpp
+++ b/tools/clang/lib/SPIRV/SpirvBuilder.cpp
@@ -81,7 +81,9 @@ SpirvBuilder::addFnParam(QualType ptrType, bool isPrecise, bool isNointerp,
     param = new (context)
         SpirvFunctionParameter(ptrType, isPrecise, isNointerp, loc);
   }
-  param->setStorageClass(spv::StorageClass::Function);
+  param->setStorageClass(hlsl::IsHLSLNodeInputType(ptrType)
+                             ? spv::StorageClass::NodePayloadAMDX
+                             : spv::StorageClass::Function);
   param->setDebugName(name);
   function->addParameter(param);
   return param;
@@ -230,6 +232,13 @@ SpirvInstruction *SpirvBuilder::createLoad(QualType resultType,
     createEndInvocationInterlockEXT(loc, range);
   }
 
+  if (context.hasLoweredType(pointer)) {
+    // preserve distinct node payload array types
+    auto *ptrType = dyn_cast<SpirvPointerType>(pointer->getResultType());
+    instruction->setResultType(ptrType->getPointeeType());
+    context.addToInstructionsWithLoweredType(instruction);
+  }
+
   const auto &bitfieldInfo = pointer->getBitfieldInfo();
   if (!bitfieldInfo.hasValue())
     return instruction;
@@ -306,6 +315,12 @@ SpirvStore *SpirvBuilder::createStore(SpirvInstruction *address,
 
   auto *instruction =
       new (context) SpirvStore(loc, address, source, llvm::None, range);
+  if (context.hasLoweredType(source)) {
+    // preserve distinct node payload array types
+    address->setResultType(context.getPointerType(source->getResultType(),
+                                                  address->getStorageClass()));
+    context.addToInstructionsWithLoweredType(address);
+  }
   insertPoint->addInstruction(instruction);
 
   if (address->getStorageClass() == spv::StorageClass::PhysicalStorageBuffer &&
@@ -872,6 +887,53 @@ SpirvInstruction *SpirvBuilder::createNonSemanticDebugPrintfExtInst(
   return extInst;
 }
 
+SpirvInstruction *
+SpirvBuilder::createIsNodePayloadValid(SpirvInstruction *payloadArray,
+                                       SpirvInstruction *nodeIndex,
+                                       SourceLocation loc) {
+  auto *inst = new (context)
+      SpirvIsNodePayloadValid(astContext.BoolTy, loc, payloadArray, nodeIndex);
+  insertPoint->addInstruction(inst);
+  return inst;
+}
+
+SpirvInstruction *
+SpirvBuilder::createNodePayloadArrayLength(SpirvInstruction *payloadArray,
+                                           SourceLocation loc) {
+  auto *inst = new (context)
+      SpirvNodePayloadArrayLength(astContext.UnsignedIntTy, loc, payloadArray);
+  insertPoint->addInstruction(inst);
+  return inst;
+}
+
+SpirvInstruction *SpirvBuilder::createAllocateNodePayloads(
+    QualType resultType, spv::Scope allocationScope,
+    SpirvInstruction *shaderIndex, SpirvInstruction *recordCount,
+    SourceLocation loc) {
+  assert(insertPoint && "null insert point");
+  auto *inst = new (context) SpirvAllocateNodePayloads(
+      resultType, loc, allocationScope, shaderIndex, recordCount);
+  insertPoint->addInstruction(inst);
+  return inst;
+}
+
+void SpirvBuilder::createEnqueueOutputNodePayloads(SpirvInstruction *payload,
+                                                   SourceLocation loc) {
+  assert(insertPoint && "null insert point");
+  auto *inst = new (context) SpirvEnqueueNodePayloads(loc, payload);
+  insertPoint->addInstruction(inst);
+}
+
+SpirvInstruction *
+SpirvBuilder::createFinishWritingNodePayload(SpirvInstruction *payload,
+                                             SourceLocation loc) {
+  assert(insertPoint && "null insert point");
+  auto *inst = new (context)
+      SpirvFinishWritingNodePayload(astContext.BoolTy, loc, payload);
+  insertPoint->addInstruction(inst);
+  return inst;
+}
+
 void SpirvBuilder::createBarrier(spv::Scope memoryScope,
                                  spv::MemorySemanticsMask memorySemantics,
                                  llvm::Optional<spv::Scope> exec,
@@ -1866,6 +1928,14 @@ SpirvConstant *SpirvBuilder::getConstantNull(QualType type) {
   return nullConst;
 }
 
+SpirvConstant *SpirvBuilder::getConstantString(llvm::StringRef str,
+                                               bool specConst) {
+  // We do not care about making unique constants at this point.
+  auto *stringConst = new (context) SpirvConstantString(str, specConst);
+  mod->addConstant(stringConst);
+  return stringConst;
+}
+
 SpirvUndef *SpirvBuilder::getUndef(QualType type) {
   // We do not care about making unique constants at this point.
   auto *undef = new (context) SpirvUndef(type);
diff --git a/tools/clang/lib/SPIRV/SpirvContext.cpp b/tools/clang/lib/SPIRV/SpirvContext.cpp
index cb44d3a3a8..88716dddde 100644
--- a/tools/clang/lib/SPIRV/SpirvContext.cpp
+++ b/tools/clang/lib/SPIRV/SpirvContext.cpp
@@ -62,6 +62,9 @@ SpirvContext::~SpirvContext() {
   for (auto *raType : runtimeArrayTypes)
     raType->~RuntimeArrayType();
 
+  for (auto *npaType : nodePayloadArrayTypes)
+    npaType->~NodePayloadArrayType();
+
   for (auto *fnType : functionTypes)
     fnType->~FunctionType();
 
@@ -273,6 +276,19 @@ SpirvContext::getRuntimeArrayType(const SpirvType *elemType,
   return *(inserted.first);
 }
 
+const NodePayloadArrayType *
+SpirvContext::getNodePayloadArrayType(const SpirvType *elemType,
+                                      const ParmVarDecl *nodeDecl) {
+  NodePayloadArrayType type(elemType, nodeDecl);
+  auto found = nodePayloadArrayTypes.find(&type);
+  if (found != nodePayloadArrayTypes.end())
+    return *found;
+
+  auto inserted = nodePayloadArrayTypes.insert(
+      new (this) NodePayloadArrayType(elemType, nodeDecl));
+  return *(inserted.first);
+}
+
 const StructType *
 SpirvContext::getStructType(llvm::ArrayRef<StructType::FieldInfo> fields,
                             llvm::StringRef name, bool isReadOnly,
diff --git a/tools/clang/lib/SPIRV/SpirvEmitter.cpp b/tools/clang/lib/SPIRV/SpirvEmitter.cpp
index 24774875f7..f3d10537e1 100644
--- a/tools/clang/lib/SPIRV/SpirvEmitter.cpp
+++ b/tools/clang/lib/SPIRV/SpirvEmitter.cpp
@@ -265,7 +265,8 @@ inline bool canActAsInParmVar(const ParmVarDecl *param) {
   return !param->hasAttr<HLSLOutAttr>() &&
          // GS output streams are marked as inout, but it should not be
          // used as in parameter.
-         !hlsl::IsHLSLStreamOutputType(param->getType());
+         !hlsl::IsHLSLStreamOutputType(param->getType()) &&
+         !hlsl::IsHLSLNodeOutputType(param->getType());
 }
 
 /// Returns true if the given function parameter can act as shader stage
@@ -1424,6 +1425,83 @@ SpirvInstruction *SpirvEmitter::castToType(SpirvInstruction *value,
   return nullptr;
 }
 
+static bool handleDispatchGrid(SpirvContext &spvContext,
+                               const RecordDecl *recordDecl) {
+  unsigned index = 0;
+  for (auto fieldDecl : recordDecl->fields()) {
+    QualType fieldType = fieldDecl->getType();
+    for (const hlsl::UnusualAnnotation *it :
+         fieldDecl->getUnusualAnnotations()) {
+      if (it->getKind() == hlsl::UnusualAnnotation::UA_SemanticDecl) {
+        const hlsl::SemanticDecl *sd = cast<hlsl::SemanticDecl>(it);
+        if (sd->SemanticName.equals("SV_DispatchGrid")) {
+          spvContext.registerDispatchGridIndex(recordDecl, index);
+          return true;
+        }
+      }
+    }
+    if (const auto *innerType = fieldType->getAs<RecordType>()) {
+      if (handleDispatchGrid(spvContext, innerType->getDecl()))
+        return true;
+    }
+    ++index;
+  }
+  return false;
+}
+
+bool SpirvEmitter::handleNodePayloadArrayType(const ParmVarDecl *decl,
+                                              SpirvInstruction *instr) {
+  // Because SPIR-V node payload array types are node-specific, propagate
+  // lowered types
+  switch (instr->getKind()) {
+  case SpirvInstruction::Kind::IK_Load: {
+    SpirvInstruction *ptr = dyn_cast<SpirvLoad>(instr)->getPointer();
+    if (handleNodePayloadArrayType(decl, ptr)) {
+      const SpirvPointerType *ptrType =
+          dyn_cast<SpirvPointerType>(ptr->getResultType());
+      instr->setResultType(ptrType->getPointeeType());
+      spvContext.addToInstructionsWithLoweredType(instr);
+      return true;
+    }
+    return false;
+  }
+  case SpirvInstruction::Kind::IK_FunctionParameter:
+  case SpirvInstruction::Kind::IK_Variable: {
+    QualType varType = decl->getType();
+    if (hlsl::IsHLSLNodeType(varType)) {
+      if (auto *type = spvContext.getNodeDeclPayloadType(decl)) {
+        instr->setResultType(
+            spvContext.getPointerType(type, instr->getStorageClass()));
+      } else {
+        LowerTypeVisitor lowerTypeVisitor(astContext, spvContext, spirvOptions,
+                                          spvBuilder);
+        QualType resultType =
+            hlsl::GetHLSLNodeIOResultType(astContext, varType);
+        const auto *recordType = resultType->getAs<RecordType>();
+        assert(recordType);
+        if (hlsl::IsHLSLDispatchNodeInputRecordType(varType)) {
+          handleDispatchGrid(spvContext, recordType->getDecl());
+        }
+        const SpirvType *elemType = lowerTypeVisitor.lowerType(
+            resultType, clang::spirv::SpirvLayoutRule::Scalar, llvm::None,
+            decl->getLocation());
+        const NodePayloadArrayType *arrType =
+            spvContext.getNodePayloadArrayType(elemType, decl);
+        const SpirvType *ptrType =
+            spvContext.getPointerType(arrType, instr->getStorageClass());
+        instr->setResultType(ptrType);
+        spvContext.registerNodeDeclPayloadType(arrType, decl);
+      }
+      spvContext.addToInstructionsWithLoweredType(instr);
+      return true;
+    }
+    return false;
+  }
+  default:
+    return false;
+  }
+}
+
 void SpirvEmitter::doFunctionDecl(const FunctionDecl *decl) {
   // Forward declaration of a function inside another.
   if (!decl->isThisDeclarationADefinition()) {
@@ -1565,6 +1643,9 @@ void SpirvEmitter::doFunctionDecl(const FunctionDecl *decl) {
     QualType paramType = paramDecl->getType();
     auto *param =
         declIdMapper.createFnParam(paramDecl, i + 1 + isNonStaticMemberFn);
+    if (isEntry) {
+      handleNodePayloadArrayType(paramDecl, param);
+    }
 #ifdef ENABLE_SPIRV_CODEGEN
     if (hlsl::IsVKBufferPointerType(paramType)) {
       Optional<bool> isRowMajor = llvm::None;
@@ -4318,7 +4399,7 @@ SpirvEmitter::processTextureLevelOfDetail(const CXXMemberCallExpr *expr,
       spvBuilder.createImageQuery(spv::Op::OpImageQueryLod, queryResultType,
                                   expr->getExprLoc(), sampledImage, coordinate);
 
-  if (spvContext.isCS()) {
+  if (spvContext.isCS() || spvContext.isNode()) {
     addDerivativeGroupExecutionMode();
   }
   // The first component of the float2 contains the mipmap array layer.
@@ -5321,6 +5402,9 @@ SpirvEmitter::doCXXMemberCallExpr(const CXXMemberCallExpr *expr) {
   uint32_t opcode = static_cast<uint32_t>(hlsl::IntrinsicOp::Num_Intrinsics);
 
   if (hlsl::GetIntrinsicOp(callee, opcode, group)) {
+    if (group == "subscript") {
+      return processIntrinsicExtractRecordStruct(expr);
+    }
     return processIntrinsicMemberCall(expr,
                                       static_cast<hlsl::IntrinsicOp>(opcode));
   }
@@ -5517,6 +5601,28 @@ SpirvEmitter::processIntrinsicMemberCall(const CXXMemberCallExpr *expr,
     return processRayQueryIntrinsics(expr, opcode);
   case IntrinsicOp::MOP_GetBufferContents:
     return processIntrinsicGetBufferContents(expr);
+  case hlsl::IntrinsicOp::MOP_GetThreadNodeOutputRecords:
+    return processIntrinsicGetNodeOutputRecords(expr, false);
+  case hlsl::IntrinsicOp::MOP_GetGroupNodeOutputRecords:
+    return processIntrinsicGetNodeOutputRecords(expr, true);
+  case hlsl::IntrinsicOp::MOP_ThreadIncrementOutputCount:
+    retVal = processIntrinsicIncrementOutputCount(expr, false);
+    break;
+  case hlsl::IntrinsicOp::MOP_GroupIncrementOutputCount:
+    retVal = processIntrinsicIncrementOutputCount(expr, true);
+    break;
+  case hlsl::IntrinsicOp::MOP_IsValid:
+    retVal = processIntrinsicIsValid(expr);
+    break;
+  case hlsl::IntrinsicOp::MOP_Count:
+    retVal = processIntrinsicGetRecordCount(expr);
+    break;
+  case hlsl::IntrinsicOp::MOP_OutputComplete:
+    processIntrinsicOutputComplete(expr);
+    break;
+  case hlsl::IntrinsicOp::MOP_FinishedCrossGroupSharing:
+    retVal = processIntrinsicFinishedCrossGroupSharing(expr);
+    break;
   default:
     emitError("intrinsic '%0' method unimplemented",
               expr->getCallee()->getExprLoc())
@@ -5568,7 +5674,8 @@ SpirvInstruction *SpirvEmitter::createImageSample(
   const bool isExplicit = lod || (grad.first && grad.second);
 
   // Implicit-lod instructions are only allowed in pixel and compute shaders.
-  if (!spvContext.isPS() && !spvContext.isCS() && !isExplicit)
+  if (!spvContext.isPS() && !spvContext.isCS() && !spvContext.isNode() &&
+      !isExplicit)
     emitError("sampling with implicit lod is only allowed in fragment and "
               "compute shaders",
               loc);
@@ -5673,7 +5780,7 @@ SpirvEmitter::processTextureSampleGather(const CXXMemberCallExpr *expr,
 
   const auto retType = expr->getDirectCallee()->getReturnType();
   if (isSample) {
-    if (spvContext.isCS()) {
+    if (spvContext.isCS() || spvContext.isNode()) {
       addDerivativeGroupExecutionMode();
     }
     return createImageSample(retType, imageType, image, sampler, coordinate,
@@ -5763,7 +5870,7 @@ SpirvEmitter::processTextureSampleBiasLevel(const CXXMemberCallExpr *expr,
 
   const auto retType = expr->getDirectCallee()->getReturnType();
 
-  if (!lod && spvContext.isCS()) {
+  if (!lod && (spvContext.isCS() || spvContext.isNode())) {
     addDerivativeGroupExecutionMode();
   }
   return createImageSample(
@@ -8647,9 +8754,10 @@ const Expr *SpirvEmitter::collectArrayStructIndices(
   }
 
   {
-    // Indexing into ConstantBuffers and TextureBuffers involves an additional
-    // FlatConversion node which casts the handle to the underlying structure
-    // type. We can look past the FlatConversion to continue to collect indices.
+    // Indexing into ConstantBuffers, TextureBuffers, and node input/output
+    // types involves an additional FlatConversion node which casts the handle
+    // to the underlying structure type. We can look past the FlatConversion to
+    // continue to collect indices.
     // For example: MyConstantBufferArray[0].structMember1
     // `-MemberExpr .structMember1
     //   `-ImplicitCastExpr 'const T' lvalue <FlatConversion>
@@ -8658,7 +8766,8 @@ const Expr *SpirvEmitter::collectArrayStructIndices(
       if (castExpr->getCastKind() == CK_FlatConversion) {
         const auto *subExpr = castExpr->getSubExpr();
         const QualType subExprType = subExpr->getType();
-        if (isConstantTextureBuffer(subExprType)) {
+        if (isConstantTextureBuffer(subExprType) ||
+            hlsl::IsHLSLNodeType(subExprType)) {
           return collectArrayStructIndices(subExpr, rawIndex, rawIndices,
                                            indices, isMSOutAttribute);
         }
@@ -9060,6 +9169,9 @@ SpirvEmitter::processIntrinsicCallExpr(const CallExpr *callExpr) {
   case hlsl::IntrinsicOp::IOP_udot:
     retVal = processIntrinsicDot(callExpr);
     break;
+  case hlsl::IntrinsicOp::IOP_Barrier:
+    retVal = processIntrinsicBarrier(callExpr);
+    break;
   case hlsl::IntrinsicOp::IOP_GroupMemoryBarrier:
     retVal = processIntrinsicMemoryBarrier(callExpr,
                                            /*isDevice*/ false,
@@ -9092,6 +9204,9 @@ SpirvEmitter::processIntrinsicCallExpr(const CallExpr *callExpr) {
                                            /*groupSync*/ true,
                                            /*isAllBarrier*/ true);
     break;
+  case hlsl::IntrinsicOp::IOP_GetRemainingRecursionLevels:
+    retVal = processIntrinsicGetRemainingRecursionLevels(callExpr);
+    break;
   case hlsl::IntrinsicOp::IOP_CheckAccessFullyMapped:
     retVal = spvBuilder.createImageSparseTexelsResident(
         doExpr(callExpr->getArg(0)), srcLoc, srcRange);
@@ -9568,6 +9683,15 @@ SpirvEmitter::processIntrinsicCallExpr(const CallExpr *callExpr) {
   return retVal;
 }
 
+SpirvInstruction *SpirvEmitter::processIntrinsicGetRecordCount(
+    const CXXMemberCallExpr *callExpr) {
+  assert(callExpr->getNumArgs() == 0);
+  const auto obj = callExpr->getImplicitObjectArgument();
+  const auto loc = callExpr->getExprLoc();
+  SpirvInstruction *payload = doExpr(obj);
+  return spvBuilder.createNodePayloadArrayLength(payload, loc);
+}
+
 SpirvInstruction *
 SpirvEmitter::processIntrinsicFirstbit(const CallExpr *callExpr,
                                        GLSLstd450 glslOpcode) {
@@ -11049,6 +11173,194 @@ SpirvInstruction *SpirvEmitter::processIntrinsicGetBufferContents(
   return retVal;
 }
 
+SpirvInstruction *SpirvEmitter::processIntrinsicExtractRecordStruct(
+    const CXXMemberCallExpr *callExpr) {
+  Expr *obj = callExpr->getImplicitObjectArgument();
+  QualType objType = obj->getType();
+  unsigned n = callExpr->getNumArgs();
+  assert(hlsl::IsHLSLNodeType(objType));
+  assert(n == 0 || n == 1 && hlsl::IsHLSLNodeRecordArrayType(objType));
+
+  QualType recordType = hlsl::GetHLSLNodeIOResultType(astContext, objType);
+  SpirvInstruction *res = doExpr(obj);
+  SpirvInstruction *index =
+      n ? doExpr(callExpr->getArg(0))
+        : spvBuilder.getConstantInt(astContext.UnsignedIntTy,
+                                    llvm::APInt(32, 0));
+  res->setLayoutRule(SpirvLayoutRule::Scalar);
+
+  return spvBuilder.createAccessChain(recordType, res, {index},
+                                      callExpr->getExprLoc(),
+                                      callExpr->getSourceRange());
+}
+
+SpirvInstruction *SpirvEmitter::processIntrinsicGetRemainingRecursionLevels(
+    const CallExpr *callExpr) {
+  assert(callExpr->getNumArgs() == 0);
+  const auto loc = callExpr->getExprLoc();
+  const QualType retType = callExpr->getCallReturnType(astContext);
+  auto *var = declIdMapper.getBuiltinVar(
+      spv::BuiltIn::RemainingRecursionLevelsAMDX, retType, loc);
+  return spvBuilder.createLoad(retType, var, loc);
+}
+
+SpirvInstruction *
+SpirvEmitter::processIntrinsicIsValid(const CXXMemberCallExpr *callExpr) {
+  assert(callExpr->getNumArgs() == 0);
+  const auto loc = callExpr->getExprLoc();
+  const Expr *nodeOutputExpr = callExpr->getImplicitObjectArgument();
+  Expr *baseExpr = const_cast<Expr *>(nodeOutputExpr);
+  SpirvInstruction *shaderIndex = nullptr;
+
+  if (const auto subExpr = dyn_cast_or_null<CXXOperatorCallExpr>(
+          nodeOutputExpr->IgnoreParenNoopCasts(astContext))) {
+    if (subExpr->getOperator() == OverloadedOperatorKind::OO_Subscript) {
+      // special case: offset shader index by the array subscript
+      shaderIndex = doExpr(subExpr->getArg(1));
+      baseExpr = const_cast<Expr *>(subExpr->getArg(0));
+    }
+  }
+
+  const auto *declRefExpr = dyn_cast<DeclRefExpr>(baseExpr->IgnoreImpCasts());
+  const auto *paramDecl = dyn_cast<ParmVarDecl>(declRefExpr->getDecl());
+  int nodeIndex = 0;
+  if (HLSLNodeIdAttr *nodeId = paramDecl->getAttr<HLSLNodeIdAttr>()) {
+    nodeIndex = nodeId->getArrayIndex();
+  }
+
+  SpirvInstruction *payload = doExpr(baseExpr);
+  if (!shaderIndex) {
+    shaderIndex = spvBuilder.getConstantInt(astContext.UnsignedIntTy,
+                                            llvm::APInt(32, nodeIndex));
+  }
+
+  return spvBuilder.createIsNodePayloadValid(payload, shaderIndex, loc);
+}
+
+SpirvInstruction *SpirvEmitter::processIntrinsicGetNodeOutputRecords(
+    const CXXMemberCallExpr *callExpr, bool isGroupShared) {
+  assert(callExpr->getNumArgs() == 1);
+  const auto loc = callExpr->getExprLoc();
+  const Expr *nodeOutputExpr = callExpr->getImplicitObjectArgument();
+  Expr *baseExpr = const_cast<Expr *>(nodeOutputExpr);
+  SpirvInstruction *shaderIndex = nullptr;
+
+  if (const auto subExpr = dyn_cast_or_null<CXXOperatorCallExpr>(
+          nodeOutputExpr->IgnoreParenNoopCasts(astContext))) {
+    if (subExpr->getOperator() == OverloadedOperatorKind::OO_Subscript) {
+      // special case: offset shader index by the array subscript
+      shaderIndex = doExpr(subExpr->getArg(1));
+      baseExpr = const_cast<Expr *>(subExpr->getArg(0));
+    }
+  }
+
+  const auto *declRefExpr = dyn_cast<DeclRefExpr>(baseExpr->IgnoreImpCasts());
+  const auto *paramDecl = dyn_cast<ParmVarDecl>(declRefExpr->getDecl());
+  const auto *nodeID = paramDecl->getAttr<HLSLNodeIdAttr>();
+  StringRef nodeName = paramDecl->getName();
+  unsigned nodeIndex = 0;
+  if (nodeID) {
+    nodeName = nodeID->getName();
+    nodeIndex = nodeID->getArrayIndex();
+  }
+
+  if (!shaderIndex) {
+    shaderIndex = spvBuilder.getConstantInt(astContext.UnsignedIntTy,
+                                            llvm::APInt(32, nodeIndex));
+  }
+
+  LowerTypeVisitor lowerTypeVisitor(astContext, spvContext, spirvOptions,
+                                    spvBuilder);
+  const SpirvType *elemType = lowerTypeVisitor.lowerType(
+      hlsl::GetHLSLNodeIOResultType(astContext, baseExpr->getType()),
+      clang::spirv::SpirvLayoutRule::Scalar, llvm::None,
+      paramDecl->getLocation());
+  const SpirvType *payloadType = spvContext.getPointerType(
+      spvContext.getNodePayloadArrayType(elemType, paramDecl),
+      spv::StorageClass::NodePayloadAMDX);
+
+  spv::Scope scope =
+      isGroupShared ? spv::Scope::Workgroup : spv::Scope::Invocation;
+  SpirvInstruction *recordCount = doExpr(callExpr->getArg(0));
+  SpirvInstruction *result = spvBuilder.createAllocateNodePayloads(
+      callExpr->getType(), scope, shaderIndex, recordCount, loc);
+  result->setResultType(payloadType);
+  spvContext.addToInstructionsWithLoweredType(result);
+  return result;
+}
+
+SpirvInstruction *SpirvEmitter::processIntrinsicIncrementOutputCount(
+    const CXXMemberCallExpr *callExpr, bool isGroupShared) {
+  return processIntrinsicGetNodeOutputRecords(callExpr, isGroupShared);
+}
+
+void SpirvEmitter::processIntrinsicOutputComplete(
+    const CXXMemberCallExpr *callExpr) {
+  Expr *payloadExpr =
+      callExpr->getImplicitObjectArgument()->IgnoreParenNoopCasts(astContext);
+  SpirvInstruction *payload = doExpr(payloadExpr);
+  spvBuilder.createEnqueueOutputNodePayloads(payload, callExpr->getExprLoc());
+}
+
+SpirvInstruction *SpirvEmitter::processIntrinsicFinishedCrossGroupSharing(
+    const CXXMemberCallExpr *callExpr) {
+  Expr *payloadExpr = callExpr->getImplicitObjectArgument();
+  SpirvInstruction *payload = doExpr(payloadExpr);
+  return spvBuilder.createFinishWritingNodePayload(payload,
+                                                   callExpr->getExprLoc());
+}
+
+SpirvInstruction *
+SpirvEmitter::processIntrinsicBarrier(const CallExpr *callExpr) {
+  llvm::APSInt a1(32, true), a2(32, true);
+  int64_t i1, i2;
+  const Expr *e1 = callExpr->getArg(0), *e2 = callExpr->getArg(1);
+
+  // object as first argument
+  if (!e1->EvaluateAsInt(a1, astContext)) {
+    assert(e1->getType()->isStructureOrClassType());
+    a1.setAllBits();
+  }
+
+  if (e2->EvaluateAsInt(a2, astContext) && (i1 = a1.getExtValue()) >= 0 &&
+      (i2 = a2.getExtValue()) >= 0) {
+  } else {
+    emitError("Barrier arguments must be non-negative integer constants",
+              callExpr->getExprLoc());
+    return nullptr;
+  }
+
+  if (!(i1 | i2)) { // all zero -> no-op
+    return nullptr;
+  }
+
+  spv::Scope memScope =
+      (i2 & (unsigned)hlsl::DXIL::BarrierSemanticFlag::DeviceScope)
+          ? spv::Scope::Device
+      : (i2 & (unsigned)hlsl::DXIL::BarrierSemanticFlag::GroupScope)
+          ? spv::Scope::Workgroup
+          : spv::Scope::Invocation;
+  spv::MemorySemanticsMask memSemaMask =
+      spv::MemorySemanticsMask::AcquireRelease |
+      ((i1 & (unsigned)hlsl::DXIL::MemoryTypeFlag::UavMemory)
+           ? spv::MemorySemanticsMask::UniformMemory
+           : spv::MemorySemanticsMask::MaskNone) |
+      ((i1 & (unsigned)hlsl::DXIL::MemoryTypeFlag::GroupSharedMemory)
+           ? spv::MemorySemanticsMask::WorkgroupMemory
+           : spv::MemorySemanticsMask::MaskNone) |
+      ((i1 & (unsigned)hlsl::DXIL::MemoryTypeFlag::NodeOutputMemory)
+           ? spv::MemorySemanticsMask::OutputMemory
+           : spv::MemorySemanticsMask::MaskNone);
+  Optional<spv::Scope> execScope =
+      (i2 & (unsigned)hlsl::DXIL::BarrierSemanticFlag::GroupSync)
+          ? Optional<spv::Scope>(spv::Scope::Workgroup)
+          : None;
+
+  spvBuilder.createBarrier(memScope, memSemaMask, execScope,
+                           callExpr->getExprLoc());
+  return nullptr;
+}
+
 SpirvInstruction *
 SpirvEmitter::processIntrinsicMemoryBarrier(const CallExpr *callExpr,
                                             bool isDevice, bool groupSync,
@@ -12186,7 +12498,7 @@ SpirvInstruction *SpirvEmitter::processIntrinsicUsingSpirvInst(
     case spv::Op::OpFwidth:
     case spv::Op::OpFwidthFine:
     case spv::Op::OpFwidthCoarse:
-      if (spvContext.isCS())
+      if (spvContext.isCS() || spvContext.isNode())
         addDerivativeGroupExecutionMode();
       needsLegalization = true;
       break;
@@ -13272,6 +13584,7 @@ hlsl::ShaderModel::Kind SpirvEmitter::getShaderModelKind(StringRef stageName) {
           .Case("callable", hlsl::ShaderModel::Kind::Callable)
           .Case("mesh", hlsl::ShaderModel::Kind::Mesh)
           .Case("amplification", hlsl::ShaderModel::Kind::Amplification)
+          .Case("node", hlsl::ShaderModel::Kind::Node)
           .Default(hlsl::ShaderModel::Kind::Invalid);
   assert(SMK != hlsl::ShaderModel::Kind::Invalid);
   return SMK;
@@ -13292,6 +13605,7 @@ SpirvEmitter::getSpirvShaderStage(hlsl::ShaderModel::Kind smk,
   case hlsl::ShaderModel::Kind::Pixel:
     return spv::ExecutionModel::Fragment;
   case hlsl::ShaderModel::Kind::Compute:
+  case hlsl::ShaderModel::Kind::Node:
     return spv::ExecutionModel::GLCompute;
   case hlsl::ShaderModel::Kind::RayGeneration:
     return spv::ExecutionModel::RayGenerationNV;
@@ -13512,6 +13826,21 @@ void SpirvEmitter::processPixelShaderAttributes(const FunctionDecl *decl) {
   }
 }
 
+void SpirvEmitter::checkForWaveSizeAttr(const FunctionDecl *decl) {
+  if (auto *waveSizeAttr = decl->getAttr<HLSLWaveSizeAttr>()) {
+    // Not supported in Vulkan SPIR-V, warn and ignore.
+
+    // SPIR-V SubgroupSize execution mode would work but it is Kernel only
+    // (requires the SubgroupDispatch capability, which implies the
+    // DeviceEnqueue capability, which is Kernel only). Subgroup sizes can be
+    // specified in Vulkan on the application side via
+    // VK_EXT_subgroup_size_control.
+    emitWarning("Wave size is not supported by Vulkan SPIR-V. Consider using "
+                "VK_EXT_subgroup_size_control.",
+                waveSizeAttr->getLocation());
+  }
+}
+
 void SpirvEmitter::processComputeShaderAttributes(const FunctionDecl *decl) {
   auto *numThreadsAttr = decl->getAttr<HLSLNumThreadsAttr>();
   assert(numThreadsAttr && "thread group size missing from entry-point");
@@ -13523,19 +13852,82 @@ void SpirvEmitter::processComputeShaderAttributes(const FunctionDecl *decl) {
   spvBuilder.addExecutionMode(entryFunction, spv::ExecutionMode::LocalSize,
                               {x, y, z}, decl->getLocation());
 
-  auto *waveSizeAttr = decl->getAttr<HLSLWaveSizeAttr>();
-  if (waveSizeAttr) {
-    // Not supported in Vulkan SPIR-V, warn and ignore.
+  checkForWaveSizeAttr(decl);
+}
 
-    // SPIR-V SubgroupSize execution mode would work but it is Kernel only
-    // (requires the SubgroupDispatch capability, which implies the
-    // DeviceEnqueue capability, which is Kernel only). Subgroup sizes can be
-    // specified in Vulkan on the application side via
-    // VK_EXT_subgroup_size_control.
-    emitWarning("Wave size is not supported by Vulkan SPIR-V. Consider using "
-                "VK_EXT_subgroup_size_control.",
-                waveSizeAttr->getLocation());
+void SpirvEmitter::processNodeShaderAttributes(const FunctionDecl *decl) {
+  uint32_t x = 1, y = 1, z = 1;
+  if (auto *numThreadsAttr = decl->getAttr<HLSLNumThreadsAttr>()) {
+    x = static_cast<uint32_t>(numThreadsAttr->getX());
+    y = static_cast<uint32_t>(numThreadsAttr->getY());
+    z = static_cast<uint32_t>(numThreadsAttr->getZ());
+  }
+  spvBuilder.addExecutionMode(entryFunction, spv::ExecutionMode::LocalSize,
+                              {x, y, z}, decl->getLocation());
+
+  auto *nodeLaunchAttr = decl->getAttr<HLSLNodeLaunchAttr>();
+  StringRef launchType = nodeLaunchAttr ? nodeLaunchAttr->getLaunchType() : "";
+  if (launchType.equals("coalescing") || launchType.equals("thread")) {
+    spvBuilder.addExecutionMode(entryFunction,
+                                spv::ExecutionMode::CoalescingAMDX, {},
+                                decl->getLocation());
   }
+
+  uint64_t nodeId = 0;
+  if (const auto nodeIdAttr = decl->getAttr<HLSLNodeIdAttr>())
+    nodeId = static_cast<uint64_t>(nodeIdAttr->getArrayIndex());
+  spvBuilder.addExecutionModeId(
+      entryFunction, spv::ExecutionMode::ShaderIndexAMDX,
+      {spvBuilder.getConstantInt(astContext.UnsignedIntTy,
+                                 llvm::APInt(32, nodeId))},
+      decl->getLocation());
+
+  if (const auto *nodeMaxRecursionDepthAttr =
+          decl->getAttr<HLSLNodeMaxRecursionDepthAttr>()) {
+    SpirvInstruction *count = spvBuilder.getConstantInt(
+        astContext.UnsignedIntTy,
+        llvm::APInt(32, nodeMaxRecursionDepthAttr->getCount()));
+    spvBuilder.addExecutionModeId(entryFunction,
+                                  spv::ExecutionMode::MaxNodeRecursionAMDX,
+                                  {count}, decl->getLocation());
+  }
+
+  if (const auto *nodeShareInputOfAttr =
+          decl->getAttr<HLSLNodeShareInputOfAttr>()) {
+    SpirvInstruction *name =
+        spvBuilder.getConstantString(nodeShareInputOfAttr->getName());
+    SpirvInstruction *index = spvBuilder.getConstantInt(
+        astContext.UnsignedIntTy,
+        llvm::APInt(32, nodeShareInputOfAttr->getArrayIndex()));
+    spvBuilder.addExecutionModeId(entryFunction,
+                                  spv::ExecutionMode::SharesInputWithAMDX,
+                                  {name, index}, decl->getLocation());
+  }
+
+  if (const auto *dispatchGrid = decl->getAttr<HLSLNodeDispatchGridAttr>()) {
+    SpirvInstruction *gridX = spvBuilder.getConstantInt(
+        astContext.UnsignedIntTy, llvm::APInt(32, dispatchGrid->getX()));
+    SpirvInstruction *gridY = spvBuilder.getConstantInt(
+        astContext.UnsignedIntTy, llvm::APInt(32, dispatchGrid->getY()));
+    SpirvInstruction *gridZ = spvBuilder.getConstantInt(
+        astContext.UnsignedIntTy, llvm::APInt(32, dispatchGrid->getZ()));
+    spvBuilder.addExecutionModeId(entryFunction,
+                                  spv::ExecutionMode::StaticNumWorkgroupsAMDX,
+                                  {gridX, gridY, gridZ}, decl->getLocation());
+  } else if (const auto *maxDispatchGrid =
+                 decl->getAttr<HLSLNodeMaxDispatchGridAttr>()) {
+    SpirvInstruction *gridX = spvBuilder.getConstantInt(
+        astContext.UnsignedIntTy, llvm::APInt(32, maxDispatchGrid->getX()));
+    SpirvInstruction *gridY = spvBuilder.getConstantInt(
+        astContext.UnsignedIntTy, llvm::APInt(32, maxDispatchGrid->getY()));
+    SpirvInstruction *gridZ = spvBuilder.getConstantInt(
+        astContext.UnsignedIntTy, llvm::APInt(32, maxDispatchGrid->getZ()));
+    spvBuilder.addExecutionModeId(entryFunction,
+                                  spv::ExecutionMode::MaxNumWorkgroupsAMDX,
+                                  {gridX, gridY, gridZ}, decl->getLocation());
+  }
+
+  checkForWaveSizeAttr(decl);
 }
 
 bool SpirvEmitter::processTessellationShaderAttributes(
@@ -13978,6 +14370,8 @@ SpirvFunction *SpirvEmitter::emitEntryFunctionWrapper(
     processPixelShaderAttributes(decl);
   } else if (spvContext.isCS()) {
     processComputeShaderAttributes(decl);
+  } else if (spvContext.isNode()) {
+    processNodeShaderAttributes(decl);
   } else if (spvContext.isHS()) {
     if (!processTessellationShaderAttributes(decl, &numOutputControlPoints))
       return nullptr;
@@ -14086,12 +14480,23 @@ SpirvFunction *SpirvEmitter::emitEntryFunctionWrapper(
   llvm::SmallVector<SpirvInstruction *, 4> params;
   for (const auto *param : decl->params()) {
     const auto paramType = param->getType();
+    if (hlsl::IsHLSLNodeInputType(paramType)) {
+      SpirvInstruction *value = nullptr;
+      if (!declIdMapper.createStageInputVar(param, &value, false))
+        return nullptr;
+      if (value && value->getKind() == SpirvInstruction::Kind::IK_Variable) {
+        handleNodePayloadArrayType(param, value);
+        params.push_back(value);
+      }
+      continue;
+    }
+
     std::string tempVarName = "param.var." + param->getNameAsString();
     auto *tempVar =
         spvBuilder.addFnVar(paramType, param->getLocation(), tempVarName,
                             param->hasAttr<HLSLPreciseAttr>(),
                             param->hasAttr<HLSLNoInterpolationAttr>());
-
+    handleNodePayloadArrayType(param, tempVar);
     params.push_back(tempVar);
 
     // Create the stage input variable for parameter not marked as pure out and
@@ -14109,6 +14514,9 @@ SpirvFunction *SpirvEmitter::emitEntryFunctionWrapper(
 
       if (!declIdMapper.createStageInputVar(param, &loadedValue, false))
         return nullptr;
+      if (loadedValue) {
+        handleNodePayloadArrayType(param, loadedValue);
+      }
 
       // Only initialize the temporary variable if the parameter is indeed used,
       // or if it is an inout parameter.
diff --git a/tools/clang/lib/SPIRV/SpirvEmitter.h b/tools/clang/lib/SPIRV/SpirvEmitter.h
index 0c77f2fc24..954b2c5dd3 100644
--- a/tools/clang/lib/SPIRV/SpirvEmitter.h
+++ b/tools/clang/lib/SPIRV/SpirvEmitter.h
@@ -126,6 +126,8 @@ class SpirvEmitter : public ASTConsumer {
                                        SourceRange range = {});
 
 private:
+  bool handleNodePayloadArrayType(const ParmVarDecl *decl,
+                                  SpirvInstruction *instr);
   void doFunctionDecl(const FunctionDecl *decl);
   void doVarDecl(const VarDecl *decl);
   void doRecordDecl(const RecordDecl *decl);
@@ -505,6 +507,9 @@ class SpirvEmitter : public ASTConsumer {
   SpirvInstruction *
   processIntrinsicGetBufferContents(const CXXMemberCallExpr *);
 
+  /// Processes the 'Barrier' intrinsic function.
+  SpirvInstruction *processIntrinsicBarrier(const CallExpr *);
+
   /// Processes the 'GroupMemoryBarrier', 'GroupMemoryBarrierWithGroupSync',
   /// 'DeviceMemoryBarrier', 'DeviceMemoryBarrierWithGroupSync',
   /// 'AllMemoryBarrier', and 'AllMemoryBarrierWithGroupSync' intrinsic
@@ -513,6 +518,40 @@ class SpirvEmitter : public ASTConsumer {
                                                   bool isDevice, bool groupSync,
                                                   bool isAllBarrier);
 
+  /// Processes the 'GetRemainingRecursionLevels' intrinsic function.
+  SpirvInstruction *
+  processIntrinsicGetRemainingRecursionLevels(const CallExpr *callExpr);
+
+  /// Processes the 'IsValid' intrinsic function.
+  SpirvInstruction *processIntrinsicIsValid(const CXXMemberCallExpr *callExpr);
+
+  /// Processes the 'Get' intrinsic function for (arrays of) node records and
+  /// the array subscript operator for node record arrays.
+  SpirvInstruction *
+  processIntrinsicExtractRecordStruct(const CXXMemberCallExpr *callExpr);
+
+  /// Processes the 'GetGroupNodeOutputRecords' and 'GetThreadNodeOutputRecords'
+  /// intrinsic functions.
+  SpirvInstruction *
+  processIntrinsicGetNodeOutputRecords(const CXXMemberCallExpr *callExpr,
+                                       bool isGroupShared);
+
+  /// Processes the 'IncrementOutputCount' intrinsic function.
+  SpirvInstruction *
+  processIntrinsicIncrementOutputCount(const CXXMemberCallExpr *callExpr,
+                                       bool isGroupShared);
+
+  /// Processes the 'Count' intrinsic function for node input record arrays.
+  SpirvInstruction *
+  processIntrinsicGetRecordCount(const CXXMemberCallExpr *callExpr);
+
+  /// Processes the 'OutputComplete' intrinsic function.
+  void processIntrinsicOutputComplete(const CXXMemberCallExpr *callExpr);
+
+  /// Processes the 'FinishedCrossGroupSharing' intrinsic function.
+  SpirvInstruction *
+  processIntrinsicFinishedCrossGroupSharing(const CXXMemberCallExpr *callExpr);
+
   /// Processes the 'mad' intrinsic function.
   SpirvInstruction *processIntrinsicMad(const CallExpr *);
 
@@ -850,6 +889,7 @@ class SpirvEmitter : public ASTConsumer {
   static hlsl::ShaderModel::Kind getShaderModelKind(StringRef stageName);
   static spv::ExecutionModel getSpirvShaderStage(hlsl::ShaderModel::Kind smk,
                                                  bool);
+  void checkForWaveSizeAttr(const FunctionDecl *decl);
 
   /// \brief Handle inline SPIR-V attributes for the entry function.
   void processInlineSpirvAttributes(const FunctionDecl *entryFunction);
@@ -876,6 +916,10 @@ class SpirvEmitter : public ASTConsumer {
   /// HLSL attributes of the entry point function.
   void processComputeShaderAttributes(const FunctionDecl *entryFunction);
 
+  /// \brief Adds necessary execution modes for the node shader based on the
+  /// HLSL attributes of the entry point function.
+  void processNodeShaderAttributes(const FunctionDecl *entryFunction);
+
   /// \brief Adds necessary execution modes for the mesh/amplification shader
   /// based on the HLSL attributes of the entry point function.
   bool
diff --git a/tools/clang/lib/SPIRV/SpirvInstruction.cpp b/tools/clang/lib/SPIRV/SpirvInstruction.cpp
index f6ac29f379..88d669d397 100644
--- a/tools/clang/lib/SPIRV/SpirvInstruction.cpp
+++ b/tools/clang/lib/SPIRV/SpirvInstruction.cpp
@@ -52,6 +52,11 @@ DEFINE_INVOKE_VISITOR_FOR_CLASS(SpirvUnreachable)
 DEFINE_INVOKE_VISITOR_FOR_CLASS(SpirvAccessChain)
 DEFINE_INVOKE_VISITOR_FOR_CLASS(SpirvAtomic)
 DEFINE_INVOKE_VISITOR_FOR_CLASS(SpirvBarrier)
+DEFINE_INVOKE_VISITOR_FOR_CLASS(SpirvIsNodePayloadValid)
+DEFINE_INVOKE_VISITOR_FOR_CLASS(SpirvNodePayloadArrayLength)
+DEFINE_INVOKE_VISITOR_FOR_CLASS(SpirvAllocateNodePayloads)
+DEFINE_INVOKE_VISITOR_FOR_CLASS(SpirvEnqueueNodePayloads)
+DEFINE_INVOKE_VISITOR_FOR_CLASS(SpirvFinishWritingNodePayload)
 DEFINE_INVOKE_VISITOR_FOR_CLASS(SpirvBinaryOp)
 DEFINE_INVOKE_VISITOR_FOR_CLASS(SpirvBitFieldExtract)
 DEFINE_INVOKE_VISITOR_FOR_CLASS(SpirvBitFieldInsert)
@@ -59,6 +64,7 @@ DEFINE_INVOKE_VISITOR_FOR_CLASS(SpirvConstantBoolean)
 DEFINE_INVOKE_VISITOR_FOR_CLASS(SpirvConstantInteger)
 DEFINE_INVOKE_VISITOR_FOR_CLASS(SpirvConstantFloat)
 DEFINE_INVOKE_VISITOR_FOR_CLASS(SpirvConstantComposite)
+DEFINE_INVOKE_VISITOR_FOR_CLASS(SpirvConstantString)
 DEFINE_INVOKE_VISITOR_FOR_CLASS(SpirvConstantNull)
 DEFINE_INVOKE_VISITOR_FOR_CLASS(SpirvConvertPtrToU)
 DEFINE_INVOKE_VISITOR_FOR_CLASS(SpirvConvertUToPtr)
@@ -469,6 +475,41 @@ SpirvBarrier::SpirvBarrier(SourceLocation loc, spv::Scope memScope,
       memoryScope(memScope), memorySemantics(memSemantics),
       executionScope(execScope) {}
 
+SpirvIsNodePayloadValid::SpirvIsNodePayloadValid(QualType resultType,
+                                                 SourceLocation loc,
+                                                 SpirvInstruction *payloadArray,
+                                                 SpirvInstruction *nodeIndex)
+    : SpirvInstruction(IK_IsNodePayloadValid, spv::Op::OpIsNodePayloadValidAMDX,
+                       resultType, loc),
+      payloadArray(payloadArray), nodeIndex(nodeIndex) {}
+
+SpirvNodePayloadArrayLength::SpirvNodePayloadArrayLength(
+    QualType resultType, SourceLocation loc, SpirvInstruction *payloadArray)
+    : SpirvInstruction(IK_NodePayloadArrayLength,
+                       spv::Op::OpNodePayloadArrayLengthAMDX, resultType, loc),
+      payloadArray(payloadArray) {}
+
+SpirvAllocateNodePayloads::SpirvAllocateNodePayloads(
+    QualType resultType, SourceLocation loc, spv::Scope allocationScope,
+    SpirvInstruction *shaderIndex, SpirvInstruction *recordCount)
+    : SpirvInstruction(IK_AllocateNodePayloads,
+                       spv::Op::OpAllocateNodePayloadsAMDX, resultType, loc),
+      allocationScope(allocationScope), shaderIndex(shaderIndex),
+      recordCount(recordCount) {}
+
+SpirvEnqueueNodePayloads::SpirvEnqueueNodePayloads(SourceLocation loc,
+                                                   SpirvInstruction *payload)
+    : SpirvInstruction(IK_EnqueueNodePayloads,
+                       spv::Op::OpEnqueueNodePayloadsAMDX, QualType(), loc),
+      payload(payload) {}
+
+SpirvFinishWritingNodePayload::SpirvFinishWritingNodePayload(
+    QualType resultType, SourceLocation loc, SpirvInstruction *payload)
+    : SpirvInstruction(IK_FinishWritingNodePayload,
+                       spv::Op::OpFinishWritingNodePayloadAMDX, resultType,
+                       loc),
+      payload(payload) {}
+
 SpirvBinaryOp::SpirvBinaryOp(spv::Op opcode, QualType resultType,
                              SourceLocation loc, SpirvInstruction *op1,
                              SpirvInstruction *op2, SourceRange range)
@@ -565,7 +606,8 @@ bool SpirvConstant::isSpecConstant() const {
   return opcode == spv::Op::OpSpecConstant ||
          opcode == spv::Op::OpSpecConstantTrue ||
          opcode == spv::Op::OpSpecConstantFalse ||
-         opcode == spv::Op::OpSpecConstantComposite;
+         opcode == spv::Op::OpSpecConstantComposite ||
+         opcode == spv::Op::OpSpecConstantStringAMDX;
 }
 
 SpirvConstantBoolean::SpirvConstantBoolean(QualType type, bool val,
@@ -620,6 +662,19 @@ SpirvConstantComposite::SpirvConstantComposite(
                     type),
       constituents(constituentsVec.begin(), constituentsVec.end()) {}
 
+SpirvConstantString::SpirvConstantString(llvm::StringRef stringLiteral,
+                                         bool isSpecConst)
+    : SpirvConstant(IK_ConstantString,
+                    isSpecConst ? spv::Op::OpSpecConstantStringAMDX
+                                : spv::Op::OpConstantStringAMDX,
+                    QualType()),
+      str(stringLiteral) {}
+
+bool SpirvConstantString::operator==(const SpirvConstantString &that) const {
+  return opcode == that.opcode && resultType == that.resultType &&
+         str == that.str;
+}
+
 SpirvConstantNull::SpirvConstantNull(QualType type)
     : SpirvConstant(IK_ConstantNull, spv::Op::OpConstantNull, type) {}
 
diff --git a/tools/clang/lib/SPIRV/SpirvType.cpp b/tools/clang/lib/SPIRV/SpirvType.cpp
index cabeba4cda..286e6224a4 100644
--- a/tools/clang/lib/SPIRV/SpirvType.cpp
+++ b/tools/clang/lib/SPIRV/SpirvType.cpp
@@ -167,6 +167,10 @@ bool RuntimeArrayType::operator==(const RuntimeArrayType &that) const {
          (!stride.hasValue() || stride.getValue() == that.stride.getValue());
 }
 
+bool NodePayloadArrayType::operator==(const NodePayloadArrayType &that) const {
+  return elementType == that.elementType && nodeDecl == that.nodeDecl;
+}
+
 bool SpvIntrinsicTypeOperand::operator==(
     const SpvIntrinsicTypeOperand &that) const {
   if (isTypeOperand != that.isTypeOperand)
diff --git a/tools/clang/lib/Sema/SemaHLSL.cpp b/tools/clang/lib/Sema/SemaHLSL.cpp
index 2163eef8a3..744b06b8d0 100644
--- a/tools/clang/lib/Sema/SemaHLSL.cpp
+++ b/tools/clang/lib/Sema/SemaHLSL.cpp
@@ -17137,6 +17137,10 @@ void DiagnoseNodeEntry(Sema &S, FunctionDecl *FD, llvm::StringRef StageName,
   DXIL::ShaderKind shaderKind = ShaderModel::KindFromFullName(StageName);
   if (shaderKind == DXIL::ShaderKind::Node) {
     NodeLoc = pAttr->getLocation();
+    // SPIR-V node shader support is experimental
+    if (S.getLangOpts().SPIRV) {
+      S.Diag(NodeLoc, diag::warn_spirv_node_shaders_experimental);
+    }
   }
   if (NodeLoc.isInvalid()) {
     return;
diff --git a/tools/clang/test/CodeGenSPIRV/bezier.hull.hlsl2spv b/tools/clang/test/CodeGenSPIRV/bezier.hull.hlsl2spv
index 30565394b4..1425137c68 100644
--- a/tools/clang/test/CodeGenSPIRV/bezier.hull.hlsl2spv
+++ b/tools/clang/test/CodeGenSPIRV/bezier.hull.hlsl2spv
@@ -161,7 +161,7 @@ BEZIER_CONTROL_POINT SubDToBezierHS(InputPatch<VS_CONTROL_POINT_OUTPUT, MAX_POIN
 // CHECK-NEXT:     %v4float = OpTypeVector %float 4
 // CHECK-NEXT: %_ptr_Output_v4float = OpTypePointer Output %v4float
 // CHECK-NEXT:        %void = OpTypeVoid
-// CHECK-NEXT:          %51 = OpTypeFunction %void
+// CHECK-NEXT:          [[L1:%[1-9][0-9]*]] = OpTypeFunction %void
 // CHECK-NEXT: %VS_CONTROL_POINT_OUTPUT = OpTypeStruct %v3float %v2float %v3float
 // CHECK-NEXT: %_arr_VS_CONTROL_POINT_OUTPUT_uint_3 = OpTypeArray %VS_CONTROL_POINT_OUTPUT %uint_3
 // CHECK-NEXT: %_ptr_Function__arr_VS_CONTROL_POINT_OUTPUT_uint_3 = OpTypePointer Function %_arr_VS_CONTROL_POINT_OUTPUT_uint_3
@@ -170,12 +170,12 @@ BEZIER_CONTROL_POINT SubDToBezierHS(InputPatch<VS_CONTROL_POINT_OUTPUT, MAX_POIN
 // CHECK-NEXT: %_ptr_Output_v3float = OpTypePointer Output %v3float
 // CHECK-NEXT:        %bool = OpTypeBool
 // CHECK-NEXT: %HS_CONSTANT_DATA_OUTPUT = OpTypeStruct %_arr_float_uint_4 %_arr_float_uint_2 %_arr_v3float_uint_4 %_arr_v2float_uint_4 %_arr_v3float_uint_4 %_arr_v3float_uint_4 %v4float
-// CHECK-NEXT:          %98 = OpTypeFunction %HS_CONSTANT_DATA_OUTPUT %_ptr_Function__arr_VS_CONTROL_POINT_OUTPUT_uint_3 %_ptr_Function_uint
+// CHECK-NEXT:          [[L2:%[1-9][0-9]*]] = OpTypeFunction %HS_CONSTANT_DATA_OUTPUT %_ptr_Function__arr_VS_CONTROL_POINT_OUTPUT_uint_3 %_ptr_Function_uint
 // CHECK-NEXT: %_ptr_Function_HS_CONSTANT_DATA_OUTPUT = OpTypePointer Function %HS_CONSTANT_DATA_OUTPUT
 // CHECK-NEXT: %_ptr_Function__arr_float_uint_4 = OpTypePointer Function %_arr_float_uint_4
 // CHECK-NEXT: %_ptr_Function_float = OpTypePointer Function %float
 // CHECK-NEXT: %_ptr_Function__arr_float_uint_2 = OpTypePointer Function %_arr_float_uint_2
-// CHECK-NEXT:         %120 = OpTypeFunction %BEZIER_CONTROL_POINT %_ptr_Function__arr_VS_CONTROL_POINT_OUTPUT_uint_3 %_ptr_Function_uint %_ptr_Function_uint
+// CHECK-NEXT:         [[L3:%[1-9][0-9]*]] = OpTypeFunction %BEZIER_CONTROL_POINT %_ptr_Function__arr_VS_CONTROL_POINT_OUTPUT_uint_3 %_ptr_Function_uint %_ptr_Function_uint
 // CHECK-NEXT: %_ptr_Function_VS_CONTROL_POINT_OUTPUT = OpTypePointer Function %VS_CONTROL_POINT_OUTPUT
 // CHECK-NEXT: %_ptr_Function_BEZIER_CONTROL_POINT = OpTypePointer Function %BEZIER_CONTROL_POINT
 // CHECK-NEXT: %_ptr_Function_v3float = OpTypePointer Function %v3float
@@ -192,94 +192,94 @@ BEZIER_CONTROL_POINT SubDToBezierHS(InputPatch<VS_CONTROL_POINT_OUTPUT, MAX_POIN
 // CHECK-NEXT: %out_var_TANUCORNER = OpVariable %_ptr_Output__arr_v3float_uint_4 Output
 // CHECK-NEXT: %out_var_TANVCORNER = OpVariable %_ptr_Output__arr_v3float_uint_4 Output
 // CHECK-NEXT: %out_var_TANWEIGHTS = OpVariable %_ptr_Output_v4float Output
-// CHECK-NEXT: %SubDToBezierHS = OpFunction %void None %51
-// CHECK-NEXT:          %52 = OpLabel
+// CHECK-NEXT: %SubDToBezierHS = OpFunction %void None [[L1]]
+// CHECK-NEXT:          [[L4:%[1-9][0-9]*]] = OpLabel
 // CHECK-NEXT: %param_var_ip = OpVariable %_ptr_Function__arr_VS_CONTROL_POINT_OUTPUT_uint_3 Function
 // CHECK-NEXT: %param_var_cpid = OpVariable %_ptr_Function_uint Function
 // CHECK-NEXT: %param_var_PatchID = OpVariable %_ptr_Function_uint Function
-// CHECK-NEXT:          %60 = OpLoad %_arr_v3float_uint_3 %in_var_WORLDPOS
-// CHECK-NEXT:          %61 = OpLoad %_arr_v2float_uint_3 %in_var_TEXCOORD0
-// CHECK-NEXT:          %62 = OpLoad %_arr_v3float_uint_3 %in_var_TANGENT
-// CHECK-NEXT:          %63 = OpCompositeExtract %v3float %60 0
-// CHECK-NEXT:          %64 = OpCompositeExtract %v2float %61 0
-// CHECK-NEXT:          %65 = OpCompositeExtract %v3float %62 0
-// CHECK-NEXT:          %66 = OpCompositeConstruct %VS_CONTROL_POINT_OUTPUT %63 %64 %65
-// CHECK-NEXT:          %67 = OpCompositeExtract %v3float %60 1
-// CHECK-NEXT:          %68 = OpCompositeExtract %v2float %61 1
-// CHECK-NEXT:          %69 = OpCompositeExtract %v3float %62 1
-// CHECK-NEXT:          %70 = OpCompositeConstruct %VS_CONTROL_POINT_OUTPUT %67 %68 %69
-// CHECK-NEXT:          %71 = OpCompositeExtract %v3float %60 2
-// CHECK-NEXT:          %72 = OpCompositeExtract %v2float %61 2
-// CHECK-NEXT:          %73 = OpCompositeExtract %v3float %62 2
-// CHECK-NEXT:          %74 = OpCompositeConstruct %VS_CONTROL_POINT_OUTPUT %71 %72 %73
-// CHECK-NEXT:          %75 = OpCompositeConstruct %_arr_VS_CONTROL_POINT_OUTPUT_uint_3 %66 %70 %74
-// CHECK-NEXT:          %76 = OpLoad %uint %gl_InvocationID
-// CHECK-NEXT:          %77 = OpLoad %uint %gl_PrimitiveID
-// CHECK-NEXT:          %79 = OpFunctionCall %BEZIER_CONTROL_POINT %src_SubDToBezierHS %param_var_ip %param_var_cpid %param_var_PatchID
-// CHECK-NEXT:          %81 = OpCompositeExtract %v3float %79 0
-// CHECK-NEXT:          %83 = OpAccessChain %_ptr_Output_v3float %out_var_BEZIERPOS %76
-// CHECK-NEXT:                OpStore %83 %81
+// CHECK-NEXT:          [[L5:%[1-9][0-9]*]] = OpLoad %_arr_v3float_uint_3 %in_var_WORLDPOS
+// CHECK-NEXT:          [[L6:%[1-9][0-9]*]] = OpLoad %_arr_v2float_uint_3 %in_var_TEXCOORD0
+// CHECK-NEXT:          [[L7:%[1-9][0-9]*]] = OpLoad %_arr_v3float_uint_3 %in_var_TANGENT
+// CHECK-NEXT:          [[L8:%[1-9][0-9]*]] = OpCompositeExtract %v3float [[L5]] 0
+// CHECK-NEXT:          [[L9:%[1-9][0-9]*]] = OpCompositeExtract %v2float [[L6]] 0
+// CHECK-NEXT:          [[L10:%[1-9][0-9]*]] = OpCompositeExtract %v3float [[L7]] 0
+// CHECK-NEXT:          [[L11:%[1-9][0-9]*]] = OpCompositeConstruct %VS_CONTROL_POINT_OUTPUT [[L8]] [[L9]] [[L10]]
+// CHECK-NEXT:          [[L12:%[1-9][0-9]*]] = OpCompositeExtract %v3float [[L5]] 1
+// CHECK-NEXT:          [[L13:%[1-9][0-9]*]] = OpCompositeExtract %v2float [[L6]] 1
+// CHECK-NEXT:          [[L14:%[1-9][0-9]*]] = OpCompositeExtract %v3float [[L7]] 1
+// CHECK-NEXT:          [[L15:%[1-9][0-9]*]] = OpCompositeConstruct %VS_CONTROL_POINT_OUTPUT [[L12]] [[L13]] [[L14]]
+// CHECK-NEXT:          [[L16:%[1-9][0-9]*]] = OpCompositeExtract %v3float [[L5]] 2
+// CHECK-NEXT:          [[L17:%[1-9][0-9]*]] = OpCompositeExtract %v2float [[L6]] 2
+// CHECK-NEXT:          [[L18:%[1-9][0-9]*]] = OpCompositeExtract %v3float [[L7]] 2
+// CHECK-NEXT:          [[L19:%[1-9][0-9]*]] = OpCompositeConstruct %VS_CONTROL_POINT_OUTPUT [[L16]] [[L17]] [[L18]]
+// CHECK-NEXT:          [[L20:%[1-9][0-9]*]] = OpCompositeConstruct %_arr_VS_CONTROL_POINT_OUTPUT_uint_3 [[L11]] [[L15]] [[L19]]
+// CHECK-NEXT:          [[L21:%[1-9][0-9]*]] = OpLoad %uint %gl_InvocationID
+// CHECK-NEXT:          [[L22:%[1-9][0-9]*]] = OpLoad %uint %gl_PrimitiveID
+// CHECK-NEXT:          [[L23:%[1-9][0-9]*]] = OpFunctionCall %BEZIER_CONTROL_POINT %src_SubDToBezierHS %param_var_ip %param_var_cpid %param_var_PatchID
+// CHECK-NEXT:          [[L24:%[1-9][0-9]*]] = OpCompositeExtract %v3float [[L23]] 0
+// CHECK-NEXT:          [[L25:%[1-9][0-9]*]] = OpAccessChain %_ptr_Output_v3float %out_var_BEZIERPOS [[L21]]
+// CHECK-NEXT:                OpStore [[L25]] [[L24]]
 // CHECK-NEXT:                OpControlBarrier %uint_2 %uint_4 %uint_0
-// CHECK-NEXT:          %85 = OpIEqual %bool %76 %uint_0
+// CHECK-NEXT:          [[L26:%[1-9][0-9]*]] = OpIEqual %bool [[L21]] %uint_0
 // CHECK-NEXT:                OpSelectionMerge %if_merge None
-// CHECK-NEXT:                OpBranchConditional %85 %if_true %if_merge
+// CHECK-NEXT:                OpBranchConditional [[L26]] %if_true %if_merge
 // CHECK-NEXT:     %if_true = OpLabel
-// CHECK-NEXT:          %89 = OpFunctionCall %HS_CONSTANT_DATA_OUTPUT %SubDToBezierConstantsHS %param_var_ip %param_var_PatchID
-// CHECK-NEXT:          %91 = OpCompositeExtract %_arr_float_uint_4 %89 0
-// CHECK-NEXT:                OpStore %gl_TessLevelOuter %91
-// CHECK-NEXT:          %92 = OpCompositeExtract %_arr_float_uint_2 %89 1
-// CHECK-NEXT:                OpStore %gl_TessLevelInner %92
-// CHECK-NEXT:          %93 = OpCompositeExtract %_arr_v3float_uint_4 %89 2
-// CHECK-NEXT:                OpStore %out_var_TANGENT %93
-// CHECK-NEXT:          %94 = OpCompositeExtract %_arr_v2float_uint_4 %89 3
-// CHECK-NEXT:                OpStore %out_var_TEXCOORD %94
-// CHECK-NEXT:          %95 = OpCompositeExtract %_arr_v3float_uint_4 %89 4
-// CHECK-NEXT:                OpStore %out_var_TANUCORNER %95
-// CHECK-NEXT:          %96 = OpCompositeExtract %_arr_v3float_uint_4 %89 5
-// CHECK-NEXT:                OpStore %out_var_TANVCORNER %96
-// CHECK-NEXT:          %97 = OpCompositeExtract %v4float %89 6
-// CHECK-NEXT:                OpStore %out_var_TANWEIGHTS %97
+// CHECK-NEXT:          [[L27:%[1-9][0-9]*]] = OpFunctionCall %HS_CONSTANT_DATA_OUTPUT %SubDToBezierConstantsHS %param_var_ip %param_var_PatchID
+// CHECK-NEXT:          [[L28:%[1-9][0-9]*]] = OpCompositeExtract %_arr_float_uint_4 [[L27]] 0
+// CHECK-NEXT:                OpStore %gl_TessLevelOuter [[L28]]
+// CHECK-NEXT:          [[L29:%[1-9][0-9]*]] = OpCompositeExtract %_arr_float_uint_2 [[L27]] 1
+// CHECK-NEXT:                OpStore %gl_TessLevelInner [[L29]]
+// CHECK-NEXT:          [[L30:%[1-9][0-9]*]] = OpCompositeExtract %_arr_v3float_uint_4 [[L27]] 2
+// CHECK-NEXT:                OpStore %out_var_TANGENT [[L30]]
+// CHECK-NEXT:          [[L31:%[1-9][0-9]*]] = OpCompositeExtract %_arr_v2float_uint_4 [[L27]] 3
+// CHECK-NEXT:                OpStore %out_var_TEXCOORD [[L31]]
+// CHECK-NEXT:          [[L32:%[1-9][0-9]*]] = OpCompositeExtract %_arr_v3float_uint_4 [[L27]] 4
+// CHECK-NEXT:                OpStore %out_var_TANUCORNER [[L32]]
+// CHECK-NEXT:          [[L33:%[1-9][0-9]*]] = OpCompositeExtract %_arr_v3float_uint_4 [[L27]] 5
+// CHECK-NEXT:                OpStore %out_var_TANVCORNER [[L33]]
+// CHECK-NEXT:          [[L34:%[1-9][0-9]*]] = OpCompositeExtract %v4float [[L27]] 6
+// CHECK-NEXT:                OpStore %out_var_TANWEIGHTS [[L34]]
 // CHECK-NEXT:                OpBranch %if_merge
 // CHECK-NEXT:    %if_merge = OpLabel
 // CHECK-NEXT:                OpReturn
 // CHECK-NEXT:                OpFunctionEnd
-// CHECK-NEXT: %SubDToBezierConstantsHS = OpFunction %HS_CONSTANT_DATA_OUTPUT None %98
+// CHECK-NEXT: %SubDToBezierConstantsHS = OpFunction %HS_CONSTANT_DATA_OUTPUT None [[L2]]
 // CHECK-NEXT:          %ip = OpFunctionParameter %_ptr_Function__arr_VS_CONTROL_POINT_OUTPUT_uint_3
 // CHECK-NEXT:     %PatchID = OpFunctionParameter %_ptr_Function_uint
 // CHECK-NEXT:    %bb_entry = OpLabel
 // CHECK-NEXT:      %Output = OpVariable %_ptr_Function_HS_CONSTANT_DATA_OUTPUT Function
-// CHECK-NEXT:         %105 = OpAccessChain %_ptr_Function__arr_float_uint_4 %Output %int_0
-// CHECK-NEXT:         %107 = OpAccessChain %_ptr_Function_float %105 %int_0
-// CHECK-NEXT:                OpStore %107 %float_1
-// CHECK-NEXT:         %108 = OpAccessChain %_ptr_Function__arr_float_uint_4 %Output %int_0
-// CHECK-NEXT:         %109 = OpAccessChain %_ptr_Function_float %108 %int_1
-// CHECK-NEXT:                OpStore %109 %float_2
-// CHECK-NEXT:         %110 = OpAccessChain %_ptr_Function__arr_float_uint_4 %Output %int_0
-// CHECK-NEXT:         %111 = OpAccessChain %_ptr_Function_float %110 %int_2
-// CHECK-NEXT:                OpStore %111 %float_3
-// CHECK-NEXT:         %112 = OpAccessChain %_ptr_Function__arr_float_uint_4 %Output %int_0
-// CHECK-NEXT:         %113 = OpAccessChain %_ptr_Function_float %112 %int_3
-// CHECK-NEXT:                OpStore %113 %float_4
-// CHECK-NEXT:         %115 = OpAccessChain %_ptr_Function__arr_float_uint_2 %Output %int_1
-// CHECK-NEXT:         %116 = OpAccessChain %_ptr_Function_float %115 %int_0
-// CHECK-NEXT:                OpStore %116 %float_5
-// CHECK-NEXT:         %117 = OpAccessChain %_ptr_Function__arr_float_uint_2 %Output %int_1
-// CHECK-NEXT:         %118 = OpAccessChain %_ptr_Function_float %117 %int_1
-// CHECK-NEXT:                OpStore %118 %float_6
-// CHECK-NEXT:         %119 = OpLoad %HS_CONSTANT_DATA_OUTPUT %Output
-// CHECK-NEXT:                OpReturnValue %119
+// CHECK-NEXT:         [[L35:%[1-9][0-9]*]] = OpAccessChain %_ptr_Function__arr_float_uint_4 %Output %int_0
+// CHECK-NEXT:         [[L36:%[1-9][0-9]*]] = OpAccessChain %_ptr_Function_float [[L35]] %int_0
+// CHECK-NEXT:                OpStore [[L36]] %float_1
+// CHECK-NEXT:         [[L37:%[1-9][0-9]*]] = OpAccessChain %_ptr_Function__arr_float_uint_4 %Output %int_0
+// CHECK-NEXT:         [[L38:%[1-9][0-9]*]] = OpAccessChain %_ptr_Function_float [[L37]] %int_1
+// CHECK-NEXT:                OpStore [[L38]] %float_2
+// CHECK-NEXT:         [[L39:%[1-9][0-9]*]] = OpAccessChain %_ptr_Function__arr_float_uint_4 %Output %int_0
+// CHECK-NEXT:         [[L40:%[1-9][0-9]*]] = OpAccessChain %_ptr_Function_float [[L39]] %int_2
+// CHECK-NEXT:                OpStore [[L40]] %float_3
+// CHECK-NEXT:         [[L41:%[1-9][0-9]*]] = OpAccessChain %_ptr_Function__arr_float_uint_4 %Output %int_0
+// CHECK-NEXT:         [[L42:%[1-9][0-9]*]] = OpAccessChain %_ptr_Function_float [[L41]] %int_3
+// CHECK-NEXT:                OpStore [[L42]] %float_4
+// CHECK-NEXT:         [[L43:%[1-9][0-9]*]] = OpAccessChain %_ptr_Function__arr_float_uint_2 %Output %int_1
+// CHECK-NEXT:         [[L44:%[1-9][0-9]*]] = OpAccessChain %_ptr_Function_float [[L43]] %int_0
+// CHECK-NEXT:                OpStore [[L44]] %float_5
+// CHECK-NEXT:         [[L45:%[1-9][0-9]*]] = OpAccessChain %_ptr_Function__arr_float_uint_2 %Output %int_1
+// CHECK-NEXT:         [[L46:%[1-9][0-9]*]] = OpAccessChain %_ptr_Function_float [[L45]] %int_1
+// CHECK-NEXT:                OpStore [[L46]] %float_6
+// CHECK-NEXT:         [[L47:%[1-9][0-9]*]] = OpLoad %HS_CONSTANT_DATA_OUTPUT %Output
+// CHECK-NEXT:                OpReturnValue [[L47]]
 // CHECK-NEXT:                OpFunctionEnd
-// CHECK-NEXT: %src_SubDToBezierHS = OpFunction %BEZIER_CONTROL_POINT None %120
+// CHECK-NEXT: %src_SubDToBezierHS = OpFunction %BEZIER_CONTROL_POINT None [[L3]]
 // CHECK-NEXT:        %ip_0 = OpFunctionParameter %_ptr_Function__arr_VS_CONTROL_POINT_OUTPUT_uint_3
 // CHECK-NEXT:        %cpid = OpFunctionParameter %_ptr_Function_uint
 // CHECK-NEXT:   %PatchID_0 = OpFunctionParameter %_ptr_Function_uint
 // CHECK-NEXT:  %bb_entry_0 = OpLabel
 // CHECK-NEXT:    %vsOutput = OpVariable %_ptr_Function_VS_CONTROL_POINT_OUTPUT Function
 // CHECK-NEXT:      %result = OpVariable %_ptr_Function_BEZIER_CONTROL_POINT Function
-// CHECK-NEXT:         %130 = OpAccessChain %_ptr_Function_v3float %vsOutput %int_0
-// CHECK-NEXT:         %131 = OpLoad %v3float %130
-// CHECK-NEXT:         %132 = OpAccessChain %_ptr_Function_v3float %result %int_0
-// CHECK-NEXT:                OpStore %132 %131
-// CHECK-NEXT:         %133 = OpLoad %BEZIER_CONTROL_POINT %result
-// CHECK-NEXT:                OpReturnValue %133
+// CHECK-NEXT:         [[L48:%[1-9][0-9]*]] = OpAccessChain %_ptr_Function_v3float %vsOutput %int_0
+// CHECK-NEXT:         [[L49:%[1-9][0-9]*]] = OpLoad %v3float [[L48]]
+// CHECK-NEXT:         [[L50:%[1-9][0-9]*]] = OpAccessChain %_ptr_Function_v3float %result %int_0
+// CHECK-NEXT:                OpStore [[L50]] [[L49]]
+// CHECK-NEXT:         [[L51:%[1-9][0-9]*]] = OpLoad %BEZIER_CONTROL_POINT %result
+// CHECK-NEXT:                OpReturnValue [[L51]]
 // CHECK-NEXT:                OpFunctionEnd
diff --git a/tools/clang/test/CodeGenSPIRV/hs.const.output-patch.out.hlsl b/tools/clang/test/CodeGenSPIRV/hs.const.output-patch.out.hlsl
index 6bbcdd3764..08669c3de0 100644
--- a/tools/clang/test/CodeGenSPIRV/hs.const.output-patch.out.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/hs.const.output-patch.out.hlsl
@@ -8,13 +8,13 @@ struct ControlPoint { float4 position : POSITION; };
 // CHECK: OpFunctionCall %void %HullConst %param_var_edge %param_var_inside %param_var_myFloat 
 // CHECK: [[edges:%[0-9]+]] = OpLoad %_arr_float_uint_3 %param_var_edge 
 // CHECK: [[addr:%[0-9]+]] = OpAccessChain %_ptr_Output_float %gl_TessLevelOuter %uint_0 
-// CHECK: [[val:%[0-9]+]] = OpCompositeExtract %float %66 0 
+// CHECK: [[val:%[0-9]+]] = OpCompositeExtract %float [[arr:%[0-9]+]] 0 
 // CHECK: OpStore [[addr]] [[val]]
 // CHECK: [[addr:%[0-9]+]] = OpAccessChain %_ptr_Output_float %gl_TessLevelOuter %uint_1 
-// CHECK: [[val:%[0-9]+]] = OpCompositeExtract %float %66 1 
+// CHECK: [[val:%[0-9]+]] = OpCompositeExtract %float [[arr]] 1 
 // CHECK: OpStore [[addr]] [[val]]
 // CHECK: [[addr:%[0-9]+]] = OpAccessChain %_ptr_Output_float %gl_TessLevelOuter %uint_2 
-// CHECK: [[val:%[0-9]+]] = OpCompositeExtract %float %66 2 
+// CHECK: [[val:%[0-9]+]] = OpCompositeExtract %float [[arr]] 2 
 // CHECK: OpStore [[addr]] [[val]]
 // CHECK: [[val:%[0-9]+]] = OpLoad %float %param_var_inside 
 // CHECK: [[addr:%[0-9]+]] = OpAccessChain %_ptr_Output_float %gl_TessLevelInner %uint_0 
diff --git a/tools/clang/test/CodeGenSPIRV/node.barrier.compute.hlsl b/tools/clang/test/CodeGenSPIRV/node.barrier.compute.hlsl
new file mode 100644
index 0000000000..42b18d35a0
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/node.barrier.compute.hlsl
@@ -0,0 +1,15 @@
+// RUN: %dxc -spirv -Od -T lib_6_8 -fspv-target-env=vulkan1.3 external %s | FileCheck %s
+
+// Barrier is called from a compute shader
+
+[Shader("compute")]
+[NumThreads(5,1,1)]
+void node116_barrier_compute()
+{
+  Barrier(1, 3);
+}
+
+// CHECK: [[UINT:%[^ ]*]] = OpTypeInt 32 0
+// CHECK: [[U2:%[^ ]*]] = OpConstant [[UINT]] 2
+// CHECK-DAG: [[U72:%[^ ]*]] = OpConstant [[UINT]] 72
+// CHECK: OpControlBarrier [[U2]] [[U2]] [[U72]]
diff --git a/tools/clang/test/CodeGenSPIRV/node.barrier.memory-arg.hlsl b/tools/clang/test/CodeGenSPIRV/node.barrier.memory-arg.hlsl
new file mode 100644
index 0000000000..9b2dc23eea
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/node.barrier.memory-arg.hlsl
@@ -0,0 +1,60 @@
+// RUN: %dxc -spirv -Od -T lib_6_8 -fspv-target-env=vulkan1.3 -enable-16bit-types %s | FileCheck %s
+
+// Barrier is called using a memory type argument
+
+static const int a = 7;
+static const int16_t b = 2;
+
+[Shader("node")]
+[NodeLaunch("coalescing")]
+[NumThreads(16, 1, 1)]
+void node117_barrier_memoryarg()
+{
+  // literal integer flag values
+  Barrier(1, 3);
+
+  // static const integer flag values
+  Barrier(a, b);
+
+  // AllMemoryBarrier() ->
+  Barrier(UAV_MEMORY|GROUP_SHARED_MEMORY|NODE_INPUT_MEMORY|NODE_OUTPUT_MEMORY,
+          DEVICE_SCOPE);
+
+  // AllMemoryBarrierWithGroupSync() ->
+  Barrier(UAV_MEMORY|GROUP_SHARED_MEMORY|NODE_INPUT_MEMORY|NODE_OUTPUT_MEMORY,
+          GROUP_SYNC|DEVICE_SCOPE);
+
+  // DeviceMemoryBarrier() ->
+  Barrier(UAV_MEMORY,
+          DEVICE_SCOPE);
+
+  // DeviceMemoryBarrierWithGroupSync() ->
+  Barrier(UAV_MEMORY,
+          GROUP_SYNC|DEVICE_SCOPE);
+
+  // GroupMemoryBarrier() ->
+  Barrier(GROUP_SHARED_MEMORY,
+          GROUP_SCOPE);
+
+  // GroupMemoryBarrierWithGroupSync() ->
+  Barrier(GROUP_SHARED_MEMORY,
+          GROUP_SYNC|GROUP_SCOPE);
+}
+
+
+// CHECK: [[UINT:%[^ ]*]] = OpTypeInt 32 0
+// CHECK-DAG: [[U2:%[^ ]*]] = OpConstant %uint 2
+// CHECK-DAG: [[U5:%[^ ]*]] = OpConstant %uint 5
+// CHECK-DAG: [[U72:%[^ ]*]] = OpConstant %uint 72
+// CHECK-DAG: [[U264:%[^ ]*]] = OpConstant %uint 264
+// CHECK-DAG: [[U328:%[^ ]*]] = OpConstant %uint 328
+// CHECK-DAG: [[U4424:%[^ ]*]] = OpConstant %uint 4424
+
+// CHECK: OpControlBarrier [[U2]] [[U2]] [[U72]]
+// CHECK: OpMemoryBarrier [[U2]] [[U328]]
+// CHECK: OpMemoryBarrier [[U5]] [[U4424]]
+// CHECK: OpControlBarrier [[U2]] [[U5]] [[U4424]]
+// CHECK: OpMemoryBarrier [[U5]] [[U72]]
+// CHECK: OpControlBarrier [[U2]] [[U5]] [[U72]]
+// CHECK: OpMemoryBarrier [[U2]] [[U264]]
+// CHECK: OpControlBarrier [[U2]] [[U2]] [[U264]]
diff --git a/tools/clang/test/CodeGenSPIRV/node.barrier.object-arg.hlsl b/tools/clang/test/CodeGenSPIRV/node.barrier.object-arg.hlsl
new file mode 100644
index 0000000000..215acf7bfd
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/node.barrier.object-arg.hlsl
@@ -0,0 +1,213 @@
+// RUN: %dxc -spirv -Vd -Od -T lib_6_8 -fspv-target-env=vulkan1.3 %s | FileCheck %s
+// Note: validation disabled until NodePayloadAMDX pointers are allowed
+// as function arguments
+
+// Barrier is called with each node record and UAV type
+
+struct RECORD
+{
+    uint value;
+};
+
+// CHECK: [[UINT:%[^ ]*]] = OpTypeInt 32 0
+// CHECK-DAG: [[U256:%[^ ]*]] = OpConstant [[UINT]] 256
+// CHECK-DAG: [[U1:%[^ ]*]] = OpConstant [[UINT]] 1
+// CHECK-DAG: [[U0:%[^ ]*]] = OpConstant [[UINT]] 0
+// CHECK-DAG: [[U3:%[^ ]*]] = OpConstant [[UINT]] 3
+// CHECK-DAG: [[U4:%[^ ]*]] = OpConstant [[UINT]] 4
+// CHECK-DAG: [[U2:%[^ ]*]] = OpConstant [[UINT]] 2
+// CHECK-DAG: [[U4424:%[^ ]*]] = OpConstant [[UINT]] 4424
+// CHECK-DAG: [[U5:%[^ ]*]] = OpConstant [[UINT]] 5
+
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NumThreads(256,1,1)]
+[NodeDispatchGrid(256,1,1)]
+void node01(DispatchNodeInputRecord<RECORD> input)
+{
+   Barrier(input, 5);
+}
+
+// CHECK: OpControlBarrier %uint_2 %uint_5 %uint_4424
+
+[Shader("node")]
+[NodeLaunch("coalescing")]
+[NumThreads(256,1,1)]
+void node02([MaxRecords(8)] GroupNodeInputRecords<RECORD> input)
+{
+   Barrier(input, 3);
+}
+
+// CHECK: OpControlBarrier %uint_2 %uint_2 %uint_4424
+
+[Shader("node")]
+[NodeLaunch("thread")]
+void node03(RWThreadNodeInputRecord<RECORD> input)
+{
+   Barrier(input, 0);
+}
+
+// CHECK: OpMemoryBarrier %uint_4 %uint_4424
+
+[Shader("node")]
+[NodeLaunch("coalescing")]
+[NumThreads(256,1,1)]
+void node04([MaxRecords(6)] RWGroupNodeInputRecords<RECORD> input)
+{
+   Barrier(input, 0);
+}
+
+// CHECK: OpMemoryBarrier %uint_4 %uint_4424
+
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NumThreads(256,1,1)]
+[NodeDispatchGrid(256,1,1)]
+void node05([MaxRecords(5)] NodeOutput<RECORD> outputs)
+{
+   ThreadNodeOutputRecords<RECORD> outrec = outputs.GetThreadNodeOutputRecords(1);
+   Barrier(outrec, 0);
+}
+
+// CHECK: OpMemoryBarrier %uint_4 %uint_4424
+
+[Shader("node")]
+[NodeLaunch("thread")]
+void node06([MaxRecords(5)] NodeOutput<RECORD> outputs)
+{
+   ThreadNodeOutputRecords<RECORD> outrec = outputs.GetThreadNodeOutputRecords(3);
+   Barrier(outrec, 0);
+}
+
+// CHECK: OpMemoryBarrier %uint_4 %uint_4424
+
+[Shader("node")]
+[NodeLaunch("coalescing")]
+[NumThreads(256,1,3)]
+void node07([MaxRecords(5)] NodeOutput<RECORD> outputs)
+{
+   GroupNodeOutputRecords<RECORD> outrec = outputs.GetGroupNodeOutputRecords(1);
+   Barrier(outrec, 3);
+}
+
+// CHECK: OpControlBarrier %uint_2 %uint_2 %uint_4424
+
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NumThreads(256,1,4)]
+[NodeDispatchGrid(256,1,1)]
+void node08([MaxRecords(5)] NodeOutput<RECORD> outputs)
+{
+   GroupNodeOutputRecords<RECORD> outrec = outputs.GetGroupNodeOutputRecords(4);
+   Barrier(outrec, 3);
+}
+
+// CHECK: OpControlBarrier %uint_2 %uint_2 %uint_4424
+
+RWBuffer<float> obj09;
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NumThreads(256,1,4)]
+[NodeDispatchGrid(256,1,1)]
+void node09()
+{
+   Barrier(obj09, 5);
+}
+
+// CHECK: OpControlBarrier %uint_2 %uint_5 %uint_4424
+
+RWTexture1D<float4> obj10;
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NumThreads(256,1,4)]
+[NodeDispatchGrid(256,1,1)]
+void node10()
+{
+   Barrier(obj10, 5);
+}
+
+// CHECK: OpControlBarrier %uint_2 %uint_5 %uint_4424
+
+RWTexture1DArray<float4> obj11;
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NumThreads(256,1,4)]
+[NodeDispatchGrid(256,1,1)]
+void node11()
+{
+   Barrier(obj11, 5);
+}
+
+// CHECK: OpControlBarrier %uint_2 %uint_5 %uint_4424
+
+RWTexture2D<float> obj12;
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NumThreads(256,1,4)]
+[NodeDispatchGrid(256,1,1)]
+void node12()
+{
+   Barrier(obj12, 5);
+}
+
+// CHECK: OpControlBarrier %uint_2 %uint_5 %uint_4424
+
+RWTexture2DArray<float> obj13;
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NumThreads(256,1,4)]
+[NodeDispatchGrid(256,1,1)]
+void node13()
+{
+   Barrier(obj13, 5);
+}
+
+// CHECK: OpControlBarrier %uint_2 %uint_5 %uint_4424
+
+RWTexture3D<float> obj14;
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NumThreads(256,1,4)]
+[NodeDispatchGrid(256,1,1)]
+void node14()
+{
+   Barrier(obj14, 5);
+}
+
+// CHECK: OpControlBarrier %uint_2 %uint_5 %uint_4424
+
+RWStructuredBuffer<RECORD> obj15;
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NumThreads(256,1,4)]
+[NodeDispatchGrid(256,1,1)]
+void node15()
+{
+   Barrier(obj15, 5);
+}
+
+// CHECK: OpControlBarrier %uint_2 %uint_5 %uint_4424
+
+RWByteAddressBuffer obj16;
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NumThreads(256,1,4)]
+[NodeDispatchGrid(256,1,1)]
+void node16()
+{
+   Barrier(obj16, 5);
+}
+
+// CHECK: OpControlBarrier %uint_2 %uint_5 %uint_4424
+
+AppendStructuredBuffer<RECORD> obj17;
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NumThreads(256,1,4)]
+[NodeDispatchGrid(256,1,1)]
+void node17()
+{
+   Barrier(obj17, 5);
+}
+
+// CHECK: OpControlBarrier %uint_2 %uint_5 %uint_4424
diff --git a/tools/clang/test/CodeGenSPIRV/node.broadcasting.no-input.hlsl b/tools/clang/test/CodeGenSPIRV/node.broadcasting.no-input.hlsl
new file mode 100644
index 0000000000..a3c369b252
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/node.broadcasting.no-input.hlsl
@@ -0,0 +1,15 @@
+// RUN: %dxc -spirv -Od -T lib_6_8 -fspv-target-env=vulkan1.3 %s | FileCheck %s
+
+// Broadcasting launch node with no input
+
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NodeDispatchGrid(3,4,5)]
+[NumThreads(6,7,1)]
+[NodeIsProgramEntry]
+void node070_broadcasting_noinput()
+{
+}
+
+// CHECK: OpReturn
+
diff --git a/tools/clang/test/CodeGenSPIRV/node.coalescing.num-threads.hlsl b/tools/clang/test/CodeGenSPIRV/node.coalescing.num-threads.hlsl
new file mode 100644
index 0000000000..14e899da02
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/node.coalescing.num-threads.hlsl
@@ -0,0 +1,16 @@
+// RUN: %dxc -spirv -Od -T lib_6_8 -fspv-target-env=vulkan1.3 external %s | FileCheck %s
+
+// Coalescing launch node with thread group defined in the shader
+
+[Shader("node")]
+[NodeLaunch("coalescing")]
+[NumThreads(1024,1,1)]
+[NodeIsProgramEntry]
+void node008_coalescing_numthreads_shader()
+{
+}
+
+// CHECK: OpEntryPoint GLCompute [[SHADER:%[0-9A-Za-z_]*]]
+// CHECK-DAG: OpExecutionMode [[SHADER]] CoalescingAMDX
+// CHECK-DAG: OpExecutionMode [[SHADER]] LocalSize 1024 1 1
+// CHECK: OpReturn
diff --git a/tools/clang/test/CodeGenSPIRV/node.dispatch-grid.hlsl b/tools/clang/test/CodeGenSPIRV/node.dispatch-grid.hlsl
new file mode 100644
index 0000000000..302c8ea698
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/node.dispatch-grid.hlsl
@@ -0,0 +1,28 @@
+// RUN: %dxc -spirv -Vd -Od -T lib_6_8 -fspv-target-env=vulkan1.3 external %s | FileCheck %s
+// Note: validation disabled until NodePayloadAMDX pointers are allowed
+// as function arguments
+
+// Broadcasting launch node with dispatch grid defined in shader
+
+struct INPUT_NOGRID
+{
+  uint textureIndex;
+};
+
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NodeDispatchGrid(2,3,2)]
+[NumThreads(1024,1,1)]
+[NodeIsProgramEntry]
+void node001_dispatchgrid_shader(DispatchNodeInputRecord<INPUT_NOGRID> input)
+{
+}
+
+// CHECK: OpEntryPoint GLCompute [[SHADER:%[0-9A-Za-z_]*]]
+// CHECK-DAG: OpExecutionMode [[SHADER]] LocalSize 1024 1 1
+// CHECK-DAG: OpExecutionModeId [[SHADER]] StaticNumWorkgroupsAMDX [[U2:%[0-9A-Za-z_]*]]
+// CHECK-SAME: [[U3:%[^ ]*]] [[U2]]
+// CHECK: [[UINT:%[^ ]*]] = OpTypeInt 32 0
+// CHECK-DAG: [[U2]] = OpConstant [[UINT]] 2
+// CHECK-DAG: [[U3]] = OpConstant [[UINT]] 3
+// CHECK: OpReturn
diff --git a/tools/clang/test/CodeGenSPIRV/node.empty-node-input.hlsl b/tools/clang/test/CodeGenSPIRV/node.empty-node-input.hlsl
new file mode 100644
index 0000000000..fa16429a1b
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/node.empty-node-input.hlsl
@@ -0,0 +1,28 @@
+// RUN: %dxc -spirv -Vd -Od -T lib_6_8 -fspv-target-env=vulkan1.3 %s | FileCheck %s
+// Note: validation disabled until NodePayloadAMDX pointers are allowed
+// as function arguments
+
+// Coalescing launch node declares EmptyNodeInput
+
+RWBuffer<uint> buf0;
+
+[Shader("node")]
+[NodeLaunch("coalescing")]
+[NodeIsProgramEntry]
+[NumThreads(2,1,1)]
+void emptynodeinput(EmptyNodeInput input)
+{
+  // input.Count should always return 1 here, so there is
+  // an opportunity for an optimization.
+  buf0[0] = input.Count();
+}
+
+// CHECK-DAG: [[UINT:%[^ ]*]] = OpTypeInt 32 0
+// CHECK-DAG: [[U0:%[^ ]*]] = OpConstant [[UINT]] 0
+// CHECK-DAG: [[IMG:%[^ ]*]] = OpTypeImage [[UINT]] Buffer 2 0 0 2 R32ui
+// CHECK-DAG: [[IMGPTR:%[^ ]*]] = OpTypePointer UniformConstant [[IMG]]
+// CHECK-DAG: [[BUF:%[^ ]*]] = OpVariable [[IMGPTR]] UniformConstant
+
+// CHECK: [[COUNT:%[^ ]*]] = OpNodePayloadArrayLengthAMDX [[UINT]]
+// CHECK: [[IMAGE:%[^ ]*]] = OpLoad [[IMG]] [[BUF]]
+// CHECK: OpImageWrite [[IMAGE]] [[U0]] [[COUNT]] None
diff --git a/tools/clang/test/CodeGenSPIRV/node.finished-cross-group-sharing.hlsl b/tools/clang/test/CodeGenSPIRV/node.finished-cross-group-sharing.hlsl
new file mode 100644
index 0000000000..8e1ce56307
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/node.finished-cross-group-sharing.hlsl
@@ -0,0 +1,32 @@
+// RUN: %dxc -spirv -Vd -Od -T lib_6_8 -fspv-target-env=vulkan1.3 %s | FileCheck %s
+// Note: validation disabled until NodePayloadAMDX pointers are allowed
+// as function arguments
+
+// FinishedCrossGroupSharing() is called with RWDispatchNodeInputRecord
+
+RWBuffer<uint> buf0;
+
+struct [NodeTrackRWInputSharing] INPUT_RECORD
+{
+  uint value;
+};
+
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NodeDispatchGrid(256,1,1)]
+[NumThreads(1,1,1)]
+void node037_finishedcrossgroupsharing(RWDispatchNodeInputRecord<INPUT_RECORD> input)
+{
+  bool b = input.FinishedCrossGroupSharing();
+  buf0[0] = 0 ? b : 1;
+}
+
+// CHECK: OpName [[INPUT:%[^ ]*]] "input"
+// CHECK: OpDecorate [[STRUCT:%[^ ]*]] TrackFinishWritingAMDX
+// CHECK: [[UINT:%[^ ]*]] = OpTypeInt 32 0
+// CHECK: [[STRUCT]] = OpTypeStruct [[UINT]]
+// CHECK: [[ARR:%[^ ]*]] = OpTypeNodePayloadArrayAMDX [[STRUCT]]
+// CHECK: [[PTR:%[^ ]*]] = OpTypePointer NodePayloadAMDX [[ARR]]
+// CHECK: [[BOOL:%[^ ]*]] = OpTypeBool
+// CHECK: [[INPUT]] = OpFunctionParameter [[PTR]]
+// CHECK: OpFinishWritingNodePayloadAMDX [[BOOL]] [[INPUT]]
diff --git a/tools/clang/test/CodeGenSPIRV/node.get-input-record-count.hlsl b/tools/clang/test/CodeGenSPIRV/node.get-input-record-count.hlsl
new file mode 100644
index 0000000000..a3af668c46
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/node.get-input-record-count.hlsl
@@ -0,0 +1,25 @@
+// RUN: %dxc -spirv -Vd -Od -T lib_6_8 external -fspv-target-env=vulkan1.3 %s | FileCheck %s
+// Note: validation disabled until NodePayloadAMDX pointers are allowed
+// as function arguments
+
+// GetInputRecordCount() called with NodeInputRecordArray
+
+RWBuffer<uint> buf0;
+
+struct INPUT_RECORD
+{
+    uint textureIndex;
+};
+
+[Shader("node")]
+[NodeLaunch("coalescing")]
+[NumThreads(1024,1,1)]
+[NodeIsProgramEntry]
+void node014_getinputrecordcount([MaxRecords(256)] GroupNodeInputRecords<INPUT_RECORD> inputs)
+{
+  uint numRecords = inputs.Count();
+  buf0[0] = numRecords;
+}
+
+// CHECK: [[UINT:%[^ ]*]] = OpTypeInt 32 0
+// CHECK: OpNodePayloadArrayLengthAMDX [[UINT]]
diff --git a/tools/clang/test/CodeGenSPIRV/node.get-node-output-record.multiple.hlsl b/tools/clang/test/CodeGenSPIRV/node.get-node-output-record.multiple.hlsl
new file mode 100644
index 0000000000..d029bd20bb
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/node.get-node-output-record.multiple.hlsl
@@ -0,0 +1,72 @@
+// RUN: %dxc -spirv -Od -T lib_6_8 -fspv-target-env=vulkan1.3 %s | FileCheck %s
+
+// Multiple calls to Get*NodeOuputRecords(array)
+
+struct RECORD {
+  int i;
+  float3 foo;
+};
+
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NumThreads(64, 1, 1)]
+[NodeDispatchGrid(8, 1, 1)]
+void node150_a(NodeOutput<RECORD> output)
+{
+  GroupNodeOutputRecords<RECORD> outRec1 = output.GetGroupNodeOutputRecords(1);
+  GroupNodeOutputRecords<RECORD> outRec2 = output.GetGroupNodeOutputRecords(4);
+  outRec1.OutputComplete();
+  outRec2.OutputComplete();
+}
+
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NumThreads(64, 1, 1)]
+[NodeDispatchGrid(8, 1, 1)]
+void node150_b(NodeOutput<RECORD> output)
+{
+  ThreadNodeOutputRecords<RECORD> outRec1 = output.GetThreadNodeOutputRecords(5);
+  ThreadNodeOutputRecords<RECORD> outRec2 = output.GetThreadNodeOutputRecords(1);
+  outRec1.OutputComplete();
+  outRec1 = outRec2;
+  outRec1.OutputComplete();
+}
+
+// CHECK: OpDecorateId [[ARR_A:%[^ ]*]] PayloadNodeNameAMDX [[STR:%[0-9A-Za-z_]*]]
+// CHECK: OpDecorateId [[ARR_B:%[^ ]*]] PayloadNodeNameAMDX [[STR]]
+
+// CHECK: [[UINT:%[^ ]*]] = OpTypeInt 32 0
+// CHECK-DAG: [[U0:%[^ ]*]] = OpConstant [[UINT]] 0
+// CHECK-DAG: [[U1:%[^ ]*]] = OpConstant [[UINT]] 1
+// CHECK-DAG: [[U2:%[^ ]*]] = OpConstant [[UINT]] 2
+// CHECK-DAG: [[U4:%[^ ]*]] = OpConstant [[UINT]] 4
+// CHECK-DAG: [[U5:%[^ ]*]] = OpConstant [[UINT]] 5
+// CHECK-DAG: [[STR]] = OpConstantStringAMDX "output"
+// CHECK-DAG: [[ARR_A]] = OpTypeNodePayloadArrayAMDX
+// CHECK-DAG: [[ARR_B]] = OpTypeNodePayloadArrayAMDX
+// CHECK-DAG: [[FPTR_A:%[^ ]*]] = OpTypePointer Function [[ARR_A]]
+// CHECK-DAG: [[NPTR_A:%[^ ]*]] = OpTypePointer NodePayloadAMDX [[ARR_A]]
+// CHECK-DAG: [[FPTR_B:%[^ ]*]] = OpTypePointer Function [[ARR_B]]
+// CHECK-DAG: [[NPTR_B:%[^ ]*]] = OpTypePointer NodePayloadAMDX [[ARR_B]]
+
+// checking for OpFunctionCall skips over the entry function wrapper and
+// thereby avoids matching wrapper variables
+// CHECK: OpFunctionCall
+// CHECK: [[OUT1:%[^ ]*]] = OpVariable [[FPTR_A]]
+// CHECK: [[OUT2:%[^ ]*]] = OpVariable [[FPTR_A]]
+// CHECK: [[PAY:%[^ ]*]] = OpAllocateNodePayloadsAMDX [[NPTR_A]] [[U2]] [[U1]] [[U0]]
+// CHECK: [[VAL:%[^ ]*]] = OpLoad [[ARR_A]] [[PAY]]
+// CHECK: OpStore [[OUT1]] [[VAL]]
+// CHECK: [[PAY:%[^ ]*]] = OpAllocateNodePayloadsAMDX [[NPTR_A]] [[U2]] [[U4]] [[U0]]
+// CHECK: [[VAL:%[^ ]*]] = OpLoad [[ARR_A]] [[PAY]]
+// CHECK: OpStore [[OUT2]] [[VAL]]
+// CHECK: OpFunctionCall
+// CHECK: [[OUT1:%[^ ]*]] = OpVariable [[FPTR_B]]
+// CHECK: [[OUT2:%[^ ]*]] = OpVariable [[FPTR_B]]
+// CHECK: [[PAY:%[^ ]*]] = OpAllocateNodePayloadsAMDX [[NPTR_B]] [[U4]] [[U5]] [[U0]]
+// CHECK: [[VAL:%[^ ]*]] = OpLoad [[ARR_B]] [[PAY]]
+// CHECK: OpStore [[OUT1]] [[VAL]]
+// CHECK: [[PAY:%[^ ]*]] = OpAllocateNodePayloadsAMDX [[NPTR_B]] [[U4]] [[U1]] [[U0]]
+// CHECK: [[VAL:%[^ ]*]] = OpLoad [[ARR_B]] [[PAY]]
+// CHECK: OpStore [[OUT2]] [[VAL]]
+// CHECK: OpFunctionEnd
diff --git a/tools/clang/test/CodeGenSPIRV/node.get-remaining-recursion-levels.hlsl b/tools/clang/test/CodeGenSPIRV/node.get-remaining-recursion-levels.hlsl
new file mode 100644
index 0000000000..f981282748
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/node.get-remaining-recursion-levels.hlsl
@@ -0,0 +1,26 @@
+// RUN: %dxc -spirv -T lib_6_8 external -fspv-target-env=vulkan1.3 %s | FileCheck %s
+
+// GetRemainingRecusionLevels() called
+
+RWBuffer<uint> buf0;
+
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NumThreads(8,1,1)]
+[NodeDispatchGrid(32,2,2)]
+[NodeMaxRecursionDepth(16)]
+void node133_getremainingrecursionlevels()
+{
+  uint remaining = GetRemainingRecursionLevels();
+  // Use resource as a way of preventing DCE
+  buf0[0] = remaining;
+}
+
+// CHECK: OpEntryPoint GLCompute [[SHADER:%[^ ]*]] "node133_getremainingrecursionlevels" [[RRL:%[^ ]*]]
+// CHECK: OpExecutionModeId [[SHADER]] MaxNodeRecursionAMDX [[U16:%[^ ]*]]
+// CHECK: OpDecorate [[RRL]] BuiltIn RemainingRecursionLevelsAMDX
+// CHECK: [[UINT:%[^ ]*]] = OpTypeInt 32 0
+// CHECK: [[U16]] = OpConstant [[UINT]] 16
+// CHECK: [[PTR:%[^ ]*]] = OpTypePointer Input [[UINT]]
+// CHECK: [[RRL]] = OpVariable [[PTR]] Input
+// CHECK: OpLoad [[UINT]] [[RRL]]
diff --git a/tools/clang/test/CodeGenSPIRV/node.group-shared.barrier.hlsl b/tools/clang/test/CodeGenSPIRV/node.group-shared.barrier.hlsl
new file mode 100644
index 0000000000..cf1638d75c
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/node.group-shared.barrier.hlsl
@@ -0,0 +1,18 @@
+// RUN: %dxc -spirv -Od -T lib_6_8 -fspv-target-env=vulkan1.3 %s | FileCheck %s
+
+// Check that a barrier can be used on a groupshared object from a
+// work graph node
+
+groupshared uint Test;
+
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NodeDispatchGrid(1, 1, 1)]
+[NumThreads(1, 1, 1)]
+void firstNode()
+{
+  Test = 1;
+  AllMemoryBarrierWithGroupSync();
+}
+
+// CHECK: OpReturn
diff --git a/tools/clang/test/CodeGenSPIRV/node.group-shared.hlsl b/tools/clang/test/CodeGenSPIRV/node.group-shared.hlsl
new file mode 100644
index 0000000000..81fc0e39a2
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/node.group-shared.hlsl
@@ -0,0 +1,24 @@
+// RUN: %dxc -spirv -Vd -Od -T lib_6_8 -fspv-target-env=vulkan1.3 %s | FileCheck %s
+// Note: validation disabled until NodePayloadAMDX pointers are allowed
+// as function arguments
+
+// Check that group shared memory is allowed from a work graph node
+
+struct Record
+{
+    uint index;
+};
+
+groupshared uint testLds[512];
+
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NodeDispatchGrid(2, 1, 1)]
+[NumThreads(1,1,1)]
+void firstNode(DispatchNodeInputRecord<Record> inputData)
+{
+    testLds[inputData.Get().index] = 99;
+}
+
+// CHECK: OpReturn
+
diff --git a/tools/clang/test/CodeGenSPIRV/node.increment-output-count.group.hlsl b/tools/clang/test/CodeGenSPIRV/node.increment-output-count.group.hlsl
new file mode 100644
index 0000000000..d6a2ea759e
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/node.increment-output-count.group.hlsl
@@ -0,0 +1,22 @@
+// RUN: %dxc -spirv -Od -T lib_6_8 external -fspv-target-env=vulkan1.3 %s | FileCheck %s
+
+// Node with EmptyNodeOutput calls GroupIncrementOutputCount
+
+
+[Shader("node")]
+[NodeLaunch("coalescing")]
+[NumThreads(1024,1,1)]
+[NodeIsProgramEntry]
+void node028_incrementoutputcount([MaxRecords(32)] EmptyNodeOutput empty)
+{
+  empty.GroupIncrementOutputCount(1);
+}
+
+// CHECK: [[UINT:%[^ ]*]] = OpTypeInt 32 0
+// CHECK-DAG: [[U0:%[^ ]*]] = OpConstant [[UINT]] 0
+// CHECK-DAG: [[U1:%[^ ]*]] = OpConstant [[UINT]] 1
+// CHECK-DAG: [[STRUCT:%[^ ]*]] = OpTypeStruct
+// CHECK-DAG: [[ARR:%[^ ]*]] = OpTypeNodePayloadArrayAMDX [[STRUCT]]
+// CHECK-DAG: [[PTR:%[^ ]*]] = OpTypePointer NodePayloadAMDX [[ARR]]
+// CHECK-DAG: [[U2:%[^ ]*]] = OpConstant [[UINT]] 2
+// CHECK: OpAllocateNodePayloadsAMDX [[PTR]] [[U2]] [[U1]] [[U0]]
diff --git a/tools/clang/test/CodeGenSPIRV/node.increment-output-count.thread.hlsl b/tools/clang/test/CodeGenSPIRV/node.increment-output-count.thread.hlsl
new file mode 100644
index 0000000000..6cd984fe69
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/node.increment-output-count.thread.hlsl
@@ -0,0 +1,22 @@
+// RUN: %dxc -spirv -Od -T lib_6_8 external -fspv-target-env=vulkan1.3 %s | FileCheck %s
+
+// Node with EmptyNodeOutput calls ThreadIncrementOutputCount
+
+
+[Shader("node")]
+[NodeLaunch("thread")]
+[NodeIsProgramEntry]
+void node028_incrementoutputcount([MaxRecords(32)] EmptyNodeOutput empty)
+{
+  empty.ThreadIncrementOutputCount(1);
+}
+
+// CHECK: [[UINT:%[^ ]*]] = OpTypeInt 32 0
+// CHECK-DAG: [[U0:%[^ ]*]] = OpConstant [[UINT]] 0
+// CHECK-DAG: [[U1:%[^ ]*]] = OpConstant [[UINT]] 1
+// CHECK-DAG: [[STRUCT:%[^ ]*]] = OpTypeStruct
+// CHECK-DAG: [[ARR:%[^ ]*]] = OpTypeNodePayloadArrayAMDX [[STRUCT]]
+// CHECK-DAG: [[PTR:%[^ ]*]] = OpTypePointer NodePayloadAMDX [[ARR]]
+// CHECK-DAG: OpConstantStringAMDX "empty"
+// CHECK-DAG: [[U4:%[^ ]*]] = OpConstant [[UINT]] 4
+// CHECK: OpAllocateNodePayloadsAMDX [[PTR]] [[U4]] [[U1]] [[U0]]
diff --git a/tools/clang/test/CodeGenSPIRV/node.input-record.dispatch-grid.array.hlsl b/tools/clang/test/CodeGenSPIRV/node.input-record.dispatch-grid.array.hlsl
new file mode 100644
index 0000000000..bae3f759b8
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/node.input-record.dispatch-grid.array.hlsl
@@ -0,0 +1,26 @@
+// RUN: %dxc -spirv -Vd -Od -T lib_6_8 -fspv-target-env=vulkan1.3 %s | FileCheck %s
+// Note: validation disabled until NodePayloadAMDX pointers are allowed
+// as function arguments
+
+// Check that SV_DispatchGrid supports array
+
+struct RECORD
+{
+  uint a[3] : SV_DispatchGrid;
+  uint b[3];
+};
+
+[Shader("node")]
+[NodeLaunch("coalescing")]
+[numthreads(4,4,4)]
+void node01(RWGroupNodeInputRecords<RECORD> input)
+{
+  input.Get().a = input.Get().b;
+}
+
+// CHECK: OpName [[RECORD:%[^ ]*]] "RECORD"
+// CHECK: OpMemberDecorate [[RECORD]] 0 PayloadDispatchIndirectAMDX
+// CHECK: [[UINT:%[^ ]*]] = OpTypeInt 32 0
+// CHECK: [[U3:%[^ ]*]] = OpConstant %uint 3
+// CHECK: [[ARRAY:%[^ ]*]] = OpTypeArray [[UINT]] [[U3]]
+// CHECK: [[RECORD]] = OpTypeStruct [[ARRAY]] [[ARRAY]]
diff --git a/tools/clang/test/CodeGenSPIRV/node.input-record.dispatch-grid.nested.hlsl b/tools/clang/test/CodeGenSPIRV/node.input-record.dispatch-grid.nested.hlsl
new file mode 100644
index 0000000000..aee7e0d014
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/node.input-record.dispatch-grid.nested.hlsl
@@ -0,0 +1,32 @@
+// RUN: %dxc -spirv -Vd -Od -T lib_6_8 -fspv-target-env=vulkan1.3 %s | FileCheck %s
+// Note: validation disabled until NodePayloadAMDX pointers are allowed
+// as function arguments
+
+// Check that SV_DispatchGrid in nested struct is recognized
+
+struct INNER {
+  uint c;
+  uint3 grid : SV_DispatchGrid;
+};
+
+struct RECORD
+{
+  uint a;
+  INNER b;
+};
+
+[Shader("node")]
+[NodeLaunch("coalescing")]
+[numthreads(4,4,4)]
+void node01(RWGroupNodeInputRecords<RECORD> input)
+{
+  input.Get().a = input.Get().b.grid.x;
+}
+
+// CHECK: OpName [[RECORD:%[^ ]*]] "RECORD"
+// CHECK: OpName [[INNER:%[^ ]*]] "INNER"
+// CHECK: OpMemberDecorate [[INNER]] 1 PayloadDispatchIndirectAMDX
+// CHECK: [[UINT:%[^ ]*]] = OpTypeInt 32 0
+// CHECK: [[VECTOR:%[^ ]*]] = OpTypeVector %uint 3
+// CHECK: [[INNER]] = OpTypeStruct [[UINT]] [[VECTOR]]
+// CHECK: [[RECORD]] = OpTypeStruct [[UINT]] [[INNER]]
diff --git a/tools/clang/test/CodeGenSPIRV/node.max-dispatch-grid.hlsl b/tools/clang/test/CodeGenSPIRV/node.max-dispatch-grid.hlsl
new file mode 100644
index 0000000000..e2440a31c0
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/node.max-dispatch-grid.hlsl
@@ -0,0 +1,30 @@
+// RUN: %dxc -spirv -Vd -Od -T lib_6_8 -fspv-target-env=vulkan1.3 %s | FileCheck %s
+// Note: validation disabled until NodePayloadAMDX pointers are allowed
+// as function arguments
+
+// Broadcasting launch node with dispatch grid defined in input
+// and max dispatch grid defined in the shader
+
+struct INPUT_GRID
+{
+  uint3 DispatchGrid : SV_DispatchGrid;
+  uint textureIndex;
+};
+
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NodeMaxDispatchGrid(2,3,4)]
+[NumThreads(1024,1,1)]
+void node002_dispatchgrid_input_maxdispatchgrid_shader(DispatchNodeInputRecord<INPUT_GRID> input)
+{
+}
+
+// CHECK: OpEntryPoint GLCompute [[SHADER:%[^ ]*]] "node002_dispatchgrid_input_maxdispatchgrid_shader"
+// CHECK-DAG: OpExecutionMode [[SHADER]] LocalSize 1024 1 1
+// CHECK-DAG: OpExecutionModeId [[SHADER]] MaxNumWorkgroupsAMDX [[U2:%[^ ]*]] [[U3:%[^ ]*]] [[U4:%[0-9A-Za-z_]*]]
+// CHECK: OpMemberDecorate %{{[^ ]*}} 0 PayloadDispatchIndirectAMDX
+// CHECK: [[UINT:%[^ ]*]] = OpTypeInt 32 0
+// CHECK-DAG: [[U2]] = OpConstant [[UINT]] 2
+// CHECK-DAG: [[U3]] = OpConstant [[UINT]] 3
+// CHECK-DAG: [[U4]] = OpConstant [[UINT]] 4
+// CHECK: OpReturn
diff --git a/tools/clang/test/CodeGenSPIRV/node.max-records.hlsl b/tools/clang/test/CodeGenSPIRV/node.max-records.hlsl
new file mode 100644
index 0000000000..7d8449afab
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/node.max-records.hlsl
@@ -0,0 +1,45 @@
+// RUN: %dxc -spirv -T lib_6_8  -fspv-target-env=vulkan1.3 %s | FileCheck %s
+
+// Test referencing params with MaxOutputRecordsSharedWith
+
+struct rec0
+{
+    int i0;
+    float f0;
+};
+
+struct rec1
+{
+    float f1;
+    int i1;
+};
+
+[Shader("node")]
+[NodeLaunch("thread")]
+void BackwardRef(
+  RWThreadNodeInputRecord<rec0> InputyMcInputFace,
+  [MaxRecords(5)] NodeOutput<rec1> Output1,
+  [MaxRecordsSharedWith(Output1)] NodeOutput<rec1> Output2)
+{
+}
+
+// CHECK: OpDecorateId [[TYPE1:%[^ ]*]] PayloadNodeNameAMDX [[STR1:%[^ ]*]]
+// CHECK: OpDecorateId [[TYPE1]] NodeMaxPayloadsAMDX [[U5:%[^ ]*]]
+// CHECK: OpDecorateId [[TYPE2:%[^ ]*]] PayloadNodeNameAMDX [[STR2:%[^ ]*]]
+// CHECK: OpDecorateId [[TYPE2]] NodeSharesPayloadLimitsWithAMDX [[TYPE1]]
+// CHECK: [[UINT:%[^ ]*]] = OpTypeInt 32 0
+// CHECK-DAG: [[U5]] = OpConstant [[UINT]] 5
+// CHECK-DAG: [[STR1]] = OpConstantStringAMDX "Output1"
+// CHECK-DAG: [[STR2]] = OpConstantStringAMDX "Output2"
+
+#if 0
+// copied from DXIL test but doesn't seem to conform to spec
+[Shader("node")]
+[NodeLaunch("thread")]
+void ForwardRef(
+  RWThreadNodeInputRecord<rec0> InputyMcInputFace,
+  [MaxRecordsSharedWith(Output2)] NodeOutput<rec1> Output1,
+  [MaxRecords(5)] NodeOutput<rec1> Output2)
+{
+}
+#endif
diff --git a/tools/clang/test/CodeGenSPIRV/node.member.read.hlsl b/tools/clang/test/CodeGenSPIRV/node.member.read.hlsl
new file mode 100644
index 0000000000..ac2474b29b
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/node.member.read.hlsl
@@ -0,0 +1,150 @@
+// RUN: %dxc -spirv -Vd -Od -T lib_6_8 -fspv-target-env=vulkan1.3 %s | FileCheck %s
+// Note: validation disabled until NodePayloadAMDX pointers are allowed
+// as function arguments
+
+// Read access to members of node input/output records
+
+RWBuffer<uint> buf0;
+
+struct RECORD
+{
+  uint a;
+  uint b;
+  uint c;
+};
+
+// CHECK: OpName [[BUF0:%[^ ]*]] "buf0"
+// CHECK: [[UINT:%[^ ]*]] = OpTypeInt 32 0
+// CHECK: [[U0:%[^ ]*]] = OpConstant [[UINT]] 0
+// CHECK: [[U16:%[^ ]*]] = OpConstant [[UINT]] 16
+// CHECK-DAG: [[INT:%[^ ]*]] = OpTypeInt 32 1
+// CHECK-DAG: [[S0:%[^ ]*]] = OpConstant [[INT]] 0
+// CHECK-DAG: [[U1:%[^ ]*]] = OpConstant [[UINT]] 1
+// CHECK-DAG: [[S1:%[^ ]*]] = OpConstant [[INT]] 1
+// CHECK-DAG: [[U2:%[^ ]*]] = OpConstant [[UINT]] 2
+// CHECK-DAG: [[S2:%[^ ]*]] = OpConstant [[INT]] 2
+// CHECK-DAG: [[U4:%[^ ]*]] = OpConstant [[UINT]] 4
+// CHECK-DAG: [[U7:%[^ ]*]] = OpConstant [[UINT]] 7
+// CHECK-DAG: [[TBI:%[^ ]*]] = OpTypeImage [[UINT]] Buffer
+
+[Shader("node")]
+[NumThreads(1024,1,1)]
+[NodeLaunch("broadcasting")]
+[NodeDispatchGrid(16,1,1)]
+void node01(DispatchNodeInputRecord<RECORD> input)
+{
+  buf0[0] = input.Get().a;
+}
+
+// CHECK: OpFunction
+// CHECK: [[PTR:%[^ ]*]] = OpAccessChain %{{[^ ]*}} %{{[^ ]*}} [[S0]]
+// CHECK: [[VAL:%[^ ]*]] = OpLoad [[UINT]] [[PTR]]
+// CHECK: [[IMG:%[^ ]*]] = OpLoad [[TBI]] [[BUF0]]
+// CHECK: OpImageWrite [[IMG]] [[U0]] [[VAL]]
+// CHECK: OpFunctionEnd
+
+
+[Shader("node")]
+[NumThreads(1024,1,1)]
+[NodeLaunch("broadcasting")]
+[NodeDispatchGrid(16,1,1)]
+void node02(RWDispatchNodeInputRecord<RECORD> input)
+{
+  buf0[0] = input.Get().b;
+}
+
+// CHECK: OpFunction
+// CHECK: [[PTR:%[^ ]*]] = OpAccessChain %{{[^ ]*}} %{{[^ ]*}} [[S1]]
+// CHECK: [[VAL:%[^ ]*]] = OpLoad [[UINT]] [[PTR]]
+// CHECK: [[IMG:%[^ ]*]] = OpLoad [[TBI]] [[BUF0]]
+// CHECK: OpImageWrite [[IMG]] [[U0]] [[VAL]]
+// CHECK: OpFunctionEnd
+
+[Shader("node")]
+[NumThreads(1024, 1, 1)]
+[NodeLaunch("coalescing")]
+void node03([MaxRecords(3)] GroupNodeInputRecords<RECORD> input)
+{
+  buf0[0] = input[1].c;
+}
+
+// CHECK: OpFunction
+// CHECK: [[PTR:%[^ ]*]] = OpAccessChain %{{[^ ]*}} %{{[^ ]*}} [[U1]] [[S2]]
+// CHECK: [[VAL:%[^ ]*]] = OpLoad [[UINT]] [[PTR]]
+// CHECK: [[IMG:%[^ ]*]] = OpLoad [[TBI]] [[BUF0]]
+// CHECK: OpImageWrite [[IMG]] [[U0]] [[VAL]]
+// CHECK: OpFunctionEnd
+
+[Shader("node")]
+[NumThreads(1,1,1)]
+[NodeLaunch("coalescing")]
+void node04([MaxRecords(4)] RWGroupNodeInputRecords<RECORD> input)
+{
+  buf0[0] = input[2].c;
+}
+
+// CHECK: OpFunction
+// CHECK: [[PTR:%[^ ]*]] = OpAccessChain %{{[^ ]*}} %{{[^ ]*}} [[U2]] [[S2]]
+// CHECK: [[VAL:%[^ ]*]] = OpLoad [[UINT]] [[PTR]]
+// CHECK: [[IMG:%[^ ]*]] = OpLoad [[TBI]] [[BUF0]]
+// CHECK: OpImageWrite [[IMG]] [[U0]] [[VAL]]
+// CHECK: OpFunctionEnd
+
+[Shader("node")]
+[NumThreads(1,1,1)]
+[NodeLaunch("coalescing")]
+void node05(NodeOutput<RECORD> output)
+{
+  ThreadNodeOutputRecords<RECORD> outrec = output.GetThreadNodeOutputRecords(1);
+  buf0[0] = outrec.Get().a;
+}
+
+// CHECK: OpFunction
+// CHECK: [[PAY:%[^ ]*]] = OpAllocateNodePayloadsAMDX %{{[^ ]*}} [[U4]] [[U1]] [[U0]]
+// CHECK: [[TEMP:%[^ ]*]] = OpLoad %{{[^ ]*}} [[PAY]]
+// CHECK: OpStore [[OUT:%[^ ]*]] [[TEMP]]
+// CHECK: [[PTR1:%[^ ]*]] = OpAccessChain %{{[^ ]*}} [[OUT]] [[U0]]
+// CHECK: [[PTR2:%[^ ]*]] = OpAccessChain %{{[^ ]*}} [[PTR1]] [[S0]]
+// CHECK-DAG: [[VAL:%[^ ]*]] = OpLoad [[UINT]] [[PTR2]]
+// CHECK-DAG: [[IMG:%[^ ]*]] = OpLoad [[TBI]] [[BUF0]]
+// CHECK: OpImageWrite [[IMG]] [[U0]] [[VAL]]
+// CHECK: OpFunctionEnd
+
+[Shader("node")]
+[NumThreads(1,1,1)]
+[NodeLaunch("coalescing")]
+void node06(NodeOutput<RECORD> output)
+{
+  ThreadNodeOutputRecords<RECORD> outrec = output.GetThreadNodeOutputRecords(7);
+  buf0[0] = outrec[2].b;
+}
+
+// CHECK: OpFunction
+// CHECK: [[PAY:%[^ ]*]] = OpAllocateNodePayloadsAMDX %{{[^ ]*}} [[U4]] [[U7]] [[U0]]
+// CHECK: [[TEMP:%[^ ]*]] = OpLoad %{{[^ ]*}} [[PAY]]
+// CHECK: OpStore [[OUT:%[^ ]*]] [[TEMP]]
+// CHECK: [[PTR:%[^ ]*]] = OpAccessChain %{{[^ ]*}} [[OUT]] [[U2]] [[S1]]
+// CHECK-DAG: [[VAL:%[^ ]*]] = OpLoad [[UINT]] [[PTR]]
+// CHECK-DAG: [[IMG:%[^ ]*]] = OpLoad [[TBI]] [[BUF0]]
+// CHECK: OpImageWrite [[IMG]] [[U0]] [[VAL]]
+// CHECK: OpFunctionEnd
+
+[Shader("node")]
+[NumThreads(1,1,1)]
+[NodeLaunch("coalescing")]
+void node07(NodeOutput<RECORD> output)
+{
+  GroupNodeOutputRecords<RECORD> outrec = output.GetGroupNodeOutputRecords(1);
+  buf0[0] = outrec.Get().c;
+}
+
+// CHECK: OpFunction
+// CHECK: [[PAY:%[^ ]*]] = OpAllocateNodePayloadsAMDX %{{[^ ]*}} [[U2]] [[U1]] [[U0]]
+// CHECK: [[TEMP:%[^ ]*]] = OpLoad %{{[^ ]*}} [[PAY]]
+// CHECK: OpStore [[OUT:%[^ ]*]] [[TEMP]]
+// CHECK: [[PTR1:%[^ ]*]] = OpAccessChain %{{[^ ]*}} [[OUT]] [[U0]]
+// CHECK: [[PTR2:%[^ ]*]] = OpAccessChain %{{[^ ]*}} [[PTR1]] [[S2]]
+// CHECK-DAG: [[VAL:%[^ ]*]] = OpLoad [[UINT]] [[PTR2]]
+// CHECK-DAG: [[IMG:%[^ ]*]] = OpLoad [[TBI]] [[BUF0]]
+// CHECK: OpImageWrite [[IMG]] [[U0]] [[VAL]]
+// CHECK: OpFunctionEnd
diff --git a/tools/clang/test/CodeGenSPIRV/node.member.read.types.hlsl b/tools/clang/test/CodeGenSPIRV/node.member.read.types.hlsl
new file mode 100644
index 0000000000..5f7d434bd2
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/node.member.read.types.hlsl
@@ -0,0 +1,193 @@
+// RUN: %dxc -spirv -Vd -Od -T lib_6_8 -fspv-target-env=vulkan1.3 -enable-16bit-types %s | FileCheck %s
+// Note: validation disabled until NodePayloadAMDX pointers are allowed
+// as function arguments
+
+// Read access of members of input/output record with different type
+// sizes - we check the function specializations generated
+
+RWBuffer<uint> buf0;
+
+struct RECORD
+{
+  half h;
+  float f;
+  double d;
+  bool b;
+  uint16_t i16;
+  int i;
+  int64_t i64;
+  uint64_t u64;
+};
+
+// CHECK: OpName [[BUF0:%[^ ]*]] "buf0"
+// CHECK-DAG: OpName [[RECORD:%[^ ]*]] "RECORD"
+// CHECK-DAG: OpMemberName [[RECORD]] 0 "h"
+// CHECK-DAG: OpMemberName [[RECORD]] 1 "f"
+// CHECK-DAG: OpMemberName [[RECORD]] 2 "d"
+// CHECK-DAG: OpMemberName [[RECORD]] 3 "b"
+// CHECK-DAG: OpMemberName [[RECORD]] 4 "i16"
+// CHECK-DAG: OpMemberName [[RECORD]] 5 "i"
+// CHECK-DAG: OpMemberName [[RECORD]] 6 "i64"
+// CHECK-DAG: OpMemberName [[RECORD]] 7 "u64"
+
+// CHECK-DAG: [[UINT:%[^ ]*]] = OpTypeInt 32 0
+// CHECK-DAG: [[INT:%[^ ]*]] = OpTypeInt 32 1
+// CHECK-DAG: [[S0:%[^ ]*]] = OpConstant [[INT]] 0
+// CHECK-DAG: [[S1:%[^ ]*]] = OpConstant [[INT]] 1
+// CHECK-DAG: [[S2:%[^ ]*]] = OpConstant [[INT]] 2
+// CHECK-DAG: [[S3:%[^ ]*]] = OpConstant [[INT]] 3
+// CHECK-DAG: [[S4:%[^ ]*]] = OpConstant [[INT]] 4
+// CHECK-DAG: [[S5:%[^ ]*]] = OpConstant [[INT]] 5
+// CHECK-DAG: [[S6:%[^ ]*]] = OpConstant [[INT]] 6
+// CHECK-DAG: [[S7:%[^ ]*]] = OpConstant [[INT]] 7
+// CHECK-DAG: [[U0:%[^ ]*]] = OpConstant [[UINT]] 0
+// CHECK-DAG: [[U1:%[^ ]*]] = OpConstant [[UINT]] 1
+// CHECK-DAG: [[TBI:%[^ ]*]] = OpTypeImage [[UINT]] Buffer
+
+// CHECK-DAG: [[HALF:%[^ ]*]] = OpTypeFloat 16
+// CHECK-DAG: [[FLOAT:%[^ ]*]] = OpTypeFloat 32
+// CHECK-DAG: [[DOUBLE:%[^ ]*]] = OpTypeFloat 64
+// CHECK-DAG: [[USHORT:%[^ ]*]] = OpTypeInt 16 0
+// CHECK-DAG: [[LONG:%[^ ]*]] = OpTypeInt 64 1
+// CHECK-DAG: [[ULONG:%[^ ]*]] = OpTypeInt 64 0
+// CHECK: [[RECORD]] = OpTypeStruct [[HALF]] [[FLOAT]] [[DOUBLE]] [[UINT]] [[USHORT]] [[INT]] [[LONG]] [[ULONG]]
+// CHECK: [[BOOL:%[^ ]*]] = OpTypeBool
+
+[Shader("node")]
+[NumThreads(1024,1,1)]
+[NodeLaunch("broadcasting")]
+[NodeDispatchGrid(64,1,1)]
+void node01(DispatchNodeInputRecord<RECORD> input)
+{
+  buf0[0] = input.Get().h;
+}
+
+// CHECK: OpFunction
+// CHECK: [[PTR:%[^ ]*]] = OpAccessChain %{{[^ ]*}} %{{[^ ]*}} [[S0]]
+// CHECK: [[VAL0:%[^ ]*]] = OpLoad [[HALF]] [[PTR]]
+// CHECK: [[VAL1:%[^ ]*]] = OpConvertFToU [[UINT]] [[VAL0]]
+// CHECK: [[VAL2:%[^ ]*]] = OpLoad [[TBI]] [[BUF0]]
+// CHECK: OpImageWrite [[VAL2]] [[U0]] [[VAL1]] None
+// CHECK: OpFunctionEnd
+
+[Shader("node")]
+[NumThreads(1024,1,1)]
+[NodeLaunch("broadcasting")]
+[NodeDispatchGrid(64,1,1)]
+void node02(DispatchNodeInputRecord<RECORD> input)
+{
+  buf0[0] = input.Get().f;
+}
+
+// CHECK: OpFunction
+// CHECK: [[PTR:%[^ ]*]] = OpAccessChain %{{[^ ]*}} %{{[^ ]*}} [[S1]]
+// CHECK: [[VAL0:%[^ ]*]] = OpLoad [[FLOAT]] [[PTR]]
+// CHECK: [[VAL1:%[^ ]*]] = OpConvertFToU [[UINT]] [[VAL0]]
+// CHECK: [[VAL2:%[^ ]*]] = OpLoad [[TBI]] [[BUF0]]
+// CHECK: OpImageWrite [[VAL2]] [[U0]] [[VAL1]] None
+// CHECK: OpFunctionEnd
+
+[Shader("node")]
+[NumThreads(1024,1,1)]
+[NodeLaunch("broadcasting")]
+[NodeDispatchGrid(64,1,1)]
+void node03(DispatchNodeInputRecord<RECORD> input)
+{
+  buf0[0] = input.Get().d;
+}
+
+// CHECK: OpFunction
+// CHECK: [[PTR:%[^ ]*]] = OpAccessChain %{{[^ ]*}} %{{[^ ]*}} [[S2]]
+// CHECK: [[VAL0:%[^ ]*]] = OpLoad [[DOUBLE]] [[PTR]]
+// CHECK: [[VAL1:%[^ ]*]] = OpConvertFToU [[UINT]] [[VAL0]]
+// CHECK: [[VAL2:%[^ ]*]] = OpLoad [[TBI]] [[BUF0]]
+// CHECK: OpImageWrite [[VAL2]] [[U0]] [[VAL1]] None
+// CHECK: OpFunctionEnd
+
+[Shader("node")]
+[NumThreads(1024,1,1)]
+[NodeLaunch("broadcasting")]
+[NodeDispatchGrid(64,1,1)]
+void node04(DispatchNodeInputRecord<RECORD> input)
+{
+  buf0[0] = input.Get().b;
+}
+
+// CHECK: OpFunction
+// CHECK: [[PTR:%[^ ]*]] = OpAccessChain %{{[^ ]*}} %{{[^ ]*}} [[S3]]
+// CHECK: [[VAL0:%[^ ]*]] = OpLoad [[UINT]] [[PTR]]
+// CHECK: [[VAL1:%[^ ]*]] = OpINotEqual [[BOOL]] [[VAL0]] [[U0]]
+// CHECK: [[VAL2:%[^ ]*]] = OpSelect [[UINT]] [[VAL1]] [[U1]] [[U0]]
+// CHECK: [[VAL3:%[^ ]*]] = OpLoad [[TBI]] [[BUF0]]
+// CHECK: OpImageWrite [[VAL3]] [[U0]] [[VAL2]] None
+// CHECK: OpFunctionEnd
+
+[Shader("node")]
+[NumThreads(1024,1,1)]
+[NodeLaunch("broadcasting")]
+[NodeDispatchGrid(64,1,1)]
+void node05(DispatchNodeInputRecord<RECORD> input)
+{
+  buf0[0] = input.Get().i16;
+}
+
+// CHECK: OpFunction
+// CHECK: [[PTR:%[^ ]*]] = OpAccessChain %{{[^ ]*}} %{{[^ ]*}} [[S4]]
+// CHECK: [[VAL0:%[^ ]*]] = OpLoad [[USHORT]] [[PTR]]
+// CHECK: [[VAL1:%[^ ]*]] = OpUConvert [[UINT]] [[VAL0]]
+// CHECK: [[VAL2:%[^ ]*]] = OpLoad [[TBI]] [[BUF0]]
+// CHECK: OpImageWrite [[VAL2]] [[U0]] [[VAL1]] None
+// CHECK: OpFunctionEnd
+
+[Shader("node")]
+[NumThreads(1024,1,1)]
+[NodeLaunch("broadcasting")]
+[NodeDispatchGrid(64,1,1)]
+void node06(DispatchNodeInputRecord<RECORD> input)
+{
+  buf0[0] = input.Get().i;
+}
+
+// CHECK: OpFunction
+// CHECK: [[PTR:%[^ ]*]] = OpAccessChain %{{[^ ]*}} %{{[^ ]*}} [[S5]]
+// CHECK: [[VAL0:%[^ ]*]] = OpLoad [[INT]] [[PTR]]
+// CHECK: [[VAL1:%[^ ]*]] = OpBitcast [[UINT]] [[VAL0]]
+// CHECK: [[VAL2:%[^ ]*]] = OpLoad [[TBI]] [[BUF0]]
+// CHECK: OpImageWrite [[VAL2]] [[U0]] [[VAL1]] None
+// CHECK: OpFunctionEnd
+
+[Shader("node")]
+[NumThreads(1024,1,1)]
+[NodeLaunch("broadcasting")]
+[NodeDispatchGrid(64,1,1)]
+void node07(DispatchNodeInputRecord<RECORD> input)
+{
+  buf0[0] = input.Get().i64;
+}
+
+// CHECK: OpFunction
+// CHECK: [[PTR:%[^ ]*]] = OpAccessChain %{{[^ ]*}} %{{[^ ]*}} [[S6]]
+// CHECK: [[VAL0:%[^ ]*]] = OpLoad [[LONG]] [[PTR]]
+// CHECK: [[VAL1:%[^ ]*]] = OpSConvert [[INT]] [[VAL0]]
+// CHECK: [[VAL2:%[^ ]*]] = OpBitcast [[UINT]] [[VAL1]]
+// CHECK: [[VAL3:%[^ ]*]] = OpLoad [[TBI]] [[BUF0]]
+// CHECK: OpImageWrite [[VAL3]] [[U0]] [[VAL2]] None
+// CHECK: OpFunctionEnd
+
+[Shader("node")]
+[NumThreads(1024,1,1)]
+[NodeLaunch("broadcasting")]
+[NodeDispatchGrid(64,1,1)]
+void node08(DispatchNodeInputRecord<RECORD> input)
+{
+  buf0[0] = input.Get().u64;
+}
+
+// CHECK: OpFunction
+// CHECK: [[PTR:%[^ ]*]] = OpAccessChain %{{[^ ]*}} %{{[^ ]*}} [[S7]]
+// CHECK: [[VAL0:%[^ ]*]] = OpLoad [[ULONG]] [[PTR]]
+// CHECK: [[VAL1:%[^ ]*]] = OpUConvert [[UINT]] [[VAL0]]
+// CHECK: [[VAL2:%[^ ]*]] = OpLoad [[TBI]] [[BUF0]]
+// CHECK: OpImageWrite [[VAL2]] [[U0]] [[VAL1]] None
+// CHECK: OpFunctionEnd
+
diff --git a/tools/clang/test/CodeGenSPIRV/node.member.write.hlsl b/tools/clang/test/CodeGenSPIRV/node.member.write.hlsl
new file mode 100644
index 0000000000..33fc2dd9ff
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/node.member.write.hlsl
@@ -0,0 +1,88 @@
+// RUN: %dxc -spirv -Vd -Od -T lib_6_8 -fspv-target-env=vulkan1.3 %s | FileCheck %s
+// Note: validation disabled until NodePayloadAMDX pointers are allowed
+// as function arguments
+
+// Writes to members of the various read-write node records
+
+struct RECORD
+{
+  uint a;
+  uint b;
+};
+
+// CHECK-DAG: [[UINT:%[^ ]*]] = OpTypeInt 32 0
+// CHECK-DAG: [[INT:%[^ ]*]] = OpTypeInt 32 1
+// CHECK-DAG: [[U0:%[^ ]*]] = OpConstant [[UINT]] 0
+// CHECK-DAG: [[S0:%[^ ]*]] = OpConstant [[INT]] 0
+// CHECK-DAG: [[U1:%[^ ]*]] = OpConstant [[UINT]] 1
+// CHECK-DAG: [[S1:%[^ ]*]] = OpConstant [[INT]] 1
+// CHECK-DAG: [[U2:%[^ ]*]] = OpConstant [[UINT]] 2
+// CHECK-DAG: [[U4:%[^ ]*]] = OpConstant [[UINT]] 4
+// CHECK-DAG: [[U5:%[^ ]*]] = OpConstant [[UINT]] 5
+// CHECK-DAG: [[U7:%[^ ]*]] = OpConstant [[UINT]] 7
+// CHECK-DAG: [[U8:%[^ ]*]] = OpConstant [[UINT]] 8
+// CHECK-DAG: [[U9:%[^ ]*]] = OpConstant [[UINT]] 9
+// CHECK-DAG: [[U11:%[^ ]*]] = OpConstant [[UINT]] 11
+
+[Shader("node")]
+[NumThreads(1024,1,1)]
+[NodeLaunch("broadcasting")]
+[NodeDispatchGrid(64,1,1)]
+void node01(RWDispatchNodeInputRecord<RECORD> input1)
+{
+  input1.Get().a = 5;
+}
+
+// CHECK: OpFunction
+// CHECK: [[PTR:%[^ ]*]] = OpAccessChain %{{[^ ]*}} %{{[^ ]*}} [[S0]]
+// CHECK: OpStore [[PTR]] [[U5]]
+// CHECK: OpFunctionEnd
+
+[Shader("node")]
+[NumThreads(2,1,1)]
+[NodeLaunch("coalescing")]
+void node02([MaxRecords(4)] RWGroupNodeInputRecords<RECORD> input2)
+{
+  input2[1].b = 7;
+}
+
+// CHECK: OpFunction
+// CHECK: [[PTR:%[^ ]*]] = OpAccessChain %{{[^ ]*}} %{{[^ ]*}} [[U1]] [[S1]]
+// CHECK: OpStore [[PTR]] [[U7]]
+// CHECK: OpFunctionEnd
+
+[Shader("node")]
+[NumThreads(3,1,1)]
+[NodeLaunch("coalescing")]
+void node03(NodeOutput<RECORD> output)
+{
+  ThreadNodeOutputRecords<RECORD> output3 = output.GetThreadNodeOutputRecords(2);
+  output3.Get().b = 9;
+}
+
+// CHECK: OpFunction
+// CHECK: [[PAY:%[^ ]*]] = OpAllocateNodePayloadsAMDX %{{[^ ]*}} [[U4]] [[U2]] [[U0]]
+// CHECK: [[VAL:%[^ ]*]] = OpLoad %{{[^ ]*}} [[PAY]]
+// CHECK: OpStore [[OUT:%[^ ]*]] [[VAL]]
+// CHECK: [[PTR0:%[^ ]*]] = OpAccessChain %{{[^ ]*}} [[OUT]] [[U0]]
+// CHECK: [[PTR1:%[^ ]*]] = OpAccessChain %{{[^ ]*}} [[PTR0]] [[S1]]
+// CHECK: OpStore [[PTR1]] [[U9]]
+// CHECK: OpFunctionEnd
+
+[Shader("node")]
+[NumThreads(4,1,1)]
+[NodeLaunch("coalescing")]
+void node04(NodeOutput<RECORD> output)
+{
+  GroupNodeOutputRecords<RECORD> output4 = output.GetGroupNodeOutputRecords(8);
+  output4[0].a = 11;
+}
+
+// CHECK: OpFunction
+// CHECK: [[PAY:%[^ ]*]] = OpAllocateNodePayloadsAMDX %{{[^ ]*}} [[U2]] [[U8]] [[U0]]
+// CHECK: [[VAL:%[^ ]*]] = OpLoad %{{[^ ]*}} [[PAY]]
+// CHECK: OpStore [[OUT:%[^ ]*]] [[VAL]]
+// CHECK: [[PTR:%[^ ]*]] = OpAccessChain %{{[^ ]*}} [[OUT]] [[U0]] [[S0]]
+// CHECK: OpStore [[PTR]] [[U11]]
+// CHECK: OpFunctionEnd
+
diff --git a/tools/clang/test/CodeGenSPIRV/node.member.write.matrix.hlsl b/tools/clang/test/CodeGenSPIRV/node.member.write.matrix.hlsl
new file mode 100644
index 0000000000..d875f27d4e
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/node.member.write.matrix.hlsl
@@ -0,0 +1,123 @@
+// RUN: %dxc -spirv -Vd -Od -T lib_6_8 -fspv-target-env=vulkan1.3 %s | FileCheck %s
+// Note: validation disabled until NodePayloadAMDX pointers are allowed
+// as function arguments
+// ==================================================================
+// Test writing to matrix members of node records
+// ==================================================================
+
+// CHECK: OpName [[NODE01:%[^ ]*]] "node01"
+// CHECK: OpName [[INPUT1:%[^ ]*]] "input1"
+// CHECK: OpName [[NODE02:%[^ ]*]] "node02"
+// CHECK: OpName [[INPUT2:%[^ ]*]] "input2"
+// CHECK: OpName [[NODE03:%[^ ]*]] "node03"
+// CHECK: OpName [[OUTPUT3:%[^ ]*]] "output3"
+// CHECK: OpName [[NODE04:%[^ ]*]] "node04"
+// CHECK: OpName [[OUTPUTS4:%[^ ]*]] "outputs4"
+
+struct RECORD
+{
+  row_major float2x2 m0;
+  row_major float2x2 m1;
+  column_major float2x2 m2;
+};
+
+// CHECK-DAG: [[UINT:%[^ ]*]] = OpTypeInt 32 0
+// CHECK-DAG: [[U64:%[^ ]*]] = OpConstant [[UINT]] 64
+// CHECK-DAG: [[U1:%[^ ]*]] = OpConstant [[UINT]] 1
+// CHECK-DAG: [[FLOAT:%[^ ]*]] = OpTypeFloat 32
+// CHECK-DAG: [[F111:%[^ ]*]] = OpConstant [[FLOAT]] 111
+// CHECK-DAG: [[V2FLOAT:%[^ ]*]] = OpTypeVector [[FLOAT]] 2
+// CHECK-DAG: [[C1:%[^ ]*]] = OpConstantComposite [[V2FLOAT]] [[F111]] [[F111]]
+// CHECK-DAG: [[MAT2V2FLOAT:[^ ]*]] = OpTypeMatrix [[V2FLOAT]] 2
+// CHECK-DAG: [[M1:%[^ ]*]] = OpConstantComposite [[MAT2V2FLOAT]] [[C1]] [[C1]]
+// CHECK-DAG: [[INT:%[^ ]*]] = OpTypeInt 32 1
+// CHECK-DAG: [[I1:%[^ ]*]] = OpConstant [[INT]] 1
+// CHECK-DAG: [[I0:%[^ ]*]] = OpConstant [[INT]] 0
+// CHECK-DAG: [[I2:%[^ ]*]] = OpConstant [[INT]] 2
+// CHECK-DAG: [[U0:%[^ ]*]] = OpConstant [[UINT]] 0
+// CHECK-DAG: [[F222:%[^ ]*]] = OpConstant [[FLOAT]] 222
+// CHECK-DAG: [[C2:%[^ ]*]] = OpConstantComposite [[V2FLOAT]] [[F222]] [[F222]]
+// CHECK-DAG: [[M2:%[^ ]*]] = OpConstantComposite [[MAT2V2FLOAT]] [[C2]] [[C2]]
+// CHECK-DAG: [[U4:%[^ ]*]] = OpConstant [[UINT]] 4
+// CHECK-DAG: [[U2:%[^ ]*]] = OpConstant [[UINT]] 2
+
+[Shader("node")]
+[NumThreads(1024,1,1)]
+[NodeLaunch("broadcasting")]
+[NodeDispatchGrid(64,1,1)]
+void node01(RWDispatchNodeInputRecord<RECORD> input1)
+{
+  // CHECK: [[NODE01]] = OpFunction
+  // CHECK: [[P0:%[^ ]*]] = OpAccessChain %{{[^ ]*}} [[INPUT1]] [[U0]]
+  // CHECK: [[P1:%[^ ]*]] = OpAccessChain %{{[^ ]*}} [[P0]] [[I1]]
+  // CHECK: OpStore [[P1]] [[M1]]
+  // CHECK: [[P0:%[^ ]*]] = OpAccessChain %{{[^ ]*}} [[INPUT1]] [[U0]]
+  // CHECK: [[P2:%[^ ]*]] = OpAccessChain %{{[^ ]*}} [[P0]] [[I0]]
+  // CHECK: [[VAL:%[^ ]*]] = OpLoad [[MAT2V2FLOAT]] [[P2]]
+  // CHECK: [[P0:%[^ ]*]] = OpAccessChain %{{[^ ]*}} [[INPUT1]] [[U0]]
+  // CHECK: [[P3:%[^ ]*]] = OpAccessChain %{{[^ ]*}} [[P0]] [[I2]]
+  // CHECK: OpStore [[P3]] [[VAL]]
+  // CHECK: OpFunctionEnd
+  input1.Get().m1 = 111;
+  input1.Get().m2 = input1.Get().m0;
+}
+
+[Shader("node")]
+[NumThreads(1,1,1)]
+[NodeLaunch("coalescing")]
+void node02([MaxRecords(4)] RWGroupNodeInputRecords<RECORD> input2)
+{
+  // CHECK: [[NODE02]] = OpFunction
+  // CHECK: [[P1:%[^ ]*]] = OpAccessChain %{{[^ ]*}} [[INPUT2]] [[U0]] [[I1]]
+  // CHECK: OpStore [[P1]] [[M1]]
+  // CHECK: [[P2:%[^ ]*]] = OpAccessChain %{{[^ ]*}} [[INPUT2]] [[U1]] [[I0]]
+  // CHECK: [[VAL:%[^ ]*]] = OpLoad [[MAT2V2FLOAT]] [[P2]]
+  // CHECK: [[P3:%[^ ]*]] = OpAccessChain %{{[^ ]*}} [[INPUT2]] [[U1]] [[I2]]
+  // CHECK: OpStore [[P3]] [[VAL]]
+  // CHECK: OpFunctionEnd
+  input2[0].m1 = 111;
+  input2[1].m2 = input2[1].m0;
+}
+
+[Shader("node")]
+[NumThreads(1024,1,1)]
+[NodeDispatchGrid(64,1,1)]
+[NodeLaunch("broadcasting")]
+void node03(NodeOutput<RECORD> output3)
+{
+  // CHECK: [[NODE03]] = OpFunction
+  // CHECK: [[PAY:%[^ ]*]] = OpAllocateNodePayloadsAMDX %{{[^ ]*}} [[U4]] [[U1]] [[U0]]
+  // CHECK: [[VAL:%[^ ]*]] = OpLoad %{{[^ ]*}} [[PAY]]
+  // CHECK: OpStore [[OUTREC3:%[^ ]*]] [[VAL]]
+  // CHECK: [[P0:%[^ ]*]] = OpAccessChain %{{[^ ]*}} [[OUTREC3]] [[U0]]
+  // CHECK: [[P1:%[^ ]*]] = OpAccessChain %{{[^ ]*}} [[P0]] [[I1]]
+  // CHECK: OpStore [[P1]] [[M1]]
+  // CHECK: [[P0:%[^ ]*]] = OpAccessChain %{{[^ ]*}} [[OUTREC3]] [[U0]]
+  // CHECK: [[P2:%[^ ]*]] = OpAccessChain %{{[^ ]*}} [[P0]] [[I2]]
+  // CHECK: OpStore [[P2]] [[M2]]
+  // CHECK: OpFunctionEnd
+  ThreadNodeOutputRecords<RECORD> outrec = output3.GetThreadNodeOutputRecords(1);
+  outrec.Get().m1 = 111;
+  outrec.Get().m2 = 222;
+}
+
+[Shader("node")]
+[NumThreads(1024,1,1)]
+[NodeLaunch("coalescing")]
+void node04([MaxRecords(5)] NodeOutput<RECORD> outputs4)
+{
+  // CHECK: [[NODE04]] = OpFunction
+  // CHECK: [[PAY:%[^ ]*]] = OpAllocateNodePayloadsAMDX %{{[^ ]*}} [[U2]] [[U1]] [[U0]]
+  // CHECK: [[VAL:%[^ ]*]] = OpLoad %{{[^ ]*}} [[PAY]]
+  // CHECK: OpStore [[OUTREC4:%[^ ]*]] [[VAL]]
+  // CHECK: [[P0:%[^ ]*]] = OpAccessChain %{{[^ ]*}} [[OUTREC4]] [[U0]]
+  // CHECK: [[P1:%[^ ]*]] = OpAccessChain %{{[^ ]*}} [[P0]] [[I1]]
+  // CHECK: OpStore [[P1]] [[M1]]
+  // CHECK: [[P0:%[^ ]*]] = OpAccessChain %{{[^ ]*}} [[OUTREC4]] [[U0]]
+  // CHECK: [[P2:%[^ ]*]] = OpAccessChain %{{[^ ]*}} [[P0]] [[I2]]
+  // CHECK: OpStore [[P2]] [[M2]]
+  // CHECK: OpFunctionEnd
+  GroupNodeOutputRecords<RECORD> outrec = outputs4.GetGroupNodeOutputRecords(1);
+  outrec.Get().m1 = 111;
+  outrec.Get().m2 = 222;
+}
diff --git a/tools/clang/test/CodeGenSPIRV/node.member.write.types.hlsl b/tools/clang/test/CodeGenSPIRV/node.member.write.types.hlsl
new file mode 100644
index 0000000000..ec95c3d758
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/node.member.write.types.hlsl
@@ -0,0 +1,150 @@
+// RUN: %dxc -spirv -T lib_6_8 -fspv-target-env=vulkan1.3 -enable-16bit-types %s | FileCheck %s
+
+// Writes to node record members of various types
+
+
+struct RECORD
+{
+  half h;
+  float f;
+  double d;
+  bool b;
+  int16_t i16;
+  uint16_t u16;
+  int i;
+  int64_t i64;
+  uint64_t u64;
+  float3 f3;
+  int ia[7];
+};
+
+// CHECK: OpName [[RECORD:%[^ ]*]] "RECORD"
+// CHECK: OpMemberName [[RECORD]] 0 "h"
+// CHECK: OpMemberName [[RECORD]] 1 "f"
+// CHECK: OpMemberName [[RECORD]] 2 "d"
+// CHECK: OpMemberName [[RECORD]] 3 "b"
+// CHECK: OpMemberName [[RECORD]] 4 "i16"
+// CHECK: OpMemberName [[RECORD]] 5 "u16"
+// CHECK: OpMemberName [[RECORD]] 6 "i"
+// CHECK: OpMemberName [[RECORD]] 7 "i64"
+// CHECK: OpMemberName [[RECORD]] 8 "u64"
+// CHECK: OpMemberName [[RECORD]] 9 "f3"
+// CHECK: OpMemberName [[RECORD]] 10 "ia"
+
+// CHECK-DAG: [[UINT:%[^ ]*]] = OpTypeInt 32 0
+// CHECK-DAG: [[HALF:%[^ ]*]] = OpTypeFloat 16
+// CHECK-DAG: [[INT:%[^ ]*]] = OpTypeInt 32 1
+// CHECK-DAG: [[FLOAT:%[^ ]*]] = OpTypeFloat 32
+// CHECK-DAG: [[DOUBLE:%[^ ]*]] = OpTypeFloat 64
+// CHECK-DAG: [[SHORT:%[^ ]*]] = OpTypeInt 16 1
+// CHECK-DAG: [[USHORT:%[^ ]*]] = OpTypeInt 16 0
+// CHECK-DAG: [[LONG:%[^ ]*]] = OpTypeInt 64 1
+// CHECK-DAG: [[ULONG:%[^ ]*]] = OpTypeInt 64 0
+// CHECK-DAG: [[V3FLOAT:%[^ ]*]] = OpTypeVector [[FLOAT]] 3
+
+// CHECK-DAG: [[U0:%[^ ]*]] = OpConstant [[UINT]] 0
+// CHECK-DAG: [[U1:%[^ ]*]] = OpConstant [[UINT]] 1
+// CHECK-DAG: [[HALF_0X1_8P_1:%[^ ]*]] = OpConstant [[HALF]] 0x1.8p+1
+// CHECK-DAG: [[I0:%[^ ]*]] = OpConstant [[INT]] 0
+// CHECK-DAG: [[FN5:%[^ ]*]] = OpConstant [[FLOAT]] -5
+// CHECK-DAG: [[I1:%[^ ]*]] = OpConstant [[INT]] 1
+// CHECK-DAG: [[D7:%[^ ]*]] = OpConstant [[DOUBLE]] 7
+// CHECK-DAG: [[I2:%[^ ]*]] = OpConstant [[INT]] 2
+// CHECK-DAG: [[I3:%[^ ]*]] = OpConstant [[INT]] 3
+// CHECK-DAG: [[S11:%[^ ]*]] = OpConstant [[SHORT]] 11
+// CHECK-DAG: [[I4:%[^ ]*]] = OpConstant [[INT]] 4
+// CHECK-DAG: [[US13:%[^ ]*]] = OpConstant [[USHORT]] 13
+// CHECK-DAG: [[I5:%[^ ]*]] = OpConstant [[INT]] 5
+// CHECK-DAG: [[I17:%[^ ]*]] = OpConstant [[INT]] 17
+// CHECK-DAG: [[I6:%[^ ]*]] = OpConstant [[INT]] 6
+// CHECK-DAG: [[LN19:%[^ ]*]] = OpConstant [[LONG]] -19
+// CHECK-DAG: [[I7:%[^ ]*]] = OpConstant [[INT]] 7
+// CHECK-DAG: [[UL21:%[^ ]*]] = OpConstant [[ULONG]] 21
+// CHECK-DAG: [[I8:%[^ ]*]] = OpConstant [[INT]] 8
+// CHECK-DAG: [[F23:%[^ ]*]] = OpConstant [[FLOAT]] 23
+// CHECK-DAG: [[I9:%[^ ]*]] = OpConstant [[INT]] 9
+// CHECK-DAG: [[I29:%[^ ]*]] = OpConstant [[INT]] 29
+// CHECK-DAG: [[I10:%[^ ]*]] = OpConstant [[INT]] 10
+// CHECK-DAG: [[U7:%[^ ]*]] = OpConstant [[UINT]] 7
+
+// CHECK-DAG: [[AI7:%[^ ]*]] = OpTypeArray [[INT]] [[U7]]
+// CHECK-DAG: [[RECORD]] = OpTypeStruct [[HALF]] [[FLOAT]] [[DOUBLE]] [[UINT]] [[SHORT]] [[USHORT]] [[INT]] [[LONG]] [[ULONG]] [[V3FLOAT]] [[AI7]]
+// CHECK-DAG: [[RAR:%[^ ]*]] = OpTypeNodePayloadArrayAMDX %RECORD
+// CHECK-DAG: [[RARP:%[^ ]*]] = OpTypePointer NodePayloadAMDX [[RAR]]
+// CHECK-DAG: [[U2:%[^ ]*]] = OpConstant [[UINT]] 2
+// CHECK-DAG: [[HALFP:%[^ ]*]] = OpTypePointer Function [[HALF]]
+// CHECK-DAG: [[FLOATP:%[^ ]*]] = OpTypePointer Function [[FLOAT]]
+// CHECK-DAG: [[DOUBLEP:%[^ ]*]] = OpTypePointer Function [[DOUBLE]]
+// CHECK-DAG: [[UINTP:%[^ ]*]] = OpTypePointer Function [[UINT]]
+// CHECK-DAG: [[SHORTP:%[^ ]*]] = OpTypePointer Function [[SHORT]]
+// CHECK-DAG: [[USHORTP:%[^ ]*]] = OpTypePointer Function [[USHORT]]
+// CHECK-DAG: [[INTP:%[^ ]*]] = OpTypePointer Function [[INT]]
+// CHECK-DAG: [[LONGP:%[^ ]*]] = OpTypePointer Function [[LONG]]
+// CHECK-DAG: [[ULONGP:%[^ ]*]] = OpTypePointer Function [[ULONG]]
+
+[Shader("node")]
+[NumThreads(1024,1,1)]
+[NodeLaunch("broadcasting")]
+[NodeDispatchGrid(512,1,1)]
+void node125(NodeOutput<RECORD> output)
+{
+  GroupNodeOutputRecords<RECORD> output01 = output.GetGroupNodeOutputRecords(1);
+  // CHECK: OpAllocateNodePayloadsAMDX [[RARP]] [[U2]] [[U1]] [[U0]]
+
+  output01.Get().h = 3.0;
+  // CHECK: [[PTR:%[^ ]*]] = OpAccessChain [[HALFP]]
+  // CHECK-SAME: [[I0]]
+  // CHECK: OpStore [[PTR]] [[HALF_0X1_8P_1]]
+
+  output01.Get().f = -5.0;
+  // CHECK: [[PTR:%[^ ]*]] = OpAccessChain [[FLOATP]]
+  // CHECK-SAME: [[I1]]
+  // CHECK: OpStore [[PTR]] [[FN5]]
+
+  output01.Get().d = 7.0;
+  // CHECK: [[PTR:%[^ ]*]] = OpAccessChain [[DOUBLEP]]
+  // CHECK-SAME: [[I2]]
+  // CHECK: OpStore [[PTR]] [[D7]]
+
+  output01.Get().b = true;
+  // CHECK: [[PTR:%[^ ]*]] = OpAccessChain [[UINTP]]
+  // CHECK-SAME: [[I3]]
+  // CHECK: OpStore [[PTR]] [[U1]]
+
+  output01.Get().i16 = 11;
+  // CHECK: [[PTR:%[^ ]*]] = OpAccessChain [[SHORTP]]
+  // CHECK-SAME: [[I4]]
+  // CHECK: OpStore [[PTR]] [[S11]]
+
+  output01.Get().u16 = 13;
+  // CHECK: [[PTR:%[^ ]*]] = OpAccessChain [[USHORTP]]
+  // CHECK-SAME: [[I5]]
+  // CHECK: OpStore [[PTR]] [[US13]]
+
+  output01.Get().i = 17;
+  // CHECK: [[PTR:%[^ ]*]] = OpAccessChain [[INTP]]
+  // CHECK-SAME: [[I6]]
+  // CHECK: OpStore [[PTR]] [[I17]]
+
+  output01.Get().i64 = -19;
+  // CHECK: [[PTR:%[^ ]*]] = OpAccessChain [[LONGP]]
+  // CHECK-SAME: [[I7]]
+  // CHECK: OpStore [[PTR]] [[LN19]]
+
+  output01.Get().u64 = 21;
+  // CHECK: [[PTR:%[^ ]*]] = OpAccessChain [[ULONGP]]
+  // CHECK-SAME: [[I8]]
+  // CHECK: OpStore [[PTR]] [[UL21]]
+
+  output01.Get().f3.y = 23;
+  // CHECK: [[PTR:%[^ ]*]] = OpAccessChain [[FLOATP]]
+  // CHECK-SAME: [[I9]]
+  // CHECK-SAME: [[I1]]
+  // CHECK: OpStore [[PTR]] [[F23]]
+
+  output01.Get().ia[5] = 29;
+  // CHECK: [[PTR:%[^ ]*]] = OpAccessChain [[INTP]]
+  // CHECK-SAME: [[I10]]
+  // CHECK-SAME: [[I5]]
+  // CHECK: OpStore [[PTR]] [[I29]]
+}
diff --git a/tools/clang/test/CodeGenSPIRV/node.mesh.hlsl b/tools/clang/test/CodeGenSPIRV/node.mesh.hlsl
new file mode 100644
index 0000000000..4d1726abb2
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/node.mesh.hlsl
@@ -0,0 +1,88 @@
+// RUN: %dxc -spirv -T lib_6_9 -fspv-target-env=vulkan1.3 %s | FileCheck %s
+// XFAIL: *
+// disabled until mesh nodes are implemented
+
+// Test loading of node input and funneling into mesh outputs
+// Essentially an end-to-end mesh node test.
+
+
+RWBuffer<float> buf0;
+
+#define MAX_VERT 32
+#define MAX_PRIM 16
+
+struct MeshPerVertex {
+    float4 position : SV_Position;
+    float color[4] : COLOR;
+};
+
+struct MeshPerPrimitive {
+    float normal : NORMAL;
+    float malnor : MALNOR;
+    float alnorm : ALNORM;
+    float ormaln : ORMALN;
+    int layer[6] : LAYER;
+};
+
+struct MeshPayload {
+    float normal;
+    float malnor;
+    float alnorm;
+    float ormaln;
+    int layer[6];
+};
+
+groupshared float gsMem[MAX_PRIM];
+
+[Shader("node")]
+[NodeLaunch("mesh")]
+[outputtopology("triangle")]
+[numthreads(128, 1, 1)]
+[NodeDispatchGrid(64,1,1)]
+void node_setmeshoutputcounts(DispatchNodeInputRecord<MeshPayload> mpl,
+            out indices uint3 primIndices[MAX_PRIM],
+            out vertices MeshPerVertex verts[MAX_VERT],
+            out primitives MeshPerPrimitive prims[MAX_PRIM],
+            in uint tig : SV_GroupIndex) {
+  SetMeshOutputCounts(32, 16);
+
+  // create mpl
+
+  MeshPerVertex ov;
+  ov.position = float4(14.0,15.0,16.0,17.0);
+  ov.color[0] = 14.0;
+  ov.color[1] = 15.0;
+  ov.color[2] = 16.0;
+  ov.color[3] = 17.0;
+
+  if (tig % 3) {
+    primIndices[tig / 3] = uint3(tig, tig + 1, tig + 2);
+
+    MeshPerPrimitive op;
+    op.normal = mpl.Get().normal;
+    op.malnor = gsMem[tig / 3 + 1];
+    op.alnorm = mpl.Get().alnorm;
+    op.ormaln = mpl.Get().ormaln;
+    op.layer[0] = mpl.Get().layer[0];
+    op.layer[1] = mpl.Get().layer[1];
+    op.layer[2] = mpl.Get().layer[2];
+    op.layer[3] = mpl.Get().layer[3];
+    op.layer[4] = mpl.Get().layer[4];
+    op.layer[5] = mpl.Get().layer[5];
+
+    gsMem[tig / 3] = op.normal;
+    prims[tig / 3] = op;
+  }
+  verts[tig] = ov;
+}
+
+// CHECK: OpEntryPoint MeshExt [[ENTRY:%[^ ]*]]
+// CHECK-DAG: OpExecutionMode [[ENTRY]] OutputVertices 32
+// CHECK-DAG: OpExecutionMode [[ENTRY]] OutputPrimitivesNV 16
+// CHECK-DAG: OpExecutionMode [[ENTRY]] OutputTrianglesNV
+// CHECK: [[UINT:%[^ ]*]] = OpTypeInt 32 0
+// CHECK-DAG: [[U16:%[^ ]*]] = OpConstant [[UINT]] 16
+// CHECK-DAG: [[U32:%[^ ]*]] = OpConstant [[UINT]] 32
+// CHECK: [[ENTRY]] = OpFunction
+// CHECK: OpSetMeshOutputsEXT [[U32]] [[U16]]
+// CHECK: OpFunctionEnd
diff --git a/tools/clang/test/CodeGenSPIRV/node.output-complete.hlsl b/tools/clang/test/CodeGenSPIRV/node.output-complete.hlsl
new file mode 100644
index 0000000000..17db15e7db
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/node.output-complete.hlsl
@@ -0,0 +1,33 @@
+// RUN: %dxc -spirv -Od -T lib_6_8 -fspv-target-env=vulkan1.3 %s | FileCheck %s
+
+// OutputComplete() is called with NodeOutput
+
+struct OUTPUT_RECORD
+{
+  uint value;
+};
+
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NodeDispatchGrid(256,1,1)]
+[NumThreads(1024,1,1)]
+void outputcomplete([MaxRecords(256)] NodeOutput<OUTPUT_RECORD> output)
+{
+  ThreadNodeOutputRecords<OUTPUT_RECORD> outputrecords = output.GetThreadNodeOutputRecords(1);
+    // ...
+  outputrecords.OutputComplete();
+}
+
+// CHECK: OpName [[RECORDS:%[^ ]*]] "outputrecords"
+// CHECK: OpDecorateId [[ARR:%[^ ]*]] PayloadNodeNameAMDX [[STR:%[0-9A-Za-z_]*]]
+// CHECK-DAG: [[UINT:%[^ ]*]] = OpTypeInt 32 0
+// CHECK-DAG: [[U1:%[^ ]*]] = OpConstant [[UINT]] 1
+// CHECK-DAG: [[U0:%[^ ]*]] = OpConstant [[UINT]] 0
+// CHECK-DAG: [[REC:%[^ ]*]] = OpTypeStruct [[UINT]]
+// CHECK-DAG: [[ARR:%[^ ]*]] = OpTypeNodePayloadArrayAMDX [[REC]]
+// CHECK-DAG: [[PTR:%[^ ]*]] = OpTypePointer NodePayloadAMDX [[ARR]]
+// CHECK-DAG: [[U4:[^ ]*]] = OpConstant [[UINT]] 4
+// CHECK: [[V0:%[^ ]*]] = OpAllocateNodePayloadsAMDX [[PTR]] [[U4]] [[U1]] [[U0]]
+// CHECK: [[V1:%[^ ]*]] = OpLoad [[ARR]] [[V0]]
+// CHECK: OpStore [[RECORDS]] [[V1]]
+// CHECK: OpEnqueueNodePayloadsAMDX [[RECORDS]]
diff --git a/tools/clang/test/CodeGenSPIRV/node.output.is-valid.empty.hlsl b/tools/clang/test/CodeGenSPIRV/node.output.is-valid.empty.hlsl
new file mode 100644
index 0000000000..08a103cf5e
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/node.output.is-valid.empty.hlsl
@@ -0,0 +1,19 @@
+// RUN: %dxc -spirv -T lib_6_8 -fspv-target-env=vulkan1.3 %s | FileCheck %s
+
+// NodeOutputIsValid() is called with EmptyNodeOutput
+
+RWBuffer<uint> buf0;
+
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NodeDispatchGrid(256,1,1)]
+[NumThreads(1,1,1)]
+void node131_nodeoutputisvalid_emptynodeoutput(EmptyNodeOutput output)
+{
+  buf0[0] = output.IsValid();
+}
+
+// CHECK: [[UINT:%[^ ]*]] = OpTypeInt 32 0
+// CHECK: [[U0:%[^ ]*]] = OpConstant [[UINT]] 0
+// CHECK: [[BOOL:%[^ ]*]] = OpTypeBool
+// CHECK: OpIsNodePayloadValidAMDX [[BOOL]] %{{[^ ]*}} [[U0]]
diff --git a/tools/clang/test/CodeGenSPIRV/node.output.is-valid.hlsl b/tools/clang/test/CodeGenSPIRV/node.output.is-valid.hlsl
new file mode 100644
index 0000000000..40e3a74fcb
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/node.output.is-valid.hlsl
@@ -0,0 +1,24 @@
+// RUN: %dxc -spirv -T lib_6_8 -fspv-target-env=vulkan1.3 %s | FileCheck %s
+
+// IsValid() is invoked on NodeOutput
+
+RWBuffer<uint> buf0;
+
+struct RECORD
+{
+  uint value;
+};
+
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NodeDispatchGrid(256,1,1)]
+[NumThreads(1,1,1)]
+void node129_nodeoutputisvalid_nodeoutput(NodeOutput<RECORD> output)
+{
+  buf0[0] = output.IsValid();
+}
+
+// CHECK: [[UINT:%[^ ]*]] = OpTypeInt 32 0
+// CHECK: [[U0:%[^ ]*]] = OpConstant [[UINT]] 0
+// CHECK: [[BOOL:%[^ ]*]] = OpTypeBool
+// CHECK: OpIsNodePayloadValidAMDX [[BOOL]] %{{[^ ]*}} [[U0]]
diff --git a/tools/clang/test/CodeGenSPIRV/node.renamed.hlsl b/tools/clang/test/CodeGenSPIRV/node.renamed.hlsl
new file mode 100644
index 0000000000..953288929d
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/node.renamed.hlsl
@@ -0,0 +1,23 @@
+// RUN: %dxc -spirv -Od -T lib_6_8 -fspv-target-env=vulkan1.3 external %s | FileCheck %s
+
+// Renamed node, unnamed index defaults to 0
+
+struct RECORD {
+  uint i;
+};
+
+[Shader("node")]
+[NodeLaunch("thread")]
+[NodeID("new_node_name")]
+[NodeIsProgramEntry]
+void node017_renamed_node([NodeID("output_node_name", 2)] NodeOutput<RECORD> r)
+{
+  r.GetThreadNodeOutputRecords(1);
+}
+
+// CHECK: OpEntryPoint GLCompute %{{[^ ]*}} "node017_renamed_node"
+// CHECK-DAG: OpDecorateId [[TYPE:%[^ ]*]] PayloadNodeNameAMDX [[STR:%[0-9A-Za-z_]*]]
+// CHECK-DAG: OpDecorateId [[TYPE]] PayloadNodeBaseIndexAMDX [[U2:%[0-9A-Za-z_]*]]
+// CHECK: [[UINT:%[^ ]*]] = OpTypeInt 32 0
+// CHECK-DAG: [[STR]] = OpConstantStringAMDX "output_node_name"
+// CHECK-DAG: [[U2]] = OpConstant [[UINT]] 2
diff --git a/tools/clang/test/CodeGenSPIRV/node.share-input.hlsl b/tools/clang/test/CodeGenSPIRV/node.share-input.hlsl
new file mode 100644
index 0000000000..c439bef017
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/node.share-input.hlsl
@@ -0,0 +1,42 @@
+// RUN: %dxc -spirv -T lib_6_8 -fspv-target-env=vulkan1.3 %s | FileCheck %s
+
+// Check that the NodeShareInputOf metadata entry is populated correctly
+
+struct entryRecord
+{
+    int data0;
+};
+
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NodeDispatchGrid(2, 1, 1)]
+[NumThreads(1, 1, 1)]
+void firstNode(DispatchNodeInputRecord<entryRecord> inputData)
+{ }
+
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NodeDispatchGrid(2, 1, 1)]
+[NumThreads(1, 1, 1)]
+[NodeShareInputOf("firstNode")]
+void secondNode(DispatchNodeInputRecord<entryRecord> inputData)
+{ }
+
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NodeDispatchGrid(2, 1, 1)]
+[NumThreads(1, 1, 1)]
+[NodeShareInputOf("firstNode", 3)]
+void thirdNode(DispatchNodeInputRecord<entryRecord> inputData)
+{ }
+
+
+// CHECK: OpEntryPoint GLCompute %firstNode "firstNode"
+// CHECK: OpEntryPoint GLCompute %secondNode "secondNode"
+// CHECK: OpEntryPoint GLCompute %thirdNode "thirdNode"
+// CHECK-NOT: OpExecutionModeId %firstNode SharesInputWithAMDX
+// CHECK: OpExecutionModeId %secondNode SharesInputWithAMDX [[STR:%[^ ]*]] [[U0:%[^ ]*]]
+// CHECK: OpExecutionModeId %thirdNode SharesInputWithAMDX [[STR]] [[U3:%[^ ]*]]
+// CHECK: [[UINT:%[^ ]*]] = OpTypeInt 32 0
+// CHECK-DAG: [[U0:%[^ ]*]] = OpConstant [[UINT]] 0
+// CHECK-DAG: [[U3:%[^ ]*]] = OpConstant [[UINT]] 3
diff --git a/tools/clang/test/CodeGenSPIRV/node.sparse-nodes.hlsl b/tools/clang/test/CodeGenSPIRV/node.sparse-nodes.hlsl
new file mode 100644
index 0000000000..ca3c14b8da
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/node.sparse-nodes.hlsl
@@ -0,0 +1,141 @@
+// RUN: %dxc -spirv -T lib_6_8 -fspv-target-env=vulkan1.3 %s | FileCheck %s
+
+struct RECORD1
+{
+  uint value;
+  uint value2;
+};
+
+// CHECK: OpEntryPoint GLCompute [[NODE10:%[^ ]*]] "node_1_0"
+// CHECK: OpEntryPoint GLCompute [[NODE11:%[^ ]*]] "node_1_1"
+// CHECK: OpEntryPoint GLCompute [[NODE12:%[^ ]*]] "node_1_2"
+// CHECK: OpEntryPoint GLCompute [[NODE20:%[^ ]*]] "node_2_0"
+// CHECK: OpEntryPoint GLCompute [[NODE21:%[^ ]*]] "node_2_1"
+// CHECK: OpEntryPoint GLCompute [[NODE22:%[^ ]*]] "node_2_2"
+// CHECK: OpDecorateId [[A10:%[^ ]*]] PayloadNodeNameAMDX [[S10:%[^ ]*]]
+// CHECK: OpDecorateId [[A10]] NodeMaxPayloadsAMDX [[U31:%[^ ]*]]
+// CHECK: OpDecorate [[A10]] PayloadNodeSparseArrayAMDX
+// CHECK: OpDecorateId [[A10]] PayloadNodeArraySizeAMDX [[U129:%[^ ]*]]
+// CHECK: OpDecorateId [[A11:%[^ ]*]] PayloadNodeNameAMDX [[S11:%[^ ]*]]
+// CHECK: OpDecorateId [[A11]] NodeMaxPayloadsAMDX [[U37:%[^ ]*]]
+// CHECK: OpDecorate [[A11]] PayloadNodeSparseArrayAMDX
+// CHECK: OpDecorateId [[A12:%[^ ]*]] PayloadNodeNameAMDX [[S12:%[^ ]*]]
+// CHECK: OpDecorateId [[A12]] NodeMaxPayloadsAMDX [[U47:%[^ ]*]]
+// CHECK: OpDecorate [[A12]] PayloadNodeSparseArrayAMDX
+// CHECK: OpDecorateId [[A20:%[^ ]*]] PayloadNodeNameAMDX [[S20:%[^ ]*]]
+// CHECK: OpDecorateId [[A20]] NodeMaxPayloadsAMDX [[U41:%[^ ]*]]
+// CHECK: OpDecorate [[A20]] PayloadNodeSparseArrayAMDX
+// CHECK: OpDecorateId [[A20]] PayloadNodeArraySizeAMDX [[U131:%[^ ]*]]
+// CHECK: OpDecorateId [[A21:%[^ ]*]] PayloadNodeNameAMDX [[S21:%[^ ]*]]
+// CHECK: OpDecorateId [[A21]] NodeMaxPayloadsAMDX [[U43:%[^ ]*]]
+// CHECK: OpDecorate [[A21]] PayloadNodeSparseArrayAMDX
+// CHECK: OpDecorateId [[A22:%[^ ]*]] PayloadNodeNameAMDX [[S22:%[^ ]*]]
+// CHECK: OpDecorateId [[A22]] NodeMaxPayloadsAMDX [[U53:%[^ ]*]]
+// CHECK: OpDecorate [[A22]] PayloadNodeSparseArrayAMDX
+// CHECK: [[UINT:%[^ ]*]] = OpTypeInt 32 0
+// CHECK: [[U0:%[^ ]*]] = OpConstant [[UINT]] 0
+// CHECK: [[RECORD:%[^ ]*]] = OpTypeStruct [[UINT]] [[UINT]]
+// CHECK-DAG: [[A10]] = OpTypeNodePayloadArrayAMDX [[RECORD]]
+// CHECK-DAG: [[S10]] = OpConstantStringAMDX "OutputArray_1_0"
+// CHECK-DAG: [[U31]] = OpConstant [[UINT]] 31
+// CHECK-DAG: [[U129]] = OpConstant [[UINT]] 129
+// CHECK-DAG: [[A11]] = OpTypeNodePayloadArrayAMDX [[RECORD]]
+// CHECK-DAG: [[S11]] = OpConstantStringAMDX "OutputArray_1_1"
+// CHECK-DAG: [[U37]] = OpConstant [[UINT]] 37
+// CHECK-DAG: [[A12]] = OpTypeNodePayloadArrayAMDX [[RECORD]]
+// CHECK-DAG: [[S12]] = OpConstantStringAMDX "Output_1_2"
+// CHECK-DAG: [[U47]] = OpConstant [[UINT]] 47
+// CHECK-DAG: [[EMPTY:%[^ ]*]] = OpTypeStruct
+// CHECK-DAG: [[A20]] = OpTypeNodePayloadArrayAMDX [[EMPTY]]
+// CHECK-DAG: [[S20]] = OpConstantStringAMDX "OutputArray_2_0"
+// CHECK-DAG: [[U41]] = OpConstant [[UINT]] 41
+// CHECK-DAG: [[U131]] = OpConstant [[UINT]] 131
+// CHECK-DAG: [[A21]] = OpTypeNodePayloadArrayAMDX [[EMPTY]]
+// CHECK-DAG: [[S21]] = OpConstantStringAMDX "OutputArray_2_1"
+// CHECK-DAG: [[U43]] = OpConstant [[UINT]] 43
+// CHECK-DAG: [[A22]] = OpTypeNodePayloadArrayAMDX [[EMPTY]]
+// CHECK-DAG: [[S22]] = OpConstantStringAMDX "Output_2_2"
+// CHECK-DAG: [[U53]] = OpConstant [[UINT]] 53
+
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NodeDispatchGrid(1, 1, 1)]
+[NumThreads(1, 1, 1)]
+void node_1_0(
+    [AllowSparseNodes] [NodeArraySize(129)] [MaxRecords(31)]
+    NodeOutputArray<RECORD1> OutputArray_1_0) {
+  ThreadNodeOutputRecords<RECORD1> outRec = OutputArray_1_0[1].GetThreadNodeOutputRecords(2);
+  outRec.OutputComplete();
+}
+
+// CHECK: [[NODE10]] = OpFunction %void None
+// CHECK: OpFunctionEnd
+
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NodeDispatchGrid(1, 1, 1)]
+[NumThreads(1, 1, 1)]
+void node_1_1(
+    [UnboundedSparseNodes] [MaxRecords(37)]
+    NodeOutputArray<RECORD1> OutputArray_1_1) {
+  ThreadNodeOutputRecords<RECORD1> outRec = OutputArray_1_1[1].GetThreadNodeOutputRecords(2);
+  outRec.OutputComplete();
+}
+
+// CHECK: [[NODE11]] = OpFunction %void None
+// CHECK: OpFunctionEnd
+
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NodeDispatchGrid(1, 1, 1)]
+[NumThreads(1, 1, 1)]
+void node_1_2(
+    [AllowSparseNodes] [MaxRecords(47)]
+    NodeOutput<RECORD1> Output_1_2) {
+  ThreadNodeOutputRecords<RECORD1> outRec = Output_1_2.GetThreadNodeOutputRecords(2);
+  outRec.OutputComplete();
+}
+
+// CHECK: [[NODE12]] = OpFunction %void None
+// CHECK: %{{[^ ]*}} = OpAllocateNodePayloadsAMDX %{{[^ ]*}} %{{[^ ]*}} %{{[^ ]*}} [[U0]]
+// CHECK: OpFunctionEnd
+
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NodeDispatchGrid(1, 1, 1)]
+[NumThreads(1, 1, 1)]
+void node_2_0(
+    [AllowSparseNodes] [NodeArraySize(131)] [MaxRecords(41)]
+    EmptyNodeOutputArray OutputArray_2_0) {
+  OutputArray_2_0[1].GroupIncrementOutputCount(10);
+}
+
+// CHECK: [[NODE20]] = OpFunction %void None
+// CHECK: OpFunctionEnd
+
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NodeDispatchGrid(1, 1, 1)]
+[NumThreads(1, 1, 1)]
+void node_2_1(
+    [UnboundedSparseNodes] [MaxRecords(43)]
+    EmptyNodeOutputArray OutputArray_2_1) {
+  OutputArray_2_1[1].GroupIncrementOutputCount(10);
+}
+
+// CHECK: [[NODE21]] = OpFunction %void None
+// CHECK: OpFunctionEnd
+
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NodeDispatchGrid(1, 1, 1)]
+[NumThreads(1, 1, 1)]
+void node_2_2(
+    [AllowSparseNodes] [MaxRecords(53)]
+    EmptyNodeOutput Output_2_2) {
+  Output_2_2.GroupIncrementOutputCount(10);
+}
+
+// CHECK: [[NODE22]] = OpFunction %void None
+// CHECK: %{{[^ ]*}} = OpAllocateNodePayloadsAMDX %{{[^ ]*}} %{{[^ ]*}} %{{[^ ]*}} [[U0]]
+// CHECK: OpFunctionEnd
diff --git a/tools/clang/test/CodeGenSPIRV/node.thread.num-threads.hlsl b/tools/clang/test/CodeGenSPIRV/node.thread.num-threads.hlsl
new file mode 100644
index 0000000000..8732cf3478
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/node.thread.num-threads.hlsl
@@ -0,0 +1,15 @@
+// RUN: %dxc -spirv -Od -T lib_6_8 -fspv-target-env=vulkan1.3 %s | FileCheck %s
+
+// NumThreads
+
+[Shader("node")]
+[NodeLaunch("thread")]
+[NumThreads(1,1,1)]
+[NodeIsProgramEntry]
+void node010_thread_numthreads_shader()
+{
+}
+
+// CHECK: OpEntryPoint GLCompute [[SHADER:%[0-9A-Za-z_]*]]
+// CHECK: OpExecutionMode [[SHADER]] LocalSize 1 1 1
+// CHECK: OpReturn
diff --git a/tools/clang/test/CodeGenSPIRV/node.thread.num-threads.none.hlsl b/tools/clang/test/CodeGenSPIRV/node.thread.num-threads.none.hlsl
new file mode 100644
index 0000000000..0b230479c4
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/node.thread.num-threads.none.hlsl
@@ -0,0 +1,15 @@
+// RUN: %dxc -spirv -Od -T lib_6_8 -fspv-target-env=vulkan1.3 %s | FileCheck %s
+
+// Thread launch node without NumThreads specified should use a
+// default of (1,1,1)
+
+[Shader("node")]
+[NodeLaunch("thread")]
+[NodeIsProgramEntry]
+void node011_thread_numthreads_none()
+{
+}
+
+// CHECK: OpEntryPoint GLCompute [[SHADER:%[0-9A-Za-z_]*]]
+// CHECK: OpExecutionMode [[SHADER]] LocalSize 1 1 1
+// CHECK: OpReturn
diff --git a/tools/clang/test/CodeGenSPIRV/vk.attribute.image-format.hlsl b/tools/clang/test/CodeGenSPIRV/vk.attribute.image-format.hlsl
index 12b03fffda..4d10dc446b 100644
--- a/tools/clang/test/CodeGenSPIRV/vk.attribute.image-format.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/vk.attribute.image-format.hlsl
@@ -60,12 +60,6 @@ RWBuffer<int64_t> Buf_r64i;
 [[vk::image_format("r64ui")]]
 RWBuffer<uint64_t> Buf_r64ui;
 
-[[vk::image_format("r16f")]]
-// CHECK: [[ImgType:%[0-9a-zA-Z_]+]] = OpTypeImage %float 2D 2 0 0 2 R16f
-// CHECK: [[ArrayType:%[0-9a-zA-Z_]+]] = OpTypeRuntimeArray [[ImgType]]
-// CHECK: [[PtrType:%[0-9a-zA-Z_]+]] = OpTypePointer UniformConstant [[ArrayType]]
-RWTexture2D<float> Buf_r16f_bindless[];
-
 struct S {
     RWBuffer<float4> b;
 };

From 296d4aee97308cbadbc03851b3457cebf7ced13b Mon Sep 17 00:00:00 2001
From: Tex Riddell <texr@microsoft.com>
Date: Tue, 3 Jun 2025 19:55:57 -0700
Subject: [PATCH 56/93] NFC: Add LSAN leak suppression to ASAN pipeline (#7503)

Address-sanitizer reports a leak coming from the dynamic shared library
loading code, though dlopen() is paired with dlclose(). This leak
doesn't manifest on main yet because dxc adds dxcompiler directly to
target_link_libraries (which it shouldn't have to), and dxil is loaded
within that library. A PR to remove dxil loading from dxcompiler exposes
the leak when dxc loads dxil dynamically.

This change adds a suppressions file for LSAN to suppress the leak in
the loader code that happens under call_init. This also changes the
Linux_Clang_Release build to RelWithDebInfo so symbols are present.
---
 azure-pipelines.yml                      | 10 ++++++----
 utils/asan/x86_64-pc-linux-gnu.lsan.supp |  1 +
 2 files changed, 7 insertions(+), 4 deletions(-)
 create mode 100644 utils/asan/x86_64-pc-linux-gnu.lsan.supp

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 7967fa03e3..dee579287c 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -56,13 +56,13 @@ stages:
 
     strategy:
       matrix:
-        Linux_Clang_Release:
+        Linux_Clang_RelWithDebInfo:
           image: ${{ variables.linux }}
-          configuration: Release
+          configuration: RelWithDebInfo
           CC: clang-18
           CXX: clang++-18
-          CMAKE_OPTS: -DLLVM_ENABLE_WERROR=On -DLLVM_USE_SANITIZER='Address;Undefined' -DLLVM_ENABLE_LIBCXX=On -DLLVM_USE_LINKER=lld
-          CHECK_ALL_ENV: ASAN_OPTIONS=alloc_dealloc_mismatch=0
+          CMAKE_OPTS: -DLLVM_ENABLE_WERROR=On -DLLVM_USE_SANITIZER='Address;Undefined' -DLLVM_ENABLE_LIBCXX=On -DLLVM_USE_LINKER=lld-18
+          CHECK_ALL_ENV: ASAN_OPTIONS=alloc_dealloc_mismatch=0 LSAN_OPTIONS=suppressions=$BUILD_SOURCESDIRECTORY/utils/asan/x86_64-pc-linux-gnu.lsan.supp:print_suppressions=0 ASAN_SYMBOLIZER_PATH=/usr/bin/llvm-symbolizer-18 LSAN_SYMBOLIZER_PATH=/usr/bin/llvm-symbolizer-18
           OS: Linux
         Linux_Clang_Debug:
           image: ${{ variables.linux }}
@@ -107,6 +107,8 @@ stages:
         versionSpec: '3.x'
 
     - bash: |
+        sudo apt-get update
+        sudo apt-get upgrade libc6 libc6-dbg
         sudo apt-get install ninja-build
         wget https://apt.llvm.org/llvm.sh
         chmod u+x llvm.sh
diff --git a/utils/asan/x86_64-pc-linux-gnu.lsan.supp b/utils/asan/x86_64-pc-linux-gnu.lsan.supp
new file mode 100644
index 0000000000..3a7725f535
--- /dev/null
+++ b/utils/asan/x86_64-pc-linux-gnu.lsan.supp
@@ -0,0 +1 @@
+leak:^call_init$
\ No newline at end of file

From 77dcbb61f7efdee92a19e4e289f03f2c77365222 Mon Sep 17 00:00:00 2001
From: Chris B <cbieneman@microsoft.com>
Date: Thu, 5 Jun 2025 13:37:04 -0500
Subject: [PATCH 57/93] Require complete types in some missing places (#7511)

This change adds two additional calls to Sema::RequireCompleteType, one
when evaluating unary `sizeof`, which fixes a reported issue with
templates. The second is in the DXR diagnostics where complete type
should be required but not diagnosed because ordering in the compiler is
a bit wonky when diagnosing DXR entry points.

Fixes #7510

---------

Co-authored-by: Tex Riddell <texr@microsoft.com>
---
 tools/clang/lib/AST/HlslTypes.cpp             |  1 +
 tools/clang/lib/Sema/SemaDXR.cpp              | 15 ++++++-
 tools/clang/lib/Sema/SemaExpr.cpp             | 16 ++++++--
 tools/clang/lib/Sema/SemaHLSL.cpp             | 12 +++---
 .../test/SemaHLSL/raytracing-entry-diags.hlsl | 21 ++++++++++
 .../sizeof-requires-complete-type.hlsl        | 27 +++++++++++++
 .../test/SemaHLSL/template-udt-load.hlsl      | 39 +++++++++++++++++++
 7 files changed, 121 insertions(+), 10 deletions(-)
 create mode 100644 tools/clang/test/SemaHLSL/sizeof-requires-complete-type.hlsl

diff --git a/tools/clang/lib/AST/HlslTypes.cpp b/tools/clang/lib/AST/HlslTypes.cpp
index 7693c065be..00c18a81a9 100644
--- a/tools/clang/lib/AST/HlslTypes.cpp
+++ b/tools/clang/lib/AST/HlslTypes.cpp
@@ -108,6 +108,7 @@ bool IsHLSLNumericOrAggregateOfNumericType(clang::QualType type) {
 // which can't be annotated. But includes UDTs of trivially copyable data and
 // the builtin trivially copyable raytracing structs.
 bool IsHLSLCopyableAnnotatableRecord(clang::QualType QT) {
+  assert(!QT->isIncompleteType() && "Type must be complete!");
   const clang::Type *Ty = QT.getCanonicalType().getTypePtr();
   if (const RecordType *RT = dyn_cast<RecordType>(Ty)) {
     const RecordDecl *RD = RT->getDecl();
diff --git a/tools/clang/lib/Sema/SemaDXR.cpp b/tools/clang/lib/Sema/SemaDXR.cpp
index 04e1582513..0ccb21fb2b 100644
--- a/tools/clang/lib/Sema/SemaDXR.cpp
+++ b/tools/clang/lib/Sema/SemaDXR.cpp
@@ -1190,7 +1190,10 @@ void DiagnoseCallableEntry(Sema &S, FunctionDecl *FD,
           << /*payload|callable*/ 1 << Param;
     QualType Ty = Param->getType().getNonReferenceType();
 
-    if (!(hlsl::IsHLSLCopyableAnnotatableRecord(Ty)))
+    // Don't diagnose incomplete type here. Function parameters are
+    // checked in Sema::CheckParmsForFunctionDef.
+    if (!S.RequireCompleteType(Param->getLocation(), Ty, 0) &&
+        !(hlsl::IsHLSLCopyableAnnotatableRecord(Ty)))
       S.Diag(Param->getLocation(), diag::err_payload_attrs_must_be_udt)
           << /*payload|attributes|callable*/ 2 << /*parameter %2|type*/ 0
           << Param;
@@ -1232,6 +1235,11 @@ void DiagnoseMissOrAnyHitEntry(Sema &S, FunctionDecl *FD,
 
     QualType Ty = Param->getType().getNonReferenceType();
 
+    // Don't diagnose here, just continue if this fails. Function parameters are
+    // checked in Sema::CheckParmsForFunctionDef.
+    if (S.RequireCompleteType(Param->getLocation(), Ty, 0))
+      continue;
+
     if (!(hlsl::IsHLSLCopyableAnnotatableRecord(Ty))) {
       S.Diag(Param->getLocation(), diag::err_payload_attrs_must_be_udt)
           << /*payload|attributes|callable*/ Idx << /*parameter %2|type*/ 0
@@ -1286,6 +1294,11 @@ void DiagnoseClosestHitEntry(Sema &S, FunctionDecl *FD,
 
     QualType Ty = Param->getType().getNonReferenceType();
 
+    // Don't diagnose here, just continue if this fails. Function parameters are
+    // checked in Sema::CheckParmsForFunctionDef.
+    if (S.RequireCompleteType(Param->getLocation(), Ty, 0))
+      continue;
+
     if (!(hlsl::IsHLSLCopyableAnnotatableRecord(Ty))) {
       S.Diag(Param->getLocation(), diag::err_payload_attrs_must_be_udt)
           << /*payload|attributes|callable*/ Idx << /*parameter %2|type*/ 0
diff --git a/tools/clang/lib/Sema/SemaExpr.cpp b/tools/clang/lib/Sema/SemaExpr.cpp
index 507b6a7508..389fcfc3ff 100644
--- a/tools/clang/lib/Sema/SemaExpr.cpp
+++ b/tools/clang/lib/Sema/SemaExpr.cpp
@@ -3798,13 +3798,21 @@ static void warnOnSizeofOnArrayDecay(Sema &S, SourceLocation Loc, QualType T,
 }
 
 // HLSL Change Begins
-bool Sema::CheckHLSLUnaryExprOrTypeTraitOperand(QualType ExprType, SourceLocation Loc,
+bool Sema::CheckHLSLUnaryExprOrTypeTraitOperand(QualType ExprType,
+                                                SourceLocation Loc,
                                                 UnaryExprOrTypeTrait ExprKind) {
   assert(ExprKind == UnaryExprOrTypeTrait::UETT_SizeOf);
 
-  // "sizeof 42" is ill-defined because HLSL has literal int type which can decay to an int of any size.
-  const BuiltinType* BuiltinTy = ExprType->getAs<BuiltinType>();
-  if (BuiltinTy != nullptr && (BuiltinTy->getKind() == BuiltinType::LitInt || BuiltinTy->getKind() == BuiltinType::LitFloat)) {
+  if (RequireCompleteType(Loc, ExprType,
+                          diag::err_sizeof_alignof_incomplete_type, ExprKind,
+                          ExprType))
+    return true;
+
+  // "sizeof 42" is ill-defined because HLSL has literal int type which can
+  // decay to an int of any size.
+  const BuiltinType *BuiltinTy = ExprType->getAs<BuiltinType>();
+  if (BuiltinTy != nullptr && (BuiltinTy->getKind() == BuiltinType::LitInt ||
+                               BuiltinTy->getKind() == BuiltinType::LitFloat)) {
     Diag(Loc, diag::err_hlsl_sizeof_literal) << ExprType;
     return true;
   }
diff --git a/tools/clang/lib/Sema/SemaHLSL.cpp b/tools/clang/lib/Sema/SemaHLSL.cpp
index 744b06b8d0..3d9de1804d 100644
--- a/tools/clang/lib/Sema/SemaHLSL.cpp
+++ b/tools/clang/lib/Sema/SemaHLSL.cpp
@@ -10947,11 +10947,13 @@ HLSLExternalSource::DeduceTemplateArgumentsForHLSL(
       }
 
       if (IsBABLoad || IsBABStore) {
-        const bool IsLegalTemplate =
-            !functionTemplateTypeArg.isNull() &&
-            hlsl::IsHLSLNumericOrAggregateOfNumericType(
-                functionTemplateTypeArg);
-        if (!IsLegalTemplate) {
+        const bool IsNull = functionTemplateTypeArg.isNull();
+        // Incomplete type is diagnosed elsewhere, so just fail if incomplete.
+        if (!IsNull &&
+            getSema()->RequireCompleteType(Loc, functionTemplateTypeArg, 0))
+          return Sema::TemplateDeductionResult::TDK_Invalid;
+        if (IsNull || !hlsl::IsHLSLNumericOrAggregateOfNumericType(
+                          functionTemplateTypeArg)) {
           getSema()->Diag(Loc, diag::err_hlsl_intrinsic_template_arg_numeric)
               << intrinsicName;
           DiagnoseTypeElements(
diff --git a/tools/clang/test/SemaHLSL/raytracing-entry-diags.hlsl b/tools/clang/test/SemaHLSL/raytracing-entry-diags.hlsl
index e41c6a2f4f..8dfc927e11 100644
--- a/tools/clang/test/SemaHLSL/raytracing-entry-diags.hlsl
+++ b/tools/clang/test/SemaHLSL/raytracing-entry-diags.hlsl
@@ -181,3 +181,24 @@ void callable7(inout MyPayload payload, float F) {}
 
 [shader("callable")]
 float callable8(inout MyPayload payload) {} // expected-error{{return type for 'callable' shaders must be void}}
+
+// expected-note@+1 6 {{forward declaration of 'Incomplete'}}
+struct Incomplete;
+
+// expected-error@+3{{variable has incomplete type 'Incomplete'}}
+// expected-error@+2{{variable has incomplete type '__restrict Incomplete'}}
+[shader("anyhit")]
+void anyhit_incomplete( inout Incomplete A1, Incomplete A2) { }
+
+// expected-error@+3{{variable has incomplete type 'Incomplete'}}
+// expected-error@+2{{variable has incomplete type '__restrict Incomplete'}}
+[shader("closesthit")]
+void closesthit_incomplete( inout Incomplete payload, Incomplete attr ) {}
+
+// expected-error@+2{{variable has incomplete type '__restrict Incomplete'}}
+[shader("miss")]
+void miss_incomplete( inout Incomplete payload) { }
+
+// expected-error@+2{{variable has incomplete type '__restrict Incomplete'}}
+[shader("callable")]
+void callable_incomplete(inout Incomplete payload) {}
diff --git a/tools/clang/test/SemaHLSL/sizeof-requires-complete-type.hlsl b/tools/clang/test/SemaHLSL/sizeof-requires-complete-type.hlsl
new file mode 100644
index 0000000000..31d4898efe
--- /dev/null
+++ b/tools/clang/test/SemaHLSL/sizeof-requires-complete-type.hlsl
@@ -0,0 +1,27 @@
+// RUN: %dxc -T lib_6_3 -verify %s
+
+struct Complete {};
+
+struct Incomplete; // expected-note{{forward declaration of 'Incomplete'}}
+template<typename T> struct CompleteTemplate {};
+
+void fn() {
+  uint s;
+  // Complete types are easy. They are complete before we get to the expression.
+  s = sizeof(Complete); // This works!
+
+  // A type may be incomplete for several reasons.
+
+  // It may be incomplete because there is only a forward declaration, which
+  // should produce an error since we can't materialize a definition.
+  s = sizeof(Incomplete); // expected-error{{invalid application of 'sizeof' to an incomplete type 'Incomplete'}}
+
+  // It may be incomplete because it is an un-instantiated template, which
+  // should work because we can just instantiate it.
+  s = sizeof(CompleteTemplate<int>); // This works!
+
+  // It may be incomplete because it is a lazy-initialized type from HLSL,
+  // which can be completed, and then will report a non-numeric type error.
+  // expected-error@+1{{invalid application of 'sizeof' to non-numeric type 'Buffer'}}
+  s = sizeof(Buffer);
+}
diff --git a/tools/clang/test/SemaHLSL/template-udt-load.hlsl b/tools/clang/test/SemaHLSL/template-udt-load.hlsl
index f666297bb9..dd7cf8bd16 100644
--- a/tools/clang/test/SemaHLSL/template-udt-load.hlsl
+++ b/tools/clang/test/SemaHLSL/template-udt-load.hlsl
@@ -4,6 +4,34 @@
 ByteAddressBuffer In;
 RWBuffer<float> Out;
 
+template <typename T>
+struct Foo {
+  // expected-note@+1{{'RWBuffer<float>' field declared here}}
+  T Member;
+};
+
+template <typename T>
+struct MyTemplate {
+  T GetValue(ByteAddressBuffer srv, uint offset) {
+    // expected-error@+2{{Explicit template arguments on intrinsic Load must be a single numeric type}}
+    // expected-error@+1{{object 'RWBuffer<float>' is not allowed in builtin template parameters}}
+    return srv.Load<T>(offset);
+  }
+};
+template <typename T>
+T GetValue(uint offset) {
+  MyTemplate<T> myTemplate;
+  // expected-error@+2{{scalar, vector, or matrix expected}}
+  // expected-note@+1{{in instantiation of member function 'MyTemplate<RWBuffer<float> >::GetValue' requested here}}
+  return myTemplate.GetValue(In, offset) +
+  // expected-error@+2{{Explicit template arguments on intrinsic Load must be a single numeric type}}
+  // expected-error@+1{{object 'RWBuffer<float>' is not allowed in builtin template parameters}}
+         In.Load<Foo<T> >(offset + 4).Member;
+}
+
+// expected-note@+1{{forward declaration of 'Incomplete'}}
+struct Incomplete;
+
 [shader("compute")]
 [numthreads(1,1,1)]
 void main()
@@ -11,5 +39,16 @@ void main()
   RWBuffer<float> FB = In.Load<RWBuffer<float> >(0);
   // expected-error@-1{{Explicit template arguments on intrinsic Load must be a single numeric type}}
   // expected-error@-2{{object 'RWBuffer<float>' is not allowed in builtin template parameters}}
+
   Out[0] = FB[0];
+
+  // Ok:
+  Out[4] = GetValue<float>(4);
+  
+  // expected-note@?{{'Load' declared here}}
+  // expected-error@+1{{calling 'Load' with incomplete return type 'Incomplete'}}
+  Out[8] = In.Load<Incomplete>(8);
+
+  // expected-note@+1 2 {{in instantiation of function template specialization 'GetValue<RWBuffer<float> >' requested here}}
+  RWBuffer<float> FB2 = GetValue<RWBuffer<float> >(16);
 }

From 77b2ff676070aa5d34bcfe2ea0bbd4f435427e0b Mon Sep 17 00:00:00 2001
From: Joshua Batista <jbatista@microsoft.com>
Date: Thu, 5 Jun 2025 13:45:30 -0700
Subject: [PATCH 58/93] NFC: remove dead external validation code paths from
 dxcompiler (#7451)

DXC has now been changed to use the internal validator (loaded by
dxcompiler.dll) by default. This PR removes the ability for dxc.exe to
load dxil.dll in preparation for a series of changes to fix external
validation handling.

---------

Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
---
 .../dxc/DxilContainer/DxcContainerBuilder.h   |   4 +-
 include/dxc/Support/HLSLOptions.h             |   9 --
 lib/DxilContainer/DxcContainerBuilder.cpp     |  12 +-
 tools/clang/tools/dxcompiler/CMakeLists.txt   |   2 -
 tools/clang/tools/dxcompiler/DXCompiler.cpp   |   8 --
 tools/clang/tools/dxcompiler/dxcapi.cpp       |  12 +-
 tools/clang/tools/dxcompiler/dxcassembler.cpp |   1 -
 tools/clang/tools/dxcompiler/dxclinker.cpp    |   1 -
 .../clang/tools/dxcompiler/dxcompilerobj.cpp  |  20 +--
 tools/clang/tools/dxcompiler/dxcutil.cpp      | 116 +++---------------
 tools/clang/tools/dxcompiler/dxcutil.h        |  17 +--
 tools/clang/tools/dxcompiler/dxillib.cpp      |  73 -----------
 tools/clang/tools/dxcompiler/dxillib.h        |  42 -------
 .../unittests/HLSLTestLib/FileCheckerTest.cpp |  25 ++--
 14 files changed, 39 insertions(+), 303 deletions(-)
 delete mode 100644 tools/clang/tools/dxcompiler/dxillib.cpp
 delete mode 100644 tools/clang/tools/dxcompiler/dxillib.h

diff --git a/include/dxc/DxilContainer/DxcContainerBuilder.h b/include/dxc/DxilContainer/DxcContainerBuilder.h
index 9a3241525c..e79fec18c8 100644
--- a/include/dxc/DxilContainer/DxcContainerBuilder.h
+++ b/include/dxc/DxilContainer/DxcContainerBuilder.h
@@ -45,8 +45,7 @@ class DxcContainerBuilder : public IDxcContainerBuilder {
     return DoBasicQueryInterface<IDxcContainerBuilder>(this, riid, ppvObject);
   }
 
-  void Init(const char *warning = nullptr) {
-    m_warning = warning;
+  void Init() {
     m_RequireValidation = false;
     m_HasPrivateData = false;
     m_HashFunction = nullptr;
@@ -67,7 +66,6 @@ class DxcContainerBuilder : public IDxcContainerBuilder {
 
   PartList m_parts;
   CComPtr<IDxcBlob> m_pContainer;
-  const char *m_warning;
   bool m_RequireValidation;
   bool m_HasPrivateData;
   // Function to compute hash when valid dxil container is built
diff --git a/include/dxc/Support/HLSLOptions.h b/include/dxc/Support/HLSLOptions.h
index bad330747b..31ca3d1c14 100644
--- a/include/dxc/Support/HLSLOptions.h
+++ b/include/dxc/Support/HLSLOptions.h
@@ -114,13 +114,6 @@ struct RewriterOpts {
   bool DeclGlobalCB = false;          // OPT_rw_decl_global_cb
 };
 
-enum class ValidatorSelection : int {
-  Auto,        // Force internal validator (even if DXIL.dll is present)
-  Internal,    // Force internal validator (even if DXIL.dll is present)
-  External,    // Use DXIL.dll, failing compilation if not available
-  Invalid = -1 // Invalid
-};
-
 /// Use this class to capture all options.
 class DxcOpts {
 public:
@@ -225,8 +218,6 @@ class DxcOpts {
   bool ResMayAlias = false;                  // OPT_res_may_alias
   unsigned long ValVerMajor = UINT_MAX,
                 ValVerMinor = UINT_MAX; // OPT_validator_version
-  ValidatorSelection SelectValidator =
-      ValidatorSelection::Auto;         // OPT_select_validator
   unsigned ScanLimit = 0;               // OPT_memdep_block_scan_limit
   bool ForceZeroStoreLifetimes = false; // OPT_force_zero_store_lifetimes
   bool EnableLifetimeMarkers = false;   // OPT_enable_lifetime_markers
diff --git a/lib/DxilContainer/DxcContainerBuilder.cpp b/lib/DxilContainer/DxcContainerBuilder.cpp
index 770aa910a4..be182328dd 100644
--- a/lib/DxilContainer/DxcContainerBuilder.cpp
+++ b/lib/DxilContainer/DxcContainerBuilder.cpp
@@ -146,18 +146,14 @@ DxcContainerBuilder::SerializeContainer(IDxcOperationResult **ppResult) {
     // Combine existing warnings and errors from validation
     CComPtr<IDxcBlobEncoding> pErrorBlob;
     CDxcMallocHeapPtr<char> errorHeap(m_pMalloc);
-    SIZE_T warningLength = m_warning ? strlen(m_warning) : 0;
-    SIZE_T valErrorLength =
+    SIZE_T totalErrorLength =
         pValErrorUtf8 ? pValErrorUtf8->GetStringLength() : 0;
-    SIZE_T totalErrorLength = warningLength + valErrorLength;
     if (totalErrorLength) {
       SIZE_T errorSizeInBytes = totalErrorLength + 1;
       errorHeap.AllocateBytes(errorSizeInBytes);
-      if (warningLength)
-        memcpy(errorHeap.m_pData, m_warning, warningLength);
-      if (valErrorLength)
-        memcpy(errorHeap.m_pData + warningLength,
-               pValErrorUtf8->GetStringPointer(), valErrorLength);
+
+      memcpy(errorHeap.m_pData, pValErrorUtf8->GetStringPointer(),
+             totalErrorLength);
       errorHeap.m_pData[totalErrorLength] = L'\0';
       IFT(hlsl::DxcCreateBlobWithEncodingOnMalloc(errorHeap.m_pData, m_pMalloc,
                                                   errorSizeInBytes, DXC_CP_UTF8,
diff --git a/tools/clang/tools/dxcompiler/CMakeLists.txt b/tools/clang/tools/dxcompiler/CMakeLists.txt
index c69e276194..26bf0e5d98 100644
--- a/tools/clang/tools/dxcompiler/CMakeLists.txt
+++ b/tools/clang/tools/dxcompiler/CMakeLists.txt
@@ -57,7 +57,6 @@ set(SOURCES
   DXCompiler.rc
   DXCompiler.def
   dxcfilesystem.cpp
-  dxillib.cpp
   dxcutil.cpp
   dxcdisassembler.cpp
   dxcpdbutils.cpp
@@ -75,7 +74,6 @@ set(SOURCES
   dxcutil.cpp
   dxcdisassembler.cpp
   dxcpdbutils.cpp
-  dxillib.cpp
   dxcvalidator.cpp
   dxclinker.cpp
   dxcshadersourceinfo.cpp
diff --git a/tools/clang/tools/dxcompiler/DXCompiler.cpp b/tools/clang/tools/dxcompiler/DXCompiler.cpp
index c548441449..c7ffcbffa1 100644
--- a/tools/clang/tools/dxcompiler/DXCompiler.cpp
+++ b/tools/clang/tools/dxcompiler/DXCompiler.cpp
@@ -19,7 +19,6 @@
 #ifdef LLVM_ON_WIN32
 #include "dxcetw.h"
 #endif
-#include "dxillib.h"
 
 namespace hlsl {
 HRESULT SetupRegistryPassForHLSL();
@@ -65,7 +64,6 @@ static HRESULT InitMaybeFail() throw() {
   fsSetup = true;
   IFC(hlsl::SetupRegistryPassForHLSL());
   IFC(hlsl::SetupRegistryPassForPIX());
-  IFC(DxilLibInitialize());
   if (hlsl::options::initHlslOptTable()) {
     hr = E_FAIL;
     goto Cleanup;
@@ -110,12 +108,6 @@ BOOL WINAPI DllMain(HINSTANCE hinstDLL, DWORD Reason, LPVOID reserved) {
     ::hlsl::options::cleanupHlslOptTable();
     ::llvm::sys::fs::CleanupPerThreadFileSystem();
     ::llvm::llvm_shutdown();
-    if (reserved ==
-        NULL) { // FreeLibrary has been called or the DLL load failed
-      DxilLibCleanup(DxilLibCleanUpType::UnloadLibrary);
-    } else { // Process termination. We should not call FreeLibrary()
-      DxilLibCleanup(DxilLibCleanUpType::ProcessTermination);
-    }
     DxcClearThreadMalloc();
     DxcCleanupThreadMalloc();
     DxcEtw_DXCompilerShutdown_Stop(S_OK);
diff --git a/tools/clang/tools/dxcompiler/dxcapi.cpp b/tools/clang/tools/dxcompiler/dxcapi.cpp
index ab2cf1f40e..d4e85bc35c 100644
--- a/tools/clang/tools/dxcompiler/dxcapi.cpp
+++ b/tools/clang/tools/dxcompiler/dxcapi.cpp
@@ -25,7 +25,6 @@
 #include "dxcetw.h"
 #endif
 #include "dxc/DxilContainer/DxcContainerBuilder.h"
-#include "dxillib.h"
 #include <memory>
 
 HRESULT CreateDxcCompiler(REFIID riid, _Out_ LPVOID *ppv);
@@ -59,20 +58,11 @@ HRESULT CreateDxcContainerReflection(REFIID riid, _Out_ LPVOID *ppv) {
 HRESULT CreateDxcContainerBuilder(REFIID riid, _Out_ LPVOID *ppv) {
   // Call dxil.dll's containerbuilder
   *ppv = nullptr;
-  const char *warning;
-  HRESULT hr = DxilLibCreateInstance(CLSID_DxcContainerBuilder,
-                                     (IDxcContainerBuilder **)ppv);
-  if (FAILED(hr)) {
-    warning = "Unable to create container builder from dxil.dll. Resulting "
-              "container will not be signed.\n";
-  } else {
-    return hr;
-  }
 
   CComPtr<DxcContainerBuilder> Result =
       DxcContainerBuilder::Alloc(DxcGetThreadMallocNoRef());
   IFROOM(Result.p);
-  Result->Init(warning);
+  Result->Init();
   return Result->QueryInterface(riid, ppv);
 }
 
diff --git a/tools/clang/tools/dxcompiler/dxcassembler.cpp b/tools/clang/tools/dxcompiler/dxcassembler.cpp
index 0ff2abe26c..6622e93cbc 100644
--- a/tools/clang/tools/dxcompiler/dxcassembler.cpp
+++ b/tools/clang/tools/dxcompiler/dxcassembler.cpp
@@ -19,7 +19,6 @@
 #include "dxc/Support/dxcfilesystem.h"
 #include "dxc/Support/microcom.h"
 #include "dxcutil.h"
-#include "dxillib.h"
 
 #include "llvm/Bitcode/ReaderWriter.h"
 #include "llvm/IR/LLVMContext.h"
diff --git a/tools/clang/tools/dxcompiler/dxclinker.cpp b/tools/clang/tools/dxcompiler/dxclinker.cpp
index 82c9b8e96b..f5427ccc08 100644
--- a/tools/clang/tools/dxcompiler/dxclinker.cpp
+++ b/tools/clang/tools/dxcompiler/dxclinker.cpp
@@ -18,7 +18,6 @@
 #include "dxc/Support/dxcapi.impl.h"
 #include "dxc/Support/microcom.h"
 #include "dxc/dxcapi.h"
-#include "dxillib.h"
 
 #include "llvm/ADT/SmallVector.h"
 #include <algorithm>
diff --git a/tools/clang/tools/dxcompiler/dxcompilerobj.cpp b/tools/clang/tools/dxcompiler/dxcompilerobj.cpp
index ebeee380ef..84b568df9c 100644
--- a/tools/clang/tools/dxcompiler/dxcompilerobj.cpp
+++ b/tools/clang/tools/dxcompiler/dxcompilerobj.cpp
@@ -56,7 +56,6 @@
 #include "dxcompileradapter.h"
 #include "dxcshadersourceinfo.h"
 #include "dxcversion.inc"
-#include "dxillib.h"
 #include <algorithm>
 #include <cfloat>
 
@@ -850,11 +849,9 @@ class DxcCompiler : public IDxcCompiler3,
           compiler.getCodeGenOpts().HLSLValidatorMajorVer = opts.ValVerMajor;
           compiler.getCodeGenOpts().HLSLValidatorMinorVer = opts.ValVerMinor;
         } else {
-          // Version from dxil.dll, or internal validator if unavailable
           dxcutil::GetValidatorVersion(
               &compiler.getCodeGenOpts().HLSLValidatorMajorVer,
-              &compiler.getCodeGenOpts().HLSLValidatorMinorVer,
-              opts.SelectValidator);
+              &compiler.getCodeGenOpts().HLSLValidatorMinorVer);
         }
 
         // Root signature-only container validation is only supported on 1.5 and
@@ -934,7 +931,7 @@ class DxcCompiler : public IDxcCompiler3,
             CComPtr<IDxcBlobEncoding> pValErrors;
             // Validation failure communicated through diagnostic error
             dxcutil::ValidateRootSignatureInContainer(
-                pOutputBlob, &compiler.getDiagnostics(), opts.SelectValidator);
+                pOutputBlob, &compiler.getDiagnostics());
           }
         }
       } else if (opts.VerifyDiagnostics) {
@@ -1054,8 +1051,7 @@ class DxcCompiler : public IDxcCompiler3,
               std::move(serializeModule), pOutputBlob, m_pMalloc,
               SerializeFlags, pOutputStream, 0, opts.GetPDBName(),
               &compiler.getDiagnostics(), &ShaderHashContent, pReflectionStream,
-              pRootSigStream, pRootSignatureBlob, pPrivateBlob,
-              opts.SelectValidator);
+              pRootSigStream, pRootSignatureBlob, pPrivateBlob);
 
           inputs.pVersionInfo = static_cast<IDxcVersionInfo *>(this);
 
@@ -1108,8 +1104,7 @@ class DxcCompiler : public IDxcCompiler3,
                 CComPtr<IDxcBlobEncoding> pValErrors;
                 // Validation failure communicated through diagnostic error
                 dxcutil::ValidateRootSignatureInContainer(
-                    pRootSignature, &compiler.getDiagnostics(),
-                    opts.SelectValidator);
+                    pRootSignature, &compiler.getDiagnostics());
               }
               IFT(pResult->SetOutputObject(DXC_OUT_ROOT_SIGNATURE,
                                            pRootSignature));
@@ -1324,13 +1319,6 @@ class DxcCompiler : public IDxcCompiler3,
       CComPtr<IDxcResult> pResult;
       hr = e.hr;
       std::string msg("Internal Compiler error: ");
-      switch (hr) {
-      case DXC_E_VALIDATOR_MISSING:
-        msg = "Error: external validator selected, but DXIL.dll not found.";
-        break;
-      default:
-        break;
-      }
       msg += e.msg;
       if (SUCCEEDED(DxcResult::Create(
               e.hr, DXC_OUT_NONE,
diff --git a/tools/clang/tools/dxcompiler/dxcutil.cpp b/tools/clang/tools/dxcompiler/dxcutil.cpp
index ea3f72dcb4..4e5c5c95e8 100644
--- a/tools/clang/tools/dxcompiler/dxcutil.cpp
+++ b/tools/clang/tools/dxcompiler/dxcutil.cpp
@@ -19,7 +19,6 @@
 #include "dxc/Support/WinIncludes.h"
 #include "dxc/Support/dxcapi.impl.h"
 #include "dxc/dxcapi.h"
-#include "dxillib.h"
 #include "clang/Basic/Diagnostic.h"
 #include "llvm/Bitcode/ReaderWriter.h"
 #include "llvm/IR/DebugInfo.h"
@@ -50,32 +49,8 @@ namespace {
 // AssembleToContainer helper functions.
 
 // return true if the internal validator was used, false otherwise
-bool CreateValidator(CComPtr<IDxcValidator> &pValidator,
-                     hlsl::options::ValidatorSelection SelectValidator =
-                         hlsl::options::ValidatorSelection::Auto) {
-  bool bInternal =
-      SelectValidator == hlsl::options::ValidatorSelection::Internal;
-  bool bExternal =
-      SelectValidator == hlsl::options::ValidatorSelection::External;
-  bool bAuto = SelectValidator == hlsl::options::ValidatorSelection::Auto;
-
-  // default behavior uses internal validator, as well as
-  // explicitly specifying internal
-  if (bInternal || bAuto) {
-    IFT(CreateDxcValidator(IID_PPV_ARGS(&pValidator)));
-    return true;
-  }
-
-  if (bExternal) {
-    // if external was explicitly specified, but no
-    // external validator could be found (no DXIL.dll), then error
-    IFTBOOL(DxilLibIsEnabled(), DXC_E_VALIDATOR_MISSING);
-    IFT(DxilLibCreateInstance(CLSID_DxcValidator, &pValidator));
-
-    return false;
-  }
-
-  return false;
+void CreateValidator(CComPtr<IDxcValidator> &pValidator) {
+  IFT(CreateDxcValidator(IID_PPV_ARGS(&pValidator)));
 }
 
 } // namespace
@@ -89,23 +64,20 @@ AssembleInputs::AssembleInputs(
     uint32_t ValidationFlags, llvm::StringRef DebugName,
     clang::DiagnosticsEngine *pDiag, hlsl::DxilShaderHash *pShaderHashOut,
     AbstractMemoryStream *pReflectionOut, AbstractMemoryStream *pRootSigOut,
-    CComPtr<IDxcBlob> pRootSigBlob, CComPtr<IDxcBlob> pPrivateBlob,
-    hlsl::options::ValidatorSelection SelectValidator)
+    CComPtr<IDxcBlob> pRootSigBlob, CComPtr<IDxcBlob> pPrivateBlob)
     : pM(std::move(pM)), pOutputContainerBlob(pOutputContainerBlob),
       pMalloc(pMalloc), SerializeFlags(SerializeFlags),
       ValidationFlags(ValidationFlags), pModuleBitcode(pModuleBitcode),
       DebugName(DebugName), pDiag(pDiag), pShaderHashOut(pShaderHashOut),
       pReflectionOut(pReflectionOut), pRootSigOut(pRootSigOut),
-      pRootSigBlob(pRootSigBlob), pPrivateBlob(pPrivateBlob),
-      SelectValidator(SelectValidator) {}
+      pRootSigBlob(pRootSigBlob), pPrivateBlob(pPrivateBlob) {}
 
-void GetValidatorVersion(unsigned *pMajor, unsigned *pMinor,
-                         hlsl::options::ValidatorSelection SelectValidator) {
+void GetValidatorVersion(unsigned *pMajor, unsigned *pMinor) {
   if (pMajor == nullptr || pMinor == nullptr)
     return;
 
   CComPtr<IDxcValidator> pValidator;
-  CreateValidator(pValidator, SelectValidator);
+  CreateValidator(pValidator);
 
   CComPtr<IDxcVersionInfo> pVersionInfo;
   if (SUCCEEDED(pValidator.QueryInterface(&pVersionInfo))) {
@@ -177,76 +149,19 @@ HRESULT ValidateAndAssembleToContainer(AssembleInputs &inputs) {
   std::unique_ptr<llvm::Module> llvmModuleWithDebugInfo;
 
   CComPtr<IDxcValidator> pValidator;
-  bool bInternalValidator = CreateValidator(pValidator, inputs.SelectValidator);
-  // Warning on internal Validator
-
-  CComPtr<IDxcValidator2> pValidator2;
-  if (!bInternalValidator) {
-    pValidator.QueryInterface(&pValidator2);
-  }
-
-  if (bInternalValidator || pValidator2) {
-    // If using the internal validator or external validator supports
-    // IDxcValidator2, we'll use the modules directly. In this case, we'll want
-    // to make a clone to avoid SerializeDxilContainerForModule stripping all
-    // the debug info. The debug info will be stripped from the orginal module,
-    // but preserved in the cloned module.
-    if (llvm::getDebugMetadataVersionFromModule(*inputs.pM) != 0) {
-      llvmModuleWithDebugInfo.reset(llvm::CloneModule(inputs.pM.get()));
-    }
-  }
+  CreateValidator(pValidator);
 
-  // Verify validator version can validate this module
-  CComPtr<IDxcVersionInfo> pValidatorVersion;
-  IFT(pValidator->QueryInterface(&pValidatorVersion));
-  UINT32 ValMajor, ValMinor;
-  IFT(pValidatorVersion->GetVersion(&ValMajor, &ValMinor));
-  DxilModule &DM = inputs.pM.get()->GetOrCreateDxilModule();
-  unsigned ReqValMajor, ReqValMinor;
-  DM.GetValidatorVersion(ReqValMajor, ReqValMinor);
-  if (DXIL::CompareVersions(ValMajor, ValMinor, ReqValMajor, ReqValMinor) < 0) {
-    // Module is expecting to be validated by a newer validator.
-    if (inputs.pDiag) {
-      unsigned diagID = inputs.pDiag->getCustomDiagID(
-          clang::DiagnosticsEngine::Level::Error,
-          "The module cannot be validated by the version of the validator "
-          "currently attached.");
-      inputs.pDiag->Report(diagID);
-    }
-    return E_FAIL;
-  }
+  if (llvm::getDebugMetadataVersionFromModule(*inputs.pM) != 0)
+    llvmModuleWithDebugInfo.reset(llvm::CloneModule(inputs.pM.get()));
 
   AssembleToContainer(inputs);
 
   CComPtr<IDxcOperationResult> pValResult;
-  // Important: in-place edit is required so the blob is reused and thus
-  // dxil.dll can be released.
+  // In-place edit to avoid an extra copy
   inputs.ValidationFlags |= DxcValidatorFlags_InPlaceEdit;
-  if (bInternalValidator) {
-    IFT(RunInternalValidator(pValidator, llvmModuleWithDebugInfo.get(),
-                             inputs.pOutputContainerBlob,
-                             inputs.ValidationFlags, &pValResult));
-  } else {
-    if (pValidator2 && llvmModuleWithDebugInfo) {
-      // If metadata was stripped, re-serialize the input module.
-      CComPtr<AbstractMemoryStream> pDebugModuleStream;
-      IFT(CreateMemoryStream(DxcGetThreadMallocNoRef(), &pDebugModuleStream));
-      raw_stream_ostream outStream(pDebugModuleStream.p);
-      WriteBitcodeToFile(llvmModuleWithDebugInfo.get(), outStream, true);
-      outStream.flush();
-
-      DxcBuffer debugModule = {};
-      debugModule.Ptr = pDebugModuleStream->GetPtr();
-      debugModule.Size = pDebugModuleStream->GetPtrSize();
-
-      IFT(pValidator2->ValidateWithDebug(inputs.pOutputContainerBlob,
-                                         inputs.ValidationFlags, &debugModule,
-                                         &pValResult));
-    } else {
-      IFT(pValidator->Validate(inputs.pOutputContainerBlob,
-                               inputs.ValidationFlags, &pValResult));
-    }
-  }
+  IFT(RunInternalValidator(pValidator, llvmModuleWithDebugInfo.get(),
+                           inputs.pOutputContainerBlob, inputs.ValidationFlags,
+                           &pValResult));
   IFT(pValResult->GetStatus(&valHR));
   if (inputs.pDiag) {
     if (FAILED(valHR)) {
@@ -271,9 +186,8 @@ HRESULT ValidateAndAssembleToContainer(AssembleInputs &inputs) {
   return valHR;
 }
 
-HRESULT ValidateRootSignatureInContainer(
-    IDxcBlob *pRootSigContainer, clang::DiagnosticsEngine *pDiag,
-    hlsl::options::ValidatorSelection SelectValidator) {
+HRESULT ValidateRootSignatureInContainer(IDxcBlob *pRootSigContainer,
+                                         clang::DiagnosticsEngine *pDiag) {
   HRESULT valHR = S_OK;
   CComPtr<IDxcValidator> pValidator;
   CComPtr<IDxcOperationResult> pValResult;
diff --git a/tools/clang/tools/dxcompiler/dxcutil.h b/tools/clang/tools/dxcompiler/dxcutil.h
index 45b3d4dc1a..8612353561 100644
--- a/tools/clang/tools/dxcompiler/dxcutil.h
+++ b/tools/clang/tools/dxcompiler/dxcutil.h
@@ -54,9 +54,7 @@ struct AssembleInputs {
                  hlsl::AbstractMemoryStream *pReflectionOut = nullptr,
                  hlsl::AbstractMemoryStream *pRootSigOut = nullptr,
                  CComPtr<IDxcBlob> pRootSigBlob = nullptr,
-                 CComPtr<IDxcBlob> pPrivateBlob = nullptr,
-                 hlsl::options::ValidatorSelection SelectValidator =
-                     hlsl::options::ValidatorSelection::Auto);
+                 CComPtr<IDxcBlob> pPrivateBlob = nullptr);
   std::unique_ptr<llvm::Module> pM;
   CComPtr<IDxcBlob> &pOutputContainerBlob;
   IDxcVersionInfo *pVersionInfo = nullptr;
@@ -71,18 +69,13 @@ struct AssembleInputs {
   hlsl::AbstractMemoryStream *pRootSigOut = nullptr;
   CComPtr<IDxcBlob> pRootSigBlob = nullptr;
   CComPtr<IDxcBlob> pPrivateBlob = nullptr;
-  hlsl::options::ValidatorSelection SelectValidator =
-      hlsl::options::ValidatorSelection::Auto;
 };
 HRESULT ValidateAndAssembleToContainer(AssembleInputs &inputs);
-HRESULT ValidateRootSignatureInContainer(
-    IDxcBlob *pRootSigContainer, clang::DiagnosticsEngine *pDiag = nullptr,
-    hlsl::options::ValidatorSelection SelectValidator =
-        hlsl::options::ValidatorSelection::Auto);
+HRESULT
+ValidateRootSignatureInContainer(IDxcBlob *pRootSigContainer,
+                                 clang::DiagnosticsEngine *pDiag = nullptr);
 HRESULT SetRootSignature(hlsl::DxilModule *pModule, CComPtr<IDxcBlob> pSource);
-void GetValidatorVersion(unsigned *pMajor, unsigned *pMinor,
-                         hlsl::options::ValidatorSelection SelectValidator =
-                             hlsl::options::ValidatorSelection::Auto);
+void GetValidatorVersion(unsigned *pMajor, unsigned *pMinor);
 void AssembleToContainer(AssembleInputs &inputs);
 HRESULT Disassemble(IDxcBlob *pProgram, llvm::raw_string_ostream &Stream);
 void ReadOptsAndValidate(hlsl::options::MainArgs &mainArgs,
diff --git a/tools/clang/tools/dxcompiler/dxillib.cpp b/tools/clang/tools/dxcompiler/dxillib.cpp
deleted file mode 100644
index 72abc869da..0000000000
--- a/tools/clang/tools/dxcompiler/dxillib.cpp
+++ /dev/null
@@ -1,73 +0,0 @@
-///////////////////////////////////////////////////////////////////////////////
-//                                                                           //
-// dxillib.cpp                                                               //
-// Copyright (C) Microsoft Corporation. All rights reserved.                 //
-// This file is distributed under the University of Illinois Open Source     //
-// License. See LICENSE.TXT for details.                                     //
-//                                                                           //
-// Provides access to dxil.dll                                               //
-//                                                                           //
-///////////////////////////////////////////////////////////////////////////////
-
-#include "dxillib.h"
-#include "dxc/Support/Global.h" // For DXASSERT
-#include "dxc/Support/dxcapi.use.h"
-#include "llvm/Support/Mutex.h"
-
-using namespace dxc;
-
-static DxcDllSupport g_DllSupport;
-static HRESULT g_DllLibResult = S_OK;
-
-static llvm::sys::Mutex *cs = nullptr;
-
-// Check if we can successfully get IDxcValidator from dxil.dll
-// This function is to prevent multiple attempts to load dxil.dll
-HRESULT DxilLibInitialize() {
-  cs = new llvm::sys::Mutex;
-  cs->lock();
-  g_DllLibResult = g_DllSupport.InitializeForDll(kDxilLib, "DxcCreateInstance");
-  cs->unlock();
-  return S_OK;
-}
-
-HRESULT DxilLibCleanup(DxilLibCleanUpType type) {
-  HRESULT hr = S_OK;
-  if (type == DxilLibCleanUpType::ProcessTermination) {
-    g_DllSupport.Detach();
-  } else if (type == DxilLibCleanUpType::UnloadLibrary) {
-    g_DllSupport.Cleanup();
-  } else {
-    hr = E_INVALIDARG;
-  }
-  delete cs;
-  cs = nullptr;
-  return hr;
-}
-
-// g_DllLibResult is S_OK by default, check again to see if dxil.dll is loaded
-// If we fail to load dxil.dll, set g_DllLibResult to E_FAIL so that we don't
-// have multiple attempts to load dxil.dll
-bool DxilLibIsEnabled() {
-  cs->lock();
-  if (SUCCEEDED(g_DllLibResult)) {
-    if (!g_DllSupport.IsEnabled()) {
-      g_DllLibResult =
-          g_DllSupport.InitializeForDll(kDxilLib, "DxcCreateInstance");
-    }
-  }
-  cs->unlock();
-  return SUCCEEDED(g_DllLibResult);
-}
-
-HRESULT DxilLibCreateInstance(REFCLSID rclsid, REFIID riid,
-                              IUnknown **ppInterface) {
-  DXASSERT_NOMSG(ppInterface != nullptr);
-  HRESULT hr = E_FAIL;
-  if (DxilLibIsEnabled()) {
-    cs->lock();
-    hr = g_DllSupport.CreateInstance(rclsid, riid, ppInterface);
-    cs->unlock();
-  }
-  return hr;
-}
diff --git a/tools/clang/tools/dxcompiler/dxillib.h b/tools/clang/tools/dxcompiler/dxillib.h
deleted file mode 100644
index 879d023459..0000000000
--- a/tools/clang/tools/dxcompiler/dxillib.h
+++ /dev/null
@@ -1,42 +0,0 @@
-///////////////////////////////////////////////////////////////////////////////
-//                                                                           //
-// dxillib.h                                                                 //
-// Copyright (C) Microsoft Corporation. All rights reserved.                 //
-// This file is distributed under the University of Illinois Open Source     //
-// License. See LICENSE.TXT for details.                                     //
-//                                                                           //
-// Provides wrappers to handle calls to dxil.dll                             //
-//                                                                           //
-///////////////////////////////////////////////////////////////////////////////
-
-#pragma once
-#ifndef __DXC_DXILLIB__
-#define __DXC_DXILLIB__
-
-#include "dxc/Support/WinIncludes.h"
-#include "dxc/WinAdapter.h"
-
-// Initialize Dxil library.
-HRESULT DxilLibInitialize();
-
-// When dxcompiler is detached from process,
-// we should not call FreeLibrary on process termination.
-// So the caller has to specify if cleaning is from FreeLibrary or process
-// termination
-enum class DxilLibCleanUpType { UnloadLibrary, ProcessTermination };
-
-HRESULT DxilLibCleanup(DxilLibCleanUpType type);
-
-// Check if can access dxil.dll
-bool DxilLibIsEnabled();
-
-HRESULT DxilLibCreateInstance(REFCLSID rclsid, REFIID riid,
-                              IUnknown **ppInterface);
-
-template <class TInterface>
-HRESULT DxilLibCreateInstance(REFCLSID rclsid, TInterface **ppInterface) {
-  return DxilLibCreateInstance(rclsid, __uuidof(TInterface),
-                               (IUnknown **)ppInterface);
-}
-
-#endif // __DXC_DXILLIB__
diff --git a/tools/clang/unittests/HLSLTestLib/FileCheckerTest.cpp b/tools/clang/unittests/HLSLTestLib/FileCheckerTest.cpp
index 2c75d45e5e..2d9ee7315d 100644
--- a/tools/clang/unittests/HLSLTestLib/FileCheckerTest.cpp
+++ b/tools/clang/unittests/HLSLTestLib/FileCheckerTest.cpp
@@ -519,28 +519,21 @@ FileRunCommandPart::RunDxc(dxc::DxcDllSupport &DllSupport,
       // Convert stage to minimum dxil/validator version:
       RequiredDxilMajor = std::max(RequiredDxilMajor, (unsigned)6) - 5;
 
-      bool bInternalValidator =
-          opts.SelectValidator == hlsl::options::ValidatorSelection::Internal;
       bool bValVerExplicit = opts.ValVerMajor != UINT_MAX;
 
-      // Normally we must check the validator version as well, but there are
-      // two scenarios where the validator version doesn't need to be checked
-      // against the version based on the shader model:
-      // 1. The test selects internal validator.
-      // 2. The test explicitly requests a specific validator version.
-      FileRunCommandResult result =
-          CheckDxilVer(DllSupport, RequiredDxilMajor, RequiredDxilMinor,
-                       !(bInternalValidator || bValVerExplicit));
+      // If validator version set explicitly, skip validator version check when
+      // checking required version for shader model.
+      FileRunCommandResult result = CheckDxilVer(
+          DllSupport, RequiredDxilMajor, RequiredDxilMinor, !bValVerExplicit);
       if (result.AbortPipeline)
         return result;
 
       // Additionally, if the test explicitly requests a specific non-zero
-      // validator version, and doesn't select internal validator or disable
-      // validation, we must check that the validator version is at least as
-      // high as the requested version.
-      // When ValVerMajor is 0, validation cannot be run against the module.
-      if (bValVerExplicit && opts.ValVerMajor != 0 &&
-          !(bInternalValidator || opts.DisableValidation))
+      // validator version, and doesn't disable validation, we must check
+      // that the validator version is at least as high as the requested
+      // version. When ValVerMajor is 0, validation cannot be run against
+      // the module.
+      if (bValVerExplicit && opts.ValVerMajor != 0 && !opts.DisableValidation)
         result = CheckDxilVer(DllSupport, opts.ValVerMajor, opts.ValVerMinor);
       if (result.AbortPipeline)
         return result;

From 50f53c6c200fd6b53f65268912e6f9e444ce9242 Mon Sep 17 00:00:00 2001
From: Tex Riddell <texr@microsoft.com>
Date: Thu, 5 Jun 2025 16:52:05 -0700
Subject: [PATCH 59/93] NFC: Update indentation in latest-release.json for
 clang-format (#7515)

This avoids other unrelated changes which didn't intend to change this
file from having to update the file just to make clang-format happy.
---
 utils/version/latest-release.json | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/utils/version/latest-release.json b/utils/version/latest-release.json
index 146acf0708..40d50a28ba 100644
--- a/utils/version/latest-release.json
+++ b/utils/version/latest-release.json
@@ -1,8 +1,8 @@
 {
-    "version": {
-        "major": "1",
-        "minor": "8",
-        "rev": "2505"
-    },
-    "sha": "0fd79eba6bb23f50ec21a7a7daeee3614bebe12b"
+  "version": {
+    "major": "1",
+    "minor": "8",
+    "rev": "2505"
+  },
+  "sha": "0fd79eba6bb23f50ec21a7a7daeee3614bebe12b"
 }

From 2084643e7ae44f23c80eb74d0cd549a8179c7443 Mon Sep 17 00:00:00 2001
From: Dan Brown <61992655+danbrown-amd@users.noreply.github.com>
Date: Fri, 6 Jun 2025 13:04:30 -0600
Subject: [PATCH 60/93] [spirv] Corrects output node index parameter. (#7517)

The node index parameter of `OpAllocateNodePayloadsAMDX` was being set
to the value of the NodeId index argument (which is captured in the
`PayloadNodeBaseIndexAMDX` decoration). Instead, it should be set to the
node's index in the node array, if any, or zero for a single node.
---
 tools/clang/lib/SPIRV/SpirvEmitter.cpp          | 12 ++----------
 tools/clang/test/CodeGenSPIRV/node.renamed.hlsl |  7 ++++++-
 2 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/tools/clang/lib/SPIRV/SpirvEmitter.cpp b/tools/clang/lib/SPIRV/SpirvEmitter.cpp
index f3d10537e1..36c9b33ad5 100644
--- a/tools/clang/lib/SPIRV/SpirvEmitter.cpp
+++ b/tools/clang/lib/SPIRV/SpirvEmitter.cpp
@@ -11256,17 +11256,9 @@ SpirvInstruction *SpirvEmitter::processIntrinsicGetNodeOutputRecords(
 
   const auto *declRefExpr = dyn_cast<DeclRefExpr>(baseExpr->IgnoreImpCasts());
   const auto *paramDecl = dyn_cast<ParmVarDecl>(declRefExpr->getDecl());
-  const auto *nodeID = paramDecl->getAttr<HLSLNodeIdAttr>();
-  StringRef nodeName = paramDecl->getName();
-  unsigned nodeIndex = 0;
-  if (nodeID) {
-    nodeName = nodeID->getName();
-    nodeIndex = nodeID->getArrayIndex();
-  }
-
   if (!shaderIndex) {
-    shaderIndex = spvBuilder.getConstantInt(astContext.UnsignedIntTy,
-                                            llvm::APInt(32, nodeIndex));
+    shaderIndex =
+        spvBuilder.getConstantInt(astContext.UnsignedIntTy, llvm::APInt(32, 0));
   }
 
   LowerTypeVisitor lowerTypeVisitor(astContext, spvContext, spirvOptions,
diff --git a/tools/clang/test/CodeGenSPIRV/node.renamed.hlsl b/tools/clang/test/CodeGenSPIRV/node.renamed.hlsl
index 953288929d..265fd6c17f 100644
--- a/tools/clang/test/CodeGenSPIRV/node.renamed.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/node.renamed.hlsl
@@ -12,7 +12,8 @@ struct RECORD {
 [NodeIsProgramEntry]
 void node017_renamed_node([NodeID("output_node_name", 2)] NodeOutput<RECORD> r)
 {
-  r.GetThreadNodeOutputRecords(1);
+  ThreadNodeOutputRecords<RECORD> records = r.GetThreadNodeOutputRecords(1);
+  records.OutputComplete();
 }
 
 // CHECK: OpEntryPoint GLCompute %{{[^ ]*}} "node017_renamed_node"
@@ -20,4 +21,8 @@ void node017_renamed_node([NodeID("output_node_name", 2)] NodeOutput<RECORD> r)
 // CHECK-DAG: OpDecorateId [[TYPE]] PayloadNodeBaseIndexAMDX [[U2:%[0-9A-Za-z_]*]]
 // CHECK: [[UINT:%[^ ]*]] = OpTypeInt 32 0
 // CHECK-DAG: [[STR]] = OpConstantStringAMDX "output_node_name"
+// CHECK-DAG: [[U0:%[_0-9A-Za-z]*]] = OpConstant [[UINT]] 0
+// CHECK-DAG: [[U1:%[_0-9A-Za-z]*]] = OpConstant [[UINT]] 1
 // CHECK-DAG: [[U2]] = OpConstant [[UINT]] 2
+// CHECK-DAG: [[U4:%[_0-9A-Za-z]*]] = OpConstant [[UINT]] 4
+// CHECK: OpAllocateNodePayloadsAMDX %{{[^ ]*}} [[U4]] [[U1]] [[U0]]

From 9b5f5c9fb239ae83a7d318b0b45632f343866583 Mon Sep 17 00:00:00 2001
From: Steven Perron <stevenperron@google.com>
Date: Thu, 12 Jun 2025 09:14:55 -0400
Subject: [PATCH 61/93] [SPIRV] Use OpCopyLogical to reconstruct values (#7530)

When DXC needs to change the layout of a value, it currently has to
extract each individual scalar, and then reconstruct using the type with
the different layout.

If you have a large array or struct with many member, this generates a
lot of extra code.

Starting in SPIR-V 1.4, the OpCopyLogical instruction is available to do
the reconstruction.

This should help generate less code, which will lead to improved compile
time and maybe smaller binary sizes.

Fixes #7493
---
 .../clang/include/clang/SPIRV/AstTypeProbe.h  |  4 ++
 tools/clang/lib/SPIRV/AstTypeProbe.cpp        | 21 ++++++
 tools/clang/lib/SPIRV/SpirvEmitter.cpp        | 39 +++++++++++
 tools/clang/lib/SPIRV/SpirvEmitter.h          |  2 +
 .../clang/test/CodeGenSPIRV/logical_copy.hlsl | 67 +++++++++++++++++++
 5 files changed, 133 insertions(+)
 create mode 100644 tools/clang/test/CodeGenSPIRV/logical_copy.hlsl

diff --git a/tools/clang/include/clang/SPIRV/AstTypeProbe.h b/tools/clang/include/clang/SPIRV/AstTypeProbe.h
index 6302d43a88..9abea972c6 100644
--- a/tools/clang/include/clang/SPIRV/AstTypeProbe.h
+++ b/tools/clang/include/clang/SPIRV/AstTypeProbe.h
@@ -337,6 +337,10 @@ bool isOrContainsNonFpColMajorMatrix(const ASTContext &,
                                      const SpirvCodeGenOptions &, QualType type,
                                      const Decl *decl);
 
+/// brief Returns true if the type is a boolean type or an aggragate type that
+/// contains a boolean type.
+bool isOrContainsBoolType(QualType type);
+
 /// \brief Returns true if the given type is `vk::ext_result_id<T>`.
 bool isExtResultIdType(QualType type);
 
diff --git a/tools/clang/lib/SPIRV/AstTypeProbe.cpp b/tools/clang/lib/SPIRV/AstTypeProbe.cpp
index 31a9bd8f7d..b6ca1f60ae 100644
--- a/tools/clang/lib/SPIRV/AstTypeProbe.cpp
+++ b/tools/clang/lib/SPIRV/AstTypeProbe.cpp
@@ -1353,6 +1353,27 @@ bool isOrContainsNonFpColMajorMatrix(const ASTContext &astContext,
   return false;
 }
 
+bool isOrContainsBoolType(QualType type) {
+  if (isBoolOrVecMatOfBoolType(type)) {
+    return true;
+  }
+
+  if (const auto *arrayType = type->getAsArrayTypeUnsafe()) {
+    return isOrContainsBoolType(arrayType->getElementType());
+  }
+
+  if (const auto *recordType = type->getAs<RecordType>()) {
+    for (auto field : recordType->getDecl()->fields()) {
+      if (isOrContainsBoolType(field->getType())) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  return false;
+}
+
 bool isTypeInVkNamespace(const RecordType *type) {
   if (const auto *nameSpaceDecl =
           dyn_cast<NamespaceDecl>(type->getDecl()->getDeclContext())) {
diff --git a/tools/clang/lib/SPIRV/SpirvEmitter.cpp b/tools/clang/lib/SPIRV/SpirvEmitter.cpp
index 36c9b33ad5..cc7016b594 100644
--- a/tools/clang/lib/SPIRV/SpirvEmitter.cpp
+++ b/tools/clang/lib/SPIRV/SpirvEmitter.cpp
@@ -7108,6 +7108,38 @@ void SpirvEmitter::storeValue(SpirvInstruction *lhsPtr,
   }
 }
 
+bool SpirvEmitter::canUseOpCopyLogical(QualType type) const {
+  if (featureManager.getSpirvVersion(featureManager.getTargetEnv()) <
+      VersionTuple(1, 4)) {
+    return false;
+  }
+
+  if (!type->isArrayType() && !type->isRecordType()) {
+    return false;
+  }
+
+  if (const auto *recordType = type->getAs<RecordType>()) {
+    if (isTypeInVkNamespace(recordType) &&
+        (recordType->getDecl()->getName().equals("BufferPointer") ||
+         recordType->getDecl()->getName().equals("SpirvType") ||
+         recordType->getDecl()->getName().equals("SpirvOpaqueType"))) {
+      // vk::BufferPointer<T> lowers to a pointer type. No need to reconstruct
+      // the value. The vk::Spirv*Type should be treated an opaque type. All we
+      // can do is leave it the same.
+      return false;
+    }
+  }
+
+  if (hlsl::IsHLSLVecMatType(type) || hlsl::IsHLSLResourceType(type)) {
+    return false;
+  }
+
+  // If the type contains a bool it is possible that one type represents it with
+  // a bool and the other with an int. If that happens, OpCopyLogical is not
+  // valid.
+  return !isOrContainsBoolType(type);
+}
+
 SpirvInstruction *SpirvEmitter::reconstructValue(SpirvInstruction *srcVal,
                                                  const QualType valType,
                                                  SpirvLayoutRule dstLR,
@@ -7171,6 +7203,13 @@ SpirvInstruction *SpirvEmitter::reconstructValue(SpirvInstruction *srcVal,
     return result;
   };
 
+  if (canUseOpCopyLogical(valType)) {
+    SpirvInstruction *copy = spvBuilder.createUnaryOp(
+        spv::Op::OpCopyLogical, valType, srcVal, srcVal->getSourceLocation());
+    copy->setLayoutRule(dstLR);
+    return copy;
+  }
+
   // Constant arrays
   if (const auto *arrayType = astContext.getAsConstantArrayType(valType)) {
     const auto elemType = arrayType->getElementType();
diff --git a/tools/clang/lib/SPIRV/SpirvEmitter.h b/tools/clang/lib/SPIRV/SpirvEmitter.h
index 954b2c5dd3..14401c6418 100644
--- a/tools/clang/lib/SPIRV/SpirvEmitter.h
+++ b/tools/clang/lib/SPIRV/SpirvEmitter.h
@@ -228,6 +228,8 @@ class SpirvEmitter : public ASTConsumer {
                   QualType lhsValType, SourceLocation loc,
                   SourceRange range = {});
 
+  bool canUseOpCopyLogical(QualType type) const;
+
   /// Decomposes and reconstructs the given srcVal of the given valType to meet
   /// the requirements of the dstLR layout rule.
   SpirvInstruction *reconstructValue(SpirvInstruction *srcVal, QualType valType,
diff --git a/tools/clang/test/CodeGenSPIRV/logical_copy.hlsl b/tools/clang/test/CodeGenSPIRV/logical_copy.hlsl
new file mode 100644
index 0000000000..eb4a803548
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/logical_copy.hlsl
@@ -0,0 +1,67 @@
+// RUN: %dxc %s -fcgl -spirv -T ps_6_8 -fspv-target-env=vulkan1.1spirv1.4 | FileCheck %s
+
+
+
+struct WithBool {
+  bool b;
+};
+
+struct StructWithBool {
+  WithBool wb;
+};
+
+struct StructWithoutBool {
+  int a;
+};
+
+struct OuterStruct {
+  StructWithBool a[2];
+  WithBool b;
+  StructWithoutBool c;
+  StructWithoutBool d[2];
+} S;
+
+
+// CHECK: %GetStruct = OpFunction %OuterStruct_0 None %34
+// CHECK: %bb_entry_0 = OpLabel
+// CHECK: [[ld:%[0-9]+]] = OpLoad %OuterStruct %39
+
+// The array `a` must be split up because it contains a bool that needs a
+// conversion from int to bool.
+// CHECK: [[arr_with_bool:%[0-9]+]] = OpCompositeExtract %_arr_StructWithBool_uint_2 [[ld]] 0
+// CHECK: [[struct_with_bool:%[0-9]+]] = OpCompositeExtract %StructWithBool [[arr_with_bool]] 0
+// CHECK: [[with_bool:%[0-9]+]] = OpCompositeExtract %WithBool [[struct_with_bool]] 0
+// CHECK: [[int:%[0-9]+]] = OpCompositeExtract %uint [[with_bool]] 0
+// CHECK: [[bool:%[0-9]+]] = OpINotEqual %bool [[int]] %uint_0
+// CHECK: [[with_bool:%[0-9]+]] = OpCompositeConstruct %WithBool_0 [[bool]]
+// CHECK: [[struct_with_bool:%[0-9]+]] = OpCompositeConstruct %StructWithBool_0 [[with_bool]]
+
+// Skip second element of the array. It is more of the same.
+// CHECK: [[a:%[0-9]+]] = OpCompositeConstruct %_arr_StructWithBool_0_uint_2 [[struct_with_bool]] {{%.*}}
+
+// The struct `b` must be split up for the same reason.
+// CHECK: [[with_bool:%[0-9]+]] = OpCompositeExtract %WithBool [[ld]] 1
+// CHECK: [[int:%[0-9]+]] = OpCompositeExtract %uint [[with_bool]] 0
+// CHECK: [[bool:%[0-9]+]] = OpINotEqual %bool [[int]] %uint_0
+// CHECK: [[b:%[0-9]+]] = OpCompositeConstruct %WithBool_0 [[bool]]
+
+// The struct `c` can use OpCopyLogical.
+// CHECK: %59 = OpCompositeExtract %StructWithoutBool [[ld]] 2
+// CHECK: [[c:%[0-9]+]] = OpCopyLogical %StructWithoutBool_0 %59
+
+// The array `d` can use OpCopyLogical.
+// CHECK: %61 = OpCompositeExtract %_arr_StructWithoutBool_uint_2 [[ld]] 3
+// CHECK: [[d:%[0-9]+]] = OpCopyLogical %_arr_StructWithoutBool_0_uint_2 %61
+
+// CHECK: [[r:%[0-9]+]] = OpCompositeConstruct %OuterStruct_0 [[a]] [[b]] [[c]] [[d]]
+// CHECK: OpStore {{%.*}} [[r]]
+// CHECK: OpFunctionEnd
+
+OuterStruct GetStruct() { return S; }
+
+uint main() : SV_TARGET
+{
+  GetStruct();
+  return 0;
+}
+

From 57177f77a4dc6996400ac97a0d618799c82374e8 Mon Sep 17 00:00:00 2001
From: Steven Perron <stevenperron@google.com>
Date: Fri, 13 Jun 2025 07:05:13 -0400
Subject: [PATCH 62/93] [SPIRV] Use unknown image format in vk1.3 and later
 (#7528)

We have had many request to use the `unknown` image format for storage
images (OpTypeImage with sampled=2). We did not want to do that when
targeting earlier versions of Vulkan because it could break existing
code. The capability StorageImageWriteWithoutFormat is not guarenteed to
be available on Vulkan 1.1 devices. This means the application will stop
working.

However, Vulkan 1.3 guarentees that StorageImageWriteWithoutFormat and
StorageImageReadWithoutFormat are available. We can make this change for
VK1.3 and later without breaking existing code.

Fixes #7484
---
 .../clang/include/clang/SPIRV/SpirvBuilder.h  |   2 +
 tools/clang/lib/SPIRV/LowerTypeVisitor.cpp    |   7 +
 .../CodeGenSPIRV/node.empty-node-input.hlsl   |   2 +-
 .../clang/test/CodeGenSPIRV/type.buffer.hlsl  | 176 +++++++++++-------
 .../type.rasterizer-ordered-buffer.hlsl       |  92 +++++----
 .../type.rasterizer-ordered-texture.hlsl      |  36 ++--
 .../test/CodeGenSPIRV/type.rwtexture.hlsl     |  56 ++++--
 7 files changed, 234 insertions(+), 137 deletions(-)

diff --git a/tools/clang/include/clang/SPIRV/SpirvBuilder.h b/tools/clang/include/clang/SPIRV/SpirvBuilder.h
index 465f7313f1..4fe31c6d62 100644
--- a/tools/clang/include/clang/SPIRV/SpirvBuilder.h
+++ b/tools/clang/include/clang/SPIRV/SpirvBuilder.h
@@ -812,6 +812,8 @@ class SpirvBuilder {
   /// the given target at the given source location.
   inline void requireExtension(llvm::StringRef extension, SourceLocation);
 
+  FeatureManager &getFeatureManager() { return featureManager; }
+
 private:
   /// \brief If not added already, adds an OpExtInstImport (import of extended
   /// instruction set) for the given instruction set. Returns the imported
diff --git a/tools/clang/lib/SPIRV/LowerTypeVisitor.cpp b/tools/clang/lib/SPIRV/LowerTypeVisitor.cpp
index 1869983ae3..0309d56840 100644
--- a/tools/clang/lib/SPIRV/LowerTypeVisitor.cpp
+++ b/tools/clang/lib/SPIRV/LowerTypeVisitor.cpp
@@ -1156,6 +1156,13 @@ LowerTypeVisitor::lowerStructFields(const RecordDecl *decl,
 spv::ImageFormat
 LowerTypeVisitor::translateSampledTypeToImageFormat(QualType sampledType,
                                                     SourceLocation srcLoc) {
+
+  // In Vulkan 1.3, all image types can be Unknown.
+  FeatureManager &featureManager = spvBuilder.getFeatureManager();
+  if (!featureManager.isTargetEnvVulkan() ||
+      featureManager.isTargetEnvVulkan1p3OrAbove())
+    return spv::ImageFormat::Unknown;
+
   uint32_t elemCount = 1;
   QualType ty = {};
   if (!isScalarType(sampledType, &ty) &&
diff --git a/tools/clang/test/CodeGenSPIRV/node.empty-node-input.hlsl b/tools/clang/test/CodeGenSPIRV/node.empty-node-input.hlsl
index fa16429a1b..da6a1d32df 100644
--- a/tools/clang/test/CodeGenSPIRV/node.empty-node-input.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/node.empty-node-input.hlsl
@@ -19,7 +19,7 @@ void emptynodeinput(EmptyNodeInput input)
 
 // CHECK-DAG: [[UINT:%[^ ]*]] = OpTypeInt 32 0
 // CHECK-DAG: [[U0:%[^ ]*]] = OpConstant [[UINT]] 0
-// CHECK-DAG: [[IMG:%[^ ]*]] = OpTypeImage [[UINT]] Buffer 2 0 0 2 R32ui
+// CHECK-DAG: [[IMG:%[^ ]*]] = OpTypeImage [[UINT]] Buffer 2 0 0 2 Unknown
 // CHECK-DAG: [[IMGPTR:%[^ ]*]] = OpTypePointer UniformConstant [[IMG]]
 // CHECK-DAG: [[BUF:%[^ ]*]] = OpVariable [[IMGPTR]] UniformConstant
 
diff --git a/tools/clang/test/CodeGenSPIRV/type.buffer.hlsl b/tools/clang/test/CodeGenSPIRV/type.buffer.hlsl
index 35d1b868a8..3e7bb73bcb 100644
--- a/tools/clang/test/CodeGenSPIRV/type.buffer.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/type.buffer.hlsl
@@ -1,109 +1,149 @@
-// RUN: %dxc -T ps_6_0 -E main -fcgl  %s -spirv | FileCheck %s
+// RUN: %dxc -T ps_6_0 -E main -fcgl  %s -spirv | FileCheck %s --check-prefixes=CHECK,INFER
+// RUN: %dxc -fspv-target-env=vulkan1.3 -T ps_6_0 -E main -fcgl  %s -spirv | FileCheck %s --check-prefixes=CHECK,UNKNOWN
+// RUN: %dxc -fspv-target-env=universal1.5 -T ps_6_0 -E main -fcgl  %s -spirv | FileCheck %s --check-prefixes=CHECK,UNKNOWN
+
+// Before vulkan1.3, we should be trying to infer the image type for because
+// we cannot necessarily use Unknown. However in VK1.3 and later, we can use
+// Unknown.
 
 // CHECK: OpCapability SampledBuffer
-// CHECK: OpCapability StorageImageExtendedFormats
+// INFER: OpCapability StorageImageExtendedFormats
 
-// CHECK: %type_buffer_image = OpTypeImage %int Buffer 2 0 0 1 R32i
+// INFER: %type_buffer_image = OpTypeImage %int Buffer 2 0 0 1 R32i
+// UNKNOWN: %type_buffer_image = OpTypeImage %int Buffer 2 0 0 1 Unknown
 // CHECK: %_ptr_UniformConstant_type_buffer_image = OpTypePointer UniformConstant %type_buffer_image
 Buffer<int> intbuf;
-// CHECK: %type_buffer_image_0 = OpTypeImage %uint Buffer 2 0 0 1 R32ui
+// INFER: %type_buffer_image_0 = OpTypeImage %uint Buffer 2 0 0 1 R32ui
+// UNKNOWN: %type_buffer_image_0 = OpTypeImage %uint Buffer 2 0 0 1 Unknown
 // CHECK: %_ptr_UniformConstant_type_buffer_image_0 = OpTypePointer UniformConstant %type_buffer_image_0
 Buffer<uint> uintbuf;
-// CHECK: %type_buffer_image_1 = OpTypeImage %float Buffer 2 0 0 1 R32f
+// INFER: %type_buffer_image_1 = OpTypeImage %float Buffer 2 0 0 1 R32f
+// UNKNOWN: %type_buffer_image_1 = OpTypeImage %float Buffer 2 0 0 1 Unknown
 // CHECK: %_ptr_UniformConstant_type_buffer_image_1 = OpTypePointer UniformConstant %type_buffer_image_1
 Buffer<float> floatbuf;
 
-// CHECK: %type_buffer_image_2 = OpTypeImage %int Buffer 2 0 0 2 R32i
+// INFER: %type_buffer_image_2 = OpTypeImage %int Buffer 2 0 0 2 R32i
+// UNKNOWN: %type_buffer_image_2 = OpTypeImage %int Buffer 2 0 0 2 Unknown
 // CHECK: %_ptr_UniformConstant_type_buffer_image_2 = OpTypePointer UniformConstant %type_buffer_image_2
 RWBuffer<int> intrwbuf;
-// CHECK: %type_buffer_image_3 = OpTypeImage %uint Buffer 2 0 0 2 R32ui
+// INFER: %type_buffer_image_3 = OpTypeImage %uint Buffer 2 0 0 2 R32ui
+// UNKNOWN: %type_buffer_image_3 = OpTypeImage %uint Buffer 2 0 0 2 Unknown
 // CHECK: %_ptr_UniformConstant_type_buffer_image_3 = OpTypePointer UniformConstant %type_buffer_image_3
 RWBuffer<uint> uintrwbuf;
-// CHECK: %type_buffer_image_4 = OpTypeImage %float Buffer 2 0 0 2 R32f
+// INFER: %type_buffer_image_4 = OpTypeImage %float Buffer 2 0 0 2 R32f
+// UNKNOWN: %type_buffer_image_4 = OpTypeImage %float Buffer 2 0 0 2 Unknown
 // CHECK: %_ptr_UniformConstant_type_buffer_image_4 = OpTypePointer UniformConstant %type_buffer_image_4
 RWBuffer<float> floatrwbuf;
 
-// CHECK: %type_buffer_image_5 = OpTypeImage %int Buffer 2 0 0 1 Rg32i
-// CHECK: %_ptr_UniformConstant_type_buffer_image_5 = OpTypePointer UniformConstant %type_buffer_image_5
+// If the `Unkonwn image format is used, then the images below will reuse the types above.
+// UNKNOWN-NOT: OpTypeImage
+
+// INFER: %type_buffer_image_5 = OpTypeImage %int Buffer 2 0 0 1 Rg32i
+// INFER: %_ptr_UniformConstant_type_buffer_image_5 = OpTypePointer UniformConstant %type_buffer_image_5
 Buffer<int2> int2buf;
-// CHECK: %type_buffer_image_6 = OpTypeImage %uint Buffer 2 0 0 1 Rg32ui
-// CHECK: %_ptr_UniformConstant_type_buffer_image_6 = OpTypePointer UniformConstant %type_buffer_image_6
+// INFER: %type_buffer_image_6 = OpTypeImage %uint Buffer 2 0 0 1 Rg32ui
+// INFER: %_ptr_UniformConstant_type_buffer_image_6 = OpTypePointer UniformConstant %type_buffer_image_6
 Buffer<uint2> uint2buf;
-// CHECK: %type_buffer_image_7 = OpTypeImage %float Buffer 2 0 0 1 Rg32f
-// CHECK: %_ptr_UniformConstant_type_buffer_image_7 = OpTypePointer UniformConstant %type_buffer_image_7
+// INFER: %type_buffer_image_7 = OpTypeImage %float Buffer 2 0 0 1 Rg32f
+// INFER: %_ptr_UniformConstant_type_buffer_image_7 = OpTypePointer UniformConstant %type_buffer_image_7
 Buffer<float2> float2buf;
 
-// CHECK: %type_buffer_image_8 = OpTypeImage %int Buffer 2 0 0 2 Rg32i
-// CHECK: %_ptr_UniformConstant_type_buffer_image_8 = OpTypePointer UniformConstant %type_buffer_image_8
+// INFER: %type_buffer_image_8 = OpTypeImage %int Buffer 2 0 0 2 Rg32i
+// INFER: %_ptr_UniformConstant_type_buffer_image_8 = OpTypePointer UniformConstant %type_buffer_image_8
 RWBuffer<int2> int2rwbuf;
-// CHECK: %type_buffer_image_9 = OpTypeImage %uint Buffer 2 0 0 2 Rg32ui
-// CHECK: %_ptr_UniformConstant_type_buffer_image_9 = OpTypePointer UniformConstant %type_buffer_image_9
+// INFER: %type_buffer_image_9 = OpTypeImage %uint Buffer 2 0 0 2 Rg32ui
+// INFER: %_ptr_UniformConstant_type_buffer_image_9 = OpTypePointer UniformConstant %type_buffer_image_9
 RWBuffer<uint2> uint2rwbuf;
-// CHECK: %type_buffer_image_10 = OpTypeImage %float Buffer 2 0 0 2 Rg32f
-// CHECK: %_ptr_UniformConstant_type_buffer_image_10 = OpTypePointer UniformConstant %type_buffer_image_10
+// INFER: %type_buffer_image_10 = OpTypeImage %float Buffer 2 0 0 2 Rg32f
+// INFER: %_ptr_UniformConstant_type_buffer_image_10 = OpTypePointer UniformConstant %type_buffer_image_10
 RWBuffer<float2> float2rwbuf;
 
-// CHECK: %type_buffer_image_11 = OpTypeImage %int Buffer 2 0 0 1 Unknown
-// CHECK: %_ptr_UniformConstant_type_buffer_image_11 = OpTypePointer UniformConstant %type_buffer_image_11
-// CHECK: %type_buffer_image_12 = OpTypeImage %int Buffer 2 0 0 1 Rgba32i
-// CHECK: %_ptr_UniformConstant_type_buffer_image_12 = OpTypePointer UniformConstant %type_buffer_image_12
+// INFER: %type_buffer_image_11 = OpTypeImage %int Buffer 2 0 0 1 Unknown
+// INFER: %_ptr_UniformConstant_type_buffer_image_11 = OpTypePointer UniformConstant %type_buffer_image_11
+// INFER: %type_buffer_image_12 = OpTypeImage %int Buffer 2 0 0 1 Rgba32i
+// INFER: %_ptr_UniformConstant_type_buffer_image_12 = OpTypePointer UniformConstant %type_buffer_image_12
 Buffer<int3> int3buf;
 Buffer<int4> int4buf;
-// CHECK: %type_buffer_image_13 = OpTypeImage %uint Buffer 2 0 0 1 Unknown
-// CHECK: %_ptr_UniformConstant_type_buffer_image_13 = OpTypePointer UniformConstant %type_buffer_image_13
-// CHECK: %type_buffer_image_14 = OpTypeImage %uint Buffer 2 0 0 1 Rgba32ui
-// CHECK: %_ptr_UniformConstant_type_buffer_image_14 = OpTypePointer UniformConstant %type_buffer_image_14
+// INFER: %type_buffer_image_13 = OpTypeImage %uint Buffer 2 0 0 1 Unknown
+// INFER: %_ptr_UniformConstant_type_buffer_image_13 = OpTypePointer UniformConstant %type_buffer_image_13
+// INFER: %type_buffer_image_14 = OpTypeImage %uint Buffer 2 0 0 1 Rgba32ui
+// INFER: %_ptr_UniformConstant_type_buffer_image_14 = OpTypePointer UniformConstant %type_buffer_image_14
 Buffer<uint3> uint3buf;
 Buffer<uint4> uint4buf;
-// CHECK: %type_buffer_image_15 = OpTypeImage %float Buffer 2 0 0 1 Unknown
-// CHECK: %_ptr_UniformConstant_type_buffer_image_15 = OpTypePointer UniformConstant %type_buffer_image_15
-// CHECK: %type_buffer_image_16 = OpTypeImage %float Buffer 2 0 0 1 Rgba32f
-// CHECK: %_ptr_UniformConstant_type_buffer_image_16 = OpTypePointer UniformConstant %type_buffer_image_16
+// INFER: %type_buffer_image_15 = OpTypeImage %float Buffer 2 0 0 1 Unknown
+// INFER: %_ptr_UniformConstant_type_buffer_image_15 = OpTypePointer UniformConstant %type_buffer_image_15
+// INFER: %type_buffer_image_16 = OpTypeImage %float Buffer 2 0 0 1 Rgba32f
+// INFER: %_ptr_UniformConstant_type_buffer_image_16 = OpTypePointer UniformConstant %type_buffer_image_16
 Buffer<float3> float3buf;
 Buffer<float4> float4buf;
 
-// CHECK: %type_buffer_image_17 = OpTypeImage %int Buffer 2 0 0 2 Unknown
-// CHECK: %_ptr_UniformConstant_type_buffer_image_17 = OpTypePointer UniformConstant %type_buffer_image_17
-// CHECK: %type_buffer_image_18 = OpTypeImage %int Buffer 2 0 0 2 Rgba32i
-// CHECK: %_ptr_UniformConstant_type_buffer_image_18 = OpTypePointer UniformConstant %type_buffer_image_18
+// INFER: %type_buffer_image_17 = OpTypeImage %int Buffer 2 0 0 2 Unknown
+// INFER: %_ptr_UniformConstant_type_buffer_image_17 = OpTypePointer UniformConstant %type_buffer_image_17
+// INFER: %type_buffer_image_18 = OpTypeImage %int Buffer 2 0 0 2 Rgba32i
+// INFER: %_ptr_UniformConstant_type_buffer_image_18 = OpTypePointer UniformConstant %type_buffer_image_18
 RWBuffer<int3> int3rwbuf;
 RWBuffer<int4> int4rwbuf;
-// CHECK: %type_buffer_image_19 = OpTypeImage %uint Buffer 2 0 0 2 Unknown
-// CHECK: %_ptr_UniformConstant_type_buffer_image_19 = OpTypePointer UniformConstant %type_buffer_image_19
-// CHECK: %type_buffer_image_20 = OpTypeImage %uint Buffer 2 0 0 2 Rgba32ui
-// CHECK: %_ptr_UniformConstant_type_buffer_image_20 = OpTypePointer UniformConstant %type_buffer_image_20
+// INFER: %type_buffer_image_19 = OpTypeImage %uint Buffer 2 0 0 2 Unknown
+// INFER: %_ptr_UniformConstant_type_buffer_image_19 = OpTypePointer UniformConstant %type_buffer_image_19
+// INFER: %type_buffer_image_20 = OpTypeImage %uint Buffer 2 0 0 2 Rgba32ui
+// INFER: %_ptr_UniformConstant_type_buffer_image_20 = OpTypePointer UniformConstant %type_buffer_image_20
 RWBuffer<uint3> uint3rwbuf;
 RWBuffer<uint4> uint4rwbuf;
-// CHECK: %type_buffer_image_21 = OpTypeImage %float Buffer 2 0 0 2 Unknown
-// CHECK: %_ptr_UniformConstant_type_buffer_image_21 = OpTypePointer UniformConstant %type_buffer_image_21
-// CHECK: %type_buffer_image_22 = OpTypeImage %float Buffer 2 0 0 2 Rgba32f
-// CHECK: %_ptr_UniformConstant_type_buffer_image_22 = OpTypePointer UniformConstant %type_buffer_image_22
+// INFER: %type_buffer_image_21 = OpTypeImage %float Buffer 2 0 0 2 Unknown
+// INFER: %_ptr_UniformConstant_type_buffer_image_21 = OpTypePointer UniformConstant %type_buffer_image_21
+// INFER: %type_buffer_image_22 = OpTypeImage %float Buffer 2 0 0 2 Rgba32f
+// INFER: %_ptr_UniformConstant_type_buffer_image_22 = OpTypePointer UniformConstant %type_buffer_image_22
 RWBuffer<float3> float3rwbuf;
 RWBuffer<float4> float4rwbuf;
 
-// CHECK: %intbuf = OpVariable %_ptr_UniformConstant_type_buffer_image UniformConstant
-// CHECK: %uintbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_0 UniformConstant
-// CHECK: %floatbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_1 UniformConstant
-// CHECK: %intrwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_2 UniformConstant
-// CHECK: %uintrwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_3 UniformConstant
-// CHECK: %floatrwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_4 UniformConstant
-// CHECK: %int2buf = OpVariable %_ptr_UniformConstant_type_buffer_image_5 UniformConstant
-// CHECK: %uint2buf = OpVariable %_ptr_UniformConstant_type_buffer_image_6 UniformConstant
-// CHECK: %float2buf = OpVariable %_ptr_UniformConstant_type_buffer_image_7 UniformConstant
-// CHECK: %int2rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_8 UniformConstant
-// CHECK: %uint2rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_9 UniformConstant
-// CHECK: %float2rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_10 UniformConstant
-// CHECK: %int3buf = OpVariable %_ptr_UniformConstant_type_buffer_image_11 UniformConstant
-// CHECK: %int4buf = OpVariable %_ptr_UniformConstant_type_buffer_image_12 UniformConstant
-// CHECK: %uint3buf = OpVariable %_ptr_UniformConstant_type_buffer_image_13 UniformConstant
-// CHECK: %uint4buf = OpVariable %_ptr_UniformConstant_type_buffer_image_14 UniformConstant
-// CHECK: %float3buf = OpVariable %_ptr_UniformConstant_type_buffer_image_15 UniformConstant
-// CHECK: %float4buf = OpVariable %_ptr_UniformConstant_type_buffer_image_16 UniformConstant
-// CHECK: %int3rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_17 UniformConstant
-// CHECK: %int4rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_18 UniformConstant
-// CHECK: %uint3rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_19 UniformConstant
-// CHECK: %uint4rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_20 UniformConstant
-// CHECK: %float3rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_21 UniformConstant
-// CHECK: %float4rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_22 UniformConstant
+// INFER: %intbuf = OpVariable %_ptr_UniformConstant_type_buffer_image UniformConstant
+// INFER: %uintbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_0 UniformConstant
+// INFER: %floatbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_1 UniformConstant
+// INFER: %intrwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_2 UniformConstant
+// INFER: %uintrwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_3 UniformConstant
+// INFER: %floatrwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_4 UniformConstant
+// INFER: %int2buf = OpVariable %_ptr_UniformConstant_type_buffer_image_5 UniformConstant
+// INFER: %uint2buf = OpVariable %_ptr_UniformConstant_type_buffer_image_6 UniformConstant
+// INFER: %float2buf = OpVariable %_ptr_UniformConstant_type_buffer_image_7 UniformConstant
+// INFER: %int2rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_8 UniformConstant
+// INFER: %uint2rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_9 UniformConstant
+// INFER: %float2rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_10 UniformConstant
+// INFER: %int3buf = OpVariable %_ptr_UniformConstant_type_buffer_image_11 UniformConstant
+// INFER: %int4buf = OpVariable %_ptr_UniformConstant_type_buffer_image_12 UniformConstant
+// INFER: %uint3buf = OpVariable %_ptr_UniformConstant_type_buffer_image_13 UniformConstant
+// INFER: %uint4buf = OpVariable %_ptr_UniformConstant_type_buffer_image_14 UniformConstant
+// INFER: %float3buf = OpVariable %_ptr_UniformConstant_type_buffer_image_15 UniformConstant
+// INFER: %float4buf = OpVariable %_ptr_UniformConstant_type_buffer_image_16 UniformConstant
+// INFER: %int3rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_17 UniformConstant
+// INFER: %int4rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_18 UniformConstant
+// INFER: %uint3rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_19 UniformConstant
+// INFER: %uint4rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_20 UniformConstant
+// INFER: %float3rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_21 UniformConstant
+// INFER: %float4rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_22 UniformConstant
+
+// UNKNOWN: %intbuf = OpVariable %_ptr_UniformConstant_type_buffer_image UniformConstant
+// UNKNOWN: %uintbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_0 UniformConstant
+// UNKNOWN: %floatbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_1 UniformConstant
+// UNKNOWN: %intrwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_2 UniformConstant
+// UNKNOWN: %uintrwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_3 UniformConstant
+// UNKNOWN: %floatrwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_4 UniformConstant
+// UNKNOWN: %int2buf = OpVariable %_ptr_UniformConstant_type_buffer_image UniformConstant
+// UNKNOWN: %uint2buf = OpVariable %_ptr_UniformConstant_type_buffer_image_0 UniformConstant
+// UNKNOWN: %float2buf = OpVariable %_ptr_UniformConstant_type_buffer_image_1 UniformConstant
+// UNKNOWN: %int2rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_2 UniformConstant
+// UNKNOWN: %uint2rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_3 UniformConstant
+// UNKNOWN: %float2rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_4 UniformConstant
+// UNKNOWN: %int3buf = OpVariable %_ptr_UniformConstant_type_buffer_image UniformConstant
+// UNKNOWN: %int4buf = OpVariable %_ptr_UniformConstant_type_buffer_image UniformConstant
+// UNKNOWN: %uint3buf = OpVariable %_ptr_UniformConstant_type_buffer_image_0 UniformConstant
+// UNKNOWN: %uint4buf = OpVariable %_ptr_UniformConstant_type_buffer_image_0 UniformConstant
+// UNKNOWN: %float3buf = OpVariable %_ptr_UniformConstant_type_buffer_image_1 UniformConstant
+// UNKNOWN: %float4buf = OpVariable %_ptr_UniformConstant_type_buffer_image_1 UniformConstant
+// UNKNOWN: %int3rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_2 UniformConstant
+// UNKNOWN: %int4rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_2 UniformConstant
+// UNKNOWN: %uint3rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_3 UniformConstant
+// UNKNOWN: %uint4rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_3 UniformConstant
+// UNKNOWN: %float3rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_4 UniformConstant
+// UNKNOWN: %float4rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_4 UniformConstant
 
 void main() {}
diff --git a/tools/clang/test/CodeGenSPIRV/type.rasterizer-ordered-buffer.hlsl b/tools/clang/test/CodeGenSPIRV/type.rasterizer-ordered-buffer.hlsl
index c616f65bb9..0b576fc5e9 100644
--- a/tools/clang/test/CodeGenSPIRV/type.rasterizer-ordered-buffer.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/type.rasterizer-ordered-buffer.hlsl
@@ -1,59 +1,81 @@
-// RUN: %dxc -T ps_6_6 -E main -fcgl %s -spirv | FileCheck %s
+// RUN: %dxc -T ps_6_6 -E main -fcgl %s -spirv | FileCheck %s --check-prefixes=CHECK,INFER
+// RUN: %dxc -fspv-target-env=vulkan1.3 -T ps_6_6 -E main -fcgl %s -spirv | FileCheck %s --check-prefixes=CHECK,UNKNOWN
+// RUN: %dxc -fspv-target-env=universal1.5 -T ps_6_6 -E main -fcgl %s -spirv | FileCheck %s --check-prefixes=CHECK,UNKNOWN
+
+// Before vulkan1.3, we should be trying to infer the image type for because
+// we cannot necessarily use Unknown. However in VK1.3 and later, we can use
+// Unknown.
 
 // CHECK: OpCapability SampledBuffer
-// CHECK: OpCapability StorageImageExtendedFormats
+// INFER: OpCapability StorageImageExtendedFormats
 
-// CHECK: %type_buffer_image = OpTypeImage %int Buffer 2 0 0 2 R32i
+// INFER: %type_buffer_image = OpTypeImage %int Buffer 2 0 0 2 R32i
+// UNKNOWN: %type_buffer_image = OpTypeImage %int Buffer 2 0 0 2 Unknown
 // CHECK: %_ptr_UniformConstant_type_buffer_image = OpTypePointer UniformConstant %type_buffer_image
 RasterizerOrderedBuffer<int> introvbuf;
-// CHECK: %type_buffer_image_0 = OpTypeImage %uint Buffer 2 0 0 2 R32ui
+// INFER: %type_buffer_image_0 = OpTypeImage %uint Buffer 2 0 0 2 R32ui
+// UNKNOWN: %type_buffer_image_0 = OpTypeImage %uint Buffer 2 0 0 2 Unknown
 // CHECK: %_ptr_UniformConstant_type_buffer_image_0 = OpTypePointer UniformConstant %type_buffer_image_0
 RasterizerOrderedBuffer<uint> uintrovbuf;
-// CHECK: %type_buffer_image_1 = OpTypeImage %float Buffer 2 0 0 2 R32f
+// INFER: %type_buffer_image_1 = OpTypeImage %float Buffer 2 0 0 2 R32f
+// UNKNOWN: %type_buffer_image_1 = OpTypeImage %float Buffer 2 0 0 2 Unknown
 // CHECK: %_ptr_UniformConstant_type_buffer_image_1 = OpTypePointer UniformConstant %type_buffer_image_1
 RasterizerOrderedBuffer<float> floatrovbuf;
 
-// CHECK: %type_buffer_image_2 = OpTypeImage %int Buffer 2 0 0 2 Rg32i
-// CHECK: %_ptr_UniformConstant_type_buffer_image_2 = OpTypePointer UniformConstant %type_buffer_image_2
+// INFER: %type_buffer_image_2 = OpTypeImage %int Buffer 2 0 0 2 Rg32i
+// INFER: %_ptr_UniformConstant_type_buffer_image_2 = OpTypePointer UniformConstant %type_buffer_image_2
 RasterizerOrderedBuffer<int2> int2rovbuf;
-// CHECK: %type_buffer_image_3 = OpTypeImage %uint Buffer 2 0 0 2 Rg32ui
-// CHECK: %_ptr_UniformConstant_type_buffer_image_3 = OpTypePointer UniformConstant %type_buffer_image_3
+// INFER: %type_buffer_image_3 = OpTypeImage %uint Buffer 2 0 0 2 Rg32ui
+// INFER: %_ptr_UniformConstant_type_buffer_image_3 = OpTypePointer UniformConstant %type_buffer_image_3
 RasterizerOrderedBuffer<uint2> uint2rovbuf;
-// CHECK: %type_buffer_image_4 = OpTypeImage %float Buffer 2 0 0 2 Rg32f
-// CHECK: %_ptr_UniformConstant_type_buffer_image_4 = OpTypePointer UniformConstant %type_buffer_image_4
+// INFER: %type_buffer_image_4 = OpTypeImage %float Buffer 2 0 0 2 Rg32f
+// INFER: %_ptr_UniformConstant_type_buffer_image_4 = OpTypePointer UniformConstant %type_buffer_image_4
 RasterizerOrderedBuffer<float2> float2rovbuf;
 
-// CHECK: %type_buffer_image_5 = OpTypeImage %int Buffer 2 0 0 2 Unknown
-// CHECK: %_ptr_UniformConstant_type_buffer_image_5 = OpTypePointer UniformConstant %type_buffer_image_5
-// CHECK: %type_buffer_image_6 = OpTypeImage %int Buffer 2 0 0 2 Rgba32i
-// CHECK: %_ptr_UniformConstant_type_buffer_image_6 = OpTypePointer UniformConstant %type_buffer_image_6
+// INFER: %type_buffer_image_5 = OpTypeImage %int Buffer 2 0 0 2 Unknown
+// INFER: %_ptr_UniformConstant_type_buffer_image_5 = OpTypePointer UniformConstant %type_buffer_image_5
+// INFER: %type_buffer_image_6 = OpTypeImage %int Buffer 2 0 0 2 Rgba32i
+// INFER: %_ptr_UniformConstant_type_buffer_image_6 = OpTypePointer UniformConstant %type_buffer_image_6
 RasterizerOrderedBuffer<int3> int3rovbuf;
 RasterizerOrderedBuffer<int4> int4rovbuf;
-// CHECK: %type_buffer_image_7 = OpTypeImage %uint Buffer 2 0 0 2 Unknown
-// CHECK: %_ptr_UniformConstant_type_buffer_image_7 = OpTypePointer UniformConstant %type_buffer_image_7
-// CHECK: %type_buffer_image_8 = OpTypeImage %uint Buffer 2 0 0 2 Rgba32ui
-// CHECK: %_ptr_UniformConstant_type_buffer_image_8 = OpTypePointer UniformConstant %type_buffer_image_8
+// INFER: %type_buffer_image_7 = OpTypeImage %uint Buffer 2 0 0 2 Unknown
+// INFER: %_ptr_UniformConstant_type_buffer_image_7 = OpTypePointer UniformConstant %type_buffer_image_7
+// INFER: %type_buffer_image_8 = OpTypeImage %uint Buffer 2 0 0 2 Rgba32ui
+// INFER: %_ptr_UniformConstant_type_buffer_image_8 = OpTypePointer UniformConstant %type_buffer_image_8
 RasterizerOrderedBuffer<uint3> uint3rovbuf;
 RasterizerOrderedBuffer<uint4> uint4rovbuf;
-// CHECK: %type_buffer_image_9 = OpTypeImage %float Buffer 2 0 0 2 Unknown
-// CHECK: %_ptr_UniformConstant_type_buffer_image_9 = OpTypePointer UniformConstant %type_buffer_image_9
-// CHECK: %type_buffer_image_10 = OpTypeImage %float Buffer 2 0 0 2 Rgba32f
-// CHECK: %_ptr_UniformConstant_type_buffer_image_10 = OpTypePointer UniformConstant %type_buffer_image_10
+// INFER: %type_buffer_image_9 = OpTypeImage %float Buffer 2 0 0 2 Unknown
+// INFER: %_ptr_UniformConstant_type_buffer_image_9 = OpTypePointer UniformConstant %type_buffer_image_9
+// INFER: %type_buffer_image_10 = OpTypeImage %float Buffer 2 0 0 2 Rgba32f
+// INFER: %_ptr_UniformConstant_type_buffer_image_10 = OpTypePointer UniformConstant %type_buffer_image_10
 RasterizerOrderedBuffer<float3> float3rovbuf;
 RasterizerOrderedBuffer<float4> float4rovbuf;
 
-// CHECK: %introvbuf = OpVariable %_ptr_UniformConstant_type_buffer_image UniformConstant
-// CHECK: %uintrovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_0 UniformConstant
-// CHECK: %floatrovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_1 UniformConstant
-// CHECK: %int2rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_2 UniformConstant
-// CHECK: %uint2rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_3 UniformConstant
-// CHECK: %float2rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_4 UniformConstant
-// CHECK: %int3rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_5 UniformConstant
-// CHECK: %int4rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_6 UniformConstant
-// CHECK: %uint3rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_7 UniformConstant
-// CHECK: %uint4rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_8 UniformConstant
-// CHECK: %float3rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_9 UniformConstant
-// CHECK: %float4rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_10 UniformConstant
+// INFER: %introvbuf = OpVariable %_ptr_UniformConstant_type_buffer_image UniformConstant
+// INFER: %uintrovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_0 UniformConstant
+// INFER: %floatrovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_1 UniformConstant
+// INFER: %int2rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_2 UniformConstant
+// INFER: %uint2rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_3 UniformConstant
+// INFER: %float2rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_4 UniformConstant
+// INFER: %int3rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_5 UniformConstant
+// INFER: %int4rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_6 UniformConstant
+// INFER: %uint3rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_7 UniformConstant
+// INFER: %uint4rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_8 UniformConstant
+// INFER: %float3rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_9 UniformConstant
+// INFER: %float4rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_10 UniformConstant
+
+// UNKNOWN: %introvbuf = OpVariable %_ptr_UniformConstant_type_buffer_image UniformConstant
+// UNKNOWN: %uintrovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_0 UniformConstant
+// UNKNOWN: %floatrovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_1 UniformConstant
+// UNKNOWN: %int2rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image UniformConstant
+// UNKNOWN: %uint2rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_0 UniformConstant
+// UNKNOWN: %float2rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_1 UniformConstant
+// UNKNOWN: %int3rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image UniformConstant
+// UNKNOWN: %int4rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image UniformConstant
+// UNKNOWN: %uint3rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_0 UniformConstant
+// UNKNOWN: %uint4rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_0 UniformConstant
+// UNKNOWN: %float3rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_1 UniformConstant
+// UNKNOWN: %float4rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_1 UniformConstant
 
 void main() {}
 
diff --git a/tools/clang/test/CodeGenSPIRV/type.rasterizer-ordered-texture.hlsl b/tools/clang/test/CodeGenSPIRV/type.rasterizer-ordered-texture.hlsl
index 32dd76e6f1..21bff421a0 100644
--- a/tools/clang/test/CodeGenSPIRV/type.rasterizer-ordered-texture.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/type.rasterizer-ordered-texture.hlsl
@@ -1,23 +1,28 @@
-// RUN: %dxc -T ps_6_6 -E main -fcgl %s -spirv | FileCheck %s
+// RUN: %dxc -T ps_6_6 -E main -fcgl %s -spirv | FileCheck %s --check-prefixes=CHECK,INFER
+// RUN: %dxc -fspv-target-env=vulkan1.3 -T ps_6_6 -E main -fcgl %s -spirv | FileCheck %s --check-prefixes=CHECK,UNKNOWN
+// RUN: %dxc -fspv-target-env=universal1.5 -T ps_6_6 -E main -fcgl %s -spirv | FileCheck %s --check-prefixes=CHECK,UNKNOWN
 
 // CHECK: OpCapability Image1D
 
-// CHECK: %type_1d_image = OpTypeImage %int 1D 2 0 0 2 R32i
+// INFER: %type_1d_image = OpTypeImage %int 1D 2 0 0 2 R32i
+// UNKNOWN: %type_1d_image = OpTypeImage %int 1D 2 0 0 2 Unknown
 // CHECK: %_ptr_UniformConstant_type_1d_image = OpTypePointer UniformConstant %type_1d_image
-// CHECK: %type_2d_image = OpTypeImage %uint 2D 2 0 0 2 Rg32ui
+// INFER: %type_2d_image = OpTypeImage %uint 2D 2 0 0 2 Rg32ui
+// UNKNOWN: %type_2d_image = OpTypeImage %uint 2D 2 0 0 2 Unknown
 // CHECK: %_ptr_UniformConstant_type_2d_image = OpTypePointer UniformConstant %type_2d_image
-// CHECK: %type_3d_image = OpTypeImage %int 3D 2 0 0 2 R32i
+// INFER: %type_3d_image = OpTypeImage %int 3D 2 0 0 2 R32i
+// UNKNOWN: %type_3d_image = OpTypeImage %int 3D 2 0 0 2 Unknown
 // CHECK: %_ptr_UniformConstant_type_3d_image = OpTypePointer UniformConstant %type_3d_image
-// CHECK: %type_3d_image_0 = OpTypeImage %float 3D 2 0 0 2 Rgba32f
-// CHECK: %_ptr_UniformConstant_type_3d_image_0 = OpTypePointer UniformConstant %type_3d_image_0
-// CHECK: %type_1d_image_array = OpTypeImage %int 1D 2 1 0 2 R32i
-// CHECK: %_ptr_UniformConstant_type_1d_image_array = OpTypePointer UniformConstant %type_1d_image_array
-// CHECK: %type_2d_image_array = OpTypeImage %uint 2D 2 1 0 2 Rg32ui
-// CHECK: %_ptr_UniformConstant_type_2d_image_array = OpTypePointer UniformConstant %type_2d_image_array
-// CHECK: %type_1d_image_array_0 = OpTypeImage %float 1D 2 1 0 2 Rgba32f
-// CHECK: %_ptr_UniformConstant_type_1d_image_array_0 = OpTypePointer UniformConstant %type_1d_image_array_0
-// CHECK: %type_2d_image_array_0 = OpTypeImage %float 2D 2 1 0 2 Rgba32f
-// CHECK: %_ptr_UniformConstant_type_2d_image_array_0 = OpTypePointer UniformConstant %type_2d_image_array_0
+// INFER: %type_3d_image_0 = OpTypeImage %float 3D 2 0 0 2 Rgba32f
+// INFER: %_ptr_UniformConstant_type_3d_image_0 = OpTypePointer UniformConstant %type_3d_image_0
+// INFER: %type_1d_image_array = OpTypeImage %int 1D 2 1 0 2 R32i
+// INFER: %_ptr_UniformConstant_type_1d_image_array = OpTypePointer UniformConstant %type_1d_image_array
+// INFER: %type_2d_image_array = OpTypeImage %uint 2D 2 1 0 2 Rg32ui
+// INFER: %_ptr_UniformConstant_type_2d_image_array = OpTypePointer UniformConstant %type_2d_image_array
+// INFER: %type_1d_image_array_0 = OpTypeImage %float 1D 2 1 0 2 Rgba32f
+// INFER: %_ptr_UniformConstant_type_1d_image_array_0 = OpTypePointer UniformConstant %type_1d_image_array_0
+// INFER: %type_2d_image_array_0 = OpTypeImage %float 2D 2 1 0 2 Rgba32f
+// INFER: %_ptr_UniformConstant_type_2d_image_array_0 = OpTypePointer UniformConstant %type_2d_image_array_0
 
 
 // CHECK: %t1 = OpVariable %_ptr_UniformConstant_type_1d_image UniformConstant
@@ -33,7 +38,8 @@ RasterizerOrderedTexture3D   <int>    t3 ;
 [[vk::image_format("rgba32f")]]
 RasterizerOrderedTexture3D   <float3> t4 ;
 
-// CHECK: %t5 = OpVariable %_ptr_UniformConstant_type_3d_image_0 UniformConstant
+// INFER: %t5 = OpVariable %_ptr_UniformConstant_type_3d_image_0 UniformConstant
+// UNKNOWN: %t5 = OpVariable %_ptr_UniformConstant_type_3d_image_1 UniformConstant
 RasterizerOrderedTexture3D   <float4> t5 ;
 
 // CHECK: %t6 = OpVariable %_ptr_UniformConstant_type_1d_image_array UniformConstant
diff --git a/tools/clang/test/CodeGenSPIRV/type.rwtexture.hlsl b/tools/clang/test/CodeGenSPIRV/type.rwtexture.hlsl
index f901d44cfa..884957210a 100644
--- a/tools/clang/test/CodeGenSPIRV/type.rwtexture.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/type.rwtexture.hlsl
@@ -1,24 +1,43 @@
-// RUN: %dxc -T vs_6_0 -E main -fcgl  %s -spirv | FileCheck %s
+// RUN: %dxc -T vs_6_0 -E main -fcgl  %s -spirv | FileCheck %s --check-prefixes=CHECK,INFER
+// RUN: %dxc -fspv-target-env=vulkan1.3 -T vs_6_0 -E main -fcgl  %s -spirv | FileCheck %s --check-prefixes=CHECK,UNKNOWN
 
 // CHECK: OpCapability Image1D
 
-// CHECK: %type_1d_image = OpTypeImage %int 1D 2 0 0 2 R32i
-// CHECK: %_ptr_UniformConstant_type_1d_image = OpTypePointer UniformConstant %type_1d_image
-// CHECK: %type_2d_image = OpTypeImage %uint 2D 2 0 0 2 Rg32ui
-// CHECK: %_ptr_UniformConstant_type_2d_image = OpTypePointer UniformConstant %type_2d_image
-// CHECK: %type_3d_image = OpTypeImage %int 3D 2 0 0 2 R32i
-// CHECK: %_ptr_UniformConstant_type_3d_image = OpTypePointer UniformConstant %type_3d_image
-// CHECK: %type_3d_image_0 = OpTypeImage %float 3D 2 0 0 2 Rgba32f
-// CHECK: %_ptr_UniformConstant_type_3d_image_0 = OpTypePointer UniformConstant %type_3d_image_0
-// CHECK: %type_1d_image_array = OpTypeImage %int 1D 2 1 0 2 R32i
-// CHECK: %_ptr_UniformConstant_type_1d_image_array = OpTypePointer UniformConstant %type_1d_image_array
-// CHECK: %type_2d_image_array = OpTypeImage %uint 2D 2 1 0 2 Rg32ui
-// CHECK: %_ptr_UniformConstant_type_2d_image_array = OpTypePointer UniformConstant %type_2d_image_array
-// CHECK: %type_1d_image_array_0 = OpTypeImage %float 1D 2 1 0 2 Rgba32f
-// CHECK: %_ptr_UniformConstant_type_1d_image_array_0 = OpTypePointer UniformConstant %type_1d_image_array_0
-// CHECK: %type_2d_image_array_0 = OpTypeImage %float 2D 2 1 0 2 Rgba32f
-// CHECK: %_ptr_UniformConstant_type_2d_image_array_0 = OpTypePointer UniformConstant %type_2d_image_array_0
+// INFER: %type_1d_image = OpTypeImage %int 1D 2 0 0 2 R32i
+// INFER: %_ptr_UniformConstant_type_1d_image = OpTypePointer UniformConstant %type_1d_image
+// INFER: %type_2d_image = OpTypeImage %uint 2D 2 0 0 2 Rg32ui
+// INFER: %_ptr_UniformConstant_type_2d_image = OpTypePointer UniformConstant %type_2d_image
+// INFER: %type_3d_image = OpTypeImage %int 3D 2 0 0 2 R32i
+// INFER: %_ptr_UniformConstant_type_3d_image = OpTypePointer UniformConstant %type_3d_image
+// INFER: %type_3d_image_0 = OpTypeImage %float 3D 2 0 0 2 Rgba32f
+// INFER: %_ptr_UniformConstant_type_3d_image_0 = OpTypePointer UniformConstant %type_3d_image_0
+// INFER: %type_1d_image_array = OpTypeImage %int 1D 2 1 0 2 R32i
+// INFER: %_ptr_UniformConstant_type_1d_image_array = OpTypePointer UniformConstant %type_1d_image_array
+// INFER: %type_2d_image_array = OpTypeImage %uint 2D 2 1 0 2 Rg32ui
+// INFER: %_ptr_UniformConstant_type_2d_image_array = OpTypePointer UniformConstant %type_2d_image_array
+// INFER: %type_1d_image_array_0 = OpTypeImage %float 1D 2 1 0 2 Rgba32f
+// INFER: %_ptr_UniformConstant_type_1d_image_array_0 = OpTypePointer UniformConstant %type_1d_image_array_0
+// INFER: %type_2d_image_array_0 = OpTypeImage %float 2D 2 1 0 2 Rgba32f
+// INFER: %_ptr_UniformConstant_type_2d_image_array_0 = OpTypePointer UniformConstant %type_2d_image_array_0
 
+// UNKNOWN: %type_1d_image = OpTypeImage %int 1D 2 0 0 2 Unknown
+// UNKNOWN: %_ptr_UniformConstant_type_1d_image = OpTypePointer UniformConstant %type_1d_image
+// UNKNOWN: %type_2d_image = OpTypeImage %uint 2D 2 0 0 2 Unknown
+// UNKNOWN: %_ptr_UniformConstant_type_2d_image = OpTypePointer UniformConstant %type_2d_image
+// UNKNOWN: %type_3d_image = OpTypeImage %int 3D 2 0 0 2 Unknown
+// UNKNOWN: %_ptr_UniformConstant_type_3d_image = OpTypePointer UniformConstant %type_3d_image
+// UNKNOWN: %type_3d_image_0 = OpTypeImage %float 3D 2 0 0 2 Rgba32f
+// UNKNOWN: %_ptr_UniformConstant_type_3d_image_0 = OpTypePointer UniformConstant %type_3d_image_0
+// UNKNOWN: %type_3d_image_1 = OpTypeImage %float 3D 2 0 0 2 Unknown
+// UNKNOWN: %_ptr_UniformConstant_type_3d_image_1 = OpTypePointer UniformConstant %type_3d_image_1
+// UNKNOWN: %type_1d_image_array = OpTypeImage %int 1D 2 1 0 2 Unknown
+// UNKNOWN: %_ptr_UniformConstant_type_1d_image_array = OpTypePointer UniformConstant %type_1d_image_array
+// UNKNOWN: %type_2d_image_array = OpTypeImage %uint 2D 2 1 0 2 Unknown
+// UNKNOWN: %_ptr_UniformConstant_type_2d_image_array = OpTypePointer UniformConstant %type_2d_image_array
+// UNKNOWN: %type_1d_image_array_0 = OpTypeImage %float 1D 2 1 0 2 Unknown
+// UNKNOWN: %_ptr_UniformConstant_type_1d_image_array_0 = OpTypePointer UniformConstant %type_1d_image_array_0
+// UNKNOWN: %type_2d_image_array_0 = OpTypeImage %float 2D 2 1 0 2 Unknown
+// UNKNOWN: %_ptr_UniformConstant_type_2d_image_array_0 = OpTypePointer UniformConstant %type_2d_image_array_0
 
 // CHECK: %t1 = OpVariable %_ptr_UniformConstant_type_1d_image UniformConstant
 RWTexture1D   <int>    t1 ;
@@ -33,7 +52,8 @@ RWTexture3D   <int>    t3 ;
 [[vk::image_format("rgba32f")]]
 RWTexture3D   <float3> t4 ;
 
-// CHECK: %t5 = OpVariable %_ptr_UniformConstant_type_3d_image_0 UniformConstant
+// INFER: %t5 = OpVariable %_ptr_UniformConstant_type_3d_image_0 UniformConstant
+// UNKNOWN: %t5 = OpVariable %_ptr_UniformConstant_type_3d_image_1 UniformConstant
 RWTexture3D   <float4> t5 ;
 
 // CHECK: %t6 = OpVariable %_ptr_UniformConstant_type_1d_image_array UniformConstant

From f94396ddffa8562a00d64a1db58d3f73f33b655a Mon Sep 17 00:00:00 2001
From: Alex Sepkowski <5620315+alsepkow@users.noreply.github.com>
Date: Mon, 16 Jun 2025 15:02:00 -0700
Subject: [PATCH 63/93] Long Vector Execution tests preliminary work to merge a
 minor refactor. (#7532)

This resolves issue #7531. This is a preliminary step to merging in some
of the Long Vector Execution tests currently sitting in the
staging-sm6.9 branch. There are no functional changes here, but given
that this is a refactor I do not want to add the [NFC] tag to the PR
title.

- Moves functions used by existing exec tests and incoming long vector
tests to a common HlslExecTestUtils.h.
- Updates naming to adhere to LLVM coding standards for newly created
files (even though the functions aren't new)
- Move a few other shared functions to files that make more sense than
ExecutionTest.cpp.
- TableParameterHandler class moved to its own header/cpp files. No
naming updates as nothing else was touched. Can update in a subsequent
PR if it is preferred.
- Add the LLVM coding guidelines preferred include guard to
HlslTestUtils.h to mitigate redefinition issues exposed by this
refactor.
- Updated the D3D shader model 'redefines' in ExecutionTest.cpp as they
were also factored out into a common header. constexpr required because
they are enum values.
- BigObj added to the cmake file as I was hitting issues locally for
this.
---
 include/dxc/Test/HlslTestUtils.h              |    4 +
 tools/clang/unittests/HLSLExec/CMakeLists.txt |    3 +
 .../unittests/HLSLExec/ExecutionTest.cpp      | 1504 ++++-------------
 .../unittests/HLSLExec/HlslExecTestUtils.h    |  405 +++++
 .../clang/unittests/HLSLExec/ShaderOpTest.cpp |   73 +
 tools/clang/unittests/HLSLExec/ShaderOpTest.h |   32 +-
 .../HLSLExec/TableParameterHandler.cpp        |  376 +++++
 .../HLSLExec/TableParameterHandler.h          |  205 +++
 8 files changed, 1458 insertions(+), 1144 deletions(-)
 create mode 100644 tools/clang/unittests/HLSLExec/HlslExecTestUtils.h
 create mode 100644 tools/clang/unittests/HLSLExec/TableParameterHandler.cpp
 create mode 100644 tools/clang/unittests/HLSLExec/TableParameterHandler.h

diff --git a/include/dxc/Test/HlslTestUtils.h b/include/dxc/Test/HlslTestUtils.h
index 0e37ccdcff..44f3f6148a 100644
--- a/include/dxc/Test/HlslTestUtils.h
+++ b/include/dxc/Test/HlslTestUtils.h
@@ -10,6 +10,8 @@
 ///////////////////////////////////////////////////////////////////////////////
 
 // *** THIS FILE CANNOT TAKE ANY LLVM DEPENDENCIES  *** //
+#ifndef HLSLTESTUTILS_H
+#define HLSLTESTUTILS_H
 
 #include <algorithm>
 #include <atomic>
@@ -735,3 +737,5 @@ inline UINT GetByteSizeForFormat(DXGI_FORMAT value) {
   }
 }
 #endif
+
+#endif // HLSLTESTUTILS_H
diff --git a/tools/clang/unittests/HLSLExec/CMakeLists.txt b/tools/clang/unittests/HLSLExec/CMakeLists.txt
index 3878fa3f34..df61aad854 100644
--- a/tools/clang/unittests/HLSLExec/CMakeLists.txt
+++ b/tools/clang/unittests/HLSLExec/CMakeLists.txt
@@ -3,9 +3,12 @@
 find_package(TAEF REQUIRED)
 find_package(D3D12 REQUIRED) # Used for ExecutionTest.cpp.
 
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /bigobj")
+
 add_clang_library(ExecHLSLTests SHARED
   ExecutionTest.cpp
   ShaderOpTest.cpp
+  TableParameterHandler.cpp
   ExecHLSLTests.rc
   )
 
diff --git a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
index 6db27d7a41..c26b9a1b5b 100644
--- a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
+++ b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
@@ -60,6 +60,8 @@
 #include "ShaderOpTest.h"
 #include <libloaderapi.h>
 #include <DirectXPackedVector.h>
+#include "TableParameterHandler.h"
+#include "HlslExecTestUtils.h"
 // clang-format on
 
 #pragma comment(lib, "d3dcompiler.lib")
@@ -67,47 +69,6 @@
 #pragma comment(lib, "dxguid.lib")
 #pragma comment(lib, "version.lib")
 
-// A more recent Windows SDK than currently required is needed for these.
-typedef HRESULT(WINAPI *D3D12EnableExperimentalFeaturesFn)(
-    UINT NumFeatures, __in_ecount(NumFeatures) const IID *pIIDs,
-    __in_ecount_opt(NumFeatures) void *pConfigurationStructs,
-    __in_ecount_opt(NumFeatures) UINT *pConfigurationStructSizes);
-
-static const GUID D3D12ExperimentalShaderModelsID =
-    {/* 76f5573e-f13a-40f5-b297-81ce9e18933f */
-     0x76f5573e,
-     0xf13a,
-     0x40f5,
-     {0xb2, 0x97, 0x81, 0xce, 0x9e, 0x18, 0x93, 0x3f}};
-
-// Used to create D3D12SDKConfiguration to enable AgilitySDK programmatically.
-typedef HRESULT(WINAPI *D3D12GetInterfaceFn)(REFCLSID rclsid, REFIID riid,
-                                             void **ppvDebug);
-
-#ifndef __ID3D12SDKConfiguration_INTERFACE_DEFINED__
-// Copied from AgilitySDK D3D12.h to programmatically enable when in developer
-// mode.
-#define __ID3D12SDKConfiguration_INTERFACE_DEFINED__
-
-EXTERN_C const GUID DECLSPEC_SELECTANY IID_ID3D12SDKConfiguration = {
-    0xe9eb5314,
-    0x33aa,
-    0x42b2,
-    {0xa7, 0x18, 0xd7, 0x7f, 0x58, 0xb1, 0xf1, 0xc7}};
-EXTERN_C const GUID DECLSPEC_SELECTANY CLSID_D3D12SDKConfiguration = {
-    0x7cda6aca,
-    0xa03e,
-    0x49c8,
-    {0x94, 0x58, 0x03, 0x34, 0xd2, 0x0e, 0x07, 0xce}};
-
-MIDL_INTERFACE("e9eb5314-33aa-42b2-a718-d77f58b1f1c7")
-ID3D12SDKConfiguration : public IUnknown {
-public:
-  virtual HRESULT STDMETHODCALLTYPE SetSDKVersion(UINT SDKVersion,
-                                                  LPCSTR SDKPath) = 0;
-};
-#endif /* __ID3D12SDKConfiguration_INTERFACE_DEFINED__ */
-
 using namespace DirectX;
 using namespace hlsl_test;
 
@@ -271,9 +232,6 @@ typedef struct D3D12_FEATURE_DATA_D3D12_OPTIONS4 {
 
 #endif
 
-// Virtual class to compute the expected result given a set of inputs
-struct TableParameter;
-
 class ExecutionTest {
 public:
   BEGIN_TEST_CLASS(ExecutionTest)
@@ -519,10 +477,10 @@ class ExecutionTest {
         return false;
       // Do not: FreeLibrary(hRuntime);
       // If we actually free the library, it defeats the purpose of
-      // EnableAgilitySDK and EnableExperimentalMode.
+      // enableAgilitySDK and enableExperimentalMode.
 
       HRESULT hr;
-      hr = EnableAgilitySDK(hRuntime);
+      hr = enableAgilitySDK(hRuntime);
       if (FAILED(hr)) {
         LogCommentFmt(L"Unable to enable Agility SDK - 0x%08x.", hr);
       } else if (hr == S_FALSE) {
@@ -531,7 +489,7 @@ class ExecutionTest {
         LogCommentFmt(L"Agility SDK enabled.");
       }
 
-      hr = EnableExperimentalMode(hRuntime);
+      hr = enableExperimentalMode(hRuntime);
       if (FAILED(hr)) {
         LogCommentFmt(L"Unable to enable shader experimental mode - 0x%08x.",
                       hr);
@@ -541,7 +499,7 @@ class ExecutionTest {
         LogCommentFmt(L"Experimental mode enabled.");
       }
 
-      hr = EnableDebugLayer();
+      hr = enableDebugLayer();
       if (FAILED(hr)) {
         LogCommentFmt(L"Unable to enable debug layer - 0x%08x.", hr);
       } else if (hr == S_FALSE) {
@@ -602,41 +560,31 @@ class ExecutionTest {
   // Do not remove the following line - it is used by TranslateExecutionTest.py
   // MARKER: ExecutionTest/DxilConf Shared Implementation Start
 
-  // This is defined in d3d.h for Windows 10 Anniversary Edition SDK, but we
-  // only require the Windows 10 SDK.
-  typedef enum D3D_SHADER_MODEL {
-    D3D_SHADER_MODEL_5_1 = 0x51,
-    D3D_SHADER_MODEL_6_0 = 0x60,
-    D3D_SHADER_MODEL_6_1 = 0x61,
-    D3D_SHADER_MODEL_6_2 = 0x62,
-    D3D_SHADER_MODEL_6_3 = 0x63,
-    D3D_SHADER_MODEL_6_4 = 0x64,
-    D3D_SHADER_MODEL_6_5 = 0x65,
-    D3D_SHADER_MODEL_6_6 = 0x66,
-    D3D_SHADER_MODEL_6_7 = 0x67,
-    D3D_SHADER_MODEL_6_8 = 0x68,
-    D3D_SHADER_MODEL_6_9 = 0x69,
-  } D3D_SHADER_MODEL;
-
-  static const D3D_SHADER_MODEL HIGHEST_SHADER_MODEL = D3D_SHADER_MODEL_6_9;
-
-  bool UseDxbc() {
-#ifdef _HLK_CONF
-    return false;
-#else
-    return GetTestParamBool(L"DXBC");
-#endif
-  }
-
-  bool UseWarpByDefault() {
-#ifdef _HLK_CONF
-    return false;
-#else
-    return true;
-#endif
-  }
-
-  bool UseDebugIfaces() { return true; }
+  // We define D3D_SHADER_MODEL enum values as we don't generally have access to
+  // the latest D3D headers when adding tests for a new SM being added.
+  using D3D_SHADER_MODEL = ExecTestUtils::D3D_SHADER_MODEL;
+  static constexpr ExecTestUtils::D3D_SHADER_MODEL D3D_SHADER_MODEL_6_0 =
+      ExecTestUtils::D3D_SHADER_MODEL_6_0;
+  static constexpr ExecTestUtils::D3D_SHADER_MODEL D3D_SHADER_MODEL_6_1 =
+      ExecTestUtils::D3D_SHADER_MODEL_6_1;
+  static constexpr ExecTestUtils::D3D_SHADER_MODEL D3D_SHADER_MODEL_6_2 =
+      ExecTestUtils::D3D_SHADER_MODEL_6_2;
+  static constexpr ExecTestUtils::D3D_SHADER_MODEL D3D_SHADER_MODEL_6_3 =
+      ExecTestUtils::D3D_SHADER_MODEL_6_3;
+  static constexpr ExecTestUtils::D3D_SHADER_MODEL D3D_SHADER_MODEL_6_4 =
+      ExecTestUtils::D3D_SHADER_MODEL_6_4;
+  static constexpr ExecTestUtils::D3D_SHADER_MODEL D3D_SHADER_MODEL_6_5 =
+      ExecTestUtils::D3D_SHADER_MODEL_6_5;
+  static constexpr ExecTestUtils::D3D_SHADER_MODEL D3D_SHADER_MODEL_6_6 =
+      ExecTestUtils::D3D_SHADER_MODEL_6_6;
+  static constexpr ExecTestUtils::D3D_SHADER_MODEL D3D_SHADER_MODEL_6_7 =
+      ExecTestUtils::D3D_SHADER_MODEL_6_7;
+  static constexpr ExecTestUtils::D3D_SHADER_MODEL D3D_SHADER_MODEL_6_8 =
+      ExecTestUtils::D3D_SHADER_MODEL_6_8;
+  static constexpr ExecTestUtils::D3D_SHADER_MODEL D3D_SHADER_MODEL_6_9 =
+      ExecTestUtils::D3D_SHADER_MODEL_6_9;
+  static constexpr ExecTestUtils::D3D_SHADER_MODEL D3D_HIGHEST_SHADER_MODEL =
+      ExecTestUtils::D3D_HIGHEST_SHADER_MODEL;
 
   bool SaveImages() { return GetTestParamBool(L"SaveImages"); }
 
@@ -766,7 +714,7 @@ class ExecutionTest {
     CComPtr<ID3DBlob> pComputeShader;
 
     // Load and compile shaders.
-    if (UseDxbc()) {
+    if (useDxbc()) {
 #ifndef _HLK_CONF
       DXBCFromText(pShader, L"main", pTargetProfile, &pComputeShader);
 #endif
@@ -784,112 +732,6 @@ class ExecutionTest {
         &computePsoDesc, IID_PPV_ARGS(ppComputeState)));
   }
 
-  bool CreateDevice(ID3D12Device **ppDevice,
-                    D3D_SHADER_MODEL testModel = D3D_SHADER_MODEL_6_0,
-                    bool skipUnsupported = true) {
-    if (testModel > HIGHEST_SHADER_MODEL) {
-      UINT minor = (UINT)testModel & 0x0f;
-      LogCommentFmt(L"Installed SDK does not support "
-                    L"shader model 6.%1u",
-                    minor);
-
-      if (skipUnsupported) {
-        WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped);
-      }
-
-      return false;
-    }
-    CComPtr<IDXGIFactory4> factory;
-    CComPtr<ID3D12Device> pDevice;
-
-    *ppDevice = nullptr;
-
-    VERIFY_SUCCEEDED(CreateDXGIFactory1(IID_PPV_ARGS(&factory)));
-    if (GetTestParamUseWARP(UseWarpByDefault())) {
-      CComPtr<IDXGIAdapter> warpAdapter;
-      VERIFY_SUCCEEDED(factory->EnumWarpAdapter(IID_PPV_ARGS(&warpAdapter)));
-      HRESULT createHR = D3D12CreateDevice(warpAdapter, D3D_FEATURE_LEVEL_11_0,
-                                           IID_PPV_ARGS(&pDevice));
-      if (FAILED(createHR)) {
-        LogCommentFmt(L"The available version of WARP does not support d3d12.");
-
-        if (skipUnsupported) {
-          WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped);
-        }
-
-        return false;
-      }
-
-      if (GetModuleHandleW(L"d3d10warp.dll") != NULL) {
-        WCHAR szFullModuleFilePath[MAX_PATH] = L"";
-        GetModuleFileNameW(GetModuleHandleW(L"d3d10warp.dll"),
-                           szFullModuleFilePath, sizeof(szFullModuleFilePath));
-        WEX::Logging::Log::Comment(WEX::Common::String().Format(
-            L"WARP driver loaded from: %S", szFullModuleFilePath));
-      }
-
-    } else {
-      CComPtr<IDXGIAdapter1> hardwareAdapter;
-      WEX::Common::String AdapterValue;
-      HRESULT hr = WEX::TestExecution::RuntimeParameters::TryGetValue(
-          L"Adapter", AdapterValue);
-      if (SUCCEEDED(hr)) {
-        st::GetHardwareAdapter(factory, AdapterValue, &hardwareAdapter);
-      } else {
-        WEX::Logging::Log::Comment(
-            L"Using default hardware adapter with D3D12 support.");
-      }
-
-      VERIFY_SUCCEEDED(D3D12CreateDevice(
-          hardwareAdapter, D3D_FEATURE_LEVEL_11_0, IID_PPV_ARGS(&pDevice)));
-    }
-    // retrieve adapter information
-    LUID adapterID = pDevice->GetAdapterLuid();
-    CComPtr<IDXGIAdapter> adapter;
-    factory->EnumAdapterByLuid(adapterID, IID_PPV_ARGS(&adapter));
-    DXGI_ADAPTER_DESC AdapterDesc;
-    VERIFY_SUCCEEDED(adapter->GetDesc(&AdapterDesc));
-    LogCommentFmt(L"Using Adapter:%s", AdapterDesc.Description);
-
-    if (pDevice == nullptr)
-      return false;
-
-    if (!UseDxbc()) {
-      // Check for DXIL support.
-      typedef struct D3D12_FEATURE_DATA_SHADER_MODEL {
-        D3D_SHADER_MODEL HighestShaderModel;
-      } D3D12_FEATURE_DATA_SHADER_MODEL;
-      const UINT D3D12_FEATURE_SHADER_MODEL = 7;
-      D3D12_FEATURE_DATA_SHADER_MODEL SMData;
-      SMData.HighestShaderModel = testModel;
-      if (FAILED(pDevice->CheckFeatureSupport(
-              (D3D12_FEATURE)D3D12_FEATURE_SHADER_MODEL, &SMData,
-              sizeof(SMData))) ||
-          SMData.HighestShaderModel < testModel) {
-        UINT minor = (UINT)testModel & 0x0f;
-        LogCommentFmt(L"The selected device does not support "
-                      L"shader model 6.%1u",
-                      minor);
-
-        if (skipUnsupported) {
-          WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped);
-        }
-
-        return false;
-      }
-    }
-
-    if (UseDebugIfaces()) {
-      CComPtr<ID3D12InfoQueue> pInfoQueue;
-      if (SUCCEEDED(pDevice->QueryInterface(&pInfoQueue))) {
-        pInfoQueue->SetMuteDebugOutput(FALSE);
-      }
-    }
-
-    *ppDevice = pDevice.Detach();
-    return true;
-  }
-
   void CreateGraphicsCommandQueue(ID3D12Device *pDevice,
                                   ID3D12CommandQueue **ppCommandQueue) {
     D3D12_COMMAND_QUEUE_DESC queueDesc = {};
@@ -919,7 +761,7 @@ class ExecutionTest {
     CComPtr<ID3DBlob> vertexShader;
     CComPtr<ID3DBlob> pixelShader;
 
-    if (UseDxbc()) {
+    if (useDxbc()) {
 #ifndef _HLK_CONF
       DXBCFromText(pShaders, L"VSMain", L"vs_6_0", &vertexShader);
       DXBCFromText(pShaders, L"PSMain", L"ps_6_0", &pixelShader);
@@ -1642,7 +1484,7 @@ class ExecutionTest {
     // The debug layer does net yet validate DXIL programs that require
     // rewriting, but basic logging should work properly.
     HRESULT hr = S_FALSE;
-    if (UseDebugIfaces()) {
+    if (useDebugIfaces()) {
       CComPtr<ID3D12Debug> debugController;
       hr = D3D12GetDebugInterface(IID_PPV_ARGS(&debugController));
       if (SUCCEEDED(hr)) {
@@ -1830,20 +1672,6 @@ class ExecutionTest {
     }
   }
 
-  void ReadHlslDataIntoNewStream(LPCWSTR relativePath, IStream **ppStream) {
-    VERIFY_SUCCEEDED(m_support.Initialize());
-    CComPtr<IDxcLibrary> pLibrary;
-    CComPtr<IDxcBlobEncoding> pBlob;
-    CComPtr<IStream> pStream;
-    std::wstring path = GetPathToHlslDataFile(relativePath, HLSLDATAFILEPARAM,
-                                              DEFAULT_EXEC_TEST_DIR);
-    VERIFY_SUCCEEDED(m_support.CreateInstance(CLSID_DxcLibrary, &pLibrary));
-    VERIFY_SUCCEEDED(
-        pLibrary->CreateBlobFromFile(path.c_str(), nullptr, &pBlob));
-    VERIFY_SUCCEEDED(pLibrary->CreateStreamFromBlobReadOnly(pBlob, &pStream));
-    *ppStream = pStream.Detach();
-  }
-
   void RecordRenderAndReadback(ID3D12GraphicsCommandList *pList,
                                ID3D12DescriptorHeap *pRtvHeap,
                                UINT rtvDescriptorSize, UINT instanceCount,
@@ -2348,15 +2176,15 @@ TEST_F(ExecutionTest, LifetimeIntrinsicTest) {
   static const int DispatchGroupCount = 1;
 
   CComPtr<ID3D12Device> pDevice;
-  bool bSM_6_6_Supported = CreateDevice(&pDevice, D3D_SHADER_MODEL_6_6, false);
+  bool bSM_6_6_Supported = createDevice(&pDevice, D3D_SHADER_MODEL_6_6, false);
   bool bSM_6_3_Supported = bSM_6_6_Supported;
   if (!bSM_6_6_Supported) {
     // Try 6.3 for downlevel DXR case
-    bSM_6_3_Supported = CreateDevice(&pDevice, D3D_SHADER_MODEL_6_3, false);
+    bSM_6_3_Supported = createDevice(&pDevice, D3D_SHADER_MODEL_6_3, false);
   }
   if (!bSM_6_3_Supported) {
     // Otherwise, 6.0 better be supported for compute case
-    VERIFY_IS_TRUE(CreateDevice(&pDevice, D3D_SHADER_MODEL_6_0, false));
+    VERIFY_IS_TRUE(createDevice(&pDevice, D3D_SHADER_MODEL_6_0, false));
   }
   bool bDXRSupported =
       bSM_6_3_Supported && DoesDeviceSupportRayTracing(pDevice);
@@ -2465,7 +2293,7 @@ TEST_F(ExecutionTest, BasicComputeTest) {
   static const int DispatchGroupCount = 1;
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice))
+  if (!createDevice(&pDevice))
     return;
 
   std::vector<uint32_t> values;
@@ -2524,7 +2352,7 @@ TEST_F(ExecutionTest, BasicTriangleTest) {
       "  return 1; //input.color;\r\n"
       "};\r\n";
 
-  if (!CreateDevice(&pDevice))
+  if (!createDevice(&pDevice))
     return;
 
   struct BasicTestChecker {
@@ -2668,7 +2496,7 @@ TEST_F(ExecutionTest, Int64Test) {
   static const int DispatchGroupCount = 1;
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice))
+  if (!createDevice(&pDevice))
     return;
 
   if (!DoesDeviceSupportInt64(pDevice)) {
@@ -2693,7 +2521,7 @@ TEST_F(ExecutionTest, SignTest) {
                                 "}";
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice))
+  if (!createDevice(&pDevice))
     return;
 
   const uint32_t neg1 = (uint32_t)-1;
@@ -2714,7 +2542,7 @@ TEST_F(ExecutionTest, SignTest) {
 TEST_F(ExecutionTest, WaveIntrinsicsDDITest) {
 #ifndef _HLK_CONF
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice))
+  if (!createDevice(&pDevice))
     return;
   D3D12_FEATURE_DATA_D3D12_OPTIONS1 O;
   if (FAILED(pDevice->CheckFeatureSupport(
@@ -2814,7 +2642,7 @@ TEST_F(ExecutionTest, WaveIntrinsicsTest) {
   static const int DispatchGroupCount = 1;
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice))
+  if (!createDevice(&pDevice))
     return;
 
   if (!DoesDeviceSupportWaveOps(pDevice)) {
@@ -2841,7 +2669,7 @@ TEST_F(ExecutionTest, WaveIntrinsicsTest) {
   CComPtr<ID3D12DescriptorHeap> pUavHeap;
   CComPtr<ID3D12CommandAllocator> pCommandAllocator;
   FenceObj FO;
-  bool dxbc = UseDxbc();
+  bool dxbc = useDxbc();
 
   const size_t valueSizeInBytes = values.size() * sizeof(PerThreadData);
   CreateComputeCommandQueue(pDevice, L"WaveIntrinsicsTest Command Queue",
@@ -3172,7 +3000,7 @@ TEST_F(ExecutionTest, WaveIntrinsicsInPSTest) {
   CComPtr<ID3D12Resource> pVertexBuffer;
   D3D12_VERTEX_BUFFER_VIEW vertexBufferView;
 
-  if (!CreateDevice(&pDevice))
+  if (!createDevice(&pDevice))
     return;
   if (!DoesDeviceSupportWaveOps(pDevice)) {
     // Optional feature, so it's correct to not support it if declared as such.
@@ -3229,7 +3057,7 @@ TEST_F(ExecutionTest, WaveIntrinsicsInPSTest) {
 
   CreateVertexBuffer(pDevice, vertices, &pVertexBuffer, &vertexBufferView);
 
-  bool dxbc = UseDxbc();
+  bool dxbc = useDxbc();
 
   // Set up UAV resource.
   std::vector<PerPixelData> values;
@@ -3491,12 +3319,6 @@ TEST_F(ExecutionTest, WaveIntrinsicsInPSTest) {
   }
 }
 
-struct ShaderOpTestResult {
-  st::ShaderOp *ShaderOp;
-  std::shared_ptr<st::ShaderOpSet> ShaderOpSet;
-  std::shared_ptr<st::ShaderOpTest> Test;
-};
-
 struct SPrimitives {
   float f_float;
   float f_float2;
@@ -3504,87 +3326,19 @@ struct SPrimitives {
   float f_float2_o;
 };
 
-std::shared_ptr<ShaderOpTestResult>
-RunShaderOpTestAfterParse(ID3D12Device *pDevice, dxc::DxcDllSupport &support,
-                          LPCSTR pName,
-                          st::ShaderOpTest::TInitCallbackFn pInitCallback,
-                          st::ShaderOpTest::TShaderCallbackFn pShaderCallback,
-                          std::shared_ptr<st::ShaderOpSet> ShaderOpSet) {
-  st::ShaderOp *pShaderOp;
-  if (pName == nullptr) {
-    if (ShaderOpSet->ShaderOps.size() != 1) {
-      VERIFY_FAIL(L"Expected a single shader operation.");
-    }
-    pShaderOp = ShaderOpSet->ShaderOps[0].get();
-  } else {
-    pShaderOp = ShaderOpSet->GetShaderOp(pName);
-  }
-  if (pShaderOp == nullptr) {
-    std::string msg = "Unable to find shader op ";
-    msg += pName;
-    msg += "; available ops";
-    const char sep = ':';
-    for (auto &pAvailOp : ShaderOpSet->ShaderOps) {
-      msg += sep;
-      msg += pAvailOp->Name ? pAvailOp->Name : "[n/a]";
-    }
-    CA2W msgWide(msg.c_str());
-    VERIFY_FAIL(msgWide.m_psz);
-  }
-
-  // This won't actually be used since we're supplying the device,
-  // but let's make it consistent.
-  pShaderOp->UseWarpDevice = GetTestParamUseWARP(true);
-
-  std::shared_ptr<st::ShaderOpTest> test = std::make_shared<st::ShaderOpTest>();
-  test->SetDxcSupport(&support);
-  test->SetInitCallback(pInitCallback);
-  test->SetShaderCallback(pShaderCallback);
-  test->SetDevice(pDevice);
-  test->RunShaderOp(pShaderOp);
-
-  std::shared_ptr<ShaderOpTestResult> result =
-      std::make_shared<ShaderOpTestResult>();
-  result->ShaderOpSet = ShaderOpSet;
-  result->Test = test;
-  result->ShaderOp = pShaderOp;
-  return result;
-}
-
-std::shared_ptr<ShaderOpTestResult>
-RunShaderOpTestAfterParse(ID3D12Device *pDevice, dxc::DxcDllSupport &support,
-                          LPCSTR pName,
-                          st::ShaderOpTest::TInitCallbackFn pInitCallback,
-                          std::shared_ptr<st::ShaderOpSet> ShaderOpSet) {
-  return RunShaderOpTestAfterParse(pDevice, support, pName, pInitCallback,
-                                   nullptr, ShaderOpSet);
-}
-
-std::shared_ptr<ShaderOpTestResult>
-RunShaderOpTest(ID3D12Device *pDevice, dxc::DxcDllSupport &support,
-                IStream *pStream, LPCSTR pName,
-                st::ShaderOpTest::TInitCallbackFn pInitCallback) {
-  DXASSERT_NOMSG(pStream != nullptr);
-  std::shared_ptr<st::ShaderOpSet> ShaderOpSet =
-      std::make_shared<st::ShaderOpSet>();
-  st::ParseShaderOpSetFromStream(pStream, ShaderOpSet.get());
-  return RunShaderOpTestAfterParse(pDevice, support, pName, pInitCallback,
-                                   ShaderOpSet);
-}
-
 TEST_F(ExecutionTest, OutOfBoundsTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   // Single operation test at the moment.
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice))
+  if (!createDevice(&pDevice))
     return;
 
-  std::shared_ptr<ShaderOpTestResult> test =
-      RunShaderOpTest(pDevice, m_support, pStream, "OOB", nullptr);
+  std::shared_ptr<st::ShaderOpTestResult> test =
+      st::RunShaderOpTest(pDevice, m_support, pStream, "OOB", nullptr);
   MappedData data;
   // Read back to CPU and examine contents - should get pure red.
   {
@@ -3601,15 +3355,15 @@ TEST_F(ExecutionTest, SaturateTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   // Single operation test at the moment.
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice))
+  if (!createDevice(&pDevice))
     return;
 
-  std::shared_ptr<ShaderOpTestResult> test =
-      RunShaderOpTest(pDevice, m_support, pStream, "Saturate", nullptr);
+  std::shared_ptr<st::ShaderOpTestResult> test =
+      st::RunShaderOpTest(pDevice, m_support, pStream, "Saturate", nullptr);
   MappedData data;
   test->Test->GetReadBackData("U0", &data);
   const float *pValues = (float *)data.data();
@@ -3636,11 +3390,11 @@ void ExecutionTest::BasicTriangleTestSetup(LPCSTR ShaderOpName,
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   // Single operation test at the moment.
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice, testModel))
+  if (!createDevice(&pDevice, testModel))
     return;
 
   // As this is used, 6.2 requirement always comes with requiring native 16-bit
@@ -3653,8 +3407,8 @@ void ExecutionTest::BasicTriangleTestSetup(LPCSTR ShaderOpName,
     return;
   }
 
-  std::shared_ptr<ShaderOpTestResult> test =
-      RunShaderOpTest(pDevice, m_support, pStream, ShaderOpName, nullptr);
+  std::shared_ptr<st::ShaderOpTestResult> test =
+      st::RunShaderOpTest(pDevice, m_support, pStream, ShaderOpName, nullptr);
   MappedData data;
   D3D12_RESOURCE_DESC &D = test->ShaderOp->GetResourceByName("RTarget")->Desc;
   UINT width = (UINT)D.Width;
@@ -3786,14 +3540,14 @@ TEST_F(ExecutionTest, PartialDerivTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice))
+  if (!createDevice(&pDevice))
     return;
 
-  std::shared_ptr<ShaderOpTestResult> test =
-      RunShaderOpTest(pDevice, m_support, pStream, "DerivFine", nullptr);
+  std::shared_ptr<st::ShaderOpTestResult> test =
+      st::RunShaderOpTest(pDevice, m_support, pStream, "DerivFine", nullptr);
   MappedData data;
   D3D12_RESOURCE_DESC &D = test->ShaderOp->GetResourceByName("RTarget")->Desc;
   UINT width = (UINT)D.Width;
@@ -3894,10 +3648,10 @@ TEST_F(ExecutionTest, DerivativesTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice, D3D_SHADER_MODEL_6_6))
+  if (!createDevice(&pDevice, D3D_SHADER_MODEL_6_6))
     return;
 
   std::shared_ptr<st::ShaderOpSet> ShaderOpSet =
@@ -3977,10 +3731,10 @@ TEST_F(ExecutionTest, QuadReadTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice))
+  if (!createDevice(&pDevice))
     return;
 
   if (!DoesDeviceSupportWaveOps(pDevice)) {
@@ -4033,8 +3787,9 @@ TEST_F(ExecutionTest, QuadReadTest) {
 
     // Test Compute Shader
     pShaderOp->CS = CS;
-    std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTestAfterParse(
-        pDevice, m_support, "QuadRead", nullptr, ShaderOpSet);
+    std::shared_ptr<st::ShaderOpTestResult> test =
+        st::RunShaderOpTestAfterParse(pDevice, m_support, "QuadRead", nullptr,
+                                      ShaderOpSet);
     MappedData data;
 
     test->Test->GetReadBackData("U0", &data);
@@ -4055,8 +3810,8 @@ TEST_F(ExecutionTest, QuadReadTest) {
 
       // Disable CS so mesh goes forward
       pShaderOp->CS = nullptr;
-      test = RunShaderOpTestAfterParse(pDevice, m_support, "QuadRead", nullptr,
-                                       ShaderOpSet);
+      test = st::RunShaderOpTestAfterParse(pDevice, m_support, "QuadRead",
+                                           nullptr, ShaderOpSet);
       test->Test->GetReadBackData("U1", &data);
       pPixels = (UINT *)data.data();
       // Test first, second and center quads
@@ -4124,10 +3879,10 @@ TEST_F(ExecutionTest, ComputeSampleTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice, D3D_SHADER_MODEL_6_6))
+  if (!createDevice(&pDevice, D3D_SHADER_MODEL_6_6))
     return;
 
   std::shared_ptr<st::ShaderOpSet> ShaderOpSet =
@@ -4175,7 +3930,7 @@ TEST_F(ExecutionTest, ComputeSampleTest) {
   }
 
   // Test 1D compute shader
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTestAfterParse(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTestAfterParse(
       pDevice, m_support, "ComputeSample", SampleInitFn, ShaderOpSet);
   MappedData data;
 
@@ -4190,8 +3945,8 @@ TEST_F(ExecutionTest, ComputeSampleTest) {
   pShaderOp->CS = CS2;
 
   test.reset();
-  test = RunShaderOpTestAfterParse(pDevice, m_support, "ComputeSample",
-                                   SampleInitFn, ShaderOpSet);
+  test = st::RunShaderOpTestAfterParse(pDevice, m_support, "ComputeSample",
+                                       SampleInitFn, ShaderOpSet);
 
   test->Test->GetReadBackData("U0", &data);
   pPixels = (UINT *)data.data();
@@ -4203,8 +3958,8 @@ TEST_F(ExecutionTest, ComputeSampleTest) {
   if (DoesDeviceSupportMeshAmpDerivatives(pDevice)) {
     // Disable CS so mesh goes forward
     pShaderOp->CS = nullptr;
-    test = RunShaderOpTestAfterParse(pDevice, m_support, "ComputeSample",
-                                     SampleInitFn, ShaderOpSet);
+    test = st::RunShaderOpTestAfterParse(pDevice, m_support, "ComputeSample",
+                                         SampleInitFn, ShaderOpSet);
     test->Test->GetReadBackData("U1", &data);
     pPixels = (UINT *)data.data();
 
@@ -4221,8 +3976,8 @@ TEST_F(ExecutionTest, ComputeSampleTest) {
 
     pShaderOp->AS = AS2;
     pShaderOp->MS = MS2;
-    test = RunShaderOpTestAfterParse(pDevice, m_support, "ComputeSample",
-                                     SampleInitFn, ShaderOpSet);
+    test = st::RunShaderOpTestAfterParse(pDevice, m_support, "ComputeSample",
+                                         SampleInitFn, ShaderOpSet);
     test->Test->GetReadBackData("U1", &data);
     pPixels = (UINT *)data.data();
 
@@ -4251,7 +4006,7 @@ TEST_F(ExecutionTest, ATOWriteMSAATest) {
 #else
   D3D_SHADER_MODEL sm = D3D_SHADER_MODEL_6_7;
 #endif
-  if (!CreateDevice(&pDevice, sm))
+  if (!createDevice(&pDevice, sm))
     return;
 
 #ifndef WRITEMSAA_FALLBACK
@@ -4517,7 +4272,7 @@ TEST_F(ExecutionTest, ATOProgOffset) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   std::shared_ptr<st::ShaderOpSet> ShaderOpSet =
       std::make_shared<st::ShaderOpSet>();
@@ -4550,7 +4305,7 @@ TEST_F(ExecutionTest, ATOProgOffset) {
     D3D_SHADER_MODEL sm = TestShaderModels[i];
 
     CComPtr<ID3D12Device> pDevice;
-    if (!CreateDevice(&pDevice, sm, /*skipUnsupported*/ false)) {
+    if (!createDevice(&pDevice, sm, /*skipUnsupported*/ false)) {
       LogCommentFmt(L"Device does not support shader model 6.%1u",
                     ((UINT)sm & 0x0f));
       break;
@@ -4603,8 +4358,9 @@ TEST_F(ExecutionTest, ATOProgOffset) {
     }
 
     // Test compute shader
-    std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTestAfterParse(
-        pDevice, m_support, "ProgOffset", SampleInitFn, ShaderOpSet);
+    std::shared_ptr<st::ShaderOpTestResult> test =
+        st::RunShaderOpTestAfterParse(pDevice, m_support, "ProgOffset",
+                                      SampleInitFn, ShaderOpSet);
     MappedData data;
 
     test->Test->GetReadBackData("U0", &data);
@@ -4614,8 +4370,8 @@ TEST_F(ExecutionTest, ATOProgOffset) {
     pShaderOp->CS = nullptr;
 
     if (DoesDeviceSupportMeshShaders(pDevice)) {
-      test = RunShaderOpTestAfterParse(pDevice, m_support, "ProgOffset",
-                                       SampleInitFn, ShaderOpSet);
+      test = st::RunShaderOpTestAfterParse(pDevice, m_support, "ProgOffset",
+                                           SampleInitFn, ShaderOpSet);
 
       // PS
       test->Test->GetReadBackData("U0", &data);
@@ -4632,8 +4388,8 @@ TEST_F(ExecutionTest, ATOProgOffset) {
 
     // Disable MS so PS goes forward
     pShaderOp->MS = nullptr;
-    test = RunShaderOpTestAfterParse(pDevice, m_support, "ProgOffset",
-                                     SampleInitFn, ShaderOpSet);
+    test = st::RunShaderOpTestAfterParse(pDevice, m_support, "ProgOffset",
+                                         SampleInitFn, ShaderOpSet);
 
     test->Test->GetReadBackData("U0", &data);
     VerifyProgOffsetResults((UINT *)data.data(), true);
@@ -4653,10 +4409,10 @@ TEST_F(ExecutionTest, ATOSampleCmpLevelTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice, D3D_SHADER_MODEL_6_7))
+  if (!createDevice(&pDevice, D3D_SHADER_MODEL_6_7))
     return;
 
   if (!DoesDeviceSupportAdvancedTexOps(pDevice)) {
@@ -4701,7 +4457,7 @@ TEST_F(ExecutionTest, ATOSampleCmpLevelTest) {
   };
 
   // Test compute shader
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTestAfterParse(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTestAfterParse(
       pDevice, m_support, "SampleCmpLevel", SampleInitFn, ShaderOpSet);
   MappedData data;
 
@@ -4718,8 +4474,8 @@ TEST_F(ExecutionTest, ATOSampleCmpLevelTest) {
   if (DoesDeviceSupportMeshShaders(pDevice)) {
     // Disable CS so mesh goes forward
     pShaderOp->CS = nullptr;
-    test = RunShaderOpTestAfterParse(pDevice, m_support, "SampleCmpLevel",
-                                     SampleInitFn, ShaderOpSet);
+    test = st::RunShaderOpTestAfterParse(pDevice, m_support, "SampleCmpLevel",
+                                         SampleInitFn, ShaderOpSet);
 
     test->Test->GetReadBackData("U0", &data);
     pPixels = (UINT *)data.data();
@@ -5298,7 +5054,7 @@ TEST_F(ExecutionTest, ATORawGather) {
   D3D_SHADER_MODEL sm = D3D_SHADER_MODEL_6_7;
 #endif
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice, sm))
+  if (!createDevice(&pDevice, sm))
     return;
 
 #ifndef RAWGATHER_FALLBACK
@@ -5528,7 +5284,7 @@ void ExecutionTest::RunBasicShaderModelTest(D3D_SHADER_MODEL shaderModel) {
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice, shaderModel)) {
+  if (!createDevice(&pDevice, shaderModel)) {
     return;
   }
 
@@ -5628,9 +5384,9 @@ void ExecutionTest::RunBasicShaderModelTest(CComPtr<ID3D12Device> pDevice,
   };
 
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTest(
       pDevice, m_support, pStream, "BinaryFPOp",
       // this callback is called when the test is creating the resource to run
       // the test
@@ -5958,178 +5714,6 @@ struct SPackUnpackOpOutUnpacked {
   std::array<uint16_t, 4> outputClampedUint16;
   std::array<int16_t, 4> outputClampedInt16;
 };
-
-// Parameter representation for taef data-driven tests
-struct TableParameter {
-  LPCWSTR m_name;
-  enum TableParameterType {
-    INT8,
-    INT16,
-    INT32,
-    UINT,
-    FLOAT,
-    HALF,
-    DOUBLE,
-    STRING,
-    BOOL,
-    INT8_TABLE,
-    INT16_TABLE,
-    INT32_TABLE,
-    FLOAT_TABLE,
-    HALF_TABLE,
-    DOUBLE_TABLE,
-    STRING_TABLE,
-    UINT8_TABLE,
-    UINT16_TABLE,
-    UINT32_TABLE,
-    BOOL_TABLE
-  };
-  TableParameter(LPCWSTR name, TableParameterType type, bool required)
-      : m_name(name), m_type(type), m_required(required) {}
-  TableParameterType m_type;
-  bool m_required; // required parameter
-  int8_t m_int8;
-  int16_t m_int16;
-  int m_int32;
-  unsigned int m_uint;
-  float m_float;
-  uint16_t m_half; // no such thing as half type in c++. Use int16 instead
-  double m_double;
-  bool m_bool;
-  WEX::Common::String m_str;
-  std::vector<int8_t> m_int8Table;
-  std::vector<int16_t> m_int16Table;
-  std::vector<int> m_int32Table;
-  std::vector<uint8_t> m_uint8Table;
-  std::vector<uint16_t> m_uint16Table;
-  std::vector<unsigned int> m_uint32Table;
-  std::vector<float> m_floatTable;
-  std::vector<uint16_t> m_halfTable; // no such thing as half type in c++
-  std::vector<double> m_doubleTable;
-  std::vector<bool> m_boolTable;
-  std::vector<WEX::Common::String> m_StringTable;
-};
-
-class TableParameterHandler {
-private:
-  HRESULT ParseTableRow();
-
-public:
-  TableParameter *m_table;
-  size_t m_tableSize;
-  TableParameterHandler(TableParameter *pTable, size_t size)
-      : m_table(pTable), m_tableSize(size) {
-    clearTableParameter();
-    VERIFY_SUCCEEDED(ParseTableRow());
-  }
-
-  TableParameter *GetTableParamByName(LPCWSTR name) {
-    for (size_t i = 0; i < m_tableSize; ++i) {
-      if (_wcsicmp(name, m_table[i].m_name) == 0) {
-        return &m_table[i];
-      }
-    }
-    DXASSERT_ARGS(false, "Invalid Table Parameter Name %s", name);
-    return nullptr;
-  }
-
-  void clearTableParameter() {
-    for (size_t i = 0; i < m_tableSize; ++i) {
-      m_table[i].m_int32 = 0;
-      m_table[i].m_uint = 0;
-      m_table[i].m_double = 0;
-      m_table[i].m_bool = false;
-      m_table[i].m_str = WEX::Common::String();
-    }
-  }
-
-  template <class T1> std::vector<T1> *GetDataArray(LPCWSTR name) {
-    return nullptr;
-  }
-
-  template <> std::vector<int> *GetDataArray(LPCWSTR name) {
-    for (size_t i = 0; i < m_tableSize; ++i) {
-      if (_wcsicmp(name, m_table[i].m_name) == 0) {
-        return &(m_table[i].m_int32Table);
-      }
-    }
-    DXASSERT_ARGS(false, "Invalid Table Parameter Name %s", name);
-    return nullptr;
-  }
-
-  template <> std::vector<int8_t> *GetDataArray(LPCWSTR name) {
-    for (size_t i = 0; i < m_tableSize; ++i) {
-      if (_wcsicmp(name, m_table[i].m_name) == 0) {
-        return &(m_table[i].m_int8Table);
-      }
-    }
-    DXASSERT_ARGS(false, "Invalid Table Parameter Name %s", name);
-    return nullptr;
-  }
-
-  template <> std::vector<int16_t> *GetDataArray(LPCWSTR name) {
-    for (size_t i = 0; i < m_tableSize; ++i) {
-      if (_wcsicmp(name, m_table[i].m_name) == 0) {
-        return &(m_table[i].m_int16Table);
-      }
-    }
-    DXASSERT_ARGS(false, "Invalid Table Parameter Name %s", name);
-    return nullptr;
-  }
-
-  template <> std::vector<unsigned int> *GetDataArray(LPCWSTR name) {
-    for (size_t i = 0; i < m_tableSize; ++i) {
-      if (_wcsicmp(name, m_table[i].m_name) == 0) {
-        return &(m_table[i].m_uint32Table);
-      }
-    }
-    DXASSERT_ARGS(false, "Invalid Table Parameter Name %s", name);
-    return nullptr;
-  }
-
-  template <> std::vector<float> *GetDataArray(LPCWSTR name) {
-    for (size_t i = 0; i < m_tableSize; ++i) {
-      if (_wcsicmp(name, m_table[i].m_name) == 0) {
-        return &(m_table[i].m_floatTable);
-      }
-    }
-    DXASSERT_ARGS(false, "Invalid Table Parameter Name %s", name);
-    return nullptr;
-  }
-
-  // TODO: uin16_t may be used to represent two different types when we
-  // introduce uint16
-  template <> std::vector<uint16_t> *GetDataArray(LPCWSTR name) {
-    for (size_t i = 0; i < m_tableSize; ++i) {
-      if (_wcsicmp(name, m_table[i].m_name) == 0) {
-        return &(m_table[i].m_halfTable);
-      }
-    }
-    DXASSERT_ARGS(false, "Invalid Table Parameter Name %s", name);
-    return nullptr;
-  }
-
-  template <> std::vector<double> *GetDataArray(LPCWSTR name) {
-    for (size_t i = 0; i < m_tableSize; ++i) {
-      if (_wcsicmp(name, m_table[i].m_name) == 0) {
-        return &(m_table[i].m_doubleTable);
-      }
-    }
-    DXASSERT_ARGS(false, "Invalid Table Parameter Name %s", name);
-    return nullptr;
-  }
-
-  template <> std::vector<bool> *GetDataArray(LPCWSTR name) {
-    for (size_t i = 0; i < m_tableSize; ++i) {
-      if (_wcsicmp(name, m_table[i].m_name) == 0) {
-        return &(m_table[i].m_boolTable);
-      }
-    }
-    DXASSERT_ARGS(false, "Invalid Table Parameter Name %s", name);
-    return nullptr;
-  }
-};
-
 static TableParameter UnaryFPOpParameters[] = {
     {L"ShaderOp.Target", TableParameter::STRING, true},
     {L"ShaderOp.Text", TableParameter::STRING, true},
@@ -6460,381 +6044,6 @@ static TableParameter PackUnpackOpParameters[] = {
     {L"Validation.Input", TableParameter::UINT32_TABLE, true},
 };
 
-static bool IsHexString(PCWSTR str, uint16_t *value) {
-  std::wstring wString(str);
-  wString.erase(std::remove(wString.begin(), wString.end(), L' '),
-                wString.end());
-  LPCWSTR wstr = wString.c_str();
-  if (wcsncmp(wstr, L"0x", 2) == 0 || wcsncmp(wstr, L"0b", 2) == 0) {
-    *value = (uint16_t)wcstol(wstr, NULL, 0);
-    return true;
-  }
-  return false;
-}
-
-static HRESULT ParseDataToFloat(PCWSTR str, float &value) {
-  std::wstring wString(str);
-  wString.erase(std::remove(wString.begin(), wString.end(), L' '),
-                wString.end());
-  wString.erase(std::remove(wString.begin(), wString.end(), L'\n'),
-                wString.end());
-  PCWSTR wstr = wString.data();
-  if (_wcsicmp(wstr, L"NaN") == 0) {
-    value = NAN;
-  } else if (_wcsicmp(wstr, L"-inf") == 0) {
-    value = -(INFINITY);
-  } else if (_wcsicmp(wstr, L"inf") == 0) {
-    value = INFINITY;
-  } else if (_wcsicmp(wstr, L"-denorm") == 0) {
-    value = -(FLT_MIN / 2);
-  } else if (_wcsicmp(wstr, L"denorm") == 0) {
-    value = FLT_MIN / 2;
-  } else if (_wcsicmp(wstr, L"-0.0f") == 0 || _wcsicmp(wstr, L"-0.0") == 0 ||
-             _wcsicmp(wstr, L"-0") == 0) {
-    value = -0.0f;
-  } else if (_wcsicmp(wstr, L"0.0f") == 0 || _wcsicmp(wstr, L"0.0") == 0 ||
-             _wcsicmp(wstr, L"0") == 0) {
-    value = 0.0f;
-  } else if (_wcsnicmp(wstr, L"0x", 2) ==
-             0) { // For hex values, take values literally
-    unsigned temp_i = std::stoul(wstr, nullptr, 16);
-    value = (float &)temp_i;
-  } else {
-    // evaluate the expression of wstring
-    double val = _wtof(wstr);
-    if (val == 0) {
-      LogErrorFmt(L"Failed to parse parameter %s to float", wstr);
-      return E_FAIL;
-    }
-    value = (float)val;
-  }
-  return S_OK;
-}
-
-static HRESULT ParseDataToUint(PCWSTR str, unsigned int &value) {
-  std::wstring wString(str);
-  wString.erase(std::remove(wString.begin(), wString.end(), L' '),
-                wString.end());
-  PCWSTR wstr = wString.data();
-  // evaluate the expression of string
-  if (_wcsicmp(wstr, L"0") == 0 || _wcsicmp(wstr, L"0x00000000") == 0) {
-    value = 0;
-    return S_OK;
-  }
-  wchar_t *end;
-  unsigned int val = std::wcstoul(wstr, &end, 0);
-  if (val == 0) {
-    LogErrorFmt(L"Failed to parse parameter %s to int", wstr);
-    return E_FAIL;
-  }
-  value = val;
-  return S_OK;
-}
-
-static HRESULT ParseDataToVectorFloat(PCWSTR str, float *ptr, size_t count) {
-  std::wstring wstr(str);
-  size_t curPosition = 0;
-  // parse a string of dot product separated by commas
-  for (size_t i = 0; i < count; ++i) {
-    size_t nextPosition = wstr.find(L",", curPosition);
-    if (FAILED(ParseDataToFloat(
-            wstr.substr(curPosition, nextPosition - curPosition).data(),
-            *(ptr + i)))) {
-      return E_FAIL;
-    }
-    curPosition = nextPosition + 1;
-  }
-  return S_OK;
-}
-
-static HRESULT ParseDataToVectorHalf(PCWSTR str, uint16_t *ptr, size_t count) {
-  std::wstring wstr(str);
-  size_t curPosition = 0;
-  // parse a string of dot product separated by commas
-  for (size_t i = 0; i < count; ++i) {
-    size_t nextPosition = wstr.find(L",", curPosition);
-    float floatValue;
-    if (FAILED(ParseDataToFloat(
-            wstr.substr(curPosition, nextPosition - curPosition).data(),
-            floatValue))) {
-      return E_FAIL;
-    }
-    *(ptr + i) = ConvertFloat32ToFloat16(floatValue);
-    curPosition = nextPosition + 1;
-  }
-  return S_OK;
-}
-
-static HRESULT ParseDataToVectorUint(PCWSTR str, unsigned int *ptr,
-                                     size_t count) {
-  std::wstring wstr(str);
-  size_t curPosition = 0;
-  // parse a string of dot product separated by commas
-  for (size_t i = 0; i < count; ++i) {
-    size_t nextPosition = wstr.find(L",", curPosition);
-    if (FAILED(ParseDataToUint(
-            wstr.substr(curPosition, nextPosition - curPosition).data(),
-            *(ptr + i)))) {
-      return E_FAIL;
-    }
-    curPosition = nextPosition + 1;
-  }
-  return S_OK;
-}
-
-HRESULT TableParameterHandler::ParseTableRow() {
-  TableParameter *table = m_table;
-  for (unsigned int i = 0; i < m_tableSize; ++i) {
-    switch (table[i].m_type) {
-    case TableParameter::INT8:
-      if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
-                                                           table[i].m_int32)) &&
-          table[i].m_required) {
-        // TryGetValue does not suppport reading from int16
-        LogErrorFmt(L"Failed to get %s", table[i].m_name);
-        return E_FAIL;
-      }
-      table[i].m_int8 = (int8_t)(table[i].m_int32);
-      break;
-    case TableParameter::INT16:
-      if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
-                                                           table[i].m_int32)) &&
-          table[i].m_required) {
-        // TryGetValue does not suppport reading from int16
-        LogErrorFmt(L"Failed to get %s", table[i].m_name);
-        return E_FAIL;
-      }
-      table[i].m_int16 = (short)(table[i].m_int32);
-      break;
-    case TableParameter::INT32:
-      if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
-                                                           table[i].m_int32)) &&
-          table[i].m_required) {
-        LogErrorFmt(L"Failed to get %s", table[i].m_name);
-        return E_FAIL;
-      }
-      break;
-    case TableParameter::UINT:
-      if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
-                                                           table[i].m_uint)) &&
-          table[i].m_required) {
-        LogErrorFmt(L"Failed to get %s", table[i].m_name);
-        return E_FAIL;
-      }
-      break;
-    case TableParameter::DOUBLE:
-      if (FAILED(WEX::TestExecution::TestData::TryGetValue(
-              table[i].m_name, table[i].m_double)) &&
-          table[i].m_required) {
-        LogErrorFmt(L"Failed to get %s", table[i].m_name);
-        return E_FAIL;
-      }
-      break;
-    case TableParameter::STRING:
-      if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
-                                                           table[i].m_str)) &&
-          table[i].m_required) {
-        LogErrorFmt(L"Failed to get %s", table[i].m_name);
-        return E_FAIL;
-      }
-      break;
-    case TableParameter::BOOL:
-      if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
-                                                           table[i].m_str)) &&
-          table[i].m_bool) {
-        LogErrorFmt(L"Failed to get %s", table[i].m_name);
-        return E_FAIL;
-      }
-      break;
-    case TableParameter::INT8_TABLE: {
-      WEX::TestExecution::TestDataArray<int> tempTable;
-      if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
-                                                           tempTable)) &&
-          table[i].m_required) {
-
-        LogErrorFmt(L"Failed to get %s", table[i].m_name);
-        return E_FAIL;
-      }
-      // TryGetValue does not suppport reading from int8
-      table[i].m_int8Table.resize(tempTable.GetSize());
-      for (size_t j = 0, end = tempTable.GetSize(); j != end; ++j) {
-        table[i].m_int8Table[j] = (int8_t)tempTable[j];
-      }
-      break;
-    }
-    case TableParameter::INT16_TABLE: {
-      WEX::TestExecution::TestDataArray<int> tempTable;
-      if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
-                                                           tempTable)) &&
-          table[i].m_required) {
-        LogErrorFmt(L"Failed to get %s", table[i].m_name);
-        return E_FAIL;
-      }
-      // TryGetValue does not suppport reading from int8
-      table[i].m_int16Table.resize(tempTable.GetSize());
-      for (size_t j = 0, end = tempTable.GetSize(); j != end; ++j) {
-        table[i].m_int16Table[j] = (int16_t)tempTable[j];
-      }
-      break;
-    }
-    case TableParameter::INT32_TABLE: {
-      WEX::TestExecution::TestDataArray<int> tempTable;
-      if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
-                                                           tempTable)) &&
-          table[i].m_required) {
-        // TryGetValue does not suppport reading from int8
-        LogErrorFmt(L"Failed to get %s", table[i].m_name);
-        return E_FAIL;
-      }
-      table[i].m_int32Table.resize(tempTable.GetSize());
-      for (size_t j = 0, end = tempTable.GetSize(); j != end; ++j) {
-        table[i].m_int32Table[j] = tempTable[j];
-      }
-      break;
-    }
-    case TableParameter::UINT8_TABLE: {
-      WEX::TestExecution::TestDataArray<int> tempTable;
-      if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
-                                                           tempTable)) &&
-          table[i].m_required) {
-
-        LogErrorFmt(L"Failed to get %s", table[i].m_name);
-        return E_FAIL;
-      }
-      // TryGetValue does not suppport reading from int8
-      table[i].m_int8Table.resize(tempTable.GetSize());
-      for (size_t j = 0, end = tempTable.GetSize(); j != end; ++j) {
-        table[i].m_int8Table[j] = (uint8_t)tempTable[j];
-      }
-      break;
-    }
-    case TableParameter::UINT16_TABLE: {
-      WEX::TestExecution::TestDataArray<int> tempTable;
-      if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
-                                                           tempTable)) &&
-          table[i].m_required) {
-        LogErrorFmt(L"Failed to get %s", table[i].m_name);
-        return E_FAIL;
-      }
-      // TryGetValue does not suppport reading from int8
-      table[i].m_uint16Table.resize(tempTable.GetSize());
-      for (size_t j = 0, end = tempTable.GetSize(); j != end; ++j) {
-        table[i].m_uint16Table[j] = (uint16_t)tempTable[j];
-      }
-      break;
-    }
-    case TableParameter::UINT32_TABLE: {
-      WEX::TestExecution::TestDataArray<unsigned int> tempTable;
-      if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
-                                                           tempTable)) &&
-          table[i].m_required) {
-        // TryGetValue does not suppport reading from int8
-        LogErrorFmt(L"Failed to get %s", table[i].m_name);
-        return E_FAIL;
-      }
-      table[i].m_uint32Table.resize(tempTable.GetSize());
-      for (size_t j = 0, end = tempTable.GetSize(); j != end; ++j) {
-        table[i].m_uint32Table[j] = tempTable[j];
-      }
-      break;
-    }
-    case TableParameter::FLOAT_TABLE: {
-      WEX::TestExecution::TestDataArray<WEX::Common::String> tempTable;
-      if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
-                                                           tempTable)) &&
-          table[i].m_required) {
-        // TryGetValue does not suppport reading from int8
-        LogErrorFmt(L"Failed to get %s", table[i].m_name);
-        return E_FAIL;
-      }
-      table[i].m_floatTable.resize(tempTable.GetSize());
-      for (size_t j = 0, end = tempTable.GetSize(); j != end; ++j) {
-        ParseDataToFloat(tempTable[j], table[i].m_floatTable[j]);
-      }
-      break;
-    }
-    case TableParameter::HALF_TABLE: {
-      WEX::TestExecution::TestDataArray<WEX::Common::String> tempTable;
-      if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
-                                                           tempTable)) &&
-          table[i].m_required) {
-        // TryGetValue does not suppport reading from int8
-        LogErrorFmt(L"Failed to get %s", table[i].m_name);
-        return E_FAIL;
-      }
-      table[i].m_halfTable.resize(tempTable.GetSize());
-      for (size_t j = 0, end = tempTable.GetSize(); j != end; ++j) {
-        uint16_t value = 0;
-        if (IsHexString(tempTable[j], &value)) {
-          table[i].m_halfTable[j] = value;
-        } else {
-          float val;
-          ParseDataToFloat(tempTable[j], val);
-          if (isdenorm(val))
-            table[i].m_halfTable[j] =
-                signbit(val) ? Float16NegDenorm : Float16PosDenorm;
-          else
-            table[i].m_halfTable[j] = ConvertFloat32ToFloat16(val);
-        }
-      }
-      break;
-    }
-    case TableParameter::DOUBLE_TABLE: {
-      WEX::TestExecution::TestDataArray<double> tempTable;
-      if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
-                                                           tempTable)) &&
-          table[i].m_required) {
-        // TryGetValue does not suppport reading from int8
-        LogErrorFmt(L"Failed to get %s", table[i].m_name);
-        return E_FAIL;
-      }
-      table[i].m_doubleTable.resize(tempTable.GetSize());
-      for (size_t j = 0, end = tempTable.GetSize(); j != end; ++j) {
-        table[i].m_doubleTable[j] = tempTable[j];
-      }
-      break;
-    }
-    case TableParameter::BOOL_TABLE: {
-      WEX::TestExecution::TestDataArray<bool> tempTable;
-      if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
-                                                           tempTable)) &&
-          table[i].m_required) {
-        // TryGetValue does not suppport reading from int8
-        LogErrorFmt(L"Failed to get %s", table[i].m_name);
-        return E_FAIL;
-      }
-      table[i].m_boolTable.resize(tempTable.GetSize());
-      for (size_t j = 0, end = tempTable.GetSize(); j != end; ++j) {
-        table[i].m_boolTable[j] = tempTable[j];
-      }
-      break;
-    }
-    case TableParameter::STRING_TABLE: {
-      WEX::TestExecution::TestDataArray<WEX::Common::String> tempTable;
-      if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
-                                                           tempTable)) &&
-          table[i].m_required) {
-        // TryGetValue does not suppport reading from int8
-        LogErrorFmt(L"Failed to get %s", table[i].m_name);
-        return E_FAIL;
-      }
-      table[i].m_StringTable.resize(tempTable.GetSize());
-      for (size_t j = 0, end = tempTable.GetSize(); j != end; ++j) {
-        table[i].m_StringTable[j] = tempTable[j];
-      }
-      break;
-    }
-    default:
-      DXASSERT_NOMSG("Invalid Parameter Type");
-    }
-    if (errno == ERANGE) {
-      LogErrorFmt(L"got out of range value for table %s", table[i].m_name);
-      return E_FAIL;
-    }
-  }
-  return S_OK;
-}
-
 static bool CompareOutputWithExpectedValueInt(int output, int ref,
                                               int tolerance) {
   return ((output - ref) <= tolerance) && ((ref - output) <= tolerance);
@@ -6972,10 +6181,10 @@ TEST_F(ExecutionTest, UnaryFloatOpTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice)) {
+  if (!createDevice(&pDevice)) {
     return;
   }
   // Read data from the table
@@ -6997,7 +6206,7 @@ TEST_F(ExecutionTest, UnaryFloatOpTest) {
 
   size_t count = Validation_Input->size();
 
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTest(
       pDevice, m_support, pStream, "UnaryFPOp",
       // this callback is called when the test
       // is creating the resource to run the test
@@ -7035,10 +6244,10 @@ TEST_F(ExecutionTest, BinaryFloatOpTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice)) {
+  if (!createDevice(&pDevice)) {
     return;
   }
   // Read data from the table
@@ -7065,7 +6274,7 @@ TEST_F(ExecutionTest, BinaryFloatOpTest) {
       handler.GetTableParamByName(L"Validation.Tolerance")->m_double;
   size_t count = Validation_Input1->size();
 
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTest(
       pDevice, m_support, pStream, "BinaryFPOp",
       // this callback is called when the test
       // is creating the resource to run the test
@@ -7125,10 +6334,10 @@ TEST_F(ExecutionTest, TertiaryFloatOpTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice)) {
+  if (!createDevice(&pDevice)) {
     return;
   }
   // Read data from the table
@@ -7155,7 +6364,7 @@ TEST_F(ExecutionTest, TertiaryFloatOpTest) {
       handler.GetTableParamByName(L"Validation.Tolerance")->m_double;
   size_t count = Validation_Input1->size();
 
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTest(
       pDevice, m_support, pStream, "TertiaryFPOp",
       // this callback is called when the test
       // is creating the resource to run the test
@@ -7198,10 +6407,10 @@ TEST_F(ExecutionTest, UnaryHalfOpTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_2)) {
+  if (!createDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_2)) {
     return;
   }
 
@@ -7232,7 +6441,7 @@ TEST_F(ExecutionTest, UnaryHalfOpTest) {
 
   size_t count = Validation_Input->size();
 
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTest(
       pDevice, m_support, pStream, "UnaryFPOp",
       // this callback is called when the test
       // is creating the resource to run the test
@@ -7273,10 +6482,10 @@ TEST_F(ExecutionTest, BinaryHalfOpTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_2)) {
+  if (!createDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_2)) {
     return;
   }
 
@@ -7312,7 +6521,7 @@ TEST_F(ExecutionTest, BinaryHalfOpTest) {
       handler.GetTableParamByName(L"Validation.Tolerance")->m_double;
   size_t count = Validation_Input1->size();
 
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTest(
       pDevice, m_support, pStream, "BinaryFPOp",
       // this callback is called when the test
       // is creating the resource to run the test
@@ -7384,10 +6593,10 @@ TEST_F(ExecutionTest, TertiaryHalfOpTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_2)) {
+  if (!createDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_2)) {
     return;
   }
 
@@ -7422,7 +6631,7 @@ TEST_F(ExecutionTest, TertiaryHalfOpTest) {
       handler.GetTableParamByName(L"Validation.Tolerance")->m_double;
   size_t count = Validation_Input1->size();
 
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTest(
       pDevice, m_support, pStream, "TertiaryFPOp",
       // this callback is called when the test
       // is creating the resource to run the test
@@ -7470,10 +6679,10 @@ TEST_F(ExecutionTest, UnaryIntOpTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice)) {
+  if (!createDevice(&pDevice)) {
     return;
   }
   // Read data from the table
@@ -7492,7 +6701,7 @@ TEST_F(ExecutionTest, UnaryIntOpTest) {
       handler.GetTableParamByName(L"Validation.Tolerance")->m_int32;
   size_t count = Validation_Input->size();
 
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTest(
       pDevice, m_support, pStream, "UnaryIntOp",
       // this callback is called when the test
       // is creating the resource to run the test
@@ -7530,10 +6739,10 @@ TEST_F(ExecutionTest, UnaryUintOpTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice)) {
+  if (!createDevice(&pDevice)) {
     return;
   }
   // Read data from the table
@@ -7552,7 +6761,7 @@ TEST_F(ExecutionTest, UnaryUintOpTest) {
       handler.GetTableParamByName(L"Validation.Tolerance")->m_int32;
   size_t count = Validation_Input->size();
 
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTest(
       pDevice, m_support, pStream, "UnaryUintOp",
       // this callback is called when the test
       // is creating the resource to run the test
@@ -7590,10 +6799,10 @@ TEST_F(ExecutionTest, BinaryIntOpTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice)) {
+  if (!createDevice(&pDevice)) {
     return;
   }
   // Read data from the table
@@ -7617,7 +6826,7 @@ TEST_F(ExecutionTest, BinaryIntOpTest) {
 
   size_t numExpected = Validation_Expected2->size() == 0 ? 1 : 2;
 
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTest(
       pDevice, m_support, pStream, "BinaryIntOp",
       // this callback is called when the test
       // is creating the resource to run the test
@@ -7680,10 +6889,10 @@ TEST_F(ExecutionTest, TertiaryIntOpTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice)) {
+  if (!createDevice(&pDevice)) {
     return;
   }
   // Read data from the table
@@ -7705,7 +6914,7 @@ TEST_F(ExecutionTest, TertiaryIntOpTest) {
       handler.GetTableParamByName(L"Validation.Tolerance")->m_int32;
   size_t count = Validation_Input1->size();
 
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTest(
       pDevice, m_support, pStream, "TertiaryIntOp",
       // this callback is called when the test
       // is creating the resource to run the test
@@ -7750,10 +6959,10 @@ TEST_F(ExecutionTest, BinaryUintOpTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice)) {
+  if (!createDevice(&pDevice)) {
     return;
   }
   // Read data from the table
@@ -7775,7 +6984,7 @@ TEST_F(ExecutionTest, BinaryUintOpTest) {
       handler.GetTableParamByName(L"Validation.Tolerance")->m_int32;
   size_t count = Validation_Input1->size();
   int numExpected = Validation_Expected2->size() == 0 ? 1 : 2;
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTest(
       pDevice, m_support, pStream, "BinaryUintOp",
       // this callback is called when the test
       // is creating the resource to run the test
@@ -7842,10 +7051,10 @@ TEST_F(ExecutionTest, TertiaryUintOpTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice)) {
+  if (!createDevice(&pDevice)) {
     return;
   }
   // Read data from the table
@@ -7867,7 +7076,7 @@ TEST_F(ExecutionTest, TertiaryUintOpTest) {
       handler.GetTableParamByName(L"Validation.Tolerance")->m_int32;
   size_t count = Validation_Input1->size();
 
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTest(
       pDevice, m_support, pStream, "TertiaryUintOp",
       // this callback is called when the test
       // is creating the resource to run the test
@@ -7916,10 +7125,10 @@ TEST_F(ExecutionTest, UnaryInt16OpTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_2)) {
+  if (!createDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_2)) {
     return;
   }
 
@@ -7946,7 +7155,7 @@ TEST_F(ExecutionTest, UnaryInt16OpTest) {
       handler.GetTableParamByName(L"Validation.Tolerance")->m_int32;
   size_t count = Validation_Input->size();
 
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTest(
       pDevice, m_support, pStream, "UnaryIntOp",
       // this callback is called when the test
       // is creating the resource to run the test
@@ -7984,10 +7193,10 @@ TEST_F(ExecutionTest, UnaryUint16OpTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_2)) {
+  if (!createDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_2)) {
     return;
   }
 
@@ -8014,7 +7223,7 @@ TEST_F(ExecutionTest, UnaryUint16OpTest) {
       handler.GetTableParamByName(L"Validation.Tolerance")->m_int32;
   size_t count = Validation_Input->size();
 
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTest(
       pDevice, m_support, pStream, "UnaryUintOp",
       // this callback is called when the test
       // is creating the resource to run the test
@@ -8053,10 +7262,10 @@ TEST_F(ExecutionTest, BinaryInt16OpTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_2)) {
+  if (!createDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_2)) {
     return;
   }
 
@@ -8089,7 +7298,7 @@ TEST_F(ExecutionTest, BinaryInt16OpTest) {
 
   size_t numExpected = Validation_Expected2->size() == 0 ? 1 : 2;
 
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTest(
       pDevice, m_support, pStream, "BinaryIntOp",
       // this callback is called when the test
       // is creating the resource to run the test
@@ -8151,10 +7360,10 @@ TEST_F(ExecutionTest, TertiaryInt16OpTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_2)) {
+  if (!createDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_2)) {
     return;
   }
 
@@ -8185,7 +7394,7 @@ TEST_F(ExecutionTest, TertiaryInt16OpTest) {
       handler.GetTableParamByName(L"Validation.Tolerance")->m_int32;
   size_t count = Validation_Input1->size();
 
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTest(
       pDevice, m_support, pStream, "TertiaryIntOp",
       // this callback is called when the test
       // is creating the resource to run the test
@@ -8228,10 +7437,10 @@ TEST_F(ExecutionTest, BinaryUint16OpTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_2)) {
+  if (!createDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_2)) {
     return;
   }
 
@@ -8262,7 +7471,7 @@ TEST_F(ExecutionTest, BinaryUint16OpTest) {
       handler.GetTableParamByName(L"Validation.Tolerance")->m_int32;
   size_t count = Validation_Input1->size();
   int numExpected = Validation_Expected2->size() == 0 ? 1 : 2;
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTest(
       pDevice, m_support, pStream, "BinaryUintOp",
       // this callback is called when the test
       // is creating the resource to run the test
@@ -8326,10 +7535,10 @@ TEST_F(ExecutionTest, TertiaryUint16OpTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_2)) {
+  if (!createDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_2)) {
     return;
   }
 
@@ -8361,7 +7570,7 @@ TEST_F(ExecutionTest, TertiaryUint16OpTest) {
       handler.GetTableParamByName(L"Validation.Tolerance")->m_int32;
   size_t count = Validation_Input1->size();
 
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTest(
       pDevice, m_support, pStream, "TertiaryUintOp",
       // this callback is called when the test
       // is creating the resource to run the test
@@ -8916,10 +8125,10 @@ TEST_F(ExecutionTest, DotTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice)) {
+  if (!createDevice(&pDevice)) {
     return;
   }
 
@@ -8946,7 +8155,7 @@ TEST_F(ExecutionTest, DotTest) {
       handler.GetTableParamByName(L"Validation.Tolerance")->m_double;
   size_t count = Validation_Input1->size();
 
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTest(
       pDevice, m_support, pStream, "DotOp",
       // this callback is called when the test
       // is creating the resource to run the test
@@ -9000,10 +8209,10 @@ TEST_F(ExecutionTest, Dot2AddHalfTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_4, false)) {
+  if (!createDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_4, false)) {
     return;
   }
 
@@ -9036,7 +8245,7 @@ TEST_F(ExecutionTest, Dot2AddHalfTest) {
       handler.GetTableParamByName(L"Validation.Tolerance")->m_double;
   size_t count = validation_input1->size();
 
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTest(
       pDevice, m_support, pStream, "Dot2AddHalfOp",
       // this callback is called when the test
       // is creating the resource to run the test
@@ -9088,10 +8297,10 @@ TEST_F(ExecutionTest, Dot4AddI8PackedTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_4, false)) {
+  if (!createDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_4, false)) {
     return;
   }
 
@@ -9112,7 +8321,7 @@ TEST_F(ExecutionTest, Dot4AddI8PackedTest) {
 
   size_t count = validation_input1->size();
 
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTest(
       pDevice, m_support, pStream, "Dot4AddI8PackedOp",
       // this callback is called when the test
       // is creating the resource to run the test
@@ -9151,10 +8360,10 @@ TEST_F(ExecutionTest, Dot4AddU8PackedTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_4, false)) {
+  if (!createDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_4, false)) {
     return;
   }
 
@@ -9175,7 +8384,7 @@ TEST_F(ExecutionTest, Dot4AddU8PackedTest) {
 
   size_t count = validation_input1->size();
 
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTest(
       pDevice, m_support, pStream, "Dot4AddU8PackedOp",
       // this callback is called when the test
       // is creating the resource to run the test
@@ -9214,10 +8423,10 @@ TEST_F(ExecutionTest, Msad4Test) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice)) {
+  if (!createDevice(&pDevice)) {
     return;
   }
   size_t tableSize = sizeof(Msad4OpParameters) / sizeof(TableParameter);
@@ -9238,7 +8447,7 @@ TEST_F(ExecutionTest, Msad4Test) {
 
   size_t count = Validation_Expected->size();
 
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTest(
       pDevice, m_support, pStream, "Msad4",
       // this callback is called when the test
       // is creating the resource to run the test
@@ -9296,10 +8505,10 @@ TEST_F(ExecutionTest, DenormBinaryFloatOpTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_2)) {
+  if (!createDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_2)) {
     return;
   }
 
@@ -9340,7 +8549,7 @@ TEST_F(ExecutionTest, DenormBinaryFloatOpTest) {
              "must have same number of expected values");
   }
 
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTest(
       pDevice, m_support, pStream, "BinaryFPOp",
       // this callback is called when the test
       // is creating the resource to run the test
@@ -9407,10 +8616,10 @@ TEST_F(ExecutionTest, DenormTertiaryFloatOpTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_2)) {
+  if (!createDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_2)) {
     return;
   }
 
@@ -9453,7 +8662,7 @@ TEST_F(ExecutionTest, DenormTertiaryFloatOpTest) {
              "must have same number of expected values");
   }
 
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTest(
       pDevice, m_support, pStream, "TertiaryFPOp",
       // this callback is called when the test
       // is creating the resource to run the test
@@ -9846,10 +9055,10 @@ void ExecutionTest::WaveIntrinsicsActivePrefixTest(
   static const unsigned int DispatchGroupCount = 1;
   static const unsigned int ThreadCount = ThreadsPerGroup * DispatchGroupCount;
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice)) {
+  if (!createDevice(&pDevice)) {
     return;
   }
   if (!DoesDeviceSupportWaveOps(pDevice)) {
@@ -9881,31 +9090,33 @@ void ExecutionTest::WaveIntrinsicsActivePrefixTest(
     for (size_t maskIndex = 0;
          maskIndex < sizeof(MaskFunctionTable) / sizeof(MaskFunction);
          ++maskIndex) {
-      std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTestAfterParse(
-          pDevice, m_support, "WaveIntrinsicsOp",
-          // this callback is called when the test
-          // is creating the resource to run the test
-          [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
-            VERIFY_IS_TRUE(0 == _stricmp(Name, "SWaveIntrinsicsOp"));
-            size_t size = sizeof(PerThreadData) * ThreadCount;
-            Data.resize(size);
-            PerThreadData *pPrimitives = (PerThreadData *)Data.data();
-            // 4 different inputs for each operation test
-            size_t index = 0;
-            std::vector<T1> *IntList = InputDataList[setIndex];
-            while (index < ThreadCount) {
-              PerThreadData *p = &pPrimitives[index];
-              p->firstLaneId = 0xFFFFBFFF;
-              p->laneIndex = 0xFFFFBFFF;
-              p->mask = MaskFunctionTable[maskIndex]((int)index);
-              p->input = (*IntList)[index % IntList->size()];
-              p->output = 0xFFFFBFFF;
-              index++;
-            }
-            // use shader from data table
-            pShaderOp->Shaders.at(0).Text = Text.m_psz;
-          },
-          ShaderOpSet);
+      std::shared_ptr<st::ShaderOpTestResult> test =
+          st::RunShaderOpTestAfterParse(
+              pDevice, m_support, "WaveIntrinsicsOp",
+              // this callback is called when the test
+              // is creating the resource to run the test
+              [&](LPCSTR Name, std::vector<BYTE> &Data,
+                  st::ShaderOp *pShaderOp) {
+                VERIFY_IS_TRUE(0 == _stricmp(Name, "SWaveIntrinsicsOp"));
+                size_t size = sizeof(PerThreadData) * ThreadCount;
+                Data.resize(size);
+                PerThreadData *pPrimitives = (PerThreadData *)Data.data();
+                // 4 different inputs for each operation test
+                size_t index = 0;
+                std::vector<T1> *IntList = InputDataList[setIndex];
+                while (index < ThreadCount) {
+                  PerThreadData *p = &pPrimitives[index];
+                  p->firstLaneId = 0xFFFFBFFF;
+                  p->laneIndex = 0xFFFFBFFF;
+                  p->mask = MaskFunctionTable[maskIndex]((int)index);
+                  p->input = (*IntList)[index % IntList->size()];
+                  p->output = 0xFFFFBFFF;
+                  index++;
+                }
+                // use shader from data table
+                pShaderOp->Shaders.at(0).Text = Text.m_psz;
+              },
+              ShaderOpSet);
 
       // Check the value
       MappedData data;
@@ -10106,11 +9317,11 @@ void ExecutionTest::WaveIntrinsicsMultiPrefixOpTest(
   constexpr size_t ThreadCount = ThreadsPerGroup * DispatchGroupSize;
 
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
 
-  if (!CreateDevice(&pDevice, D3D_SHADER_MODEL_6_5)) {
+  if (!createDevice(&pDevice, D3D_SHADER_MODEL_6_5)) {
     return;
   }
 
@@ -10134,30 +9345,31 @@ void ExecutionTest::WaveIntrinsicsMultiPrefixOpTest(
 
   for (size_t maskIndex = 0; maskIndex < _countof(MaskFunctionTable);
        ++maskIndex) {
-    std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTestAfterParse(
-        pDevice, m_support, "WaveIntrinsicsOp",
-        [&](LPCSTR name, std::vector<BYTE> &data, st::ShaderOp *pShaderOp) {
-          UNREFERENCED_PARAMETER(name);
-
-          const size_t dataSize = sizeof(PerThreadData) * ThreadCount;
-
-          data.resize(dataSize);
-          PerThreadData *pThreadData =
-              reinterpret_cast<PerThreadData *>(data.data());
-
-          for (size_t i = 0; i != ThreadCount; ++i) {
-            pThreadData[i].key = keys->at(i % keys->size());
-            pThreadData[i].value = values->at(i % values->size());
-            pThreadData[i].firstLaneId = 0xdeadbeef;
-            pThreadData[i].laneId = 0xdeadbeef;
-            pThreadData[i].mask = MaskFunctionTable[maskIndex]((int)i);
-            pThreadData[i].result = 0xdeadbeef;
-          }
+    std::shared_ptr<st::ShaderOpTestResult> test =
+        st::RunShaderOpTestAfterParse(
+            pDevice, m_support, "WaveIntrinsicsOp",
+            [&](LPCSTR name, std::vector<BYTE> &data, st::ShaderOp *pShaderOp) {
+              UNREFERENCED_PARAMETER(name);
+
+              const size_t dataSize = sizeof(PerThreadData) * ThreadCount;
+
+              data.resize(dataSize);
+              PerThreadData *pThreadData =
+                  reinterpret_cast<PerThreadData *>(data.data());
+
+              for (size_t i = 0; i != ThreadCount; ++i) {
+                pThreadData[i].key = keys->at(i % keys->size());
+                pThreadData[i].value = values->at(i % values->size());
+                pThreadData[i].firstLaneId = 0xdeadbeef;
+                pThreadData[i].laneId = 0xdeadbeef;
+                pThreadData[i].mask = MaskFunctionTable[maskIndex]((int)i);
+                pThreadData[i].result = 0xdeadbeef;
+              }
 
-          pShaderOp->Shaders.at(0).Text = shaderSource;
-          pShaderOp->Shaders.at(0).Target = shaderProfile;
-        },
-        ShaderOpSet);
+              pShaderOp->Shaders.at(0).Text = shaderSource;
+              pShaderOp->Shaders.at(0).Target = shaderProfile;
+            },
+            ShaderOpSet);
 
     MappedData mappedData;
     test->Test->GetReadBackData("SWaveIntrinsicsOp", &mappedData);
@@ -10234,11 +9446,11 @@ TEST_F(ExecutionTest, CBufferTestHalf) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   // Single operation test at the moment.
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice, D3D_SHADER_MODEL_6_2))
+  if (!createDevice(&pDevice, D3D_SHADER_MODEL_6_2))
     return;
 
   if (!DoesDeviceSupportNative16bitOps(pDevice)) {
@@ -10250,7 +9462,7 @@ TEST_F(ExecutionTest, CBufferTestHalf) {
 
   uint16_t InputData[] = {0x3F80, 0x3F00, 0x3D80, 0x7BFF};
 
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTest(
       pDevice, m_support, pStream, "CBufferTestHalf",
       [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
         UNREFERENCED_PARAMETER(pShaderOp);
@@ -10280,7 +9492,7 @@ TEST_F(ExecutionTest, CBufferTestHalf) {
 }
 
 void TestBarycentricVariant(bool checkOrdering,
-                            std::shared_ptr<ShaderOpTestResult> test) {
+                            std::shared_ptr<st::ShaderOpTestResult> test) {
   MappedData data;
   D3D12_RESOURCE_DESC &D = test->ShaderOp->GetResourceByName("RTarget")->Desc;
   UINT width = (UINT)D.Width;
@@ -10364,10 +9576,10 @@ TEST_F(ExecutionTest, BarycentricsTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice, D3D_SHADER_MODEL_6_1))
+  if (!createDevice(&pDevice, D3D_SHADER_MODEL_6_1))
     return;
 
   if (!DoesDeviceSupportBarycentrics(pDevice)) {
@@ -10386,9 +9598,9 @@ TEST_F(ExecutionTest, BarycentricsTest) {
   auto ResourceCallbackFnNoShift =
       MakeBarycentricsResourceInitCallbackFn(test_iteration);
 
-  std::shared_ptr<ShaderOpTestResult> test =
-      RunShaderOpTestAfterParse(pDevice, m_support, "Barycentrics",
-                                ResourceCallbackFnNoShift, ShaderOpSet);
+  std::shared_ptr<st::ShaderOpTestResult> test =
+      st::RunShaderOpTestAfterParse(pDevice, m_support, "Barycentrics",
+                                    ResourceCallbackFnNoShift, ShaderOpSet);
   TestBarycentricVariant(false, test);
 
   // Now test that barycentric ordering is consistent
@@ -10400,8 +9612,9 @@ TEST_F(ExecutionTest, BarycentricsTest) {
     auto ResourceCallbackFn =
         MakeBarycentricsResourceInitCallbackFn(test_iteration);
 
-    std::shared_ptr<ShaderOpTestResult> test2 = RunShaderOpTestAfterParse(
-        pDevice, m_support, "Barycentrics", ResourceCallbackFn, ShaderOpSet);
+    std::shared_ptr<st::ShaderOpTestResult> test2 =
+        st::RunShaderOpTestAfterParse(pDevice, m_support, "Barycentrics",
+                                      ResourceCallbackFn, ShaderOpSet);
     TestBarycentricVariant(true, test2);
   }
 }
@@ -10647,7 +9860,7 @@ bool ExecutionTest::SetupRawBufferLdStTest(D3D_SHADER_MODEL shaderModel,
                                            CComPtr<IStream> &pStream,
                                            const char *&sTy,
                                            const char *&additionalOptions) {
-  if (!CreateDevice(&pDevice, shaderModel)) {
+  if (!createDevice(&pDevice, shaderModel)) {
     return false;
   }
 
@@ -10692,7 +9905,7 @@ bool ExecutionTest::SetupRawBufferLdStTest(D3D_SHADER_MODEL shaderModel,
   }
 
   // read shader config
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   return true;
 }
@@ -10784,7 +9997,7 @@ void ExecutionTest::RunComputeRawBufferLdStTest(
                            (int)sizeof(Ty), additionalOptions) != -1);
 
   // run the shader
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTest(
       pDevice, m_support, pStream, shaderOpName,
       [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
         VERIFY_IS_TRUE(((0 == strncmp(Name, "SRVBuffer", 9)) ||
@@ -10839,7 +10052,7 @@ void ExecutionTest::RunGraphicsRawBufferLdStTest(
                            (int)sizeof(Ty), additionalOptions) != -1);
 
   // run the shader
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTest(
       pDevice, m_support, pStream, shaderOpName,
       [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
         VERIFY_IS_TRUE(((0 == strncmp(Name, "SRVBuffer", 9)) ||
@@ -10921,7 +10134,7 @@ TEST_F(ExecutionTest, PackUnpackTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
 
@@ -10929,14 +10142,14 @@ TEST_F(ExecutionTest, PackUnpackTest) {
   string args = "-enable-16bit-types -DPACKUNPACK_PLACEHOLDER";
   string target = "cs_6_2";
 
-  if (!CreateDevice(&pDevice)) {
+  if (!createDevice(&pDevice)) {
     return;
   }
 #else
   string args = "-enable-16bit-types";
   string target = "cs_6_6";
 
-  if (!CreateDevice(&pDevice, D3D_SHADER_MODEL_6_6)) {
+  if (!createDevice(&pDevice, D3D_SHADER_MODEL_6_6)) {
     return;
   }
 #endif
@@ -10962,7 +10175,7 @@ TEST_F(ExecutionTest, PackUnpackTest) {
   std::vector<SPackUnpackOpOutPacked> expectedPacked(count / 4);
   std::vector<SPackUnpackOpOutUnpacked> expectedUnpacked(count / 4);
 
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTest(
       pDevice, m_support, pStream, "PackUnpackOp",
       // this callback is called when the test
       // is creating the resource to run the test
@@ -11316,7 +10529,7 @@ TEST_F(ExecutionTest, SignatureResourcesTest) {
       "}\n";
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice, D3D_SHADER_MODEL_6_6))
+  if (!createDevice(&pDevice, D3D_SHADER_MODEL_6_6))
     return;
 
   RunResourceTest(pDevice, pShader.c_str(), L"cs_6_6", /*isDynamic*/ false);
@@ -11355,7 +10568,7 @@ TEST_F(ExecutionTest, DynamicResourcesTest) {
       "}\n";
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice, D3D_SHADER_MODEL_6_6))
+  if (!createDevice(&pDevice, D3D_SHADER_MODEL_6_6))
     return;
 
   // ResourceDescriptorHeap/SamplerDescriptorHeap requires Resource Binding Tier
@@ -11398,7 +10611,7 @@ TEST_F(ExecutionTest, DynamicResourcesDynamicIndexingTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   std::shared_ptr<st::ShaderOpSet> ShaderOpSet =
       std::make_shared<st::ShaderOpSet>();
@@ -11436,7 +10649,7 @@ TEST_F(ExecutionTest, DynamicResourcesDynamicIndexingTest) {
                   ((UINT)sm & 0x0f));
 
     CComPtr<ID3D12Device> pDevice;
-    if (!CreateDevice(&pDevice, sm, false /* skipUnsupported */)) {
+    if (!createDevice(&pDevice, sm, false /* skipUnsupported */)) {
       continue;
     }
     D3D12_FEATURE_DATA_D3D12_OPTIONS devOptions;
@@ -11495,9 +10708,10 @@ TEST_F(ExecutionTest, DynamicResourcesDynamicIndexingTest) {
       // Test Compute shader
       {
         pShaderOp->CS = pShaderOp->GetString("CS66");
-        std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTestAfterParse(
-            pDevice, m_support, "DynamicResourcesDynamicIndexing", nullptr,
-            ShaderOpSet);
+        std::shared_ptr<st::ShaderOpTestResult> test =
+            st::RunShaderOpTestAfterParse(pDevice, m_support,
+                                          "DynamicResourcesDynamicIndexing",
+                                          nullptr, ShaderOpSet);
 
         MappedData resultData;
         test->Test->GetReadBackData("g_result", &resultData);
@@ -11512,9 +10726,10 @@ TEST_F(ExecutionTest, DynamicResourcesDynamicIndexingTest) {
         pShaderOp->CS = nullptr;
         pShaderOp->VS = pShaderOp->GetString("VS66");
         pShaderOp->PS = pShaderOp->GetString("PS66");
-        std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTestAfterParse(
-            pDevice, m_support, "DynamicResourcesDynamicIndexing", nullptr,
-            ShaderOpSet);
+        std::shared_ptr<st::ShaderOpTestResult> test =
+            st::RunShaderOpTestAfterParse(pDevice, m_support,
+                                          "DynamicResourcesDynamicIndexing",
+                                          nullptr, ShaderOpSet);
 
         MappedData resultVSData;
         MappedData resultPSData;
@@ -11577,19 +10792,20 @@ void RunWaveSizeTest(UINT minWaveSize, UINT maxWaveSize,
                              waveSize) != -1);
 
     // run the shader
-    std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTestAfterParse(
-        pDevice, m_support, "WaveSizeTest",
-        [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
-          VERIFY_IS_TRUE((0 == strncmp(Name, "UAVBuffer0", 10)));
-          pShaderOp->Shaders.at(0).Arguments = compilerOptions;
-          pShaderOp->Shaders.at(0).Text = waveSizeTestShader;
-
-          VERIFY_IS_TRUE(sizeof(WaveSizeTestData) * MAX_WAVESIZE <=
-                         Data.size());
-          WaveSizeTestData *pInData = (WaveSizeTestData *)Data.data();
-          memset(pInData, 0, sizeof(WaveSizeTestData) * MAX_WAVESIZE);
-        },
-        ShaderOpSet);
+    std::shared_ptr<st::ShaderOpTestResult> test =
+        st::RunShaderOpTestAfterParse(
+            pDevice, m_support, "WaveSizeTest",
+            [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
+              VERIFY_IS_TRUE((0 == strncmp(Name, "UAVBuffer0", 10)));
+              pShaderOp->Shaders.at(0).Arguments = compilerOptions;
+              pShaderOp->Shaders.at(0).Text = waveSizeTestShader;
+
+              VERIFY_IS_TRUE(sizeof(WaveSizeTestData) * MAX_WAVESIZE <=
+                             Data.size());
+              WaveSizeTestData *pInData = (WaveSizeTestData *)Data.data();
+              memset(pInData, 0, sizeof(WaveSizeTestData) * MAX_WAVESIZE);
+            },
+            ShaderOpSet);
 
     // verify expected values
     MappedData dataUav;
@@ -11665,7 +10881,7 @@ void ExecuteWaveSizeRangeInstance(UINT minWaveSize, UINT maxWaveSize,
   };
 
   // run the shader
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTestAfterParse(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTestAfterParse(
       pDevice, m_support, "WaveSizeTest",
       [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
         VERIFY_IS_TRUE((0 == strncmp(Name, "UAVBuffer0", 10)));
@@ -11737,7 +10953,7 @@ void ExecutionTest::WaveSizeTest() {
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice, D3D_SHADER_MODEL_6_6,
+  if (!createDevice(&pDevice, D3D_SHADER_MODEL_6_6,
                     /*skipUnsupported*/ false)) {
     return;
   }
@@ -11765,7 +10981,7 @@ void ExecutionTest::WaveSizeTest() {
   CComPtr<IStream> pStream;
   std::shared_ptr<st::ShaderOpSet> ShaderOpSet =
       std::make_shared<st::ShaderOpSet>();
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
   st::ParseShaderOpSetFromStream(pStream, ShaderOpSet.get());
 
   LogCommentFmt(L"Testing WaveSize attribute for shader model 6.6.");
@@ -11777,7 +10993,7 @@ void ExecutionTest::WaveSizeRangeTest() {
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice, D3D_SHADER_MODEL_6_8,
+  if (!createDevice(&pDevice, D3D_SHADER_MODEL_6_8,
                     /*skipUnsupported*/ false)) {
     return;
   }
@@ -11805,7 +11021,7 @@ void ExecutionTest::WaveSizeRangeTest() {
   CComPtr<IStream> pStream;
   std::shared_ptr<st::ShaderOpSet> ShaderOpSet =
       std::make_shared<st::ShaderOpSet>();
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
   st::ParseShaderOpSetFromStream(pStream, ShaderOpSet.get());
 
   LogCommentFmt(L"Testing WaveSize Range attribute for shader model 6.8.");
@@ -12034,7 +11250,7 @@ void VerifyAtomicResults(const BYTE *uResults, const BYTE *sResults,
   }
 }
 
-void VerifyAtomicsRawTest(std::shared_ptr<ShaderOpTestResult> test,
+void VerifyAtomicsRawTest(std::shared_ptr<st::ShaderOpTestResult> test,
                           uint64_t maxIdx, size_t bitSize) {
 
   size_t stride = 8;
@@ -12083,7 +11299,7 @@ void VerifyAtomicsRawTest(std::shared_ptr<ShaderOpTestResult> test,
                       bitSize);
 }
 
-void VerifyAtomicsTypedTest(std::shared_ptr<ShaderOpTestResult> test,
+void VerifyAtomicsTypedTest(std::shared_ptr<st::ShaderOpTestResult> test,
                             uint64_t maxIdx, size_t bitSize) {
 
   size_t stride = 8;
@@ -12135,7 +11351,7 @@ void VerifyAtomicsTypedTest(std::shared_ptr<ShaderOpTestResult> test,
   VerifyAtomicResults(pUint, pSint + stride, pXchg, stride, maxIdx, bitSize);
 }
 
-void VerifyAtomicsSharedTest(std::shared_ptr<ShaderOpTestResult> test,
+void VerifyAtomicsSharedTest(std::shared_ptr<st::ShaderOpTestResult> test,
                              uint64_t maxIdx, size_t bitSize) {
 
   size_t stride = 8;
@@ -12156,7 +11372,7 @@ void VerifyAtomicsSharedTest(std::shared_ptr<ShaderOpTestResult> test,
                       bitSize);
 }
 
-void VerifyAtomicsTest(std::shared_ptr<ShaderOpTestResult> test,
+void VerifyAtomicsTest(std::shared_ptr<st::ShaderOpTestResult> test,
                        uint64_t maxIdx, size_t bitSize) {
   VerifyAtomicsRawTest(test, maxIdx, bitSize);
   VerifyAtomicsTypedTest(test, maxIdx, bitSize);
@@ -12166,10 +11382,10 @@ TEST_F(ExecutionTest, AtomicsTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice))
+  if (!createDevice(&pDevice))
     return;
 
   std::shared_ptr<st::ShaderOpSet> ShaderOpSet =
@@ -12181,7 +11397,7 @@ TEST_F(ExecutionTest, AtomicsTest) {
   // Test compute shader
   LogCommentFmt(
       L"Verifying 32-bit integer atomic operations in compute shader");
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTestAfterParse(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTestAfterParse(
       pDevice, m_support, "AtomicsHeap", nullptr, ShaderOpSet);
 
   VerifyAtomicsTest(test, 32 * 32, 32);
@@ -12192,8 +11408,8 @@ TEST_F(ExecutionTest, AtomicsTest) {
   if (DoesDeviceSupportMeshShaders(pDevice)) {
     LogCommentFmt(L"Verifying 32-bit integer atomic operations in "
                   L"amp/mesh/pixel shaders");
-    test = RunShaderOpTestAfterParse(pDevice, m_support, "AtomicsHeap", nullptr,
-                                     ShaderOpSet);
+    test = st::RunShaderOpTestAfterParse(pDevice, m_support, "AtomicsHeap",
+                                         nullptr, ShaderOpSet);
     VerifyAtomicsTest(test, 8 * 8 * 2 + 8 * 8 * 2 + 64 * 64, 32);
     VerifyAtomicsSharedTest(test, 8 * 8 * 2 + 8 * 8 * 2, 32);
   }
@@ -12202,8 +11418,8 @@ TEST_F(ExecutionTest, AtomicsTest) {
   pShaderOp->MS = nullptr;
   LogCommentFmt(
       L"Verifying 32-bit integer atomic operations in vert/pixel shaders");
-  test = RunShaderOpTestAfterParse(pDevice, m_support, "AtomicsHeap", nullptr,
-                                   ShaderOpSet);
+  test = st::RunShaderOpTestAfterParse(pDevice, m_support, "AtomicsHeap",
+                                       nullptr, ShaderOpSet);
   VerifyAtomicsTest(test, 64 * 64 + 6, 32);
 }
 
@@ -12211,10 +11427,10 @@ TEST_F(ExecutionTest, Atomics64Test) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice, D3D_SHADER_MODEL_6_6))
+  if (!createDevice(&pDevice, D3D_SHADER_MODEL_6_6))
     return;
 
   if (!DoesDeviceSupportInt64(pDevice)) {
@@ -12240,7 +11456,7 @@ TEST_F(ExecutionTest, Atomics64Test) {
   // Test compute shader
   LogCommentFmt(L"Verifying 64-bit integer atomic operations on raw buffers in "
                 L"compute shader");
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTestAfterParse(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTestAfterParse(
       pDevice, m_support, "AtomicsRoot", nullptr, ShaderOpSet);
   VerifyAtomicsRawTest(test, 32 * 32, 64);
 
@@ -12249,8 +11465,8 @@ TEST_F(ExecutionTest, Atomics64Test) {
   if (DoesDeviceSupportMeshShaders(pDevice)) {
     LogCommentFmt(L"Verifying 64-bit integer atomic operations on raw buffers "
                   L"in amp/mesh/pixel shader");
-    test = RunShaderOpTestAfterParse(pDevice, m_support, "AtomicsRoot", nullptr,
-                                     ShaderOpSet);
+    test = st::RunShaderOpTestAfterParse(pDevice, m_support, "AtomicsRoot",
+                                         nullptr, ShaderOpSet);
     VerifyAtomicsRawTest(test, 8 * 8 * 2 + 8 * 8 * 2 + 64 * 64, 64);
   }
 
@@ -12258,8 +11474,8 @@ TEST_F(ExecutionTest, Atomics64Test) {
   pShaderOp->MS = nullptr;
   LogCommentFmt(L"Verifying 64-bit integer atomic operations on raw buffers in "
                 L"vert/pixel shader");
-  test = RunShaderOpTestAfterParse(pDevice, m_support, "AtomicsRoot", nullptr,
-                                   ShaderOpSet);
+  test = st::RunShaderOpTestAfterParse(pDevice, m_support, "AtomicsRoot",
+                                       nullptr, ShaderOpSet);
   VerifyAtomicsRawTest(test, 64 * 64 + 6, 64);
 }
 
@@ -12267,10 +11483,10 @@ TEST_F(ExecutionTest, AtomicsRawHeap64Test) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice, D3D_SHADER_MODEL_6_6))
+  if (!createDevice(&pDevice, D3D_SHADER_MODEL_6_6))
     return;
 
   if (!DoesDeviceSupportInt64(pDevice)) {
@@ -12303,7 +11519,7 @@ TEST_F(ExecutionTest, AtomicsRawHeap64Test) {
   // Test compute shader
   LogCommentFmt(L"Verifying 64-bit integer atomic operations on heap raw "
                 L"buffers in compute shader");
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTestAfterParse(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTestAfterParse(
       pDevice, m_support, "AtomicsHeap", nullptr, ShaderOpSet);
   VerifyAtomicsRawTest(test, 32 * 32, 64);
 
@@ -12312,8 +11528,8 @@ TEST_F(ExecutionTest, AtomicsRawHeap64Test) {
   if (DoesDeviceSupportMeshShaders(pDevice)) {
     LogCommentFmt(L"Verifying 64-bit integer atomic operations on heap raw "
                   L"buffers in amp/mesh/pixel shader");
-    test = RunShaderOpTestAfterParse(pDevice, m_support, "AtomicsHeap", nullptr,
-                                     ShaderOpSet);
+    test = st::RunShaderOpTestAfterParse(pDevice, m_support, "AtomicsHeap",
+                                         nullptr, ShaderOpSet);
     VerifyAtomicsRawTest(test, 8 * 8 * 2 + 8 * 8 * 2 + 64 * 64, 64);
   }
 
@@ -12321,8 +11537,8 @@ TEST_F(ExecutionTest, AtomicsRawHeap64Test) {
   pShaderOp->MS = nullptr;
   LogCommentFmt(L"Verifying 64-bit integer atomic operations on heap raw "
                 L"buffers in vert/pixel shader");
-  test = RunShaderOpTestAfterParse(pDevice, m_support, "AtomicsHeap", nullptr,
-                                   ShaderOpSet);
+  test = st::RunShaderOpTestAfterParse(pDevice, m_support, "AtomicsHeap",
+                                       nullptr, ShaderOpSet);
   VerifyAtomicsRawTest(test, 64 * 64 + 6, 64);
 }
 
@@ -12330,10 +11546,10 @@ TEST_F(ExecutionTest, AtomicsTyped64Test) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice, D3D_SHADER_MODEL_6_6))
+  if (!createDevice(&pDevice, D3D_SHADER_MODEL_6_6))
     return;
 
   if (!DoesDeviceSupportInt64(pDevice)) {
@@ -12366,7 +11582,7 @@ TEST_F(ExecutionTest, AtomicsTyped64Test) {
   // Test compute shader
   LogCommentFmt(L"Verifying 64-bit integer atomic operations on typed "
                 L"resources in compute shader");
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTestAfterParse(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTestAfterParse(
       pDevice, m_support, "AtomicsHeap", nullptr, ShaderOpSet);
   VerifyAtomicsTypedTest(test, 32 * 32, 64);
 
@@ -12375,8 +11591,8 @@ TEST_F(ExecutionTest, AtomicsTyped64Test) {
   if (DoesDeviceSupportMeshShaders(pDevice)) {
     LogCommentFmt(L"Verifying 64-bit integer atomic operations on typed "
                   L"resources in amp/mesh/pixel shader");
-    test = RunShaderOpTestAfterParse(pDevice, m_support, "AtomicsHeap", nullptr,
-                                     ShaderOpSet);
+    test = st::RunShaderOpTestAfterParse(pDevice, m_support, "AtomicsHeap",
+                                         nullptr, ShaderOpSet);
     VerifyAtomicsTypedTest(test, 8 * 8 * 2 + 8 * 8 * 2 + 64 * 64, 64);
   }
 
@@ -12384,8 +11600,8 @@ TEST_F(ExecutionTest, AtomicsTyped64Test) {
   pShaderOp->MS = nullptr;
   LogCommentFmt(L"Verifying 64-bit integer atomic operations on typed "
                 L"resources in vert/pixel shader");
-  test = RunShaderOpTestAfterParse(pDevice, m_support, "AtomicsHeap", nullptr,
-                                   ShaderOpSet);
+  test = st::RunShaderOpTestAfterParse(pDevice, m_support, "AtomicsHeap",
+                                       nullptr, ShaderOpSet);
   VerifyAtomicsTypedTest(test, 64 * 64 + 6, 64);
 }
 
@@ -12393,10 +11609,10 @@ TEST_F(ExecutionTest, AtomicsShared64Test) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice, D3D_SHADER_MODEL_6_6))
+  if (!createDevice(&pDevice, D3D_SHADER_MODEL_6_6))
     return;
 
   if (!DoesDeviceSupportInt64(pDevice)) {
@@ -12426,7 +11642,7 @@ TEST_F(ExecutionTest, AtomicsShared64Test) {
 
   LogCommentFmt(L"Verifying 64-bit integer atomic operations on groupshared "
                 L"variables in compute shader");
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTestAfterParse(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTestAfterParse(
       pDevice, m_support, "AtomicsRoot", nullptr, ShaderOpSet);
   VerifyAtomicsSharedTest(test, 32 * 32, 64);
 
@@ -12435,8 +11651,8 @@ TEST_F(ExecutionTest, AtomicsShared64Test) {
   if (DoesDeviceSupportMeshShaders(pDevice)) {
     LogCommentFmt(L"Verifying 64-bit integer atomic operations on groupshared "
                   L"variables in amp/mesh/pixel shader");
-    test = RunShaderOpTestAfterParse(pDevice, m_support, "AtomicsRoot", nullptr,
-                                     ShaderOpSet);
+    test = st::RunShaderOpTestAfterParse(pDevice, m_support, "AtomicsRoot",
+                                         nullptr, ShaderOpSet);
     VerifyAtomicsSharedTest(test, 8 * 8 * 2 + 8 * 8 * 2, 64);
   }
 }
@@ -12464,7 +11680,8 @@ void VerifyAtomicFloatResults(const float *results) {
   }
 }
 
-void VerifyAtomicsFloatSharedTest(std::shared_ptr<ShaderOpTestResult> test) {
+void VerifyAtomicsFloatSharedTest(
+    std::shared_ptr<st::ShaderOpTestResult> test) {
   MappedData Data;
   const float *pData = nullptr;
 
@@ -12476,7 +11693,7 @@ void VerifyAtomicsFloatSharedTest(std::shared_ptr<ShaderOpTestResult> test) {
   VerifyAtomicFloatResults(pData);
 }
 
-void VerifyAtomicsFloatTest(std::shared_ptr<ShaderOpTestResult> test) {
+void VerifyAtomicsFloatTest(std::shared_ptr<st::ShaderOpTestResult> test) {
 
   // struct mirroring that in the shader
   struct AtomicStuff {
@@ -12524,10 +11741,10 @@ TEST_F(ExecutionTest, AtomicsFloatTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice))
+  if (!createDevice(&pDevice))
     return;
 
   std::shared_ptr<st::ShaderOpSet> ShaderOpSet =
@@ -12539,7 +11756,7 @@ TEST_F(ExecutionTest, AtomicsFloatTest) {
   // Test compute shader
   LogCommentFmt(
       L"Verifying float cmp/xchg atomic operations in compute shader");
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTestAfterParse(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTestAfterParse(
       pDevice, m_support, "FloatAtomics", nullptr, ShaderOpSet);
   VerifyAtomicsFloatTest(test);
   VerifyAtomicsFloatSharedTest(test);
@@ -12549,8 +11766,8 @@ TEST_F(ExecutionTest, AtomicsFloatTest) {
   if (DoesDeviceSupportMeshShaders(pDevice)) {
     LogCommentFmt(L"Verifying float cmp/xchg atomic operations in "
                   L"amp/mesh/pixel shaders");
-    test = RunShaderOpTestAfterParse(pDevice, m_support, "FloatAtomics",
-                                     nullptr, ShaderOpSet);
+    test = st::RunShaderOpTestAfterParse(pDevice, m_support, "FloatAtomics",
+                                         nullptr, ShaderOpSet);
     VerifyAtomicsFloatTest(test);
     VerifyAtomicsFloatSharedTest(test);
   }
@@ -12559,8 +11776,8 @@ TEST_F(ExecutionTest, AtomicsFloatTest) {
   pShaderOp->MS = nullptr;
   LogCommentFmt(
       L"Verifying float cmp/xchg atomic operations in vert/pixel shaders");
-  test = RunShaderOpTestAfterParse(pDevice, m_support, "FloatAtomics", nullptr,
-                                   ShaderOpSet);
+  test = st::RunShaderOpTestAfterParse(pDevice, m_support, "FloatAtomics",
+                                       nullptr, ShaderOpSet);
   VerifyAtomicsFloatTest(test);
 }
 
@@ -12589,7 +11806,7 @@ TEST_F(ExecutionTest, HelperLaneTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   std::shared_ptr<st::ShaderOpSet> ShaderOpSet =
       std::make_shared<st::ShaderOpSet>();
@@ -12604,19 +11821,20 @@ TEST_F(ExecutionTest, HelperLaneTest) {
                   ((UINT)sm & 0x0f));
 
     CComPtr<ID3D12Device> pDevice;
-    if (!CreateDevice(&pDevice, sm, false /* skipUnsupported */))
+    if (!createDevice(&pDevice, sm, false /* skipUnsupported */))
       continue;
 
-    std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTestAfterParse(
-        pDevice, m_support, "HelperLaneTestNoWave",
-        // this callback is called when the test is creating the resource to
-        // run the test
-        [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
-          VERIFY_IS_TRUE(0 == _stricmp(Name, "UAVBuffer0"));
-          std::fill(Data.begin(), Data.end(), (BYTE)0xCC);
-          UNREFERENCED_PARAMETER(pShaderOp);
-        },
-        ShaderOpSet);
+    std::shared_ptr<st::ShaderOpTestResult> test =
+        st::RunShaderOpTestAfterParse(
+            pDevice, m_support, "HelperLaneTestNoWave",
+            // this callback is called when the test is creating the resource to
+            // run the test
+            [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
+              VERIFY_IS_TRUE(0 == _stricmp(Name, "UAVBuffer0"));
+              std::fill(Data.begin(), Data.end(), (BYTE)0xCC);
+              UNREFERENCED_PARAMETER(pShaderOp);
+            },
+            ShaderOpSet);
 
     struct HelperLaneTestResult {
       int32_t is_helper_00;
@@ -12989,7 +12207,7 @@ TEST_F(ExecutionTest, HelperLaneTestWave) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   std::shared_ptr<st::ShaderOpSet> ShaderOpSet =
       std::make_shared<st::ShaderOpSet>();
@@ -13010,7 +12228,7 @@ TEST_F(ExecutionTest, HelperLaneTestWave) {
     bool smPassed = true;
 
     CComPtr<ID3D12Device> pDevice;
-    if (!CreateDevice(&pDevice, sm, false /* skipUnsupported */)) {
+    if (!createDevice(&pDevice, sm, false /* skipUnsupported */)) {
       continue;
     }
 
@@ -13045,9 +12263,10 @@ TEST_F(ExecutionTest, HelperLaneTestWave) {
 
     // Test Compute shader
     {
-      std::shared_ptr<ShaderOpTestResult> test =
-          RunShaderOpTestAfterParse(pDevice, m_support, "HelperLaneTestWave",
-                                    CleanUAVBuffer0Buffer, ShaderOpSet);
+      std::shared_ptr<st::ShaderOpTestResult> test =
+          st::RunShaderOpTestAfterParse(pDevice, m_support,
+                                        "HelperLaneTestWave",
+                                        CleanUAVBuffer0Buffer, ShaderOpSet);
 
       MappedData uavData;
       test->Test->GetReadBackData("UAVBuffer0", &uavData);
@@ -13069,9 +12288,10 @@ TEST_F(ExecutionTest, HelperLaneTestWave) {
     // Test Vertex + Pixel shader
     {
       pShaderOp->CS = nullptr;
-      std::shared_ptr<ShaderOpTestResult> test =
-          RunShaderOpTestAfterParse(pDevice, m_support, "HelperLaneTestWave",
-                                    CleanUAVBuffer0Buffer, ShaderOpSet);
+      std::shared_ptr<st::ShaderOpTestResult> test =
+          st::RunShaderOpTestAfterParse(pDevice, m_support,
+                                        "HelperLaneTestWave",
+                                        CleanUAVBuffer0Buffer, ShaderOpSet);
 
       MappedData uavData;
       test->Test->GetReadBackData("UAVBuffer0", &uavData);
@@ -13130,7 +12350,7 @@ TEST_F(ExecutionTest, QuadAnyAll) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   std::shared_ptr<st::ShaderOpSet> ShaderOpSet =
       std::make_shared<st::ShaderOpSet>();
@@ -13163,7 +12383,7 @@ TEST_F(ExecutionTest, QuadAnyAll) {
     }
 
     CComPtr<ID3D12Device> pDevice;
-    if (!CreateDevice(&pDevice, sm, false /* skipUnsupported */)) {
+    if (!createDevice(&pDevice, sm, false /* skipUnsupported */)) {
       continue;
     }
 
@@ -13176,8 +12396,9 @@ TEST_F(ExecutionTest, QuadAnyAll) {
     Skipped = false;
 
     // test compute
-    std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTestAfterParse(
-        pDevice, m_support, "QuadAnyAll", CleanUAVBuffer0Buffer, ShaderOpSet);
+    std::shared_ptr<st::ShaderOpTestResult> test =
+        st::RunShaderOpTestAfterParse(pDevice, m_support, "QuadAnyAll",
+                                      CleanUAVBuffer0Buffer, ShaderOpSet);
 
     MappedData uavData;
     test->Test->GetReadBackData("UAVBuffer0", &uavData);
@@ -13189,8 +12410,8 @@ TEST_F(ExecutionTest, QuadAnyAll) {
 
     pShaderOp->CS = nullptr;
     // test AS/MS
-    test = RunShaderOpTestAfterParse(pDevice, m_support, "QuadAnyAll",
-                                     CleanUAVBuffer0Buffer, ShaderOpSet);
+    test = st::RunShaderOpTestAfterParse(pDevice, m_support, "QuadAnyAll",
+                                         CleanUAVBuffer0Buffer, ShaderOpSet);
 
     test->Test->GetReadBackData("UAVBuffer0", &uavData);
     Result = VerifyQuadAnyAllResults((int2 *)uavData.data());
@@ -13337,7 +12558,7 @@ TEST_F(ExecutionTest, IsNormalTest) {
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
 
   CComPtr<ID3D12Device> pDevice;
-  VERIFY_IS_TRUE(CreateDevice(&pDevice, D3D_SHADER_MODEL_6_0,
+  VERIFY_IS_TRUE(createDevice(&pDevice, D3D_SHADER_MODEL_6_0,
                               false /* skipUnsupported */));
 
   // The input is -Zero, Zero, -Denormal, Denormal, -Infinity, Infinity, -NaN,
@@ -13354,7 +12575,7 @@ TEST_F(ExecutionTest, IsNormalTest) {
   std::vector<unsigned int> *Validation_Expected = &Validation_Expected_Vec;
 
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   std::shared_ptr<st::ShaderOpSet> ShaderOpSet =
       std::make_shared<st::ShaderOpSet>();
@@ -13395,9 +12616,10 @@ TEST_F(ExecutionTest, IsNormalTest) {
   // Test Compute shader
   {
     pShaderOp->CS = pShaderOp->GetString("CS60");
-    std::shared_ptr<ShaderOpTestResult> test =
-        RunShaderOpTestAfterParse(pDevice, m_support, "IsNormal",
-                                  ResourceInitFn, ShaderInitFn, ShaderOpSet);
+    std::shared_ptr<st::ShaderOpTestResult> test =
+        st::RunShaderOpTestAfterParse(pDevice, m_support, "IsNormal",
+                                      ResourceInitFn, ShaderInitFn,
+                                      ShaderOpSet);
 
     MappedData data;
     test->Test->GetReadBackData("g_TestData", &data);
diff --git a/tools/clang/unittests/HLSLExec/HlslExecTestUtils.h b/tools/clang/unittests/HLSLExec/HlslExecTestUtils.h
new file mode 100644
index 0000000000..3822ef02ad
--- /dev/null
+++ b/tools/clang/unittests/HLSLExec/HlslExecTestUtils.h
@@ -0,0 +1,405 @@
+#ifndef HLSLEXECTESTUTILS_H
+#define HLSLEXECTESTUTILS_H
+
+#include "dxc/Support/dxcapi.use.h"
+#include "dxc/Test/HlslTestUtils.h"
+#include <Verify.h>
+#include <d3d12.h>
+#include <dxgi1_4.h>
+
+namespace ExecTestUtils {
+// This is defined in d3d.h for Windows 10 Anniversary Edition SDK, but we
+// only require the Windows 10 SDK.
+typedef enum D3D_SHADER_MODEL {
+  D3D_SHADER_MODEL_5_1 = 0x51,
+  D3D_SHADER_MODEL_6_0 = 0x60,
+  D3D_SHADER_MODEL_6_1 = 0x61,
+  D3D_SHADER_MODEL_6_2 = 0x62,
+  D3D_SHADER_MODEL_6_3 = 0x63,
+  D3D_SHADER_MODEL_6_4 = 0x64,
+  D3D_SHADER_MODEL_6_5 = 0x65,
+  D3D_SHADER_MODEL_6_6 = 0x66,
+  D3D_SHADER_MODEL_6_7 = 0x67,
+  D3D_SHADER_MODEL_6_8 = 0x68,
+  D3D_SHADER_MODEL_6_9 = 0x69,
+  D3D_HIGHEST_SHADER_MODEL = D3D_SHADER_MODEL_6_9
+} D3D_SHADER_MODEL;
+} // namespace ExecTestUtils
+
+static bool useDebugIfaces() { return true; }
+
+static bool useDxbc() {
+#ifdef _HLK_CONF
+  return false;
+#else
+  return hlsl_test::GetTestParamBool(L"DXBC");
+#endif
+}
+
+static bool useWarpByDefualt() {
+#ifdef _HLK_CONF
+  return false;
+#else
+  return true;
+#endif
+}
+
+// A more recent Windows SDK than currently required is needed for these.
+typedef HRESULT(WINAPI *D3D12EnableExperimentalFeaturesFn)(
+    UINT NumFeatures, __in_ecount(NumFeatures) const IID *IIDs,
+    __in_ecount_opt(NumFeatures) void *ConfigurationStructs,
+    __in_ecount_opt(NumFeatures) UINT *ConfigurationStructSizes);
+
+static const GUID D3D12ExperimentalShaderModelsID =
+    {/* 76f5573e-f13a-40f5-b297-81ce9e18933f */
+     0x76f5573e,
+     0xf13a,
+     0x40f5,
+     {0xb2, 0x97, 0x81, 0xce, 0x9e, 0x18, 0x93, 0x3f}};
+
+// Used to create D3D12SDKConfiguration to enable AgilitySDK programmatically.
+typedef HRESULT(WINAPI *D3D12GetInterfaceFn)(REFCLSID Rclsid, REFIID Riid,
+                                             void **Debug);
+
+#ifndef __ID3D12SDKConfiguration_INTERFACE_DEFINED__
+
+// Copied from AgilitySDK D3D12.h to programmatically enable when in developer
+// mode.
+#define __ID3D12SDKConfiguration_INTERFACE_DEFINED__
+
+EXTERN_C const GUID DECLSPEC_SELECTANY IID_ID3D12SDKConfiguration = {
+    0xe9eb5314,
+    0x33aa,
+    0x42b2,
+    {0xa7, 0x18, 0xd7, 0x7f, 0x58, 0xb1, 0xf1, 0xc7}};
+EXTERN_C const GUID DECLSPEC_SELECTANY CLSID_D3D12SDKConfiguration = {
+    0x7cda6aca,
+    0xa03e,
+    0x49c8,
+    {0x94, 0x58, 0x03, 0x34, 0xd2, 0x0e, 0x07, 0xce}};
+
+MIDL_INTERFACE("e9eb5314-33aa-42b2-a718-d77f58b1f1c7")
+ID3D12SDKConfiguration : public IUnknown {
+public:
+  virtual HRESULT STDMETHODCALLTYPE SetSDKVersion(UINT SDKVersion,
+                                                  LPCSTR SDKPath) = 0;
+};
+#endif /* __ID3D12SDKConfiguration_INTERFACE_DEFINED__ */
+
+static std::wstring getModuleName() {
+  wchar_t ModuleName[MAX_PATH + 1] = {0};
+  const DWORD Length = GetModuleFileNameW(NULL, ModuleName, MAX_PATH);
+
+  if (Length == 0 || Length == MAX_PATH)
+    return std::wstring(); // Error condition
+
+  return std::wstring(ModuleName, Length);
+}
+
+static std::wstring computeSDKFullPath(std::wstring SDKPath) {
+  std::wstring ModulePath = getModuleName();
+  const size_t Pos = ModulePath.rfind('\\');
+
+  if (Pos == std::wstring::npos)
+    return SDKPath;
+
+  if (SDKPath.substr(0, 2) != L".\\")
+    return SDKPath;
+
+  return ModulePath.substr(0, Pos) + SDKPath.substr(1);
+}
+
+static UINT getD3D12SDKVersion(std::wstring SDKPath) {
+  // Try to automatically get the D3D12SDKVersion from the DLL
+  UINT SDKVersion = 0;
+  std::wstring D3DCorePath = computeSDKFullPath(SDKPath);
+  D3DCorePath.append(L"D3D12Core.dll");
+  HMODULE D3DCore = LoadLibraryW(D3DCorePath.c_str());
+  if (D3DCore) {
+    if (UINT *SDKVersionOut =
+            (UINT *)GetProcAddress(D3DCore, "D3D12SDKVersion"))
+      SDKVersion = *SDKVersionOut;
+    FreeModule(D3DCore);
+  }
+  return SDKVersion;
+}
+
+static bool createDevice(ID3D12Device **D3DDevice,
+                         ExecTestUtils::D3D_SHADER_MODEL TestModel =
+                             ExecTestUtils::D3D_SHADER_MODEL_6_0,
+                         bool SkipUnsupported = true) {
+  if (TestModel > ExecTestUtils::D3D_HIGHEST_SHADER_MODEL) {
+    const UINT Minor = (UINT)TestModel & 0x0f;
+    hlsl_test::LogCommentFmt(L"Installed SDK does not support "
+                             L"shader model 6.%1u",
+                             Minor);
+
+    if (SkipUnsupported)
+      WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped);
+
+    return false;
+  }
+  CComPtr<IDXGIFactory4> DXGIFactory;
+  CComPtr<ID3D12Device> D3DDeviceCom;
+
+  *D3DDevice = nullptr;
+
+  VERIFY_SUCCEEDED(CreateDXGIFactory1(IID_PPV_ARGS(&DXGIFactory)));
+  if (hlsl_test::GetTestParamUseWARP(useWarpByDefualt())) {
+    CComPtr<IDXGIAdapter> WarpAdapter;
+    VERIFY_SUCCEEDED(DXGIFactory->EnumWarpAdapter(IID_PPV_ARGS(&WarpAdapter)));
+    HRESULT CreateHR = D3D12CreateDevice(WarpAdapter, D3D_FEATURE_LEVEL_11_0,
+                                         IID_PPV_ARGS(&D3DDeviceCom));
+    if (FAILED(CreateHR)) {
+      hlsl_test::LogCommentFmt(
+          L"The available version of WARP does not support d3d12.");
+
+      if (SkipUnsupported)
+        WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped);
+
+      return false;
+    }
+
+    if (GetModuleHandleW(L"d3d10warp.dll") != NULL) {
+      WCHAR FullModuleFilePath[MAX_PATH] = L"";
+      GetModuleFileNameW(GetModuleHandleW(L"d3d10warp.dll"), FullModuleFilePath,
+                         sizeof(FullModuleFilePath));
+      WEX::Logging::Log::Comment(WEX::Common::String().Format(
+          L"WARP driver loaded from: %ls", FullModuleFilePath));
+    }
+
+  } else {
+    CComPtr<IDXGIAdapter1> HardwareAdapter;
+    WEX::Common::String AdapterValue;
+    HRESULT HR = WEX::TestExecution::RuntimeParameters::TryGetValue(
+        L"Adapter", AdapterValue);
+    if (SUCCEEDED(HR))
+      st::GetHardwareAdapter(DXGIFactory, AdapterValue, &HardwareAdapter);
+    else
+      WEX::Logging::Log::Comment(
+          L"Using default hardware adapter with D3D12 support.");
+
+    VERIFY_SUCCEEDED(D3D12CreateDevice(HardwareAdapter, D3D_FEATURE_LEVEL_11_0,
+                                       IID_PPV_ARGS(&D3DDeviceCom)));
+  }
+  // retrieve adapter information
+  const LUID AdapterID = D3DDeviceCom->GetAdapterLuid();
+  CComPtr<IDXGIAdapter> DXGIAdapter;
+  DXGIFactory->EnumAdapterByLuid(AdapterID, IID_PPV_ARGS(&DXGIAdapter));
+  DXGI_ADAPTER_DESC AdapterDesc;
+  VERIFY_SUCCEEDED(DXGIAdapter->GetDesc(&AdapterDesc));
+  hlsl_test::LogCommentFmt(L"Using Adapter:%s", AdapterDesc.Description);
+
+  if (D3DDeviceCom == nullptr)
+    return false;
+
+  if (!useDxbc()) {
+    // Check for DXIL support.
+    typedef struct D3D12_FEATURE_DATA_SHADER_MODEL {
+      ExecTestUtils::D3D_SHADER_MODEL HighestShaderModel;
+    } D3D12_FEATURE_DATA_SHADER_MODEL;
+    const UINT D3D12_FEATURE_SHADER_MODEL = 7;
+    D3D12_FEATURE_DATA_SHADER_MODEL SMData;
+    SMData.HighestShaderModel = TestModel;
+    if (FAILED(D3DDeviceCom->CheckFeatureSupport(
+            (D3D12_FEATURE)D3D12_FEATURE_SHADER_MODEL, &SMData,
+            sizeof(SMData))) ||
+        SMData.HighestShaderModel < TestModel) {
+      const UINT Minor = (UINT)TestModel & 0x0f;
+      hlsl_test::LogCommentFmt(L"The selected device does not support "
+                               L"shader model 6.%1u",
+                               Minor);
+
+      if (SkipUnsupported)
+        WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped);
+
+      return false;
+    }
+  }
+
+  if (useDebugIfaces()) {
+    CComPtr<ID3D12InfoQueue> InfoQueue;
+    if (SUCCEEDED(D3DDeviceCom->QueryInterface(&InfoQueue)))
+      InfoQueue->SetMuteDebugOutput(FALSE);
+  }
+
+  *D3DDevice = D3DDeviceCom.Detach();
+  return true;
+}
+
+inline void readHlslDataIntoNewStream(LPCWSTR RelativePath, IStream **Stream,
+                                      dxc::DxcDllSupport &Support) {
+  VERIFY_SUCCEEDED(Support.Initialize());
+  CComPtr<IDxcLibrary> Library;
+  CComPtr<IDxcBlobEncoding> Blob;
+  CComPtr<IStream> StreamCom;
+  std::wstring Path = hlsl_test::GetPathToHlslDataFile(
+      RelativePath, HLSLDATAFILEPARAM, DEFAULT_EXEC_TEST_DIR);
+  VERIFY_SUCCEEDED(Support.CreateInstance(CLSID_DxcLibrary, &Library));
+  VERIFY_SUCCEEDED(Library->CreateBlobFromFile(Path.c_str(), nullptr, &Blob));
+  VERIFY_SUCCEEDED(Library->CreateStreamFromBlobReadOnly(Blob, &StreamCom));
+  *Stream = StreamCom.Detach();
+}
+
+static HRESULT enableAgilitySDK(HMODULE Runtime, UINT SDKVersion,
+                                LPCWSTR SDKPath) {
+  D3D12GetInterfaceFn GetInterfaceFunc =
+      (D3D12GetInterfaceFn)GetProcAddress(Runtime, "D3D12GetInterface");
+  CComPtr<ID3D12SDKConfiguration> D3D12SDKConfiguration;
+  IFR(GetInterfaceFunc(CLSID_D3D12SDKConfiguration,
+                       IID_PPV_ARGS(&D3D12SDKConfiguration)));
+  IFR(D3D12SDKConfiguration->SetSDKVersion(SDKVersion, CW2A(SDKPath)));
+
+  // Currently, it appears that the SetSDKVersion will succeed even when
+  // D3D12Core is not found, or its version doesn't match.  When that's the
+  // case, will cause a failure in the very next thing that actually requires
+  // D3D12Core.dll to be loaded instead.  So, we attempt to clear experimental
+  // features next, which is a valid use case and a no-op at this point.  This
+  // requires D3D12Core to be loaded.  If this fails, we know the AgilitySDK
+  // setting actually failed.
+  D3D12EnableExperimentalFeaturesFn ExperimentalFeaturesFunc =
+      (D3D12EnableExperimentalFeaturesFn)GetProcAddress(
+          Runtime, "D3D12EnableExperimentalFeatures");
+  if (ExperimentalFeaturesFunc == nullptr)
+    // If this failed, D3D12 must be too old for AgilitySDK.  But if that's
+    // the case, creating D3D12SDKConfiguration should have failed.  So while
+    // this case shouldn't be hit, fail if it is.
+    return HRESULT_FROM_WIN32(GetLastError());
+
+  return ExperimentalFeaturesFunc(0, nullptr, nullptr, nullptr);
+}
+
+static HRESULT
+enableExperimentalShaderModels(HMODULE hRuntime,
+                               UUID AdditionalFeatures[] = nullptr,
+                               size_t NumAdditionalFeatures = 0) {
+  D3D12EnableExperimentalFeaturesFn ExperimentalFeaturesFunc =
+      (D3D12EnableExperimentalFeaturesFn)GetProcAddress(
+          hRuntime, "D3D12EnableExperimentalFeatures");
+  if (ExperimentalFeaturesFunc == nullptr)
+    return HRESULT_FROM_WIN32(GetLastError());
+
+  std::vector<UUID> Features;
+
+  Features.push_back(D3D12ExperimentalShaderModels);
+
+  if (AdditionalFeatures != nullptr && NumAdditionalFeatures > 0)
+    Features.insert(Features.end(), AdditionalFeatures,
+                    AdditionalFeatures + NumAdditionalFeatures);
+
+  return ExperimentalFeaturesFunc((UINT)Features.size(), Features.data(),
+                                  nullptr, nullptr);
+}
+
+static HRESULT
+enableExperimentalShaderModels(UUID AdditionalFeatures[] = nullptr,
+                               size_t NumAdditionalFeatures = 0) {
+  HMODULE Runtime = LoadLibraryW(L"d3d12.dll");
+  if (Runtime == NULL)
+    return E_FAIL;
+  return enableExperimentalShaderModels(Runtime, AdditionalFeatures,
+                                        NumAdditionalFeatures);
+}
+
+static HRESULT disableExperimentalShaderModels() {
+  HMODULE Runtime = LoadLibraryW(L"d3d12.dll");
+  if (Runtime == NULL)
+    return E_FAIL;
+
+  D3D12EnableExperimentalFeaturesFn ExperimentalFeaturesFunc =
+      (D3D12EnableExperimentalFeaturesFn)GetProcAddress(
+          Runtime, "D3D12EnableExperimentalFeatures");
+  if (ExperimentalFeaturesFunc == nullptr)
+    return HRESULT_FROM_WIN32(GetLastError());
+
+  return ExperimentalFeaturesFunc(0, nullptr, nullptr, nullptr);
+}
+
+static HRESULT enableAgilitySDK(HMODULE Runtime) {
+  // D3D12SDKVersion > 1 will use provided version, otherwise, auto-detect.
+  // D3D12SDKVersion == 1 means fail if we can't auto-detect.
+  UINT SDKVersion = 0;
+  WEX::TestExecution::RuntimeParameters::TryGetValue(L"D3D12SDKVersion",
+                                                     SDKVersion);
+
+  // SDKPath must be relative path from .exe, which means relative to
+  // TE.exe location, and must start with ".\\", such as with the
+  // default: ".\\D3D12\\"
+  WEX::Common::String SDKPath;
+  if (SUCCEEDED(WEX::TestExecution::RuntimeParameters::TryGetValue(
+          L"D3D12SDKPath", SDKPath))) {
+    // Make sure path ends in backslash
+    if (!SDKPath.IsEmpty() && SDKPath.Right(1) != "\\")
+      SDKPath.Append("\\");
+  }
+
+  if (SDKPath.IsEmpty())
+    SDKPath = L".\\D3D12\\";
+
+  const bool MustFind = SDKVersion > 0;
+  if (SDKVersion <= 1) {
+    // lookup version from D3D12Core.dll
+    SDKVersion = getD3D12SDKVersion((LPCWSTR)SDKPath);
+    if (MustFind && SDKVersion == 0) {
+      hlsl_test::LogErrorFmt(L"Agility SDK not found in relative path: %s",
+                             (LPCWSTR)SDKPath);
+      return E_FAIL;
+    }
+  }
+
+  // Not found, not asked for.
+  if (SDKVersion == 0)
+    return S_FALSE;
+
+  HRESULT HR = enableAgilitySDK(Runtime, SDKVersion, (LPCWSTR)SDKPath);
+  if (FAILED(HR)) {
+    // If SDKVersion provided, fail if not successful.
+    // 1 means we should find it, and fill in the version automatically.
+    if (MustFind) {
+      hlsl_test::LogErrorFmt(
+          L"Failed to set Agility SDK version %d at path: %s", SDKVersion,
+          (LPCWSTR)SDKPath);
+      return HR;
+    }
+    return S_FALSE;
+  }
+  if (HR == S_OK)
+    hlsl_test::LogCommentFmt(L"Agility SDK version set to: %d", SDKVersion);
+
+  return HR;
+}
+
+static HRESULT enableExperimentalMode(HMODULE Runtime) {
+#ifdef _FORCE_EXPERIMENTAL_SHADERS
+  bool ExperimentalShaderModels = true;
+#else
+  bool ExperimentalShaderModels =
+      hlsl_test::GetTestParamBool(L"ExperimentalShaders");
+#endif // _FORCE_EXPERIMENTAL_SHADERS
+
+  HRESULT HR = S_FALSE;
+  if (ExperimentalShaderModels) {
+    HR = enableExperimentalShaderModels(Runtime);
+    if (SUCCEEDED(HR))
+      WEX::Logging::Log::Comment(L"Experimental shader models enabled.");
+  }
+
+  return HR;
+}
+
+static HRESULT enableDebugLayer() {
+  // The debug layer does net yet validate DXIL programs that require
+  // rewriting, but basic logging should work properly.
+  HRESULT HR = S_FALSE;
+  if (useDebugIfaces()) {
+    CComPtr<ID3D12Debug> DebugController;
+    HR = D3D12GetDebugInterface(IID_PPV_ARGS(&DebugController));
+    if (SUCCEEDED(HR)) {
+      DebugController->EnableDebugLayer();
+      HR = S_OK;
+    }
+  }
+  return HR;
+}
+
+#endif // HLSLEXECTESTUTILS_H
diff --git a/tools/clang/unittests/HLSLExec/ShaderOpTest.cpp b/tools/clang/unittests/HLSLExec/ShaderOpTest.cpp
index 8dde3faa0b..9e18351a6d 100644
--- a/tools/clang/unittests/HLSLExec/ShaderOpTest.cpp
+++ b/tools/clang/unittests/HLSLExec/ShaderOpTest.cpp
@@ -866,6 +866,11 @@ void ShaderOpTest::CreateShaders() {
       CHECK_HR(pLibrary->CreateBlobWithEncodingFromPinned(
           pText, (UINT32)strlen(pText), CP_UTF8, &pTextBlob));
       CHECK_HR(m_pDxcSupport->CreateInstance(CLSID_DxcCompiler, &pCompiler));
+      WEX::Logging::Log::Comment(L"Compiling shader:");
+      ShaderOpLogFmt(L"\tTarget profile: %S", S.Target);
+      if (argumentsWList.size() > 0) {
+        ShaderOpLogFmt(L"\tArguments: %S", pArguments);
+      }
       CHECK_HR(pCompiler->Compile(pTextBlob, nameW, entryPointW, targetW,
                                   (LPCWSTR *)argumentsWList.data(),
                                   (UINT32)argumentsWList.size(), nullptr, 0,
@@ -2752,6 +2757,74 @@ bool ShaderOpParser::ReadAtElementName(IXmlReader *pReader, LPCWSTR pName) {
   }
 }
 
+std::shared_ptr<ShaderOpTestResult>
+RunShaderOpTestAfterParse(ID3D12Device *pDevice, dxc::DxcDllSupport &support,
+                          LPCSTR pName,
+                          st::ShaderOpTest::TInitCallbackFn pInitCallback,
+                          st::ShaderOpTest::TShaderCallbackFn pShaderCallback,
+                          std::shared_ptr<st::ShaderOpSet> ShaderOpSet) {
+  st::ShaderOp *pShaderOp;
+  if (pName == nullptr) {
+    if (ShaderOpSet->ShaderOps.size() != 1) {
+      VERIFY_FAIL(L"Expected a single shader operation.");
+    }
+    pShaderOp = ShaderOpSet->ShaderOps[0].get();
+  } else {
+    pShaderOp = ShaderOpSet->GetShaderOp(pName);
+  }
+  if (pShaderOp == nullptr) {
+    std::string msg = "Unable to find shader op ";
+    msg += pName;
+    msg += "; available ops";
+    const char sep = ':';
+    for (auto &pAvailOp : ShaderOpSet->ShaderOps) {
+      msg += sep;
+      msg += pAvailOp->Name ? pAvailOp->Name : "[n/a]";
+    }
+    CA2W msgWide(msg.c_str());
+    VERIFY_FAIL(msgWide.m_psz);
+  }
+
+  // This won't actually be used since we're supplying the device,
+  // but let's make it consistent.
+  pShaderOp->UseWarpDevice = hlsl_test::GetTestParamUseWARP(true);
+
+  std::shared_ptr<st::ShaderOpTest> test = std::make_shared<st::ShaderOpTest>();
+  test->SetDxcSupport(&support);
+  test->SetInitCallback(pInitCallback);
+  test->SetShaderCallback(pShaderCallback);
+  test->SetDevice(pDevice);
+  test->RunShaderOp(pShaderOp);
+
+  std::shared_ptr<ShaderOpTestResult> result =
+      std::make_shared<ShaderOpTestResult>();
+  result->ShaderOpSet = ShaderOpSet;
+  result->Test = test;
+  result->ShaderOp = pShaderOp;
+  return result;
+}
+
+std::shared_ptr<ShaderOpTestResult>
+RunShaderOpTestAfterParse(ID3D12Device *pDevice, dxc::DxcDllSupport &support,
+                          LPCSTR pName,
+                          st::ShaderOpTest::TInitCallbackFn pInitCallback,
+                          std::shared_ptr<st::ShaderOpSet> ShaderOpSet) {
+  return RunShaderOpTestAfterParse(pDevice, support, pName, pInitCallback,
+                                   nullptr, ShaderOpSet);
+}
+
+std::shared_ptr<ShaderOpTestResult>
+RunShaderOpTest(ID3D12Device *pDevice, dxc::DxcDllSupport &support,
+                IStream *pStream, LPCSTR pName,
+                st::ShaderOpTest::TInitCallbackFn pInitCallback) {
+  DXASSERT_NOMSG(pStream != nullptr);
+  std::shared_ptr<st::ShaderOpSet> ShaderOpSet =
+      std::make_shared<st::ShaderOpSet>();
+  st::ParseShaderOpSetFromStream(pStream, ShaderOpSet.get());
+  return RunShaderOpTestAfterParse(pDevice, support, pName, pInitCallback,
+                                   ShaderOpSet);
+}
+
 #pragma endregion Parsing support
 
 } // namespace st
diff --git a/tools/clang/unittests/HLSLExec/ShaderOpTest.h b/tools/clang/unittests/HLSLExec/ShaderOpTest.h
index b71ee08765..52b5f37730 100644
--- a/tools/clang/unittests/HLSLExec/ShaderOpTest.h
+++ b/tools/clang/unittests/HLSLExec/ShaderOpTest.h
@@ -12,12 +12,12 @@
 // results.                                                                  //
 //                                                                           //
 ///////////////////////////////////////////////////////////////////////////////
-
-#pragma once
-
 #ifndef __SHADEROPTEST_H__
 #define __SHADEROPTEST_H__
 
+#include <atlbase.h>
+#include <d3d12.h>
+#include <dxgi1_4.h>
 #include <functional>
 #include <map>
 #include <memory>
@@ -344,6 +344,32 @@ void ParseShaderOpSetFromStream(IStream *pStream, ShaderOpSet *pShaderOpSet);
 // Deserialize a ShaderOpSet from an IXmlReader instance.
 void ParseShaderOpSetFromXml(IXmlReader *pReader, ShaderOpSet *pShaderOpSet);
 
+///////////////////////////////////////////////////////////////////////////////
+// RunShaderOpTest* helper functions.
+struct ShaderOpTestResult {
+  st::ShaderOp *ShaderOp;
+  std::shared_ptr<st::ShaderOpSet> ShaderOpSet;
+  std::shared_ptr<st::ShaderOpTest> Test;
+};
+
+std::shared_ptr<ShaderOpTestResult>
+RunShaderOpTestAfterParse(ID3D12Device *pDevice, dxc::DxcDllSupport &support,
+                          LPCSTR pName,
+                          st::ShaderOpTest::TInitCallbackFn pInitCallback,
+                          st::ShaderOpTest::TShaderCallbackFn pShaderCallback,
+                          std::shared_ptr<st::ShaderOpSet> ShaderOpSet);
+
+std::shared_ptr<ShaderOpTestResult>
+RunShaderOpTestAfterParse(ID3D12Device *pDevice, dxc::DxcDllSupport &support,
+                          LPCSTR pName,
+                          st::ShaderOpTest::TInitCallbackFn pInitCallback,
+                          std::shared_ptr<st::ShaderOpSet> ShaderOpSet);
+
+std::shared_ptr<ShaderOpTestResult>
+RunShaderOpTest(ID3D12Device *pDevice, dxc::DxcDllSupport &support,
+                IStream *pStream, LPCSTR pName,
+                st::ShaderOpTest::TInitCallbackFn pInitCallback);
+
 } // namespace st
 
 #endif // __SHADEROPTEST_H__
diff --git a/tools/clang/unittests/HLSLExec/TableParameterHandler.cpp b/tools/clang/unittests/HLSLExec/TableParameterHandler.cpp
new file mode 100644
index 0000000000..16badb074d
--- /dev/null
+++ b/tools/clang/unittests/HLSLExec/TableParameterHandler.cpp
@@ -0,0 +1,376 @@
+#include "TableParameterHandler.h"
+#include "dxc/Test/HlslTestUtils.h"
+
+TableParameterHandler::TableParameterHandler(TableParameter *pTable,
+                                             size_t size)
+    : m_table(pTable), m_tableSize(size) {
+  clearTableParameter();
+  VERIFY_SUCCEEDED(ParseTableRow());
+}
+
+TableParameter *TableParameterHandler::GetTableParamByName(LPCWSTR name) {
+  for (size_t i = 0; i < m_tableSize; ++i) {
+    if (_wcsicmp(name, m_table[i].m_name) == 0) {
+      return &m_table[i];
+    }
+  }
+  DXASSERT_ARGS(false, "Invalid Table Parameter Name %s", name);
+  return nullptr;
+}
+
+void TableParameterHandler::clearTableParameter() {
+  for (size_t i = 0; i < m_tableSize; ++i) {
+    m_table[i].m_int32 = 0;
+    m_table[i].m_uint = 0;
+    m_table[i].m_double = 0;
+    m_table[i].m_bool = false;
+    m_table[i].m_str = WEX::Common::String();
+  }
+}
+
+template <class T1>
+std::vector<T1> *TableParameterHandler::GetDataArray(LPCWSTR name) {
+  return nullptr;
+}
+
+template <>
+std::vector<int> *TableParameterHandler::GetDataArray(LPCWSTR name) {
+  for (size_t i = 0; i < m_tableSize; ++i) {
+    if (_wcsicmp(name, m_table[i].m_name) == 0) {
+      return &(m_table[i].m_int32Table);
+    }
+  }
+  DXASSERT_ARGS(false, "Invalid Table Parameter Name %s", name);
+  return nullptr;
+}
+
+template <>
+std::vector<int8_t> *TableParameterHandler::GetDataArray(LPCWSTR name) {
+  for (size_t i = 0; i < m_tableSize; ++i) {
+    if (_wcsicmp(name, m_table[i].m_name) == 0) {
+      return &(m_table[i].m_int8Table);
+    }
+  }
+  DXASSERT_ARGS(false, "Invalid Table Parameter Name %s", name);
+  return nullptr;
+}
+
+template <>
+std::vector<int16_t> *TableParameterHandler::GetDataArray(LPCWSTR name) {
+  for (size_t i = 0; i < m_tableSize; ++i) {
+    if (_wcsicmp(name, m_table[i].m_name) == 0) {
+      return &(m_table[i].m_int16Table);
+    }
+  }
+  DXASSERT_ARGS(false, "Invalid Table Parameter Name %s", name);
+  return nullptr;
+}
+
+template <>
+std::vector<unsigned int> *TableParameterHandler::GetDataArray(LPCWSTR name) {
+  for (size_t i = 0; i < m_tableSize; ++i) {
+    if (_wcsicmp(name, m_table[i].m_name) == 0) {
+      return &(m_table[i].m_uint32Table);
+    }
+  }
+  DXASSERT_ARGS(false, "Invalid Table Parameter Name %s", name);
+  return nullptr;
+}
+
+template <>
+std::vector<float> *TableParameterHandler::GetDataArray(LPCWSTR name) {
+  for (size_t i = 0; i < m_tableSize; ++i) {
+    if (_wcsicmp(name, m_table[i].m_name) == 0) {
+      return &(m_table[i].m_floatTable);
+    }
+  }
+  DXASSERT_ARGS(false, "Invalid Table Parameter Name %s", name);
+  return nullptr;
+}
+
+template <>
+std::vector<uint16_t> *TableParameterHandler::GetDataArray(LPCWSTR name) {
+  for (size_t i = 0; i < m_tableSize; ++i) {
+    if (_wcsicmp(name, m_table[i].m_name) == 0) {
+      return &(m_table[i].m_halfTable);
+    }
+  }
+  DXASSERT_ARGS(false, "Invalid Table Parameter Name %s", name);
+  return nullptr;
+}
+
+template <>
+std::vector<double> *TableParameterHandler::GetDataArray(LPCWSTR name) {
+  for (size_t i = 0; i < m_tableSize; ++i) {
+    if (_wcsicmp(name, m_table[i].m_name) == 0) {
+      return &(m_table[i].m_doubleTable);
+    }
+  }
+  DXASSERT_ARGS(false, "Invalid Table Parameter Name %s", name);
+  return nullptr;
+}
+
+template <>
+std::vector<bool> *TableParameterHandler::GetDataArray(LPCWSTR name) {
+  for (size_t i = 0; i < m_tableSize; ++i) {
+    if (_wcsicmp(name, m_table[i].m_name) == 0) {
+      return &(m_table[i].m_boolTable);
+    }
+  }
+  DXASSERT_ARGS(false, "Invalid Table Parameter Name %s", name);
+  return nullptr;
+}
+
+HRESULT TableParameterHandler::ParseTableRow() {
+  TableParameter *table = m_table;
+  for (unsigned int i = 0; i < m_tableSize; ++i) {
+    switch (table[i].m_type) {
+    case TableParameter::INT8:
+      if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
+                                                           table[i].m_int32)) &&
+          table[i].m_required) {
+        // TryGetValue does not suppport reading from int16
+        hlsl_test::LogErrorFmt(L"Failed to get %s", table[i].m_name);
+        return E_FAIL;
+      }
+      table[i].m_int8 = (int8_t)(table[i].m_int32);
+      break;
+    case TableParameter::INT16:
+      if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
+                                                           table[i].m_int32)) &&
+          table[i].m_required) {
+        // TryGetValue does not suppport reading from int16
+        hlsl_test::LogErrorFmt(L"Failed to get %s", table[i].m_name);
+        return E_FAIL;
+      }
+      table[i].m_int16 = (short)(table[i].m_int32);
+      break;
+    case TableParameter::INT32:
+      if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
+                                                           table[i].m_int32)) &&
+          table[i].m_required) {
+        hlsl_test::LogErrorFmt(L"Failed to get %s", table[i].m_name);
+        return E_FAIL;
+      }
+      break;
+    case TableParameter::UINT:
+      if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
+                                                           table[i].m_uint)) &&
+          table[i].m_required) {
+        hlsl_test::LogErrorFmt(L"Failed to get %s", table[i].m_name);
+        return E_FAIL;
+      }
+      break;
+    case TableParameter::DOUBLE:
+      if (FAILED(WEX::TestExecution::TestData::TryGetValue(
+              table[i].m_name, table[i].m_double)) &&
+          table[i].m_required) {
+        hlsl_test::LogErrorFmt(L"Failed to get %s", table[i].m_name);
+        return E_FAIL;
+      }
+      break;
+    case TableParameter::STRING:
+      if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
+                                                           table[i].m_str)) &&
+          table[i].m_required) {
+        hlsl_test::LogErrorFmt(L"Failed to get %s", table[i].m_name);
+        return E_FAIL;
+      }
+      break;
+    case TableParameter::BOOL:
+      if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
+                                                           table[i].m_str)) &&
+          table[i].m_bool) {
+        hlsl_test::LogErrorFmt(L"Failed to get %s", table[i].m_name);
+        return E_FAIL;
+      }
+      break;
+    case TableParameter::INT8_TABLE: {
+      WEX::TestExecution::TestDataArray<int> tempTable;
+      if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
+                                                           tempTable)) &&
+          table[i].m_required) {
+
+        hlsl_test::LogErrorFmt(L"Failed to get %s", table[i].m_name);
+        return E_FAIL;
+      }
+      // TryGetValue does not suppport reading from int8
+      table[i].m_int8Table.resize(tempTable.GetSize());
+      for (size_t j = 0, end = tempTable.GetSize(); j != end; ++j) {
+        table[i].m_int8Table[j] = (int8_t)tempTable[j];
+      }
+      break;
+    }
+    case TableParameter::INT16_TABLE: {
+      WEX::TestExecution::TestDataArray<int> tempTable;
+      if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
+                                                           tempTable)) &&
+          table[i].m_required) {
+        hlsl_test::LogErrorFmt(L"Failed to get %s", table[i].m_name);
+        return E_FAIL;
+      }
+      // TryGetValue does not suppport reading from int8
+      table[i].m_int16Table.resize(tempTable.GetSize());
+      for (size_t j = 0, end = tempTable.GetSize(); j != end; ++j) {
+        table[i].m_int16Table[j] = (int16_t)tempTable[j];
+      }
+      break;
+    }
+    case TableParameter::INT32_TABLE: {
+      WEX::TestExecution::TestDataArray<int> tempTable;
+      if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
+                                                           tempTable)) &&
+          table[i].m_required) {
+        // TryGetValue does not suppport reading from int8
+        hlsl_test::LogErrorFmt(L"Failed to get %s", table[i].m_name);
+        return E_FAIL;
+      }
+      table[i].m_int32Table.resize(tempTable.GetSize());
+      for (size_t j = 0, end = tempTable.GetSize(); j != end; ++j) {
+        table[i].m_int32Table[j] = tempTable[j];
+      }
+      break;
+    }
+    case TableParameter::UINT8_TABLE: {
+      WEX::TestExecution::TestDataArray<int> tempTable;
+      if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
+                                                           tempTable)) &&
+          table[i].m_required) {
+
+        hlsl_test::LogErrorFmt(L"Failed to get %s", table[i].m_name);
+        return E_FAIL;
+      }
+      // TryGetValue does not suppport reading from int8
+      table[i].m_int8Table.resize(tempTable.GetSize());
+      for (size_t j = 0, end = tempTable.GetSize(); j != end; ++j) {
+        table[i].m_int8Table[j] = (uint8_t)tempTable[j];
+      }
+      break;
+    }
+    case TableParameter::UINT16_TABLE: {
+      WEX::TestExecution::TestDataArray<int> tempTable;
+      if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
+                                                           tempTable)) &&
+          table[i].m_required) {
+        hlsl_test::LogErrorFmt(L"Failed to get %s", table[i].m_name);
+        return E_FAIL;
+      }
+      // TryGetValue does not suppport reading from int8
+      table[i].m_uint16Table.resize(tempTable.GetSize());
+      for (size_t j = 0, end = tempTable.GetSize(); j != end; ++j) {
+        table[i].m_uint16Table[j] = (uint16_t)tempTable[j];
+      }
+      break;
+    }
+    case TableParameter::UINT32_TABLE: {
+      WEX::TestExecution::TestDataArray<unsigned int> tempTable;
+      if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
+                                                           tempTable)) &&
+          table[i].m_required) {
+        // TryGetValue does not suppport reading from int8
+        hlsl_test::LogErrorFmt(L"Failed to get %s", table[i].m_name);
+        return E_FAIL;
+      }
+      table[i].m_uint32Table.resize(tempTable.GetSize());
+      for (size_t j = 0, end = tempTable.GetSize(); j != end; ++j) {
+        table[i].m_uint32Table[j] = tempTable[j];
+      }
+      break;
+    }
+    case TableParameter::FLOAT_TABLE: {
+      WEX::TestExecution::TestDataArray<WEX::Common::String> tempTable;
+      if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
+                                                           tempTable)) &&
+          table[i].m_required) {
+        // TryGetValue does not suppport reading from int8
+        hlsl_test::LogErrorFmt(L"Failed to get %s", table[i].m_name);
+        return E_FAIL;
+      }
+      table[i].m_floatTable.resize(tempTable.GetSize());
+      for (size_t j = 0, end = tempTable.GetSize(); j != end; ++j) {
+        ParseDataToFloat(tempTable[j], table[i].m_floatTable[j]);
+      }
+      break;
+    }
+    case TableParameter::HALF_TABLE: {
+      WEX::TestExecution::TestDataArray<WEX::Common::String> tempTable;
+      if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
+                                                           tempTable)) &&
+          table[i].m_required) {
+        // TryGetValue does not suppport reading from int8
+        hlsl_test::LogErrorFmt(L"Failed to get %s", table[i].m_name);
+        return E_FAIL;
+      }
+      table[i].m_halfTable.resize(tempTable.GetSize());
+      for (size_t j = 0, end = tempTable.GetSize(); j != end; ++j) {
+        uint16_t value = 0;
+        if (IsHexString(tempTable[j], &value)) {
+          table[i].m_halfTable[j] = value;
+        } else {
+          float val;
+          ParseDataToFloat(tempTable[j], val);
+          if (isdenorm(val))
+            table[i].m_halfTable[j] =
+                signbit(val) ? Float16NegDenorm : Float16PosDenorm;
+          else
+            table[i].m_halfTable[j] = ConvertFloat32ToFloat16(val);
+        }
+      }
+      break;
+    }
+    case TableParameter::DOUBLE_TABLE: {
+      WEX::TestExecution::TestDataArray<double> tempTable;
+      if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
+                                                           tempTable)) &&
+          table[i].m_required) {
+        // TryGetValue does not suppport reading from int8
+        hlsl_test::LogErrorFmt(L"Failed to get %s", table[i].m_name);
+        return E_FAIL;
+      }
+      table[i].m_doubleTable.resize(tempTable.GetSize());
+      for (size_t j = 0, end = tempTable.GetSize(); j != end; ++j) {
+        table[i].m_doubleTable[j] = tempTable[j];
+      }
+      break;
+    }
+    case TableParameter::BOOL_TABLE: {
+      WEX::TestExecution::TestDataArray<bool> tempTable;
+      if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
+                                                           tempTable)) &&
+          table[i].m_required) {
+        // TryGetValue does not suppport reading from int8
+        hlsl_test::LogErrorFmt(L"Failed to get %s", table[i].m_name);
+        return E_FAIL;
+      }
+      table[i].m_boolTable.resize(tempTable.GetSize());
+      for (size_t j = 0, end = tempTable.GetSize(); j != end; ++j) {
+        table[i].m_boolTable[j] = tempTable[j];
+      }
+      break;
+    }
+    case TableParameter::STRING_TABLE: {
+      WEX::TestExecution::TestDataArray<WEX::Common::String> tempTable;
+      if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
+                                                           tempTable)) &&
+          table[i].m_required) {
+        // TryGetValue does not suppport reading from int8
+        hlsl_test::LogErrorFmt(L"Failed to get %s", table[i].m_name);
+        return E_FAIL;
+      }
+      table[i].m_StringTable.resize(tempTable.GetSize());
+      for (size_t j = 0, end = tempTable.GetSize(); j != end; ++j) {
+        table[i].m_StringTable[j] = tempTable[j];
+      }
+      break;
+    }
+    default:
+      DXASSERT_NOMSG("Invalid Parameter Type");
+    }
+    if (errno == ERANGE) {
+      hlsl_test::LogErrorFmt(L"got out of range value for table %s",
+                             table[i].m_name);
+      return E_FAIL;
+    }
+  }
+  return S_OK;
+}
diff --git a/tools/clang/unittests/HLSLExec/TableParameterHandler.h b/tools/clang/unittests/HLSLExec/TableParameterHandler.h
new file mode 100644
index 0000000000..eac851a263
--- /dev/null
+++ b/tools/clang/unittests/HLSLExec/TableParameterHandler.h
@@ -0,0 +1,205 @@
+#ifndef TABLE_PARAMETER_HANDLER_H
+#define TABLE_PARAMETER_HANDLER_H
+
+#include <Verify.h>
+#include <WexString.h>
+#include <WexTestClass.h>
+#include <memory>
+#include <string>
+#include <vector>
+#include <wchar.h>
+#include <windows.h> // For LPCWSTR
+
+#include "dxc/Support/Global.h" // For DXASSERT_ARGS
+#include "dxc/Test/HlslTestUtils.h"
+
+// Parameter representation for taef data-driven tests
+struct TableParameter {
+  LPCWSTR m_name;
+  enum TableParameterType {
+    INT8,
+    INT16,
+    INT32,
+    UINT,
+    FLOAT,
+    HALF,
+    DOUBLE,
+    STRING,
+    BOOL,
+    INT8_TABLE,
+    INT16_TABLE,
+    INT32_TABLE,
+    FLOAT_TABLE,
+    HALF_TABLE,
+    DOUBLE_TABLE,
+    STRING_TABLE,
+    UINT8_TABLE,
+    UINT16_TABLE,
+    UINT32_TABLE,
+    BOOL_TABLE
+  };
+  TableParameter(LPCWSTR name, TableParameterType type, bool required)
+      : m_name(name), m_type(type), m_required(required) {}
+  TableParameterType m_type;
+  bool m_required; // required parameter
+  int8_t m_int8;
+  int16_t m_int16;
+  int m_int32;
+  unsigned int m_uint;
+  float m_float;
+  uint16_t m_half; // no such thing as half type in c++. Use int16 instead
+  double m_double;
+  bool m_bool;
+  WEX::Common::String m_str;
+  std::vector<int8_t> m_int8Table;
+  std::vector<int16_t> m_int16Table;
+  std::vector<int> m_int32Table;
+  std::vector<uint8_t> m_uint8Table;
+  std::vector<uint16_t> m_uint16Table;
+  std::vector<unsigned int> m_uint32Table;
+  std::vector<float> m_floatTable;
+  std::vector<uint16_t> m_halfTable; // no such thing as half type in c++
+  std::vector<double> m_doubleTable;
+  std::vector<bool> m_boolTable;
+  std::vector<WEX::Common::String> m_StringTable;
+};
+
+class TableParameterHandler {
+private:
+  HRESULT ParseTableRow();
+
+public:
+  TableParameter *m_table;
+  size_t m_tableSize;
+  TableParameterHandler(TableParameter *pTable, size_t size);
+
+  TableParameter *GetTableParamByName(LPCWSTR name);
+  void clearTableParameter();
+
+  template <class T1> std::vector<T1> *GetDataArray(LPCWSTR name);
+};
+
+// Static helpers
+static bool IsHexString(PCWSTR str, uint16_t *value) {
+  std::wstring wString(str);
+  wString.erase(std::remove(wString.begin(), wString.end(), L' '),
+                wString.end());
+  LPCWSTR wstr = wString.c_str();
+  if (wcsncmp(wstr, L"0x", 2) == 0 || wcsncmp(wstr, L"0b", 2) == 0) {
+    *value = (uint16_t)wcstol(wstr, NULL, 0);
+    return true;
+  }
+  return false;
+}
+
+static HRESULT ParseDataToFloat(PCWSTR str, float &value) {
+  std::wstring wString(str);
+  wString.erase(std::remove(wString.begin(), wString.end(), L' '),
+                wString.end());
+  wString.erase(std::remove(wString.begin(), wString.end(), L'\n'),
+                wString.end());
+  PCWSTR wstr = wString.data();
+  if (_wcsicmp(wstr, L"NaN") == 0) {
+    value = NAN;
+  } else if (_wcsicmp(wstr, L"-inf") == 0) {
+    value = -(INFINITY);
+  } else if (_wcsicmp(wstr, L"inf") == 0) {
+    value = INFINITY;
+  } else if (_wcsicmp(wstr, L"-denorm") == 0) {
+    value = -(FLT_MIN / 2);
+  } else if (_wcsicmp(wstr, L"denorm") == 0) {
+    value = FLT_MIN / 2;
+  } else if (_wcsicmp(wstr, L"-0.0f") == 0 || _wcsicmp(wstr, L"-0.0") == 0 ||
+             _wcsicmp(wstr, L"-0") == 0) {
+    value = -0.0f;
+  } else if (_wcsicmp(wstr, L"0.0f") == 0 || _wcsicmp(wstr, L"0.0") == 0 ||
+             _wcsicmp(wstr, L"0") == 0) {
+    value = 0.0f;
+  } else if (_wcsnicmp(wstr, L"0x", 2) ==
+             0) { // For hex values, take values literally
+    unsigned temp_i = std::stoul(wstr, nullptr, 16);
+    value = (float &)temp_i;
+  } else {
+    // evaluate the expression of wstring
+    double val = _wtof(wstr);
+    if (val == 0) {
+      hlsl_test::LogErrorFmt(L"Failed to parse parameter %s to float", wstr);
+      return E_FAIL;
+    }
+    value = (float)val;
+  }
+  return S_OK;
+}
+
+static HRESULT ParseDataToUint(PCWSTR str, unsigned int &value) {
+  std::wstring wString(str);
+  wString.erase(std::remove(wString.begin(), wString.end(), L' '),
+                wString.end());
+  PCWSTR wstr = wString.data();
+  // evaluate the expression of string
+  if (_wcsicmp(wstr, L"0") == 0 || _wcsicmp(wstr, L"0x00000000") == 0) {
+    value = 0;
+    return S_OK;
+  }
+  wchar_t *end;
+  unsigned int val = std::wcstoul(wstr, &end, 0);
+  if (val == 0) {
+    hlsl_test::LogErrorFmt(L"Failed to parse parameter %s to int", wstr);
+    return E_FAIL;
+  }
+  value = val;
+  return S_OK;
+}
+
+static HRESULT ParseDataToVectorFloat(PCWSTR str, float *ptr, size_t count) {
+  std::wstring wstr(str);
+  size_t curPosition = 0;
+  // parse a string of dot product separated by commas
+  for (size_t i = 0; i < count; ++i) {
+    size_t nextPosition = wstr.find(L",", curPosition);
+    if (FAILED(ParseDataToFloat(
+            wstr.substr(curPosition, nextPosition - curPosition).data(),
+            *(ptr + i)))) {
+      return E_FAIL;
+    }
+    curPosition = nextPosition + 1;
+  }
+  return S_OK;
+}
+
+static HRESULT ParseDataToVectorHalf(PCWSTR str, uint16_t *ptr, size_t count) {
+  std::wstring wstr(str);
+  size_t curPosition = 0;
+  // parse a string of dot product separated by commas
+  for (size_t i = 0; i < count; ++i) {
+    size_t nextPosition = wstr.find(L",", curPosition);
+    float floatValue;
+    if (FAILED(ParseDataToFloat(
+            wstr.substr(curPosition, nextPosition - curPosition).data(),
+            floatValue))) {
+      return E_FAIL;
+    }
+    *(ptr + i) = ConvertFloat32ToFloat16(floatValue);
+    curPosition = nextPosition + 1;
+  }
+  return S_OK;
+}
+
+static HRESULT ParseDataToVectorUint(PCWSTR str, unsigned int *ptr,
+                                     size_t count) {
+  std::wstring wstr(str);
+  size_t curPosition = 0;
+  // parse a string of dot product separated by commas
+  for (size_t i = 0; i < count; ++i) {
+    size_t nextPosition = wstr.find(L",", curPosition);
+    if (FAILED(ParseDataToUint(
+            wstr.substr(curPosition, nextPosition - curPosition).data(),
+            *(ptr + i)))) {
+      return E_FAIL;
+    }
+    curPosition = nextPosition + 1;
+  }
+  return S_OK;
+}
+
+#endif // TABLE_PARAMETER_HANDLER_H

From 8a77b0c714d0e0db4b1c0202f2697f91ce3928a8 Mon Sep 17 00:00:00 2001
From: Jeff Noyle <jeffno@microsoft.com>
Date: Tue, 17 Jun 2025 17:06:51 -0700
Subject: [PATCH 64/93] PIX shader debugger: Support dynamic indices for local
 arrays (#7536)

The root of the problem being addressed here is this line from the
previous version of DxilAnnotateWithVirtualRegister.cpp at (old) line
251 in function GetStructOffset:

```
    auto *pArrayIndex =
        llvm::dyn_cast<llvm::ConstantInt>(pGEP->getOperand(GEPOperandIndex++));
```

When an array is dynamically indexed, this dyn_cast of course returns
nullptr, and this function returns a zero, which eventually caused the
values of all dynamically-indexed array elements in PIX's shader
debugger to be reported as the value of the zeroth element in the array.

The next issue was that stores to an alloca-backed dynamic array weren't
being properly recognized as significant events from PIX debugger's
point of view. PIX adds its own "fake" alloca stores to help tie its
debug output with the debug info that ends up in the PDB, so it's easy
enough to co-opt that machinery to cover stores to "real" allocas, i.e.
function-local array storage. To do so, the "AnnotateStore" function
needs some of the metadata (i.e. PIX instruction number) that is added
during runOnModule here. This necessitated rearranging runOnModule and
putting stores into a vector that we then iterate over at the end of
runOnModule.

Now that indices aren't collapsed into just the zeroth, PIX needs to
know how much storage to allocate for the full array, which is the
motivation for the change in DxilDebugInstrumentation.cpp to return some
metadata that PIX can parse.

DxilDbgValueToDbgDeclare.cpp's changes are just a variable rename to aid
readability.

The rearrangement of runOnModule can induce some allocas to be visited
more than once, so there are changes in DxilPIXVirtualRegisters.cpp to
make sure we don't overwrite an existing alloca ordinal with a new one
(which would confuse previously-established references to that alloca).

file-check tests have been added to validate that
-the stores to local arrays are being noticed properly.
-the debug pass correctly outputs the metadata that informs PIX about
alloca sizes

The majority of these changes really needs end-to-end testing in PIX,
where I can gather real debug output as generated by the GPU in response
to the instrumentation, then match those results up with PDB data and
finally show HLSL variable contents in the shader debugger, so there are
some tests waiting on the PIX side for when this change makes its way
there.
---
 .../DxilAnnotateWithVirtualRegister.cpp       | 140 +++++++++++++-----
 .../DxilDbgValueToDbgDeclare.cpp              |  17 ++-
 .../DxilDebugInstrumentation.cpp              |  17 ++-
 lib/DxilPIXPasses/DxilPIXVirtualRegisters.cpp |  12 +-
 ...ValueToDbgDeclare_dynamic_array_index.hlsl |  27 ++++
 .../pix/Debug_dynamic_array_index.hlsl        |  19 +++
 tools/clang/unittests/HLSL/PixTest.cpp        |   1 -
 7 files changed, 178 insertions(+), 55 deletions(-)
 create mode 100644 tools/clang/test/HLSLFileCheck/pix/DbgValueToDbgDeclare_dynamic_array_index.hlsl
 create mode 100644 tools/clang/test/HLSLFileCheck/pix/Debug_dynamic_array_index.hlsl

diff --git a/lib/DxilPIXPasses/DxilAnnotateWithVirtualRegister.cpp b/lib/DxilPIXPasses/DxilAnnotateWithVirtualRegister.cpp
index babf5b7953..88f696b7fa 100644
--- a/lib/DxilPIXPasses/DxilAnnotateWithVirtualRegister.cpp
+++ b/lib/DxilPIXPasses/DxilAnnotateWithVirtualRegister.cpp
@@ -76,19 +76,29 @@ class DxilAnnotateWithVirtualRegister : public llvm::ModulePass {
 
 private:
   void AnnotateValues(llvm::Instruction *pI);
-  void AnnotateStore(llvm::Instruction *pI);
-  void SplitVectorStores(hlsl::OP *HlslOP, llvm::Instruction *pI);
+  void AnnotateStore(hlsl::OP *HlslOP, llvm::Instruction *pI);
+  void SplitVectorStores(llvm::Instruction *pI);
   bool IsAllocaRegisterWrite(llvm::Value *V, llvm::AllocaInst **pAI,
                              llvm::Value **pIdx);
   void AnnotateAlloca(llvm::AllocaInst *pAlloca);
   void AnnotateGeneric(llvm::Instruction *pI);
   void AssignNewDxilRegister(llvm::Instruction *pI);
   void AssignNewAllocaRegister(llvm::AllocaInst *pAlloca, std::uint32_t C);
-
+  llvm::Value *AddConstIntValues(llvm::Value *l, llvm::Value *r);
+  llvm::Value *MultiplyConstIntValue(llvm::Value *l, uint32_t r);
+  llvm::Value *GetStructOffset(llvm::GetElementPtrInst *pGEP,
+                               uint32_t &GEPOperandIndex,
+                               llvm::Type *pElementType);
   hlsl::DxilModule *m_DM;
   std::uint32_t m_uVReg;
   std::unique_ptr<llvm::ModuleSlotTracker> m_MST;
   int m_StartInstruction = 0;
+  struct RememberedAllocaStores {
+    llvm::StoreInst *StoreInst;
+    llvm::Value *Index;
+    llvm::MDNode *AllocaReg;
+  };
+  std::vector<RememberedAllocaStores> m_RememberedAllocaStores;
 
   void Init(llvm::Module &M) {
     m_DM = &M.GetOrCreateDxilModule();
@@ -129,8 +139,6 @@ bool DxilAnnotateWithVirtualRegister::runOnModule(llvm::Module &M) {
     m_DM->SetValidatorVersion(1, 4);
   }
 
-  std::uint32_t InstNum = m_StartInstruction;
-
   auto instrumentableFunctions =
       PIXPassHelpers::GetAllInstrumentableFunctions(*m_DM);
 
@@ -138,7 +146,7 @@ bool DxilAnnotateWithVirtualRegister::runOnModule(llvm::Module &M) {
     for (auto &block : F->getBasicBlockList()) {
       for (auto it = block.begin(); it != block.end();) {
         llvm::Instruction *I = &*(it++);
-        SplitVectorStores(m_DM->GetOP(), I);
+        SplitVectorStores(I);
       }
     }
   }
@@ -151,17 +159,32 @@ bool DxilAnnotateWithVirtualRegister::runOnModule(llvm::Module &M) {
     }
   }
 
+  // Process all allocas referenced by dbg.declare intrinsics
   for (auto *F : instrumentableFunctions) {
     for (auto &block : F->getBasicBlockList()) {
-      for (llvm::Instruction &I : block.getInstList()) {
-        AnnotateStore(&I);
+      for (auto &I : block) {
+        if (auto *DbgDeclare = llvm::dyn_cast<llvm::DbgDeclareInst>(&I)) {
+          // The first operand of DbgDeclare is the address (typically an
+          // AllocaInst)
+          if (auto *AddrVal =
+                  llvm::dyn_cast<llvm::Instruction>(DbgDeclare->getAddress())) {
+            AnnotateValues(AddrVal);
+          }
+        }
       }
     }
   }
 
+  for (auto *F : instrumentableFunctions)
+    for (auto &block : F->getBasicBlockList()) {
+      for (llvm::Instruction &I : block.getInstList()) {
+        AnnotateStore(m_DM->GetOP(), &I);
+      }
+    }
+
   for (auto *F : instrumentableFunctions) {
-    int InstructionRangeStart = InstNum;
-    int InstructionRangeEnd = InstNum;
+    int InstructionRangeStart = m_StartInstruction;
+    int InstructionRangeEnd = m_StartInstruction;
     for (auto &block : F->getBasicBlockList()) {
       for (llvm::Instruction &I : block.getInstList()) {
         // If the instruction is part of the debug value instrumentation added
@@ -171,8 +194,9 @@ bool DxilAnnotateWithVirtualRegister::runOnModule(llvm::Module &M) {
           if (PixAllocaReg::FromInst(Alloca, &unused1, &unused2))
             continue;
         if (!llvm::isa<llvm::DbgDeclareInst>(&I)) {
-          pix_dxil::PixDxilInstNum::AddMD(M.getContext(), &I, InstNum++);
-          InstructionRangeEnd = InstNum;
+          pix_dxil::PixDxilInstNum::AddMD(M.getContext(), &I,
+                                          m_StartInstruction++);
+          InstructionRangeEnd = m_StartInstruction;
         }
       }
     }
@@ -188,12 +212,17 @@ bool DxilAnnotateWithVirtualRegister::runOnModule(llvm::Module &M) {
     }
   }
 
+  for (auto const &as : m_RememberedAllocaStores) {
+    PixAllocaRegWrite::AddMD(m_DM->GetCtx(), as.StoreInst, as.AllocaReg,
+                             as.Index);
+  }
+
   if (OSOverride != nullptr) {
     // Print a set of strings of the exemplary form "InstructionCount: <n>
     // <fnName>"
     if (m_DM->GetShaderModel()->GetKind() == hlsl::ShaderModel::Kind::Library)
       *OSOverride << "\nIsLibrary\n";
-    *OSOverride << "\nInstructionCount:" << InstNum << "\n";
+    *OSOverride << "\nInstructionCount:" << m_StartInstruction << "\n";
   }
 
   m_DM = nullptr;
@@ -210,7 +239,8 @@ void DxilAnnotateWithVirtualRegister::AnnotateValues(llvm::Instruction *pI) {
   }
 }
 
-void DxilAnnotateWithVirtualRegister::AnnotateStore(llvm::Instruction *pI) {
+void DxilAnnotateWithVirtualRegister::AnnotateStore(hlsl::OP *HlslOP,
+                                                    llvm::Instruction *pI) {
   auto *pSt = llvm::dyn_cast<llvm::StoreInst>(pI);
   if (pSt == nullptr) {
     return;
@@ -226,15 +256,47 @@ void DxilAnnotateWithVirtualRegister::AnnotateStore(llvm::Instruction *pI) {
   if (AllocaReg == nullptr) {
     return;
   }
+  m_RememberedAllocaStores.push_back({pSt, Index, AllocaReg});
+}
+
+llvm::Value *
+DxilAnnotateWithVirtualRegister::MultiplyConstIntValue(llvm::Value *l,
+                                                       uint32_t r) {
+  if (r == 1)
+    return l;
+  if (auto *lci = llvm::dyn_cast<llvm::ConstantInt>(l))
+    return m_DM->GetOP()->GetU32Const(lci->getLimitedValue() * r);
+  // Should never get here, but if we do, return the left as a reasonable
+  // default:
+  return l;
+}
 
-  PixAllocaRegWrite::AddMD(m_DM->GetCtx(), pSt, AllocaReg, Index);
+llvm::Value *
+DxilAnnotateWithVirtualRegister::AddConstIntValues(llvm::Value *l,
+                                                   llvm::Value *r) {
+  auto *rci = llvm::dyn_cast<llvm::ConstantInt>(r);
+  if (rci && rci->getLimitedValue() == 0)
+    return l;
+  auto *lci = llvm::dyn_cast<llvm::ConstantInt>(l);
+  if (lci && lci->getLimitedValue() == 0)
+    return r;
+  // Both an assert and a check, in case of unexpected circumstances.
+  DXASSERT(lci != nullptr && rci != nullptr,
+           "Both sides of add should be constant ints");
+  if (lci != nullptr && rci != nullptr)
+    return m_DM->GetOP()->GetU32Const(lci->getLimitedValue() +
+                                      rci->getLimitedValue());
+  // In an emergency, return the left argument. It'll be closest to
+  // the desired value.
+  return l;
 }
 
-static uint32_t GetStructOffset(llvm::GetElementPtrInst *pGEP,
-                                uint32_t &GEPOperandIndex,
-                                llvm::Type *pElementType) {
+llvm::Value *
+DxilAnnotateWithVirtualRegister::GetStructOffset(llvm::GetElementPtrInst *pGEP,
+                                                 uint32_t &GEPOperandIndex,
+                                                 llvm::Type *pElementType) {
   if (IsInstrumentableFundamentalType(pElementType)) {
-    return 0;
+    return m_DM->GetOP()->GetU32Const(0);
   } else if (auto *pArray = llvm::dyn_cast<llvm::ArrayType>(pElementType)) {
     // 1D-array example:
     //
@@ -248,18 +310,13 @@ static uint32_t GetStructOffset(llvm::GetElementPtrInst *pGEP,
     //  -The zeroth element in the struct (which is the array)
     //  -The zeroth element in that array
 
-    auto *pArrayIndex =
-        llvm::dyn_cast<llvm::ConstantInt>(pGEP->getOperand(GEPOperandIndex++));
-
-    if (pArrayIndex == nullptr) {
-      return 0;
-    }
+    auto *pArrayIndex = pGEP->getOperand(GEPOperandIndex++);
 
-    uint32_t ArrayIndex = pArrayIndex->getLimitedValue();
     auto pArrayElementType = pArray->getArrayElementType();
-    uint32_t MemberIndex = ArrayIndex * CountStructMembers(pArrayElementType);
-    return MemberIndex +
-           GetStructOffset(pGEP, GEPOperandIndex, pArrayElementType);
+    auto *MemberIndex = MultiplyConstIntValue(
+        pArrayIndex, CountStructMembers(pArrayElementType));
+    return AddConstIntValues(
+        MemberIndex, GetStructOffset(pGEP, GEPOperandIndex, pArrayElementType));
   } else if (auto *pStruct = llvm::dyn_cast<llvm::StructType>(pElementType)) {
     DXASSERT(GEPOperandIndex < pGEP->getNumOperands(),
              "Unexpectedly read too many GetElementPtrInst operands");
@@ -268,7 +325,7 @@ static uint32_t GetStructOffset(llvm::GetElementPtrInst *pGEP,
         llvm::dyn_cast<llvm::ConstantInt>(pGEP->getOperand(GEPOperandIndex++));
 
     if (pMemberIndex == nullptr) {
-      return 0;
+      return m_DM->GetOP()->GetU32Const(0);
     }
 
     uint32_t MemberIndex = pMemberIndex->getLimitedValue();
@@ -278,16 +335,17 @@ static uint32_t GetStructOffset(llvm::GetElementPtrInst *pGEP,
       MemberOffset += CountStructMembers(pStruct->getElementType(i));
     }
 
-    return MemberOffset + GetStructOffset(pGEP, GEPOperandIndex,
-                                          pStruct->getElementType(MemberIndex));
+    return AddConstIntValues(
+        m_DM->GetOP()->GetU32Const(MemberOffset),
+        GetStructOffset(pGEP, GEPOperandIndex,
+                        pStruct->getElementType(MemberIndex)));
   } else {
-    return 0;
+    return m_DM->GetOP()->GetU32Const(0);
   }
 }
 
 bool DxilAnnotateWithVirtualRegister::IsAllocaRegisterWrite(
     llvm::Value *V, llvm::AllocaInst **pAI, llvm::Value **pIdx) {
-  llvm::IRBuilder<> B(m_DM->GetCtx());
 
   *pAI = nullptr;
   *pIdx = nullptr;
@@ -366,7 +424,8 @@ bool DxilAnnotateWithVirtualRegister::IsAllocaRegisterWrite(
 
     auto offset = GetStructOffset(pGEP, GEPOperandIndex, pStructType);
 
-    llvm::Value *IndexValue = B.getInt32(offset + precedingMemberCount);
+    llvm::Value *IndexValue = AddConstIntValues(
+        offset, m_DM->GetOP()->GetU32Const(precedingMemberCount));
 
     if (IndexValue != nullptr) {
       *pAI = Alloca;
@@ -383,7 +442,7 @@ bool DxilAnnotateWithVirtualRegister::IsAllocaRegisterWrite(
     }
 
     *pAI = pAlloca;
-    *pIdx = B.getInt32(0);
+    *pIdx = m_DM->GetOP()->GetU32Const(0);
     return true;
   }
 
@@ -463,12 +522,13 @@ void DxilAnnotateWithVirtualRegister::AssignNewDxilRegister(
 
 void DxilAnnotateWithVirtualRegister::AssignNewAllocaRegister(
     llvm::AllocaInst *pAlloca, std::uint32_t C) {
-  PixAllocaReg::AddMD(m_DM->GetCtx(), pAlloca, m_uVReg, C);
-  m_uVReg += C;
+  if (!PixAllocaReg::FromInst(pAlloca, nullptr, nullptr)) {
+    PixAllocaReg::AddMD(m_DM->GetCtx(), pAlloca, m_uVReg, C);
+    m_uVReg += C;
+  }
 }
 
-void DxilAnnotateWithVirtualRegister::SplitVectorStores(hlsl::OP *HlslOP,
-                                                        llvm::Instruction *pI) {
+void DxilAnnotateWithVirtualRegister::SplitVectorStores(llvm::Instruction *pI) {
   auto *pSt = llvm::dyn_cast<llvm::StoreInst>(pI);
   if (pSt == nullptr) {
     return;
diff --git a/lib/DxilPIXPasses/DxilDbgValueToDbgDeclare.cpp b/lib/DxilPIXPasses/DxilDbgValueToDbgDeclare.cpp
index bf25d9f85f..9ddbe876b5 100644
--- a/lib/DxilPIXPasses/DxilDbgValueToDbgDeclare.cpp
+++ b/lib/DxilPIXPasses/DxilDbgValueToDbgDeclare.cpp
@@ -36,7 +36,7 @@ using namespace PIXPassHelpers;
 
 using namespace llvm;
 
-//#define VALUE_TO_DECLARE_LOGGING
+// #define VALUE_TO_DECLARE_LOGGING
 
 #ifdef VALUE_TO_DECLARE_LOGGING
 #ifndef PIX_DEBUG_DUMP_HELPER
@@ -859,8 +859,8 @@ void DxilDbgValueToDbgDeclare::handleDbgValue(llvm::Module &M,
     VALUE_TO_DECLARE_LOG("... variable was null too");
   }
 
-  llvm::Value *V = DbgValue->getValue();
-  if (V == nullptr) {
+  llvm::Value *ValueFromDbgInst = DbgValue->getValue();
+  if (ValueFromDbgInst == nullptr) {
     // The metadata contained a null Value, so we ignore it. This
     // seems to be a dxcompiler bug.
     VALUE_TO_DECLARE_LOG("...Null value!");
@@ -873,20 +873,20 @@ void DxilDbgValueToDbgDeclare::handleDbgValue(llvm::Module &M,
     return;
   }
 
-  if (llvm::isa<llvm::PointerType>(V->getType())) {
+  if (llvm::isa<llvm::PointerType>(ValueFromDbgInst->getType())) {
     // Safeguard: If the type is not a pointer type, then this is
     // dbg.value directly pointing to a memory location instead of
     // a value.
     if (!IsDITypePointer(Ty, EmptyMap)) {
       // We only know how to handle AllocaInsts for now
-      if (!isa<AllocaInst>(V)) {
+      if (!isa<AllocaInst>(ValueFromDbgInst)) {
         VALUE_TO_DECLARE_LOG(
             "... variable had pointer type, but is not an alloca.");
         return;
       }
 
       IRBuilder<> B(DbgValue->getNextNode());
-      V = B.CreateLoad(V);
+      ValueFromDbgInst = B.CreateLoad(ValueFromDbgInst);
     }
   }
 
@@ -931,7 +931,7 @@ void DxilDbgValueToDbgDeclare::handleDbgValue(llvm::Module &M,
   }
 
   const OffsetInBits InitialOffset = PackedOffsetFromVar;
-  auto *insertPt = llvm::dyn_cast<llvm::Instruction>(V);
+  auto *insertPt = llvm::dyn_cast<llvm::Instruction>(ValueFromDbgInst);
   if (insertPt != nullptr && !llvm::isa<TerminatorInst>(insertPt)) {
     insertPt = insertPt->getNextNode();
     // Drivers may crash if phi nodes aren't always at the top of a block,
@@ -950,7 +950,8 @@ void DxilDbgValueToDbgDeclare::handleDbgValue(llvm::Module &M,
       // Offset}. InitialOffset is the offset from DbgValue's expression
       // (i.e., the offset from the Variable's start), and Offset is the
       // Scalar Value's packed offset from DbgValue's value.
-      for (const ValueAndOffset &VO : SplitValue(V, InitialOffset, B)) {
+      for (const ValueAndOffset &VO :
+           SplitValue(ValueFromDbgInst, InitialOffset, B)) {
 
         OffsetInBits AlignedOffset;
         if (!Offsets.GetAlignedOffsetFromPackedOffset(VO.m_PackedOffset,
diff --git a/lib/DxilPIXPasses/DxilDebugInstrumentation.cpp b/lib/DxilPIXPasses/DxilDebugInstrumentation.cpp
index a7d7e72cb4..4dd43b07cc 100644
--- a/lib/DxilPIXPasses/DxilDebugInstrumentation.cpp
+++ b/lib/DxilPIXPasses/DxilDebugInstrumentation.cpp
@@ -1356,7 +1356,19 @@ DxilDebugInstrumentation::FindInstrumentableInstructionsInBlock(
           IndexingToken = "s"; // static indexing, no debug output required
         } else {
           IndexingToken = "d"; // dynamic indexing
-          RegisterOrStaticIndex = std::to_string(IandT->AllocaBase);
+          int MaxArraySize = 1;
+          if (auto *Store = dyn_cast<StoreInst>(&Inst)) {
+            if (auto *GEP =
+                    dyn_cast<GetElementPtrInst>(Store->getPointerOperand())) {
+              if (auto *Alloca =
+                      dyn_cast<AllocaInst>(GEP->getPointerOperand())) {
+                MaxArraySize =
+                    Alloca->getAllocatedType()->getArrayNumElements();
+              }
+            }
+          }
+          RegisterOrStaticIndex = std::to_string(IandT->AllocaBase) + "-" +
+                                  std::to_string(MaxArraySize);
           DebugOutputForThisInstruction.ValueToWriteToDebugMemory =
               IandT->AllocaWriteIndex;
         }
@@ -1374,7 +1386,8 @@ DxilDebugInstrumentation::FindInstrumentableInstructionsInBlock(
         *OSOverride << "," << *RegisterOrStaticIndex;
       }
       if (IandT->ConstantAllocaStoreValue) {
-        *OSOverride << "," << std::to_string(*IandT->ConstantAllocaStoreValue);
+        uint64_t value = IandT->ConstantAllocaStoreValue.value();
+        *OSOverride << "," << std::to_string(value);
       }
       *OSOverride << ";";
       if (DebugOutputForThisInstruction.ValueToWriteToDebugMemory)
diff --git a/lib/DxilPIXPasses/DxilPIXVirtualRegisters.cpp b/lib/DxilPIXPasses/DxilPIXVirtualRegisters.cpp
index f68e2082bc..a60f6a77a7 100644
--- a/lib/DxilPIXPasses/DxilPIXVirtualRegisters.cpp
+++ b/lib/DxilPIXPasses/DxilPIXVirtualRegisters.cpp
@@ -124,8 +124,10 @@ static bool ParsePixAllocaReg(llvm::MDNode *MD, std::uint32_t *RegNum,
     return false;
   }
 
-  *RegNum = mdRegNum->getLimitedValue();
-  *Count = mdCount->getLimitedValue();
+  if (RegNum != nullptr)
+    *RegNum = mdRegNum->getLimitedValue();
+  if (Count != nullptr)
+    *Count = mdCount->getLimitedValue();
   return true;
 }
 
@@ -144,8 +146,10 @@ void pix_dxil::PixAllocaReg::AddMD(llvm::LLVMContext &Ctx,
 bool pix_dxil::PixAllocaReg::FromInst(llvm::AllocaInst const *pAlloca,
                                       std::uint32_t *pRegBase,
                                       std::uint32_t *pRegSize) {
-  *pRegBase = 0;
-  *pRegSize = 0;
+  if (pRegBase != nullptr)
+    *pRegBase = 0;
+  if (pRegSize != nullptr)
+    *pRegSize = 0;
 
   auto *mdNodes = pAlloca->getMetadata(MDName);
   if (mdNodes == nullptr) {
diff --git a/tools/clang/test/HLSLFileCheck/pix/DbgValueToDbgDeclare_dynamic_array_index.hlsl b/tools/clang/test/HLSLFileCheck/pix/DbgValueToDbgDeclare_dynamic_array_index.hlsl
new file mode 100644
index 0000000000..cba891424a
--- /dev/null
+++ b/tools/clang/test/HLSLFileCheck/pix/DbgValueToDbgDeclare_dynamic_array_index.hlsl
@@ -0,0 +1,27 @@
+// RUN: %dxc -Tcs_6_0 /Od %s | %opt -S -dxil-annotate-with-virtual-regs | %FileCheck %s
+
+// Check that there is an alloca backing the local array
+// CHECK: [[ARRAYNAME:%.*]] = alloca [4 x float]
+
+// Grab the GEP for the above array's element that we're expecting to store to:
+// CHECK: [[ARRAYELEMENTPTR:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[ARRAYNAME]]
+
+// Check that the store to the alloca is annotated with pix-alloca-reg-read metadata 
+// (meaning that the pass accurately noted that the 8.0 is stored to a dynamic array index)
+// CHECK: store float 8.000000e+00, float* [[ARRAYELEMENTPTR]]
+// CHECK-SAME: !pix-alloca-reg-write
+
+
+RWByteAddressBuffer RawUAV: register(u1);
+
+[numthreads(1, 1, 1)]
+void main()
+{
+    float local_array[4];
+    local_array[RawUAV.Load(0)] = 8;
+    local_array[RawUAV.Load(1)] = 128;
+
+    RawUAV.Store(64+0,local_array[0]);
+    RawUAV.Store(64+4,local_array[1]);
+}
+
diff --git a/tools/clang/test/HLSLFileCheck/pix/Debug_dynamic_array_index.hlsl b/tools/clang/test/HLSLFileCheck/pix/Debug_dynamic_array_index.hlsl
new file mode 100644
index 0000000000..9ab5bce95a
--- /dev/null
+++ b/tools/clang/test/HLSLFileCheck/pix/Debug_dynamic_array_index.hlsl
@@ -0,0 +1,19 @@
+// RUN: %dxc -Tcs_6_0 /Od %s | %opt -S -dxil-annotate-with-virtual-regs -hlsl-dxil-debug-instrumentation,UAVSize=128,upstreamSVPositionRow=2 -hlsl-dxilemit | %FileCheck %s
+
+// Check that there is a block precis that correctly returns that the array is a 4-value float array
+// CHECK: Block#0
+// CHECK-SAME: d,0-4
+
+RWByteAddressBuffer RawUAV: register(u1);
+
+[numthreads(1, 1, 1)]
+void main()
+{
+    float local_array[4];
+    local_array[RawUAV.Load(0)] = 8;
+    local_array[RawUAV.Load(1)] = 128;
+
+    RawUAV.Store(64+0,local_array[0]);
+    RawUAV.Store(64+4,local_array[1]);
+}
+
diff --git a/tools/clang/unittests/HLSL/PixTest.cpp b/tools/clang/unittests/HLSL/PixTest.cpp
index af7801c7bf..c032e9e872 100644
--- a/tools/clang/unittests/HLSL/PixTest.cpp
+++ b/tools/clang/unittests/HLSL/PixTest.cpp
@@ -1220,7 +1220,6 @@ PixTest::TestableResults PixTest::TestStructAnnotationCase(
 
 #if 0 // handy for debugging
   auto disTextW = Disassemble(pAnnotatedContainer);
-  WEX::Logging::Log::Comment(disTextW.c_str());
 #endif
 
   ModuleAndHangersOn moduleEtc(pAnnotatedContainer);

From 978a6d3f13eef89a3cf513da55e7d1b16fb8aef4 Mon Sep 17 00:00:00 2001
From: Steve Urquhart <53908460+SteveUrquhart@users.noreply.github.com>
Date: Wed, 18 Jun 2025 06:36:46 -0400
Subject: [PATCH 65/93] [SPIRV] Emit DebugScope in wrapper (#7341) (#7529)

Legalization and optimization will produce inaccurate NS100 debug info
if there is no DebugScope emitted in the wrapper function. This PR
corrects this oversight and renames the wrapper to "__dxc_setup". This
may cause a stack frame named __dxc_setup to appear in an HLSL debugger,
however, users should be familiar with this type of thing. A C debugger
might show crt0, or a debugger can filter this frame out of the user's
view. This PR addresses
[7341](https://github.com/microsoft/DirectXShaderCompiler/issues//7341)
---
 tools/clang/lib/SPIRV/EmitVisitor.cpp          |  6 ------
 tools/clang/lib/SPIRV/SpirvEmitter.cpp         | 18 ++++++++++++++----
 tools/clang/lib/SPIRV/SpirvEmitter.h           |  1 +
 .../rich.debug.function.param.hlsl             |  2 +-
 .../CodeGenSPIRV/shader.debug.function.hlsl    |  2 +-
 5 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/tools/clang/lib/SPIRV/EmitVisitor.cpp b/tools/clang/lib/SPIRV/EmitVisitor.cpp
index 8de0262ae6..eb94ce0797 100644
--- a/tools/clang/lib/SPIRV/EmitVisitor.cpp
+++ b/tools/clang/lib/SPIRV/EmitVisitor.cpp
@@ -1635,12 +1635,6 @@ bool EmitVisitor::visit(SpirvDebugLexicalBlock *inst) {
 }
 
 bool EmitVisitor::visit(SpirvDebugScope *inst) {
-  // Technically entry function wrappers do not exist in HLSL. They
-  // are just created by DXC. We do not want to emit DebugScope for
-  // it.
-  if (inEntryFunctionWrapper)
-    return true;
-
   initInstruction(inst);
   curInst.push_back(inst->getResultTypeId());
   curInst.push_back(getOrAssignResultId<SpirvInstruction>(inst));
diff --git a/tools/clang/lib/SPIRV/SpirvEmitter.cpp b/tools/clang/lib/SPIRV/SpirvEmitter.cpp
index cc7016b594..850a8dd736 100644
--- a/tools/clang/lib/SPIRV/SpirvEmitter.cpp
+++ b/tools/clang/lib/SPIRV/SpirvEmitter.cpp
@@ -14050,8 +14050,8 @@ bool SpirvEmitter::processTessellationShaderAttributes(
 }
 
 bool SpirvEmitter::emitEntryFunctionWrapperForRayTracing(
-    const FunctionDecl *decl, SpirvDebugFunction *debugFunction,
-    SpirvFunction *entryFuncInstr) {
+    const FunctionDecl *decl, RichDebugInfo **info,
+    SpirvDebugFunction *debugFunction, SpirvFunction *entryFuncInstr) {
   // The entry basic block.
   auto *entryLabel = spvBuilder.createBasicBlock();
   spvBuilder.setInsertPoint(entryLabel);
@@ -14160,6 +14160,10 @@ bool SpirvEmitter::emitEntryFunctionWrapperForRayTracing(
   spvBuilder.createReturn(decl->getBody()->getLocEnd());
   spvBuilder.endFunction();
 
+  if (spirvOptions.debugInfoRich && decl->hasBody()) {
+    spvContext.popDebugLexicalScope(*info);
+  }
+
   return true;
 }
 
@@ -14374,7 +14378,9 @@ SpirvFunction *SpirvEmitter::emitEntryFunctionWrapper(
       astContext.VoidTy, decl->getLocStart(), decl->getName());
 
   if (spirvOptions.debugInfoRich && decl->hasBody()) {
-    *debugFunction = emitDebugFunction(decl, entryFunction, info, "wrapper");
+    *debugFunction =
+        emitDebugFunction(decl, entryFunction, info, "__dxc_setup");
+    spvContext.pushDebugLexicalScope(*info, *debugFunction);
   }
 
   // Specify that entryFunction is an entry function wrapper.
@@ -14391,7 +14397,7 @@ SpirvFunction *SpirvEmitter::emitEntryFunctionWrapper(
   entryInfo->entryFunction = entryFunction;
 
   if (spvContext.isRay()) {
-    return emitEntryFunctionWrapperForRayTracing(decl, *debugFunction,
+    return emitEntryFunctionWrapperForRayTracing(decl, info, *debugFunction,
                                                  entryFuncInstr)
                ? entryFunction
                : nullptr;
@@ -14632,6 +14638,10 @@ SpirvFunction *SpirvEmitter::emitEntryFunctionWrapper(
   if (spvContext.isHS())
     doDecl(patchConstFunc);
 
+  if (spirvOptions.debugInfoRich && decl->hasBody()) {
+    spvContext.popDebugLexicalScope(*info);
+  }
+
   return entryFunction;
 }
 
diff --git a/tools/clang/lib/SPIRV/SpirvEmitter.h b/tools/clang/lib/SPIRV/SpirvEmitter.h
index 14401c6418..ada8db3068 100644
--- a/tools/clang/lib/SPIRV/SpirvEmitter.h
+++ b/tools/clang/lib/SPIRV/SpirvEmitter.h
@@ -958,6 +958,7 @@ class SpirvEmitter : public ASTConsumer {
   /// The wrapper function is also responsible for initializing global static
   /// variables for some cases.
   bool emitEntryFunctionWrapperForRayTracing(const FunctionDecl *entryFunction,
+                                             RichDebugInfo **info,
                                              SpirvDebugFunction *debugFunction,
                                              SpirvFunction *entryFuncId);
 
diff --git a/tools/clang/test/CodeGenSPIRV/rich.debug.function.param.hlsl b/tools/clang/test/CodeGenSPIRV/rich.debug.function.param.hlsl
index 9576837884..a3701a4ed4 100644
--- a/tools/clang/test/CodeGenSPIRV/rich.debug.function.param.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/rich.debug.function.param.hlsl
@@ -9,7 +9,7 @@
 // CHECK:               [[x:%[0-9]+]] = OpString "x"
 // CHECK:     [[srcMainName:%[0-9]+]] = OpString "main"
 // CHECK:           [[color:%[0-9]+]] = OpString "color"
-// CHECK:        [[mainName:%[0-9]+]] = OpString "wrapper"
+// CHECK:        [[mainName:%[0-9]+]] = OpString "__dxc_setup"
 
 // CHECK: [[int:%[0-9]+]] = OpExtInst %void [[set]] DebugTypeBasic {{%[0-9]+}} %uint_32 Signed
 // CHECK: [[float:%[0-9]+]] = OpExtInst %void [[set]] DebugTypeBasic {{%[0-9]+}} %uint_32 Float
diff --git a/tools/clang/test/CodeGenSPIRV/shader.debug.function.hlsl b/tools/clang/test/CodeGenSPIRV/shader.debug.function.hlsl
index b263fd88ad..23bb479a46 100644
--- a/tools/clang/test/CodeGenSPIRV/shader.debug.function.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/shader.debug.function.hlsl
@@ -6,7 +6,7 @@
 // CHECK:         [[fooName:%[0-9]+]] = OpString "foo"
 // CHECK:        [[emptyStr:%[0-9]+]] = OpString ""
 // CHECK:     [[srcMainName:%[0-9]+]] = OpString "main"
-// CHECK:        [[mainName:%[0-9]+]] = OpString "wrapper"
+// CHECK:        [[mainName:%[0-9]+]] = OpString "__dxc_setup"
 // CHECK:          [[clOpts:%[0-9]+]] = OpString " -E main -T ps_6_0 -spirv -fcgl -fspv-debug=vulkan
 
 // CHECK:    [[int:%[0-9]+]] = OpExtInst %void [[set]] DebugTypeBasic {{%[0-9]+}} %uint_32 %uint_4 %uint_0

From d43d909801c185e5bad11a683a970cd23957c3c9 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 18 Jun 2025 13:23:05 -0700
Subject: [PATCH 66/93] Bump requests from 2.32.0 to 2.32.4 in /utils/git
 (#7524)

Bumps [requests](https://github.com/psf/requests) from 2.32.0 to 2.32.4.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a
href="https://github.com/psf/requests/releases">requests's
releases</a>.</em></p>
<blockquote>
<h2>v2.32.4</h2>
<h2>2.32.4 (2025-06-10)</h2>
<p><strong>Security</strong></p>
<ul>
<li>CVE-2024-47081 Fixed an issue where a maliciously crafted URL and
trusted
environment will retrieve credentials for the wrong hostname/machine
from a
netrc file. (<a
href="https://redirect.github.com/psf/requests/issues/6965">#6965</a>)</li>
</ul>
<p><strong>Improvements</strong></p>
<ul>
<li>Numerous documentation improvements</li>
</ul>
<p><strong>Deprecations</strong></p>
<ul>
<li>Added support for pypy 3.11 for Linux and macOS. (<a
href="https://redirect.github.com/psf/requests/issues/6926">#6926</a>)</li>
<li>Dropped support for pypy 3.9 following its end of support. (<a
href="https://redirect.github.com/psf/requests/issues/6926">#6926</a>)</li>
</ul>
<h2>v2.32.3</h2>
<h2>2.32.3 (2024-05-29)</h2>
<p><strong>Bugfixes</strong></p>
<ul>
<li>Fixed bug breaking the ability to specify custom SSLContexts in
sub-classes of
HTTPAdapter. (<a
href="https://redirect.github.com/psf/requests/issues/6716">#6716</a>)</li>
<li>Fixed issue where Requests started failing to run on Python versions
compiled
without the <code>ssl</code> module. (<a
href="https://redirect.github.com/psf/requests/issues/6724">#6724</a>)</li>
</ul>
<h2>v2.32.2</h2>
<h2>2.32.2 (2024-05-21)</h2>
<p><strong>Deprecations</strong></p>
<ul>
<li>
<p>To provide a more stable migration for custom HTTPAdapters impacted
by the CVE changes in 2.32.0, we've renamed <code>_get_connection</code>
to
a new public API, <code>get_connection_with_tls_context</code>. Existing
custom
HTTPAdapters will need to migrate their code to use this new API.
<code>get_connection</code> is considered deprecated in all versions of
Requests&gt;=2.32.0.</p>
<p>A minimal (2-line) example has been provided in the linked PR to ease
migration, but we strongly urge users to evaluate if their custom
adapter
is subject to the same issue described in CVE-2024-35195. (<a
href="https://redirect.github.com/psf/requests/issues/6710">#6710</a>)</p>
</li>
</ul>
<h2>v2.32.1</h2>
<h2>2.32.1 (2024-05-20)</h2>
<p><strong>Bugfixes</strong></p>
<ul>
<li>Add missing test certs to the sdist distributed on PyPI.</li>
</ul>
</blockquote>
</details>
<details>
<summary>Changelog</summary>
<p><em>Sourced from <a
href="https://github.com/psf/requests/blob/main/HISTORY.md">requests's
changelog</a>.</em></p>
<blockquote>
<h2>2.32.4 (2025-06-10)</h2>
<p><strong>Security</strong></p>
<ul>
<li>CVE-2024-47081 Fixed an issue where a maliciously crafted URL and
trusted
environment will retrieve credentials for the wrong hostname/machine
from a
netrc file.</li>
</ul>
<p><strong>Improvements</strong></p>
<ul>
<li>Numerous documentation improvements</li>
</ul>
<p><strong>Deprecations</strong></p>
<ul>
<li>Added support for pypy 3.11 for Linux and macOS.</li>
<li>Dropped support for pypy 3.9 following its end of support.</li>
</ul>
<h2>2.32.3 (2024-05-29)</h2>
<p><strong>Bugfixes</strong></p>
<ul>
<li>Fixed bug breaking the ability to specify custom SSLContexts in
sub-classes of
HTTPAdapter. (<a
href="https://redirect.github.com/psf/requests/issues/6716">#6716</a>)</li>
<li>Fixed issue where Requests started failing to run on Python versions
compiled
without the <code>ssl</code> module. (<a
href="https://redirect.github.com/psf/requests/issues/6724">#6724</a>)</li>
</ul>
<h2>2.32.2 (2024-05-21)</h2>
<p><strong>Deprecations</strong></p>
<ul>
<li>
<p>To provide a more stable migration for custom HTTPAdapters impacted
by the CVE changes in 2.32.0, we've renamed <code>_get_connection</code>
to
a new public API, <code>get_connection_with_tls_context</code>. Existing
custom
HTTPAdapters will need to migrate their code to use this new API.
<code>get_connection</code> is considered deprecated in all versions of
Requests&gt;=2.32.0.</p>
<p>A minimal (2-line) example has been provided in the linked PR to ease
migration, but we strongly urge users to evaluate if their custom
adapter
is subject to the same issue described in CVE-2024-35195. (<a
href="https://redirect.github.com/psf/requests/issues/6710">#6710</a>)</p>
</li>
</ul>
<h2>2.32.1 (2024-05-20)</h2>
<p><strong>Bugfixes</strong></p>
<ul>
<li>Add missing test certs to the sdist distributed on PyPI.</li>
</ul>
</blockquote>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a
href="https://github.com/psf/requests/commit/021dc729f0b71a3030cefdbec7fb57a0e80a6cfd"><code>021dc72</code></a>
Polish up release tooling for last manual release</li>
<li><a
href="https://github.com/psf/requests/commit/821770e822a20a21b207b3907ea83878bda1d396"><code>821770e</code></a>
Bump version and add release notes for v2.32.4</li>
<li><a
href="https://github.com/psf/requests/commit/59f8aa2adf1d3d06bcbf7ce6b13743a1639a5401"><code>59f8aa2</code></a>
Add netrc file search information to authentication documentation (<a
href="https://redirect.github.com/psf/requests/issues/6876">#6876</a>)</li>
<li><a
href="https://github.com/psf/requests/commit/5b4b64c3467fd7a3c03f91ee641aaa348b6bed3b"><code>5b4b64c</code></a>
Add more tests to prevent regression of CVE 2024 47081</li>
<li><a
href="https://github.com/psf/requests/commit/7bc45877a86192af77645e156eb3744f95b47dae"><code>7bc4587</code></a>
Add new test to check netrc auth leak (<a
href="https://redirect.github.com/psf/requests/issues/6962">#6962</a>)</li>
<li><a
href="https://github.com/psf/requests/commit/96ba401c1296ab1dda74a2365ef36d88f7d144ef"><code>96ba401</code></a>
Only use hostname to do netrc lookup instead of netloc</li>
<li><a
href="https://github.com/psf/requests/commit/7341690e842a23cf18ded0abd9229765fa88c4e2"><code>7341690</code></a>
Merge pull request <a
href="https://redirect.github.com/psf/requests/issues/6951">#6951</a>
from tswast/patch-1</li>
<li><a
href="https://github.com/psf/requests/commit/6716d7c9f29df636643fa2489f98890216525cb0"><code>6716d7c</code></a>
remove links</li>
<li><a
href="https://github.com/psf/requests/commit/a7e1c745dc23c18e836febd672416ed0c5d8d8ae"><code>a7e1c74</code></a>
Update docs/conf.py</li>
<li><a
href="https://github.com/psf/requests/commit/c799b8167a13416833ad3b4f3298261a477e826f"><code>c799b81</code></a>
docs: fix dead links to kenreitz.org</li>
<li>Additional commits viewable in <a
href="https://github.com/psf/requests/compare/v2.32.0...v2.32.4">compare
view</a></li>
</ul>
</details>
<br />


[![Dependabot compatibility
score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=requests&package-manager=pip&previous-version=2.32.0&new-version=2.32.4)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't
alter it yourself. You can also trigger a rebase manually by commenting
`@dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@dependabot rebase` will rebase this PR
- `@dependabot recreate` will recreate this PR, overwriting any edits
that have been made to it
- `@dependabot merge` will merge this PR after your CI passes on it
- `@dependabot squash and merge` will squash and merge this PR after
your CI passes on it
- `@dependabot cancel merge` will cancel a previously requested merge
and block automerging
- `@dependabot reopen` will reopen this PR if it is closed
- `@dependabot close` will close this PR and stop Dependabot recreating
it. You can achieve the same result by closing it manually
- `@dependabot show <dependency name> ignore conditions` will show all
of the ignore conditions of the specified dependency
- `@dependabot ignore this major version` will close this PR and stop
Dependabot creating any more for this major version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this minor version` will close this PR and stop
Dependabot creating any more for this minor version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this dependency` will close this PR and stop
Dependabot creating any more for this dependency (unless you reopen the
PR or upgrade to it yourself)
You can disable automated security fix PRs for this repo from the
[Security Alerts
page](https://github.com/microsoft/DirectXShaderCompiler/network/alerts).

</details>

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 utils/git/requirements_formatting.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/git/requirements_formatting.txt b/utils/git/requirements_formatting.txt
index 6f3e07dcf2..14123e4ac0 100644
--- a/utils/git/requirements_formatting.txt
+++ b/utils/git/requirements_formatting.txt
@@ -42,7 +42,7 @@ pyjwt[crypto]==2.8.0
     # via pygithub
 pynacl==1.5.0
     # via pygithub
-requests==2.32.0
+requests==2.32.4
     # via pygithub
 toml==0.10.2
     # via darker

From 5aec1ec4e4d0e31a263f24458c598a3b151c0d4f Mon Sep 17 00:00:00 2001
From: Alex Sepkowski <5620315+alsepkow@users.noreply.github.com>
Date: Wed, 18 Jun 2025 17:19:48 -0700
Subject: [PATCH 67/93] [NFC] Address compiler warnings: C4146 - Cases where we
 can swap to using ~ operator (#7551)

Addresses #7550.
---
 include/llvm/ADT/StringExtras.h                   |  4 ++--
 include/llvm/CodeGen/SelectionDAGNodes.h          |  6 +++---
 .../llvm/DebugInfo/DWARF/DWARFDebugRangeList.h    |  4 ++--
 include/llvm/Support/LEB128.h                     |  2 +-
 lib/Bitcode/Reader/BitcodeReader.cpp              |  2 +-
 lib/Bitcode/Writer/BitcodeWriter.cpp              |  4 ++--
 lib/DXIL/DxilUtil.cpp                             |  6 +++---
 lib/Support/APFloat.cpp                           |  4 ++--
 lib/Support/DataExtractor.cpp                     |  2 +-
 lib/Transforms/IPO/DeadArgumentElimination.cpp    |  4 ++--
 .../InstCombine/InstCombineSimplifyDemanded.cpp   |  4 ++--
 tools/clang/lib/Lex/LiteralSupport.cpp            | 15 ++++++++++-----
 tools/clang/lib/Sema/SemaDecl.cpp                 |  4 ++--
 13 files changed, 33 insertions(+), 28 deletions(-)

diff --git a/include/llvm/ADT/StringExtras.h b/include/llvm/ADT/StringExtras.h
index 270989b349..684ee0f9dc 100644
--- a/include/llvm/ADT/StringExtras.h
+++ b/include/llvm/ADT/StringExtras.h
@@ -36,12 +36,12 @@ static inline StringRef toStringRef(bool B) {
 /// Interpret the given character \p C as a hexadecimal digit and return its
 /// value.
 ///
-/// If \p C is not a valid hex digit, -1U is returned.
+/// If \p C is not a valid hex digit, ~0U is returned.
 static inline unsigned hexDigitValue(char C) {
   if (C >= '0' && C <= '9') return C-'0';
   if (C >= 'a' && C <= 'f') return C-'a'+10U;
   if (C >= 'A' && C <= 'F') return C-'A'+10U;
-  return -1U;
+  return ~0U;
 }
 
 /// utohex_buffer - Emit the specified number into the buffer specified by
diff --git a/include/llvm/CodeGen/SelectionDAGNodes.h b/include/llvm/CodeGen/SelectionDAGNodes.h
index d4a6371216..ba63d80e94 100644
--- a/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -191,12 +191,12 @@ class SDValue {
 template<> struct DenseMapInfo<SDValue> {
   static inline SDValue getEmptyKey() {
     SDValue V;
-    V.ResNo = -1U;
+    V.ResNo = ~0U;
     return V;
   }
   static inline SDValue getTombstoneKey() {
     SDValue V;
-    V.ResNo = -2U;
+    V.ResNo = ~1U;
     return V;
   }
   static unsigned getHashValue(const SDValue &Val) {
@@ -879,7 +879,7 @@ inline SDValue::SDValue(SDNode *node, unsigned resno)
     : Node(node), ResNo(resno) {
   assert((!Node || ResNo < Node->getNumValues()) &&
          "Invalid result number for the given node!");
-  assert(ResNo < -2U && "Cannot use result numbers reserved for DenseMaps.");
+  assert(ResNo < ~1U && "Cannot use result numbers reserved for DenseMaps.");
 }
 
 inline unsigned SDValue::getOpcode() const {
diff --git a/include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h b/include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h
index c930bd603d..8eea252b60 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h
@@ -49,9 +49,9 @@ class DWARFDebugRangeList {
     bool isBaseAddressSelectionEntry(uint8_t AddressSize) const {
       assert(AddressSize == 4 || AddressSize == 8);
       if (AddressSize == 4)
-        return StartAddress == -1U;
+        return StartAddress == ~0U;
       else
-        return StartAddress == -1ULL;
+        return StartAddress == ~0ULL;
     }
   };
 
diff --git a/include/llvm/Support/LEB128.h b/include/llvm/Support/LEB128.h
index 1324cb82ca..f8a2843412 100644
--- a/include/llvm/Support/LEB128.h
+++ b/include/llvm/Support/LEB128.h
@@ -103,7 +103,7 @@ inline int64_t decodeSLEB128(const uint8_t *p, unsigned *n = nullptr) {
   } while (Byte >= 128);
   // Sign extend negative numbers.
   if (Byte & 0x40)
-    Value |= (-1ULL) << Shift;
+    Value |= (~0ULL) << Shift;
   if (n)
     *n = (unsigned)(p - orig_p);
   return Value;
diff --git a/lib/Bitcode/Reader/BitcodeReader.cpp b/lib/Bitcode/Reader/BitcodeReader.cpp
index 185c291d66..a87128ca26 100644
--- a/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -2401,7 +2401,7 @@ uint64_t BitcodeReader::decodeSignRotatedValue(uint64_t V) {
   if ((V & 1) == 0)
     return V >> 1;
   if (V != 1)
-    return -(V >> 1);
+    return ~(V >> 1) + 1;
   // There is no such thing as -0 with integers.  "-0" really means MININT.
   return 1ULL << 63;
 }
diff --git a/lib/Bitcode/Writer/BitcodeWriter.cpp b/lib/Bitcode/Writer/BitcodeWriter.cpp
index 0718c81451..f02344ae64 100644
--- a/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -1360,7 +1360,7 @@ static void emitSignedInt64(SmallVectorImpl<uint64_t> &Vals, uint64_t V) {
   if ((int64_t)V >= 0)
     Vals.push_back(V << 1);
   else
-    Vals.push_back((-V << 1) | 1);
+    Vals.push_back(((~V + 1) << 1) | 1);
 }
 
 static void WriteConstants(unsigned FirstVal, unsigned LastVal,
@@ -1437,7 +1437,7 @@ static void WriteConstants(unsigned FirstVal, unsigned LastVal,
       continue;
     }
     const Constant *C = cast<Constant>(V);
-    unsigned Code = -1U;
+    unsigned Code = ~0U;
     unsigned AbbrevToUse = 0;
     if (C->isNullValue()) {
       Code = bitc::CST_CODE_NULL;
diff --git a/lib/DXIL/DxilUtil.cpp b/lib/DXIL/DxilUtil.cpp
index 966c2e189c..cc0b509772 100644
--- a/lib/DXIL/DxilUtil.cpp
+++ b/lib/DXIL/DxilUtil.cpp
@@ -181,11 +181,11 @@ void PrintUnescapedString(StringRef Name, raw_ostream &Out) {
     if (C == '\\') {
       C = Name[++i];
       unsigned value = hexDigitValue(C);
-      if (value != -1U) {
+      if (value != ~0U) {
         C = (unsigned char)value;
         unsigned value2 = hexDigitValue(Name[i + 1]);
-        assert(value2 != -1U && "otherwise, not a two digit hex escape");
-        if (value2 != -1U) {
+        assert(value2 != ~0U && "otherwise, not a two digit hex escape");
+        if (value2 != ~0U) {
           C = (C << 4) + (unsigned char)value2;
           ++i;
         }
diff --git a/lib/Support/APFloat.cpp b/lib/Support/APFloat.cpp
index 3c76c72271..f8f1fb03cd 100644
--- a/lib/Support/APFloat.cpp
+++ b/lib/Support/APFloat.cpp
@@ -331,7 +331,7 @@ trailingHexadecimalFraction(StringRef::iterator p, StringRef::iterator end,
 
   /* If we ran off the end it is exactly zero or one-half, otherwise
      a little more.  */
-  if (hexDigit == -1U)
+  if (hexDigit == ~0U)
     return digitValue == 0 ? lfExactlyZero: lfExactlyHalf;
   else
     return digitValue == 0 ? lfLessThanHalf: lfMoreThanHalf;
@@ -2368,7 +2368,7 @@ APFloat::convertFromHexadecimalString(StringRef s, roundingMode rounding_mode)
     }
 
     hex_value = hexDigitValue(*p);
-    if (hex_value == -1U)
+    if (hex_value == ~0U)
       break;
 
     p++;
diff --git a/lib/Support/DataExtractor.cpp b/lib/Support/DataExtractor.cpp
index 5d6d60a87f..625fb3595a 100644
--- a/lib/Support/DataExtractor.cpp
+++ b/lib/Support/DataExtractor.cpp
@@ -168,7 +168,7 @@ int64_t DataExtractor::getSLEB128(uint32_t *offset_ptr) const {
 
   // Sign bit of byte is 2nd high order bit (0x40)
   if (shift < 64 && (byte & 0x40))
-    result |= -(1ULL << shift);
+    result |= (~(1ULL << shift) + 1);
 
   *offset_ptr = offset;
   return result;
diff --git a/lib/Transforms/IPO/DeadArgumentElimination.cpp b/lib/Transforms/IPO/DeadArgumentElimination.cpp
index d044764025..0cf9f7797a 100644
--- a/lib/Transforms/IPO/DeadArgumentElimination.cpp
+++ b/lib/Transforms/IPO/DeadArgumentElimination.cpp
@@ -146,7 +146,7 @@ namespace {
   private:
     Liveness MarkIfNotLive(RetOrArg Use, UseVector &MaybeLiveUses);
     Liveness SurveyUse(const Use *U, UseVector &MaybeLiveUses,
-                       unsigned RetValNum = -1U);
+                       unsigned RetValNum = ~0U);
     Liveness SurveyUses(const Value *V, UseVector &MaybeLiveUses);
 
     void SurveyFunction(const Function &F);
@@ -442,7 +442,7 @@ DAE::Liveness DAE::SurveyUse(const Use *U,
       // that U is really a use of an insertvalue instruction that uses the
       // original Use.
       const Function *F = RI->getParent()->getParent();
-      if (RetValNum != -1U) {
+      if (RetValNum != ~0U) {
         RetOrArg Use = CreateRet(F, RetValNum);
         // We might be live, depending on the liveness of Use.
         return MarkIfNotLive(Use, MaybeLiveUses);
diff --git a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index 2d28b14213..66e01198bd 100644
--- a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -998,7 +998,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
     for (unsigned i = 0; i < VWidth; i++) {
       if (DemandedElts[i]) {
         unsigned MaskVal = Shuffle->getMaskValue(i);
-        if (MaskVal != -1u) {
+        if (MaskVal != ~0u) {
           assert(MaskVal < LHSVWidth * 2 &&
                  "shufflevector mask index out of range!");
           if (MaskVal < LHSVWidth)
@@ -1022,7 +1022,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
     bool NewUndefElts = false;
     for (unsigned i = 0; i < VWidth; i++) {
       unsigned MaskVal = Shuffle->getMaskValue(i);
-      if (MaskVal == -1u) {
+      if (MaskVal == ~0u) {
         UndefElts.setBit(i);
       } else if (!DemandedElts[i]) {
         NewUndefElts = true;
diff --git a/tools/clang/lib/Lex/LiteralSupport.cpp b/tools/clang/lib/Lex/LiteralSupport.cpp
index 606c821bb2..62f241812b 100644
--- a/tools/clang/lib/Lex/LiteralSupport.cpp
+++ b/tools/clang/lib/Lex/LiteralSupport.cpp
@@ -141,8 +141,12 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin,
     // Hex escapes are a maximal series of hex digits.
     bool Overflow = false;
     for (; ThisTokBuf != ThisTokEnd; ++ThisTokBuf) {
-      int CharVal = llvm::hexDigitValue(ThisTokBuf[0]);
-      if (CharVal == -1) break;
+      // originally returned -1 for invalid hex digits, now returns ~0u
+      // signature: static inline unsigned int llvm::hexDigitValue(char C)
+      unsigned int CharVal = llvm::hexDigitValue(ThisTokBuf[0]);
+      if (CharVal == ~0U)
+        break;
+
       // About to shift out a digit?
       if (ResultChar & 0xF0000000)
         Overflow = true;
@@ -245,7 +249,7 @@ void clang::expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) {
     uint32_t CodePoint = 0;
     for (++I; NumHexDigits != 0; ++I, --NumHexDigits) {
       unsigned Value = llvm::hexDigitValue(*I);
-      assert(Value != -1U);
+      assert(Value != ~0U);
 
       CodePoint <<= 4;
       CodePoint += Value;
@@ -278,8 +282,9 @@ static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
   UcnLen = (ThisTokBuf[-1] == 'u' ? 4 : 8);
   unsigned short UcnLenSave = UcnLen;
   for (; ThisTokBuf != ThisTokEnd && UcnLenSave; ++ThisTokBuf, UcnLenSave--) {
-    int CharVal = llvm::hexDigitValue(ThisTokBuf[0]);
-    if (CharVal == -1) break;
+    unsigned int CharVal = llvm::hexDigitValue(ThisTokBuf[0]);
+    if (CharVal == ~0U)
+      break;
     UcnVal <<= 4;
     UcnVal |= CharVal;
   }
diff --git a/tools/clang/lib/Sema/SemaDecl.cpp b/tools/clang/lib/Sema/SemaDecl.cpp
index e09bf4623c..a772054960 100644
--- a/tools/clang/lib/Sema/SemaDecl.cpp
+++ b/tools/clang/lib/Sema/SemaDecl.cpp
@@ -5331,7 +5331,7 @@ bool Sema::inferObjCARCLifetime(ValueDecl *decl) {
   Qualifiers::ObjCLifetime lifetime = type.getObjCLifetime();
   if (lifetime == Qualifiers::OCL_Autoreleasing) {
     // Various kinds of declaration aren't allowed to be __autoreleasing.
-    unsigned kind = -1U;
+    unsigned kind = ~0U;
     if (VarDecl *var = dyn_cast<VarDecl>(decl)) {
       if (var->hasAttr<BlocksAttr>())
         kind = 0; // __block
@@ -5343,7 +5343,7 @@ bool Sema::inferObjCARCLifetime(ValueDecl *decl) {
       kind = 2; // field
     }
 
-    if (kind != -1U) {
+    if (kind != ~0U) {
       Diag(decl->getLocation(), diag::err_arc_autoreleasing_var)
         << kind;
     }

From b78ac50593248bbbf04ba51cb48c3e1f94b416c6 Mon Sep 17 00:00:00 2001
From: Tim Corringham <timothy.corringham@amd.com>
Date: Fri, 20 Jun 2025 12:05:05 +0100
Subject: [PATCH 68/93] Add missing diagnostic argument (#7426)

Two instances of the err_integer_literal_too_large diagnostic in HLSL
specific code within Sema::ActOnNumericConstant() had a missing
argument. When these diagnostics were raised this caused an assert in an
assert enabled DXC, and random corruption of the diagnostic text in a
non-assert enabled DXC.

The trivial fix is to supply the required argument.

Fixes #7425

Co-authored-by: Tim Corringham <tcorring@amd.com>
---
 tools/clang/lib/Sema/SemaExpr.cpp                  |  6 ++++--
 .../errors/integer_literal_too_large.hlsl          | 14 ++++++++++++++
 2 files changed, 18 insertions(+), 2 deletions(-)
 create mode 100644 tools/clang/test/HLSLFileCheck/hlsl/diagnostics/errors/integer_literal_too_large.hlsl

diff --git a/tools/clang/lib/Sema/SemaExpr.cpp b/tools/clang/lib/Sema/SemaExpr.cpp
index 389fcfc3ff..b8272ba4a0 100644
--- a/tools/clang/lib/Sema/SemaExpr.cpp
+++ b/tools/clang/lib/Sema/SemaExpr.cpp
@@ -3504,12 +3504,14 @@ ExprResult Sema::ActOnNumericConstant(const Token &Tok, Scope *UDLScope) {
       Ty = Context.LitIntTy;
       if (Literal.GetIntegerValue(ResultVal)) {
         // If this value didn't fit into 64-bit literal int, report error.
-        Diag(Tok.getLocation(), diag::err_integer_literal_too_large);
+        Diag(Tok.getLocation(), diag::err_integer_literal_too_large)
+            << /* Unsigned */ 1;
       }
     } else {
 
       if (Literal.GetIntegerValue(ResultVal)) {
-        Diag(Tok.getLocation(), diag::err_integer_literal_too_large);
+        Diag(Tok.getLocation(), diag::err_integer_literal_too_large)
+            << /* Unsigned */ 1;
       }
       if (Literal.isLongLong) {
         if (Literal.isUnsigned)
diff --git a/tools/clang/test/HLSLFileCheck/hlsl/diagnostics/errors/integer_literal_too_large.hlsl b/tools/clang/test/HLSLFileCheck/hlsl/diagnostics/errors/integer_literal_too_large.hlsl
new file mode 100644
index 0000000000..98db6a6f56
--- /dev/null
+++ b/tools/clang/test/HLSLFileCheck/hlsl/diagnostics/errors/integer_literal_too_large.hlsl
@@ -0,0 +1,14 @@
+// RUN: %dxc -T lib_6_6 %s | FileCheck %s
+
+// A diagnostic is generated for an integer literal that is too large to be
+// represented by any integer type - an argument indicates whether the  text
+// contains "signed". That argument was missing in HLSL specific code within
+// Sema::ActOnNumericConstant() which resulted in an assert being raised if
+// the diagnostic was generated in an assert enabled DXC and a random string
+// being inserted in a non-assert enabled DXC.
+
+// CHECK: integer literal is too large to be represented in any integer type
+int a = 98765432109876543210;
+
+// CHECK: integer literal is too large to be represented in any integer type
+uint b = 98765432109876543210U;

From b4baabb7da9e483b624e12a86ae29df7d162d4f2 Mon Sep 17 00:00:00 2001
From: Alex Sepkowski <5620315+alsepkow@users.noreply.github.com>
Date: Fri, 20 Jun 2025 12:21:27 -0700
Subject: [PATCH 69/93] [NFC] Address compiler warnings: Fix C4146 compiler
 warnings in APInt.cpp (#7556)

Addresses #7555

All but one are simple updates to use std::numeric_limits<T>.
One case converts to use ~ operator and includes a comment with
additional context.
---
 lib/Support/APInt.cpp | 31 ++++++++++++++++++++++---------
 1 file changed, 22 insertions(+), 9 deletions(-)

diff --git a/lib/Support/APInt.cpp b/lib/Support/APInt.cpp
index 23f89bb66f..d01238a552 100644
--- a/lib/Support/APInt.cpp
+++ b/lib/Support/APInt.cpp
@@ -70,7 +70,7 @@ inline static unsigned getDigit(char cdigit, uint8_t radix) {
   if (r < radix)
     return r;
 
-  return -1U;
+  return std::numeric_limits<unsigned>::max();
 }
 
 
@@ -79,7 +79,7 @@ void APInt::initSlowCase(unsigned numBits, uint64_t val, bool isSigned) {
   pVal[0] = val;
   if (isSigned && int64_t(val) < 0)
     for (unsigned i = 1; i < getNumWords(); ++i)
-      pVal[i] = -1ULL;
+      pVal[i] = std::numeric_limits<uint64_t>::max();
 }
 
 void APInt::initSlowCase(const APInt& that) {
@@ -735,7 +735,7 @@ unsigned APInt::countLeadingOnes() const {
   unsigned Count = llvm::countLeadingOnes(pVal[i] << shift);
   if (Count == highWordBits) {
     for (i--; i >= 0; --i) {
-      if (pVal[i] == -1ULL)
+      if (pVal[i] == std::numeric_limits<uint64_t>::max())
         Count += APINT_BITS_PER_WORD;
       else {
         Count += llvm::countLeadingOnes(pVal[i]);
@@ -761,7 +761,8 @@ unsigned APInt::countTrailingZeros() const {
 unsigned APInt::countTrailingOnesSlowCase() const {
   unsigned Count = 0;
   unsigned i = 0;
-  for (; i < getNumWords() && pVal[i] == -1ULL; ++i)
+  for (; i < getNumWords() && pVal[i] == std::numeric_limits<uint64_t>::max();
+       ++i)
     Count += APINT_BITS_PER_WORD;
   if (i < getNumWords())
     Count += llvm::countTrailingOnes(pVal[i]);
@@ -1070,7 +1071,7 @@ APInt APInt::ashr(unsigned shiftAmt) const {
   // issues in the algorithm below.
   if (shiftAmt == BitWidth) {
     if (isNegative())
-      return APInt(BitWidth, -1ULL, true);
+      return APInt(BitWidth, std::numeric_limits<uint64_t>::max(), true);
     else
       return APInt(BitWidth, 0);
   }
@@ -1123,7 +1124,8 @@ APInt APInt::ashr(unsigned shiftAmt) const {
   }
 
   // Remaining words are 0 or -1, just assign them.
-  uint64_t fillValue = (isNegative() ? -1ULL : 0);
+  uint64_t fillValue =
+      (isNegative() ? std::numeric_limits<uint64_t>::max() : 0);
   for (unsigned i = breakWord+1; i < getNumWords(); ++i)
     val[i] = fillValue;
   APInt Result(val, BitWidth);
@@ -2192,7 +2194,18 @@ void APInt::toString(SmallVectorImpl<char> &Str, unsigned Radix,
         N = I;
       } else {
         Str.push_back('-');
-        N = -(uint64_t)I;
+        // In this else block, all values of I must be less than 0.
+        //
+        // Because values are stored in 2's complement and I is a signed
+        // integer, the expression -I is equivalent to (~I + 1) for all values
+        // of I, except INT64_MIN, where -I is undefined behavior in C++ due to
+        // overflow.
+        //
+        // However, (~I + 1) is still well-defined even when I == INT64_MIN, and
+        // it evaluates to the same bit pattern as INT64_MIN. Because N is
+        // unsigned, assigning N = ~I + 1 preserves the exact bit pattern
+        // and correctly represents the 2's complement value of -I.
+        N = (~I + 1);
       }
     }
 
@@ -2408,7 +2421,7 @@ APInt::tcLSB(const integerPart *parts, unsigned int n)
       }
   }
 
-  return -1U;
+  return std::numeric_limits<unsigned int>::max();
 }
 
 /* Returns the bit number of the most significant set bit of a number.
@@ -2428,7 +2441,7 @@ APInt::tcMSB(const integerPart *parts, unsigned int n)
     }
   } while (n);
 
-  return -1U;
+  return std::numeric_limits<unsigned int>::max();
 }
 
 /* Copy the bit vector of width srcBITS from SRC, starting at bit

From d1d0a31a7a6a039a35d3b8bc9586b23c57bea2a5 Mon Sep 17 00:00:00 2001
From: Alex Sepkowski <5620315+alsepkow@users.noreply.github.com>
Date: Fri, 20 Jun 2025 17:52:32 -0700
Subject: [PATCH 70/93] [NFC] Address compiler warnings: C4146 - Trivial
 std::numeric_limits<T> cases (#7559)

Addresses #7558. There is also one trivial change to use the ~ operator
included in LEB128.h. My notes on the files were wrong and suggested
that it should use std::numeric_limits<T> but looking at it again using
~0ULL made more sense.
---
 .../llvm/DebugInfo/DWARF/DWARFDebugAranges.h  | 11 +++---
 include/llvm/Support/BlockFrequency.h         |  5 ++-
 lib/Analysis/LoopAccessAnalysis.cpp           |  2 +-
 .../InstCombine/InstructionCombining.cpp      |  3 +-
 lib/Transforms/Scalar/LoadCombine.cpp         |  4 +--
 tools/clang/include/clang/AST/Expr.h          |  4 ++-
 tools/clang/lib/AST/Expr.cpp                  | 34 ++++++++-----------
 tools/clang/lib/CodeGen/CGExprScalar.cpp      |  3 +-
 tools/clang/lib/Lex/Lexer.cpp                 |  2 +-
 tools/clang/lib/Sema/SemaExpr.cpp             |  5 +--
 tools/clang/lib/Sema/SemaType.cpp             |  4 +--
 utils/TableGen/FixedLenDecoderEmitter.cpp     |  9 ++---
 12 files changed, 46 insertions(+), 40 deletions(-)

diff --git a/include/llvm/DebugInfo/DWARF/DWARFDebugAranges.h b/include/llvm/DebugInfo/DWARF/DWARFDebugAranges.h
index 791f010a88..c34cfab284 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFDebugAranges.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFDebugAranges.h
@@ -32,12 +32,13 @@ class DWARFDebugAranges {
   void construct();
 
   struct Range {
-    explicit Range(uint64_t LowPC = -1ULL, uint64_t HighPC = -1ULL,
-                   uint32_t CUOffset = -1U)
-      : LowPC(LowPC), Length(HighPC - LowPC), CUOffset(CUOffset) {}
+    explicit Range(uint64_t LowPC = std::numeric_limits<uint64_t>::max(),
+                   uint64_t HighPC = std::numeric_limits<uint64_t>::max(),
+                   uint32_t CUOffset = std::numeric_limits<uint32_t>::max())
+        : LowPC(LowPC), Length(HighPC - LowPC), CUOffset(CUOffset) {}
 
     void setHighPC(uint64_t HighPC) {
-      if (HighPC == -1ULL || HighPC <= LowPC)
+      if (HighPC == std::numeric_limits<uint64_t>::max() || HighPC <= LowPC)
         Length = 0;
       else
         Length = HighPC - LowPC;
@@ -45,7 +46,7 @@ class DWARFDebugAranges {
     uint64_t HighPC() const {
       if (Length)
         return LowPC + Length;
-      return -1ULL;
+      return std::numeric_limits<uint64_t>::max();
     }
 
     bool containsAddress(uint64_t Address) const {
diff --git a/include/llvm/Support/BlockFrequency.h b/include/llvm/Support/BlockFrequency.h
index 4304a253b2..d7d6d741f4 100644
--- a/include/llvm/Support/BlockFrequency.h
+++ b/include/llvm/Support/BlockFrequency.h
@@ -15,6 +15,7 @@
 #define LLVM_SUPPORT_BLOCKFREQUENCY_H
 
 #include "llvm/Support/DataTypes.h"
+#include <limits>
 
 namespace llvm {
 
@@ -29,7 +30,9 @@ class BlockFrequency {
   BlockFrequency(uint64_t Freq = 0) : Frequency(Freq) { }
 
   /// \brief Returns the maximum possible frequency, the saturation value.
-  static uint64_t getMaxFrequency() { return -1ULL; }
+  static uint64_t getMaxFrequency() {
+    return std::numeric_limits<uint64_t>::max();
+  }
 
   /// \brief Returns the frequency as a fixpoint number scaled by the entry
   /// frequency.
diff --git a/lib/Analysis/LoopAccessAnalysis.cpp b/lib/Analysis/LoopAccessAnalysis.cpp
index d6316dc75b..7e5e3e5ebd 100644
--- a/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/lib/Analysis/LoopAccessAnalysis.cpp
@@ -1179,7 +1179,7 @@ bool MemoryDepChecker::areDepsSafe(DepCandidates &AccessSets,
                                    MemAccessInfoSet &CheckDeps,
                                    const ValueToValueMap &Strides) {
 
-  MaxSafeDepDistBytes = -1U;
+  MaxSafeDepDistBytes = std::numeric_limits<unsigned>::max();
   while (!CheckDeps.empty()) {
     MemAccessInfo CurAccess = *CheckDeps.begin();
 
diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp
index 6bc322fa92..c93232b67f 100644
--- a/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1937,7 +1937,8 @@ Instruction *InstCombiner::visitAllocSite(Instruction &MI) {
       } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
         if (II->getIntrinsicID() == Intrinsic::objectsize) {
           ConstantInt *CI = cast<ConstantInt>(II->getArgOperand(1));
-          uint64_t DontKnow = CI->isZero() ? -1ULL : 0;
+          uint64_t DontKnow =
+              CI->isZero() ? std::numeric_limits<uint64_t>::max() : 0;
           ReplaceInstUsesWith(*I, ConstantInt::get(I->getType(), DontKnow));
         }
       }
diff --git a/lib/Transforms/Scalar/LoadCombine.cpp b/lib/Transforms/Scalar/LoadCombine.cpp
index 6d358744ef..8f22bb337d 100644
--- a/lib/Transforms/Scalar/LoadCombine.cpp
+++ b/lib/Transforms/Scalar/LoadCombine.cpp
@@ -131,10 +131,10 @@ bool LoadCombine::aggregateLoads(SmallVectorImpl<LoadPOPPair> &Loads) {
   LoadInst *BaseLoad = nullptr;
   SmallVector<LoadPOPPair, 8> AggregateLoads;
   bool Combined = false;
-  uint64_t PrevOffset = -1ull;
+  uint64_t PrevOffset = std::numeric_limits<uint64_t>::max();
   uint64_t PrevSize = 0;
   for (auto &L : Loads) {
-    if (PrevOffset == -1ull) {
+    if (PrevOffset == std::numeric_limits<uint64_t>::max()) {
       BaseLoad = L.Load;
       PrevOffset = L.POP.Offset;
       PrevSize = L.Load->getModule()->getDataLayout().getTypeStoreSize(
diff --git a/tools/clang/include/clang/AST/Expr.h b/tools/clang/include/clang/AST/Expr.h
index 26eff309f7..55fd184a79 100644
--- a/tools/clang/include/clang/AST/Expr.h
+++ b/tools/clang/include/clang/AST/Expr.h
@@ -4510,7 +4510,9 @@ class GenericSelectionExpr : public Expr {
   Expr *getControllingExpr() { return cast<Expr>(SubExprs[CONTROLLING]); }
 
   /// Whether this generic selection is result-dependent.
-  bool isResultDependent() const { return ResultIndex == -1U; }
+  bool isResultDependent() const {
+    return ResultIndex == std::numeric_limits<unsigned>::max();
+  }
 
   /// The zero-based index of the result expression's generic association in
   /// the generic selection's association list.  Defined only if the
diff --git a/tools/clang/lib/AST/Expr.cpp b/tools/clang/lib/AST/Expr.cpp
index 8ed14508af..2d039a7e98 100644
--- a/tools/clang/lib/AST/Expr.cpp
+++ b/tools/clang/lib/AST/Expr.cpp
@@ -3883,25 +3883,21 @@ GenericSelectionExpr::GenericSelectionExpr(const ASTContext &Context,
   std::copy(AssocExprs.begin(), AssocExprs.end(), SubExprs+END_EXPR);
 }
 
-GenericSelectionExpr::GenericSelectionExpr(const ASTContext &Context,
-                               SourceLocation GenericLoc, Expr *ControllingExpr,
-                               ArrayRef<TypeSourceInfo*> AssocTypes,
-                               ArrayRef<Expr*> AssocExprs,
-                               SourceLocation DefaultLoc,
-                               SourceLocation RParenLoc,
-                               bool ContainsUnexpandedParameterPack)
-  : Expr(GenericSelectionExprClass,
-         Context.DependentTy,
-         VK_RValue,
-         OK_Ordinary,
-         /*isTypeDependent=*/true,
-         /*isValueDependent=*/true,
-         /*isInstantiationDependent=*/true,
-         ContainsUnexpandedParameterPack),
-    AssocTypes(new (Context) TypeSourceInfo*[AssocTypes.size()]),
-    SubExprs(new (Context) Stmt*[END_EXPR+AssocExprs.size()]),
-    NumAssocs(AssocExprs.size()), ResultIndex(-1U), GenericLoc(GenericLoc),
-    DefaultLoc(DefaultLoc), RParenLoc(RParenLoc) {
+GenericSelectionExpr::GenericSelectionExpr(
+    const ASTContext &Context, SourceLocation GenericLoc, Expr *ControllingExpr,
+    ArrayRef<TypeSourceInfo *> AssocTypes, ArrayRef<Expr *> AssocExprs,
+    SourceLocation DefaultLoc, SourceLocation RParenLoc,
+    bool ContainsUnexpandedParameterPack)
+    : Expr(GenericSelectionExprClass, Context.DependentTy, VK_RValue,
+           OK_Ordinary,
+           /*isTypeDependent=*/true,
+           /*isValueDependent=*/true,
+           /*isInstantiationDependent=*/true, ContainsUnexpandedParameterPack),
+      AssocTypes(new(Context) TypeSourceInfo *[AssocTypes.size()]),
+      SubExprs(new(Context) Stmt *[END_EXPR + AssocExprs.size()]),
+      NumAssocs(AssocExprs.size()),
+      ResultIndex(std::numeric_limits<unsigned>::max()), GenericLoc(GenericLoc),
+      DefaultLoc(DefaultLoc), RParenLoc(RParenLoc) {
   SubExprs[CONTROLLING] = ControllingExpr;
   assert(AssocTypes.size() == AssocExprs.size());
   std::copy(AssocTypes.begin(), AssocTypes.end(), this->AssocTypes);
diff --git a/tools/clang/lib/CodeGen/CGExprScalar.cpp b/tools/clang/lib/CodeGen/CGExprScalar.cpp
index 530c791fcc..50aae94505 100644
--- a/tools/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/tools/clang/lib/CodeGen/CGExprScalar.cpp
@@ -2559,7 +2559,8 @@ void ScalarExprEmitter::EmitUndefinedBehaviorIntegerDivAndRemCheck(
 
     llvm::Value *IntMin =
       Builder.getInt(llvm::APInt::getSignedMinValue(Ty->getBitWidth()));
-    llvm::Value *NegOne = llvm::ConstantInt::get(Ty, -1ULL);
+    llvm::Value *NegOne =
+        llvm::ConstantInt::get(Ty, std::numeric_limits<uint64_t>::max());
 
     llvm::Value *LHSCmp = Builder.CreateICmpNE(Ops.LHS, IntMin);
     llvm::Value *RHSCmp = Builder.CreateICmpNE(Ops.RHS, NegOne);
diff --git a/tools/clang/lib/Lex/Lexer.cpp b/tools/clang/lib/Lex/Lexer.cpp
index 089e76b78b..e39573ca34 100644
--- a/tools/clang/lib/Lex/Lexer.cpp
+++ b/tools/clang/lib/Lex/Lexer.cpp
@@ -2737,7 +2737,7 @@ uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
     char C = getCharAndSize(CurPtr, CharSize);
 
     unsigned Value = llvm::hexDigitValue(C);
-    if (Value == -1U) {
+    if (Value == std::numeric_limits<unsigned>::max()) {
       if (Result && !isLexingRawMode()) {
         if (i == 0) {
           Diag(BufferPtr, diag::warn_ucn_escape_no_digits)
diff --git a/tools/clang/lib/Sema/SemaExpr.cpp b/tools/clang/lib/Sema/SemaExpr.cpp
index b8272ba4a0..cbc4ac37ab 100644
--- a/tools/clang/lib/Sema/SemaExpr.cpp
+++ b/tools/clang/lib/Sema/SemaExpr.cpp
@@ -1466,7 +1466,7 @@ Sema::CreateGenericSelectionExpr(SourceLocation KeyLoc,
         ContainsUnexpandedParameterPack);
 
   SmallVector<unsigned, 1> CompatIndices;
-  unsigned DefaultIndex = -1U;
+  unsigned DefaultIndex = std::numeric_limits<unsigned>::max();
   for (unsigned i = 0; i < NumAssocs; ++i) {
     if (!Types[i])
       DefaultIndex = i;
@@ -1498,7 +1498,8 @@ Sema::CreateGenericSelectionExpr(SourceLocation KeyLoc,
   // C11 6.5.1.1p2 "If a generic selection has no default generic association,
   // its controlling expression shall have type compatible with exactly one of
   // the types named in its generic association list."
-  if (DefaultIndex == -1U && CompatIndices.size() == 0) {
+  if (DefaultIndex == std::numeric_limits<unsigned>::max() &&
+      CompatIndices.size() == 0) {
     // We strip parens here because the controlling expression is typically
     // parenthesized in macro definitions.
     ControllingExpr = ControllingExpr->IgnoreParens();
diff --git a/tools/clang/lib/Sema/SemaType.cpp b/tools/clang/lib/Sema/SemaType.cpp
index ff3b0dbac7..f08ae486b5 100644
--- a/tools/clang/lib/Sema/SemaType.cpp
+++ b/tools/clang/lib/Sema/SemaType.cpp
@@ -462,7 +462,7 @@ distributeObjCPointerTypeAttrFromDeclarator(TypeProcessingState &state,
 
   // objc_gc goes on the innermost pointer to something that's not a
   // pointer.
-  unsigned innermost = -1U;
+  unsigned innermost = std::numeric_limits<unsigned>::max();
   bool considerDeclSpec = true;
   for (unsigned i = 0, e = declarator.getNumTypeObjects(); i != e; ++i) {
     DeclaratorChunk &chunk = declarator.getTypeObject(i);
@@ -501,7 +501,7 @@ distributeObjCPointerTypeAttrFromDeclarator(TypeProcessingState &state,
 
   // Otherwise, if we found an appropriate chunk, splice the attribute
   // into it.
-  if (innermost != -1U) {
+  if (innermost != std::numeric_limits<unsigned>::max()) {
     moveAttrFromListToList(attr, declarator.getAttrListRef(),
                        declarator.getTypeObject(innermost).getAttrListRef());
     return;
diff --git a/utils/TableGen/FixedLenDecoderEmitter.cpp b/utils/TableGen/FixedLenDecoderEmitter.cpp
index c5ef9d0e99..d356971f24 100644
--- a/utils/TableGen/FixedLenDecoderEmitter.cpp
+++ b/utils/TableGen/FixedLenDecoderEmitter.cpp
@@ -547,10 +547,11 @@ void Filter::recurse() {
 
     // Delegates to an inferior filter chooser for further processing on this
     // group of instructions whose segment values are variable.
-    FilterChooserMap.insert(
-        std::make_pair(-1U, llvm::make_unique<FilterChooser>(
-                                Owner->AllInstructions, VariableInstructions,
-                                Owner->Operands, BitValueArray, *Owner)));
+    FilterChooserMap.insert(std::make_pair(
+        std::numeric_limits<unsigned>::max(),
+        llvm::make_unique<FilterChooser>(Owner->AllInstructions,
+                                         VariableInstructions, Owner->Operands,
+                                         BitValueArray, *Owner)));
   }
 
   // No need to recurse for a singleton filtered instruction.

From dd725c203c3acffcf0c43f496f3c0676bdae1f80 Mon Sep 17 00:00:00 2001
From: Steven Perron <stevenperron@google.com>
Date: Tue, 24 Jun 2025 06:42:33 -0400
Subject: [PATCH 71/93] Revert "[SPIRV] Use unknown image format in vk1.3 and
 later" (#7572)

I just learned more about the VK feature. In VK1.3, the validation rule
was moved from the existence of the capability to being specific to the
format. It is possible that people will see regressions if their code
runs on a driver that does not support
VK_FORMAT_FEATURE_2_STORAGE_WRITE_WITHOUT_FORMAT_BIT or
VK_FORMAT_FEATURE_2_STORAGE_READ_WITHOUT_FORMAT_BIT for the format used
by the developer.

Reverts microsoft/DirectXShaderCompiler#7528
---
 .../clang/include/clang/SPIRV/SpirvBuilder.h  |   2 -
 tools/clang/lib/SPIRV/LowerTypeVisitor.cpp    |   7 -
 .../CodeGenSPIRV/node.empty-node-input.hlsl   |   2 +-
 .../clang/test/CodeGenSPIRV/type.buffer.hlsl  | 176 +++++++-----------
 .../type.rasterizer-ordered-buffer.hlsl       |  92 ++++-----
 .../type.rasterizer-ordered-texture.hlsl      |  36 ++--
 .../test/CodeGenSPIRV/type.rwtexture.hlsl     |  56 ++----
 7 files changed, 137 insertions(+), 234 deletions(-)

diff --git a/tools/clang/include/clang/SPIRV/SpirvBuilder.h b/tools/clang/include/clang/SPIRV/SpirvBuilder.h
index 4fe31c6d62..465f7313f1 100644
--- a/tools/clang/include/clang/SPIRV/SpirvBuilder.h
+++ b/tools/clang/include/clang/SPIRV/SpirvBuilder.h
@@ -812,8 +812,6 @@ class SpirvBuilder {
   /// the given target at the given source location.
   inline void requireExtension(llvm::StringRef extension, SourceLocation);
 
-  FeatureManager &getFeatureManager() { return featureManager; }
-
 private:
   /// \brief If not added already, adds an OpExtInstImport (import of extended
   /// instruction set) for the given instruction set. Returns the imported
diff --git a/tools/clang/lib/SPIRV/LowerTypeVisitor.cpp b/tools/clang/lib/SPIRV/LowerTypeVisitor.cpp
index 0309d56840..1869983ae3 100644
--- a/tools/clang/lib/SPIRV/LowerTypeVisitor.cpp
+++ b/tools/clang/lib/SPIRV/LowerTypeVisitor.cpp
@@ -1156,13 +1156,6 @@ LowerTypeVisitor::lowerStructFields(const RecordDecl *decl,
 spv::ImageFormat
 LowerTypeVisitor::translateSampledTypeToImageFormat(QualType sampledType,
                                                     SourceLocation srcLoc) {
-
-  // In Vulkan 1.3, all image types can be Unknown.
-  FeatureManager &featureManager = spvBuilder.getFeatureManager();
-  if (!featureManager.isTargetEnvVulkan() ||
-      featureManager.isTargetEnvVulkan1p3OrAbove())
-    return spv::ImageFormat::Unknown;
-
   uint32_t elemCount = 1;
   QualType ty = {};
   if (!isScalarType(sampledType, &ty) &&
diff --git a/tools/clang/test/CodeGenSPIRV/node.empty-node-input.hlsl b/tools/clang/test/CodeGenSPIRV/node.empty-node-input.hlsl
index da6a1d32df..fa16429a1b 100644
--- a/tools/clang/test/CodeGenSPIRV/node.empty-node-input.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/node.empty-node-input.hlsl
@@ -19,7 +19,7 @@ void emptynodeinput(EmptyNodeInput input)
 
 // CHECK-DAG: [[UINT:%[^ ]*]] = OpTypeInt 32 0
 // CHECK-DAG: [[U0:%[^ ]*]] = OpConstant [[UINT]] 0
-// CHECK-DAG: [[IMG:%[^ ]*]] = OpTypeImage [[UINT]] Buffer 2 0 0 2 Unknown
+// CHECK-DAG: [[IMG:%[^ ]*]] = OpTypeImage [[UINT]] Buffer 2 0 0 2 R32ui
 // CHECK-DAG: [[IMGPTR:%[^ ]*]] = OpTypePointer UniformConstant [[IMG]]
 // CHECK-DAG: [[BUF:%[^ ]*]] = OpVariable [[IMGPTR]] UniformConstant
 
diff --git a/tools/clang/test/CodeGenSPIRV/type.buffer.hlsl b/tools/clang/test/CodeGenSPIRV/type.buffer.hlsl
index 3e7bb73bcb..35d1b868a8 100644
--- a/tools/clang/test/CodeGenSPIRV/type.buffer.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/type.buffer.hlsl
@@ -1,149 +1,109 @@
-// RUN: %dxc -T ps_6_0 -E main -fcgl  %s -spirv | FileCheck %s --check-prefixes=CHECK,INFER
-// RUN: %dxc -fspv-target-env=vulkan1.3 -T ps_6_0 -E main -fcgl  %s -spirv | FileCheck %s --check-prefixes=CHECK,UNKNOWN
-// RUN: %dxc -fspv-target-env=universal1.5 -T ps_6_0 -E main -fcgl  %s -spirv | FileCheck %s --check-prefixes=CHECK,UNKNOWN
-
-// Before vulkan1.3, we should be trying to infer the image type for because
-// we cannot necessarily use Unknown. However in VK1.3 and later, we can use
-// Unknown.
+// RUN: %dxc -T ps_6_0 -E main -fcgl  %s -spirv | FileCheck %s
 
 // CHECK: OpCapability SampledBuffer
-// INFER: OpCapability StorageImageExtendedFormats
+// CHECK: OpCapability StorageImageExtendedFormats
 
-// INFER: %type_buffer_image = OpTypeImage %int Buffer 2 0 0 1 R32i
-// UNKNOWN: %type_buffer_image = OpTypeImage %int Buffer 2 0 0 1 Unknown
+// CHECK: %type_buffer_image = OpTypeImage %int Buffer 2 0 0 1 R32i
 // CHECK: %_ptr_UniformConstant_type_buffer_image = OpTypePointer UniformConstant %type_buffer_image
 Buffer<int> intbuf;
-// INFER: %type_buffer_image_0 = OpTypeImage %uint Buffer 2 0 0 1 R32ui
-// UNKNOWN: %type_buffer_image_0 = OpTypeImage %uint Buffer 2 0 0 1 Unknown
+// CHECK: %type_buffer_image_0 = OpTypeImage %uint Buffer 2 0 0 1 R32ui
 // CHECK: %_ptr_UniformConstant_type_buffer_image_0 = OpTypePointer UniformConstant %type_buffer_image_0
 Buffer<uint> uintbuf;
-// INFER: %type_buffer_image_1 = OpTypeImage %float Buffer 2 0 0 1 R32f
-// UNKNOWN: %type_buffer_image_1 = OpTypeImage %float Buffer 2 0 0 1 Unknown
+// CHECK: %type_buffer_image_1 = OpTypeImage %float Buffer 2 0 0 1 R32f
 // CHECK: %_ptr_UniformConstant_type_buffer_image_1 = OpTypePointer UniformConstant %type_buffer_image_1
 Buffer<float> floatbuf;
 
-// INFER: %type_buffer_image_2 = OpTypeImage %int Buffer 2 0 0 2 R32i
-// UNKNOWN: %type_buffer_image_2 = OpTypeImage %int Buffer 2 0 0 2 Unknown
+// CHECK: %type_buffer_image_2 = OpTypeImage %int Buffer 2 0 0 2 R32i
 // CHECK: %_ptr_UniformConstant_type_buffer_image_2 = OpTypePointer UniformConstant %type_buffer_image_2
 RWBuffer<int> intrwbuf;
-// INFER: %type_buffer_image_3 = OpTypeImage %uint Buffer 2 0 0 2 R32ui
-// UNKNOWN: %type_buffer_image_3 = OpTypeImage %uint Buffer 2 0 0 2 Unknown
+// CHECK: %type_buffer_image_3 = OpTypeImage %uint Buffer 2 0 0 2 R32ui
 // CHECK: %_ptr_UniformConstant_type_buffer_image_3 = OpTypePointer UniformConstant %type_buffer_image_3
 RWBuffer<uint> uintrwbuf;
-// INFER: %type_buffer_image_4 = OpTypeImage %float Buffer 2 0 0 2 R32f
-// UNKNOWN: %type_buffer_image_4 = OpTypeImage %float Buffer 2 0 0 2 Unknown
+// CHECK: %type_buffer_image_4 = OpTypeImage %float Buffer 2 0 0 2 R32f
 // CHECK: %_ptr_UniformConstant_type_buffer_image_4 = OpTypePointer UniformConstant %type_buffer_image_4
 RWBuffer<float> floatrwbuf;
 
-// If the `Unkonwn image format is used, then the images below will reuse the types above.
-// UNKNOWN-NOT: OpTypeImage
-
-// INFER: %type_buffer_image_5 = OpTypeImage %int Buffer 2 0 0 1 Rg32i
-// INFER: %_ptr_UniformConstant_type_buffer_image_5 = OpTypePointer UniformConstant %type_buffer_image_5
+// CHECK: %type_buffer_image_5 = OpTypeImage %int Buffer 2 0 0 1 Rg32i
+// CHECK: %_ptr_UniformConstant_type_buffer_image_5 = OpTypePointer UniformConstant %type_buffer_image_5
 Buffer<int2> int2buf;
-// INFER: %type_buffer_image_6 = OpTypeImage %uint Buffer 2 0 0 1 Rg32ui
-// INFER: %_ptr_UniformConstant_type_buffer_image_6 = OpTypePointer UniformConstant %type_buffer_image_6
+// CHECK: %type_buffer_image_6 = OpTypeImage %uint Buffer 2 0 0 1 Rg32ui
+// CHECK: %_ptr_UniformConstant_type_buffer_image_6 = OpTypePointer UniformConstant %type_buffer_image_6
 Buffer<uint2> uint2buf;
-// INFER: %type_buffer_image_7 = OpTypeImage %float Buffer 2 0 0 1 Rg32f
-// INFER: %_ptr_UniformConstant_type_buffer_image_7 = OpTypePointer UniformConstant %type_buffer_image_7
+// CHECK: %type_buffer_image_7 = OpTypeImage %float Buffer 2 0 0 1 Rg32f
+// CHECK: %_ptr_UniformConstant_type_buffer_image_7 = OpTypePointer UniformConstant %type_buffer_image_7
 Buffer<float2> float2buf;
 
-// INFER: %type_buffer_image_8 = OpTypeImage %int Buffer 2 0 0 2 Rg32i
-// INFER: %_ptr_UniformConstant_type_buffer_image_8 = OpTypePointer UniformConstant %type_buffer_image_8
+// CHECK: %type_buffer_image_8 = OpTypeImage %int Buffer 2 0 0 2 Rg32i
+// CHECK: %_ptr_UniformConstant_type_buffer_image_8 = OpTypePointer UniformConstant %type_buffer_image_8
 RWBuffer<int2> int2rwbuf;
-// INFER: %type_buffer_image_9 = OpTypeImage %uint Buffer 2 0 0 2 Rg32ui
-// INFER: %_ptr_UniformConstant_type_buffer_image_9 = OpTypePointer UniformConstant %type_buffer_image_9
+// CHECK: %type_buffer_image_9 = OpTypeImage %uint Buffer 2 0 0 2 Rg32ui
+// CHECK: %_ptr_UniformConstant_type_buffer_image_9 = OpTypePointer UniformConstant %type_buffer_image_9
 RWBuffer<uint2> uint2rwbuf;
-// INFER: %type_buffer_image_10 = OpTypeImage %float Buffer 2 0 0 2 Rg32f
-// INFER: %_ptr_UniformConstant_type_buffer_image_10 = OpTypePointer UniformConstant %type_buffer_image_10
+// CHECK: %type_buffer_image_10 = OpTypeImage %float Buffer 2 0 0 2 Rg32f
+// CHECK: %_ptr_UniformConstant_type_buffer_image_10 = OpTypePointer UniformConstant %type_buffer_image_10
 RWBuffer<float2> float2rwbuf;
 
-// INFER: %type_buffer_image_11 = OpTypeImage %int Buffer 2 0 0 1 Unknown
-// INFER: %_ptr_UniformConstant_type_buffer_image_11 = OpTypePointer UniformConstant %type_buffer_image_11
-// INFER: %type_buffer_image_12 = OpTypeImage %int Buffer 2 0 0 1 Rgba32i
-// INFER: %_ptr_UniformConstant_type_buffer_image_12 = OpTypePointer UniformConstant %type_buffer_image_12
+// CHECK: %type_buffer_image_11 = OpTypeImage %int Buffer 2 0 0 1 Unknown
+// CHECK: %_ptr_UniformConstant_type_buffer_image_11 = OpTypePointer UniformConstant %type_buffer_image_11
+// CHECK: %type_buffer_image_12 = OpTypeImage %int Buffer 2 0 0 1 Rgba32i
+// CHECK: %_ptr_UniformConstant_type_buffer_image_12 = OpTypePointer UniformConstant %type_buffer_image_12
 Buffer<int3> int3buf;
 Buffer<int4> int4buf;
-// INFER: %type_buffer_image_13 = OpTypeImage %uint Buffer 2 0 0 1 Unknown
-// INFER: %_ptr_UniformConstant_type_buffer_image_13 = OpTypePointer UniformConstant %type_buffer_image_13
-// INFER: %type_buffer_image_14 = OpTypeImage %uint Buffer 2 0 0 1 Rgba32ui
-// INFER: %_ptr_UniformConstant_type_buffer_image_14 = OpTypePointer UniformConstant %type_buffer_image_14
+// CHECK: %type_buffer_image_13 = OpTypeImage %uint Buffer 2 0 0 1 Unknown
+// CHECK: %_ptr_UniformConstant_type_buffer_image_13 = OpTypePointer UniformConstant %type_buffer_image_13
+// CHECK: %type_buffer_image_14 = OpTypeImage %uint Buffer 2 0 0 1 Rgba32ui
+// CHECK: %_ptr_UniformConstant_type_buffer_image_14 = OpTypePointer UniformConstant %type_buffer_image_14
 Buffer<uint3> uint3buf;
 Buffer<uint4> uint4buf;
-// INFER: %type_buffer_image_15 = OpTypeImage %float Buffer 2 0 0 1 Unknown
-// INFER: %_ptr_UniformConstant_type_buffer_image_15 = OpTypePointer UniformConstant %type_buffer_image_15
-// INFER: %type_buffer_image_16 = OpTypeImage %float Buffer 2 0 0 1 Rgba32f
-// INFER: %_ptr_UniformConstant_type_buffer_image_16 = OpTypePointer UniformConstant %type_buffer_image_16
+// CHECK: %type_buffer_image_15 = OpTypeImage %float Buffer 2 0 0 1 Unknown
+// CHECK: %_ptr_UniformConstant_type_buffer_image_15 = OpTypePointer UniformConstant %type_buffer_image_15
+// CHECK: %type_buffer_image_16 = OpTypeImage %float Buffer 2 0 0 1 Rgba32f
+// CHECK: %_ptr_UniformConstant_type_buffer_image_16 = OpTypePointer UniformConstant %type_buffer_image_16
 Buffer<float3> float3buf;
 Buffer<float4> float4buf;
 
-// INFER: %type_buffer_image_17 = OpTypeImage %int Buffer 2 0 0 2 Unknown
-// INFER: %_ptr_UniformConstant_type_buffer_image_17 = OpTypePointer UniformConstant %type_buffer_image_17
-// INFER: %type_buffer_image_18 = OpTypeImage %int Buffer 2 0 0 2 Rgba32i
-// INFER: %_ptr_UniformConstant_type_buffer_image_18 = OpTypePointer UniformConstant %type_buffer_image_18
+// CHECK: %type_buffer_image_17 = OpTypeImage %int Buffer 2 0 0 2 Unknown
+// CHECK: %_ptr_UniformConstant_type_buffer_image_17 = OpTypePointer UniformConstant %type_buffer_image_17
+// CHECK: %type_buffer_image_18 = OpTypeImage %int Buffer 2 0 0 2 Rgba32i
+// CHECK: %_ptr_UniformConstant_type_buffer_image_18 = OpTypePointer UniformConstant %type_buffer_image_18
 RWBuffer<int3> int3rwbuf;
 RWBuffer<int4> int4rwbuf;
-// INFER: %type_buffer_image_19 = OpTypeImage %uint Buffer 2 0 0 2 Unknown
-// INFER: %_ptr_UniformConstant_type_buffer_image_19 = OpTypePointer UniformConstant %type_buffer_image_19
-// INFER: %type_buffer_image_20 = OpTypeImage %uint Buffer 2 0 0 2 Rgba32ui
-// INFER: %_ptr_UniformConstant_type_buffer_image_20 = OpTypePointer UniformConstant %type_buffer_image_20
+// CHECK: %type_buffer_image_19 = OpTypeImage %uint Buffer 2 0 0 2 Unknown
+// CHECK: %_ptr_UniformConstant_type_buffer_image_19 = OpTypePointer UniformConstant %type_buffer_image_19
+// CHECK: %type_buffer_image_20 = OpTypeImage %uint Buffer 2 0 0 2 Rgba32ui
+// CHECK: %_ptr_UniformConstant_type_buffer_image_20 = OpTypePointer UniformConstant %type_buffer_image_20
 RWBuffer<uint3> uint3rwbuf;
 RWBuffer<uint4> uint4rwbuf;
-// INFER: %type_buffer_image_21 = OpTypeImage %float Buffer 2 0 0 2 Unknown
-// INFER: %_ptr_UniformConstant_type_buffer_image_21 = OpTypePointer UniformConstant %type_buffer_image_21
-// INFER: %type_buffer_image_22 = OpTypeImage %float Buffer 2 0 0 2 Rgba32f
-// INFER: %_ptr_UniformConstant_type_buffer_image_22 = OpTypePointer UniformConstant %type_buffer_image_22
+// CHECK: %type_buffer_image_21 = OpTypeImage %float Buffer 2 0 0 2 Unknown
+// CHECK: %_ptr_UniformConstant_type_buffer_image_21 = OpTypePointer UniformConstant %type_buffer_image_21
+// CHECK: %type_buffer_image_22 = OpTypeImage %float Buffer 2 0 0 2 Rgba32f
+// CHECK: %_ptr_UniformConstant_type_buffer_image_22 = OpTypePointer UniformConstant %type_buffer_image_22
 RWBuffer<float3> float3rwbuf;
 RWBuffer<float4> float4rwbuf;
 
-// INFER: %intbuf = OpVariable %_ptr_UniformConstant_type_buffer_image UniformConstant
-// INFER: %uintbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_0 UniformConstant
-// INFER: %floatbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_1 UniformConstant
-// INFER: %intrwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_2 UniformConstant
-// INFER: %uintrwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_3 UniformConstant
-// INFER: %floatrwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_4 UniformConstant
-// INFER: %int2buf = OpVariable %_ptr_UniformConstant_type_buffer_image_5 UniformConstant
-// INFER: %uint2buf = OpVariable %_ptr_UniformConstant_type_buffer_image_6 UniformConstant
-// INFER: %float2buf = OpVariable %_ptr_UniformConstant_type_buffer_image_7 UniformConstant
-// INFER: %int2rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_8 UniformConstant
-// INFER: %uint2rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_9 UniformConstant
-// INFER: %float2rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_10 UniformConstant
-// INFER: %int3buf = OpVariable %_ptr_UniformConstant_type_buffer_image_11 UniformConstant
-// INFER: %int4buf = OpVariable %_ptr_UniformConstant_type_buffer_image_12 UniformConstant
-// INFER: %uint3buf = OpVariable %_ptr_UniformConstant_type_buffer_image_13 UniformConstant
-// INFER: %uint4buf = OpVariable %_ptr_UniformConstant_type_buffer_image_14 UniformConstant
-// INFER: %float3buf = OpVariable %_ptr_UniformConstant_type_buffer_image_15 UniformConstant
-// INFER: %float4buf = OpVariable %_ptr_UniformConstant_type_buffer_image_16 UniformConstant
-// INFER: %int3rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_17 UniformConstant
-// INFER: %int4rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_18 UniformConstant
-// INFER: %uint3rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_19 UniformConstant
-// INFER: %uint4rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_20 UniformConstant
-// INFER: %float3rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_21 UniformConstant
-// INFER: %float4rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_22 UniformConstant
-
-// UNKNOWN: %intbuf = OpVariable %_ptr_UniformConstant_type_buffer_image UniformConstant
-// UNKNOWN: %uintbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_0 UniformConstant
-// UNKNOWN: %floatbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_1 UniformConstant
-// UNKNOWN: %intrwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_2 UniformConstant
-// UNKNOWN: %uintrwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_3 UniformConstant
-// UNKNOWN: %floatrwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_4 UniformConstant
-// UNKNOWN: %int2buf = OpVariable %_ptr_UniformConstant_type_buffer_image UniformConstant
-// UNKNOWN: %uint2buf = OpVariable %_ptr_UniformConstant_type_buffer_image_0 UniformConstant
-// UNKNOWN: %float2buf = OpVariable %_ptr_UniformConstant_type_buffer_image_1 UniformConstant
-// UNKNOWN: %int2rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_2 UniformConstant
-// UNKNOWN: %uint2rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_3 UniformConstant
-// UNKNOWN: %float2rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_4 UniformConstant
-// UNKNOWN: %int3buf = OpVariable %_ptr_UniformConstant_type_buffer_image UniformConstant
-// UNKNOWN: %int4buf = OpVariable %_ptr_UniformConstant_type_buffer_image UniformConstant
-// UNKNOWN: %uint3buf = OpVariable %_ptr_UniformConstant_type_buffer_image_0 UniformConstant
-// UNKNOWN: %uint4buf = OpVariable %_ptr_UniformConstant_type_buffer_image_0 UniformConstant
-// UNKNOWN: %float3buf = OpVariable %_ptr_UniformConstant_type_buffer_image_1 UniformConstant
-// UNKNOWN: %float4buf = OpVariable %_ptr_UniformConstant_type_buffer_image_1 UniformConstant
-// UNKNOWN: %int3rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_2 UniformConstant
-// UNKNOWN: %int4rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_2 UniformConstant
-// UNKNOWN: %uint3rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_3 UniformConstant
-// UNKNOWN: %uint4rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_3 UniformConstant
-// UNKNOWN: %float3rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_4 UniformConstant
-// UNKNOWN: %float4rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_4 UniformConstant
+// CHECK: %intbuf = OpVariable %_ptr_UniformConstant_type_buffer_image UniformConstant
+// CHECK: %uintbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_0 UniformConstant
+// CHECK: %floatbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_1 UniformConstant
+// CHECK: %intrwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_2 UniformConstant
+// CHECK: %uintrwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_3 UniformConstant
+// CHECK: %floatrwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_4 UniformConstant
+// CHECK: %int2buf = OpVariable %_ptr_UniformConstant_type_buffer_image_5 UniformConstant
+// CHECK: %uint2buf = OpVariable %_ptr_UniformConstant_type_buffer_image_6 UniformConstant
+// CHECK: %float2buf = OpVariable %_ptr_UniformConstant_type_buffer_image_7 UniformConstant
+// CHECK: %int2rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_8 UniformConstant
+// CHECK: %uint2rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_9 UniformConstant
+// CHECK: %float2rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_10 UniformConstant
+// CHECK: %int3buf = OpVariable %_ptr_UniformConstant_type_buffer_image_11 UniformConstant
+// CHECK: %int4buf = OpVariable %_ptr_UniformConstant_type_buffer_image_12 UniformConstant
+// CHECK: %uint3buf = OpVariable %_ptr_UniformConstant_type_buffer_image_13 UniformConstant
+// CHECK: %uint4buf = OpVariable %_ptr_UniformConstant_type_buffer_image_14 UniformConstant
+// CHECK: %float3buf = OpVariable %_ptr_UniformConstant_type_buffer_image_15 UniformConstant
+// CHECK: %float4buf = OpVariable %_ptr_UniformConstant_type_buffer_image_16 UniformConstant
+// CHECK: %int3rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_17 UniformConstant
+// CHECK: %int4rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_18 UniformConstant
+// CHECK: %uint3rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_19 UniformConstant
+// CHECK: %uint4rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_20 UniformConstant
+// CHECK: %float3rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_21 UniformConstant
+// CHECK: %float4rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_22 UniformConstant
 
 void main() {}
diff --git a/tools/clang/test/CodeGenSPIRV/type.rasterizer-ordered-buffer.hlsl b/tools/clang/test/CodeGenSPIRV/type.rasterizer-ordered-buffer.hlsl
index 0b576fc5e9..c616f65bb9 100644
--- a/tools/clang/test/CodeGenSPIRV/type.rasterizer-ordered-buffer.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/type.rasterizer-ordered-buffer.hlsl
@@ -1,81 +1,59 @@
-// RUN: %dxc -T ps_6_6 -E main -fcgl %s -spirv | FileCheck %s --check-prefixes=CHECK,INFER
-// RUN: %dxc -fspv-target-env=vulkan1.3 -T ps_6_6 -E main -fcgl %s -spirv | FileCheck %s --check-prefixes=CHECK,UNKNOWN
-// RUN: %dxc -fspv-target-env=universal1.5 -T ps_6_6 -E main -fcgl %s -spirv | FileCheck %s --check-prefixes=CHECK,UNKNOWN
-
-// Before vulkan1.3, we should be trying to infer the image type for because
-// we cannot necessarily use Unknown. However in VK1.3 and later, we can use
-// Unknown.
+// RUN: %dxc -T ps_6_6 -E main -fcgl %s -spirv | FileCheck %s
 
 // CHECK: OpCapability SampledBuffer
-// INFER: OpCapability StorageImageExtendedFormats
+// CHECK: OpCapability StorageImageExtendedFormats
 
-// INFER: %type_buffer_image = OpTypeImage %int Buffer 2 0 0 2 R32i
-// UNKNOWN: %type_buffer_image = OpTypeImage %int Buffer 2 0 0 2 Unknown
+// CHECK: %type_buffer_image = OpTypeImage %int Buffer 2 0 0 2 R32i
 // CHECK: %_ptr_UniformConstant_type_buffer_image = OpTypePointer UniformConstant %type_buffer_image
 RasterizerOrderedBuffer<int> introvbuf;
-// INFER: %type_buffer_image_0 = OpTypeImage %uint Buffer 2 0 0 2 R32ui
-// UNKNOWN: %type_buffer_image_0 = OpTypeImage %uint Buffer 2 0 0 2 Unknown
+// CHECK: %type_buffer_image_0 = OpTypeImage %uint Buffer 2 0 0 2 R32ui
 // CHECK: %_ptr_UniformConstant_type_buffer_image_0 = OpTypePointer UniformConstant %type_buffer_image_0
 RasterizerOrderedBuffer<uint> uintrovbuf;
-// INFER: %type_buffer_image_1 = OpTypeImage %float Buffer 2 0 0 2 R32f
-// UNKNOWN: %type_buffer_image_1 = OpTypeImage %float Buffer 2 0 0 2 Unknown
+// CHECK: %type_buffer_image_1 = OpTypeImage %float Buffer 2 0 0 2 R32f
 // CHECK: %_ptr_UniformConstant_type_buffer_image_1 = OpTypePointer UniformConstant %type_buffer_image_1
 RasterizerOrderedBuffer<float> floatrovbuf;
 
-// INFER: %type_buffer_image_2 = OpTypeImage %int Buffer 2 0 0 2 Rg32i
-// INFER: %_ptr_UniformConstant_type_buffer_image_2 = OpTypePointer UniformConstant %type_buffer_image_2
+// CHECK: %type_buffer_image_2 = OpTypeImage %int Buffer 2 0 0 2 Rg32i
+// CHECK: %_ptr_UniformConstant_type_buffer_image_2 = OpTypePointer UniformConstant %type_buffer_image_2
 RasterizerOrderedBuffer<int2> int2rovbuf;
-// INFER: %type_buffer_image_3 = OpTypeImage %uint Buffer 2 0 0 2 Rg32ui
-// INFER: %_ptr_UniformConstant_type_buffer_image_3 = OpTypePointer UniformConstant %type_buffer_image_3
+// CHECK: %type_buffer_image_3 = OpTypeImage %uint Buffer 2 0 0 2 Rg32ui
+// CHECK: %_ptr_UniformConstant_type_buffer_image_3 = OpTypePointer UniformConstant %type_buffer_image_3
 RasterizerOrderedBuffer<uint2> uint2rovbuf;
-// INFER: %type_buffer_image_4 = OpTypeImage %float Buffer 2 0 0 2 Rg32f
-// INFER: %_ptr_UniformConstant_type_buffer_image_4 = OpTypePointer UniformConstant %type_buffer_image_4
+// CHECK: %type_buffer_image_4 = OpTypeImage %float Buffer 2 0 0 2 Rg32f
+// CHECK: %_ptr_UniformConstant_type_buffer_image_4 = OpTypePointer UniformConstant %type_buffer_image_4
 RasterizerOrderedBuffer<float2> float2rovbuf;
 
-// INFER: %type_buffer_image_5 = OpTypeImage %int Buffer 2 0 0 2 Unknown
-// INFER: %_ptr_UniformConstant_type_buffer_image_5 = OpTypePointer UniformConstant %type_buffer_image_5
-// INFER: %type_buffer_image_6 = OpTypeImage %int Buffer 2 0 0 2 Rgba32i
-// INFER: %_ptr_UniformConstant_type_buffer_image_6 = OpTypePointer UniformConstant %type_buffer_image_6
+// CHECK: %type_buffer_image_5 = OpTypeImage %int Buffer 2 0 0 2 Unknown
+// CHECK: %_ptr_UniformConstant_type_buffer_image_5 = OpTypePointer UniformConstant %type_buffer_image_5
+// CHECK: %type_buffer_image_6 = OpTypeImage %int Buffer 2 0 0 2 Rgba32i
+// CHECK: %_ptr_UniformConstant_type_buffer_image_6 = OpTypePointer UniformConstant %type_buffer_image_6
 RasterizerOrderedBuffer<int3> int3rovbuf;
 RasterizerOrderedBuffer<int4> int4rovbuf;
-// INFER: %type_buffer_image_7 = OpTypeImage %uint Buffer 2 0 0 2 Unknown
-// INFER: %_ptr_UniformConstant_type_buffer_image_7 = OpTypePointer UniformConstant %type_buffer_image_7
-// INFER: %type_buffer_image_8 = OpTypeImage %uint Buffer 2 0 0 2 Rgba32ui
-// INFER: %_ptr_UniformConstant_type_buffer_image_8 = OpTypePointer UniformConstant %type_buffer_image_8
+// CHECK: %type_buffer_image_7 = OpTypeImage %uint Buffer 2 0 0 2 Unknown
+// CHECK: %_ptr_UniformConstant_type_buffer_image_7 = OpTypePointer UniformConstant %type_buffer_image_7
+// CHECK: %type_buffer_image_8 = OpTypeImage %uint Buffer 2 0 0 2 Rgba32ui
+// CHECK: %_ptr_UniformConstant_type_buffer_image_8 = OpTypePointer UniformConstant %type_buffer_image_8
 RasterizerOrderedBuffer<uint3> uint3rovbuf;
 RasterizerOrderedBuffer<uint4> uint4rovbuf;
-// INFER: %type_buffer_image_9 = OpTypeImage %float Buffer 2 0 0 2 Unknown
-// INFER: %_ptr_UniformConstant_type_buffer_image_9 = OpTypePointer UniformConstant %type_buffer_image_9
-// INFER: %type_buffer_image_10 = OpTypeImage %float Buffer 2 0 0 2 Rgba32f
-// INFER: %_ptr_UniformConstant_type_buffer_image_10 = OpTypePointer UniformConstant %type_buffer_image_10
+// CHECK: %type_buffer_image_9 = OpTypeImage %float Buffer 2 0 0 2 Unknown
+// CHECK: %_ptr_UniformConstant_type_buffer_image_9 = OpTypePointer UniformConstant %type_buffer_image_9
+// CHECK: %type_buffer_image_10 = OpTypeImage %float Buffer 2 0 0 2 Rgba32f
+// CHECK: %_ptr_UniformConstant_type_buffer_image_10 = OpTypePointer UniformConstant %type_buffer_image_10
 RasterizerOrderedBuffer<float3> float3rovbuf;
 RasterizerOrderedBuffer<float4> float4rovbuf;
 
-// INFER: %introvbuf = OpVariable %_ptr_UniformConstant_type_buffer_image UniformConstant
-// INFER: %uintrovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_0 UniformConstant
-// INFER: %floatrovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_1 UniformConstant
-// INFER: %int2rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_2 UniformConstant
-// INFER: %uint2rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_3 UniformConstant
-// INFER: %float2rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_4 UniformConstant
-// INFER: %int3rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_5 UniformConstant
-// INFER: %int4rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_6 UniformConstant
-// INFER: %uint3rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_7 UniformConstant
-// INFER: %uint4rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_8 UniformConstant
-// INFER: %float3rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_9 UniformConstant
-// INFER: %float4rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_10 UniformConstant
-
-// UNKNOWN: %introvbuf = OpVariable %_ptr_UniformConstant_type_buffer_image UniformConstant
-// UNKNOWN: %uintrovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_0 UniformConstant
-// UNKNOWN: %floatrovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_1 UniformConstant
-// UNKNOWN: %int2rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image UniformConstant
-// UNKNOWN: %uint2rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_0 UniformConstant
-// UNKNOWN: %float2rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_1 UniformConstant
-// UNKNOWN: %int3rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image UniformConstant
-// UNKNOWN: %int4rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image UniformConstant
-// UNKNOWN: %uint3rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_0 UniformConstant
-// UNKNOWN: %uint4rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_0 UniformConstant
-// UNKNOWN: %float3rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_1 UniformConstant
-// UNKNOWN: %float4rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_1 UniformConstant
+// CHECK: %introvbuf = OpVariable %_ptr_UniformConstant_type_buffer_image UniformConstant
+// CHECK: %uintrovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_0 UniformConstant
+// CHECK: %floatrovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_1 UniformConstant
+// CHECK: %int2rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_2 UniformConstant
+// CHECK: %uint2rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_3 UniformConstant
+// CHECK: %float2rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_4 UniformConstant
+// CHECK: %int3rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_5 UniformConstant
+// CHECK: %int4rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_6 UniformConstant
+// CHECK: %uint3rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_7 UniformConstant
+// CHECK: %uint4rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_8 UniformConstant
+// CHECK: %float3rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_9 UniformConstant
+// CHECK: %float4rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_10 UniformConstant
 
 void main() {}
 
diff --git a/tools/clang/test/CodeGenSPIRV/type.rasterizer-ordered-texture.hlsl b/tools/clang/test/CodeGenSPIRV/type.rasterizer-ordered-texture.hlsl
index 21bff421a0..32dd76e6f1 100644
--- a/tools/clang/test/CodeGenSPIRV/type.rasterizer-ordered-texture.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/type.rasterizer-ordered-texture.hlsl
@@ -1,28 +1,23 @@
-// RUN: %dxc -T ps_6_6 -E main -fcgl %s -spirv | FileCheck %s --check-prefixes=CHECK,INFER
-// RUN: %dxc -fspv-target-env=vulkan1.3 -T ps_6_6 -E main -fcgl %s -spirv | FileCheck %s --check-prefixes=CHECK,UNKNOWN
-// RUN: %dxc -fspv-target-env=universal1.5 -T ps_6_6 -E main -fcgl %s -spirv | FileCheck %s --check-prefixes=CHECK,UNKNOWN
+// RUN: %dxc -T ps_6_6 -E main -fcgl %s -spirv | FileCheck %s
 
 // CHECK: OpCapability Image1D
 
-// INFER: %type_1d_image = OpTypeImage %int 1D 2 0 0 2 R32i
-// UNKNOWN: %type_1d_image = OpTypeImage %int 1D 2 0 0 2 Unknown
+// CHECK: %type_1d_image = OpTypeImage %int 1D 2 0 0 2 R32i
 // CHECK: %_ptr_UniformConstant_type_1d_image = OpTypePointer UniformConstant %type_1d_image
-// INFER: %type_2d_image = OpTypeImage %uint 2D 2 0 0 2 Rg32ui
-// UNKNOWN: %type_2d_image = OpTypeImage %uint 2D 2 0 0 2 Unknown
+// CHECK: %type_2d_image = OpTypeImage %uint 2D 2 0 0 2 Rg32ui
 // CHECK: %_ptr_UniformConstant_type_2d_image = OpTypePointer UniformConstant %type_2d_image
-// INFER: %type_3d_image = OpTypeImage %int 3D 2 0 0 2 R32i
-// UNKNOWN: %type_3d_image = OpTypeImage %int 3D 2 0 0 2 Unknown
+// CHECK: %type_3d_image = OpTypeImage %int 3D 2 0 0 2 R32i
 // CHECK: %_ptr_UniformConstant_type_3d_image = OpTypePointer UniformConstant %type_3d_image
-// INFER: %type_3d_image_0 = OpTypeImage %float 3D 2 0 0 2 Rgba32f
-// INFER: %_ptr_UniformConstant_type_3d_image_0 = OpTypePointer UniformConstant %type_3d_image_0
-// INFER: %type_1d_image_array = OpTypeImage %int 1D 2 1 0 2 R32i
-// INFER: %_ptr_UniformConstant_type_1d_image_array = OpTypePointer UniformConstant %type_1d_image_array
-// INFER: %type_2d_image_array = OpTypeImage %uint 2D 2 1 0 2 Rg32ui
-// INFER: %_ptr_UniformConstant_type_2d_image_array = OpTypePointer UniformConstant %type_2d_image_array
-// INFER: %type_1d_image_array_0 = OpTypeImage %float 1D 2 1 0 2 Rgba32f
-// INFER: %_ptr_UniformConstant_type_1d_image_array_0 = OpTypePointer UniformConstant %type_1d_image_array_0
-// INFER: %type_2d_image_array_0 = OpTypeImage %float 2D 2 1 0 2 Rgba32f
-// INFER: %_ptr_UniformConstant_type_2d_image_array_0 = OpTypePointer UniformConstant %type_2d_image_array_0
+// CHECK: %type_3d_image_0 = OpTypeImage %float 3D 2 0 0 2 Rgba32f
+// CHECK: %_ptr_UniformConstant_type_3d_image_0 = OpTypePointer UniformConstant %type_3d_image_0
+// CHECK: %type_1d_image_array = OpTypeImage %int 1D 2 1 0 2 R32i
+// CHECK: %_ptr_UniformConstant_type_1d_image_array = OpTypePointer UniformConstant %type_1d_image_array
+// CHECK: %type_2d_image_array = OpTypeImage %uint 2D 2 1 0 2 Rg32ui
+// CHECK: %_ptr_UniformConstant_type_2d_image_array = OpTypePointer UniformConstant %type_2d_image_array
+// CHECK: %type_1d_image_array_0 = OpTypeImage %float 1D 2 1 0 2 Rgba32f
+// CHECK: %_ptr_UniformConstant_type_1d_image_array_0 = OpTypePointer UniformConstant %type_1d_image_array_0
+// CHECK: %type_2d_image_array_0 = OpTypeImage %float 2D 2 1 0 2 Rgba32f
+// CHECK: %_ptr_UniformConstant_type_2d_image_array_0 = OpTypePointer UniformConstant %type_2d_image_array_0
 
 
 // CHECK: %t1 = OpVariable %_ptr_UniformConstant_type_1d_image UniformConstant
@@ -38,8 +33,7 @@ RasterizerOrderedTexture3D   <int>    t3 ;
 [[vk::image_format("rgba32f")]]
 RasterizerOrderedTexture3D   <float3> t4 ;
 
-// INFER: %t5 = OpVariable %_ptr_UniformConstant_type_3d_image_0 UniformConstant
-// UNKNOWN: %t5 = OpVariable %_ptr_UniformConstant_type_3d_image_1 UniformConstant
+// CHECK: %t5 = OpVariable %_ptr_UniformConstant_type_3d_image_0 UniformConstant
 RasterizerOrderedTexture3D   <float4> t5 ;
 
 // CHECK: %t6 = OpVariable %_ptr_UniformConstant_type_1d_image_array UniformConstant
diff --git a/tools/clang/test/CodeGenSPIRV/type.rwtexture.hlsl b/tools/clang/test/CodeGenSPIRV/type.rwtexture.hlsl
index 884957210a..f901d44cfa 100644
--- a/tools/clang/test/CodeGenSPIRV/type.rwtexture.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/type.rwtexture.hlsl
@@ -1,43 +1,24 @@
-// RUN: %dxc -T vs_6_0 -E main -fcgl  %s -spirv | FileCheck %s --check-prefixes=CHECK,INFER
-// RUN: %dxc -fspv-target-env=vulkan1.3 -T vs_6_0 -E main -fcgl  %s -spirv | FileCheck %s --check-prefixes=CHECK,UNKNOWN
+// RUN: %dxc -T vs_6_0 -E main -fcgl  %s -spirv | FileCheck %s
 
 // CHECK: OpCapability Image1D
 
-// INFER: %type_1d_image = OpTypeImage %int 1D 2 0 0 2 R32i
-// INFER: %_ptr_UniformConstant_type_1d_image = OpTypePointer UniformConstant %type_1d_image
-// INFER: %type_2d_image = OpTypeImage %uint 2D 2 0 0 2 Rg32ui
-// INFER: %_ptr_UniformConstant_type_2d_image = OpTypePointer UniformConstant %type_2d_image
-// INFER: %type_3d_image = OpTypeImage %int 3D 2 0 0 2 R32i
-// INFER: %_ptr_UniformConstant_type_3d_image = OpTypePointer UniformConstant %type_3d_image
-// INFER: %type_3d_image_0 = OpTypeImage %float 3D 2 0 0 2 Rgba32f
-// INFER: %_ptr_UniformConstant_type_3d_image_0 = OpTypePointer UniformConstant %type_3d_image_0
-// INFER: %type_1d_image_array = OpTypeImage %int 1D 2 1 0 2 R32i
-// INFER: %_ptr_UniformConstant_type_1d_image_array = OpTypePointer UniformConstant %type_1d_image_array
-// INFER: %type_2d_image_array = OpTypeImage %uint 2D 2 1 0 2 Rg32ui
-// INFER: %_ptr_UniformConstant_type_2d_image_array = OpTypePointer UniformConstant %type_2d_image_array
-// INFER: %type_1d_image_array_0 = OpTypeImage %float 1D 2 1 0 2 Rgba32f
-// INFER: %_ptr_UniformConstant_type_1d_image_array_0 = OpTypePointer UniformConstant %type_1d_image_array_0
-// INFER: %type_2d_image_array_0 = OpTypeImage %float 2D 2 1 0 2 Rgba32f
-// INFER: %_ptr_UniformConstant_type_2d_image_array_0 = OpTypePointer UniformConstant %type_2d_image_array_0
+// CHECK: %type_1d_image = OpTypeImage %int 1D 2 0 0 2 R32i
+// CHECK: %_ptr_UniformConstant_type_1d_image = OpTypePointer UniformConstant %type_1d_image
+// CHECK: %type_2d_image = OpTypeImage %uint 2D 2 0 0 2 Rg32ui
+// CHECK: %_ptr_UniformConstant_type_2d_image = OpTypePointer UniformConstant %type_2d_image
+// CHECK: %type_3d_image = OpTypeImage %int 3D 2 0 0 2 R32i
+// CHECK: %_ptr_UniformConstant_type_3d_image = OpTypePointer UniformConstant %type_3d_image
+// CHECK: %type_3d_image_0 = OpTypeImage %float 3D 2 0 0 2 Rgba32f
+// CHECK: %_ptr_UniformConstant_type_3d_image_0 = OpTypePointer UniformConstant %type_3d_image_0
+// CHECK: %type_1d_image_array = OpTypeImage %int 1D 2 1 0 2 R32i
+// CHECK: %_ptr_UniformConstant_type_1d_image_array = OpTypePointer UniformConstant %type_1d_image_array
+// CHECK: %type_2d_image_array = OpTypeImage %uint 2D 2 1 0 2 Rg32ui
+// CHECK: %_ptr_UniformConstant_type_2d_image_array = OpTypePointer UniformConstant %type_2d_image_array
+// CHECK: %type_1d_image_array_0 = OpTypeImage %float 1D 2 1 0 2 Rgba32f
+// CHECK: %_ptr_UniformConstant_type_1d_image_array_0 = OpTypePointer UniformConstant %type_1d_image_array_0
+// CHECK: %type_2d_image_array_0 = OpTypeImage %float 2D 2 1 0 2 Rgba32f
+// CHECK: %_ptr_UniformConstant_type_2d_image_array_0 = OpTypePointer UniformConstant %type_2d_image_array_0
 
-// UNKNOWN: %type_1d_image = OpTypeImage %int 1D 2 0 0 2 Unknown
-// UNKNOWN: %_ptr_UniformConstant_type_1d_image = OpTypePointer UniformConstant %type_1d_image
-// UNKNOWN: %type_2d_image = OpTypeImage %uint 2D 2 0 0 2 Unknown
-// UNKNOWN: %_ptr_UniformConstant_type_2d_image = OpTypePointer UniformConstant %type_2d_image
-// UNKNOWN: %type_3d_image = OpTypeImage %int 3D 2 0 0 2 Unknown
-// UNKNOWN: %_ptr_UniformConstant_type_3d_image = OpTypePointer UniformConstant %type_3d_image
-// UNKNOWN: %type_3d_image_0 = OpTypeImage %float 3D 2 0 0 2 Rgba32f
-// UNKNOWN: %_ptr_UniformConstant_type_3d_image_0 = OpTypePointer UniformConstant %type_3d_image_0
-// UNKNOWN: %type_3d_image_1 = OpTypeImage %float 3D 2 0 0 2 Unknown
-// UNKNOWN: %_ptr_UniformConstant_type_3d_image_1 = OpTypePointer UniformConstant %type_3d_image_1
-// UNKNOWN: %type_1d_image_array = OpTypeImage %int 1D 2 1 0 2 Unknown
-// UNKNOWN: %_ptr_UniformConstant_type_1d_image_array = OpTypePointer UniformConstant %type_1d_image_array
-// UNKNOWN: %type_2d_image_array = OpTypeImage %uint 2D 2 1 0 2 Unknown
-// UNKNOWN: %_ptr_UniformConstant_type_2d_image_array = OpTypePointer UniformConstant %type_2d_image_array
-// UNKNOWN: %type_1d_image_array_0 = OpTypeImage %float 1D 2 1 0 2 Unknown
-// UNKNOWN: %_ptr_UniformConstant_type_1d_image_array_0 = OpTypePointer UniformConstant %type_1d_image_array_0
-// UNKNOWN: %type_2d_image_array_0 = OpTypeImage %float 2D 2 1 0 2 Unknown
-// UNKNOWN: %_ptr_UniformConstant_type_2d_image_array_0 = OpTypePointer UniformConstant %type_2d_image_array_0
 
 // CHECK: %t1 = OpVariable %_ptr_UniformConstant_type_1d_image UniformConstant
 RWTexture1D   <int>    t1 ;
@@ -52,8 +33,7 @@ RWTexture3D   <int>    t3 ;
 [[vk::image_format("rgba32f")]]
 RWTexture3D   <float3> t4 ;
 
-// INFER: %t5 = OpVariable %_ptr_UniformConstant_type_3d_image_0 UniformConstant
-// UNKNOWN: %t5 = OpVariable %_ptr_UniformConstant_type_3d_image_1 UniformConstant
+// CHECK: %t5 = OpVariable %_ptr_UniformConstant_type_3d_image_0 UniformConstant
 RWTexture3D   <float4> t5 ;
 
 // CHECK: %t6 = OpVariable %_ptr_UniformConstant_type_1d_image_array UniformConstant

From 8f5595872e158796195d1b5526761b5a4216bf40 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nathan=20Gau=C3=ABr?= <brioche@google.com>
Date: Tue, 24 Jun 2025 18:40:03 +0200
Subject: [PATCH 72/93] [SPIR-V] Prepase SDK v2025.3 release (#7579)

Updating SPIRV-{Headers,Tools} for the release.
---
 external/SPIRV-Headers | 2 +-
 external/SPIRV-Tools   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/external/SPIRV-Headers b/external/SPIRV-Headers
index c9aad99f92..2a611a970f 160000
--- a/external/SPIRV-Headers
+++ b/external/SPIRV-Headers
@@ -1 +1 @@
-Subproject commit c9aad99f9276817f18f72a4696239237c83cb775
+Subproject commit 2a611a970fdbc41ac2e3e328802aed9985352dca
diff --git a/external/SPIRV-Tools b/external/SPIRV-Tools
index da48bb20bd..33e0256818 160000
--- a/external/SPIRV-Tools
+++ b/external/SPIRV-Tools
@@ -1 +1 @@
-Subproject commit da48bb20bdfc8a214d5bffdacca2d1d2ae849009
+Subproject commit 33e02568181e3312f49a3cf33df470bf96ef293a

From 3e01e8b70ee18bd33e706a3c6779ec397d7e9a1f Mon Sep 17 00:00:00 2001
From: Alex Sepkowski <5620315+alsepkow@users.noreply.github.com>
Date: Tue, 24 Jun 2025 17:41:37 -0700
Subject: [PATCH 73/93] [NFC] Address compiler warnings: C4146 - Another round
 of use two's complement instead of negation (#7567)

Addresses https://github.com/microsoft/DirectXShaderCompiler/issues/7565

A few more instances where we can take advantage of -N being equivalent
to (~N + 1)
---
 lib/Analysis/BasicAliasAnalysis.cpp  | 6 +++++-
 lib/Analysis/ConstantFolding.cpp     | 4 ++--
 lib/Analysis/InstructionSimplify.cpp | 8 ++++----
 3 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/lib/Analysis/BasicAliasAnalysis.cpp b/lib/Analysis/BasicAliasAnalysis.cpp
index aa0f9ed873..956c334374 100644
--- a/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/lib/Analysis/BasicAliasAnalysis.cpp
@@ -1117,7 +1117,11 @@ AliasResult BasicAliasAnalysis::aliasGEP(
       // stripped a gep with negative index ('gep <ptr>, -1, ...).
       if (V1Size != MemoryLocation::UnknownSize &&
           V2Size != MemoryLocation::UnknownSize) {
-        if (-(uint64_t)GEP1BaseOffset < V1Size)
+        // GEP1BaseOffset is negative in this else block and because we're
+        // assigning to an unsigned variable, we can make use of
+        // -I == (~I + 1) to compute the absolute value of GEP1BaseOffset.
+        const uint64_t GEP1BaseOffsetAbs = (~GEP1BaseOffset + 1ULL);
+        if (GEP1BaseOffsetAbs < V1Size)
           return PartialAlias;
         return NoAlias;
       }
diff --git a/lib/Analysis/ConstantFolding.cpp b/lib/Analysis/ConstantFolding.cpp
index 69c9b10b60..0167bdf0a1 100644
--- a/lib/Analysis/ConstantFolding.cpp
+++ b/lib/Analysis/ConstantFolding.cpp
@@ -187,7 +187,7 @@ static Constant *FoldBitCast(Constant *C, Type *DestTy, const DataLayout &DL) {
         // Shift it to the right place, depending on endianness.
         Src = ConstantExpr::getShl(Src,
                                    ConstantInt::get(Src->getType(), ShiftAmt));
-        ShiftAmt += isLittleEndian ? SrcBitSize : -SrcBitSize;
+        ShiftAmt += isLittleEndian ? SrcBitSize : (~SrcBitSize + 1U);
 
         // Mix it in.
         Elt = ConstantExpr::getOr(Elt, Src);
@@ -213,7 +213,7 @@ static Constant *FoldBitCast(Constant *C, Type *DestTy, const DataLayout &DL) {
       // endianness.
       Constant *Elt = ConstantExpr::getLShr(Src,
                                   ConstantInt::get(Src->getType(), ShiftAmt));
-      ShiftAmt += isLittleEndian ? DstBitSize : -DstBitSize;
+      ShiftAmt += isLittleEndian ? DstBitSize : (~DstBitSize + 1U);
 
       // Truncate the element to an integer with the same pointer size and
       // convert the element back to a pointer using a inttoptr.
diff --git a/lib/Analysis/InstructionSimplify.cpp b/lib/Analysis/InstructionSimplify.cpp
index 89c7cc7a3e..96c0b3302d 100644
--- a/lib/Analysis/InstructionSimplify.cpp
+++ b/lib/Analysis/InstructionSimplify.cpp
@@ -4109,7 +4109,7 @@ Constant *FoldBitCast(Constant *C, Type *DestTy, const DataLayout &DL) {
         // Shift it to the right place, depending on endianness.
         Src = ConstantExpr::getShl(Src,
                                    ConstantInt::get(Src->getType(), ShiftAmt));
-        ShiftAmt += isLittleEndian ? SrcBitSize : -SrcBitSize;
+        ShiftAmt += isLittleEndian ? SrcBitSize : (~SrcBitSize + 1U);
 
         // Mix it in.
         Elt = ConstantExpr::getOr(Elt, Src);
@@ -4144,9 +4144,9 @@ Constant *FoldBitCast(Constant *C, Type *DestTy, const DataLayout &DL) {
     for (unsigned j = 0; j != Ratio; ++j) {
       // Shift the piece of the value into the right place, depending on
       // endianness.
-      Constant *Elt = ConstantExpr::getLShr(Src,
-                                  ConstantInt::get(Src->getType(), ShiftAmt));
-      ShiftAmt += isLittleEndian ? DstBitSize : -DstBitSize;
+      Constant *Elt = ConstantExpr::getLShr(
+          Src, ConstantInt::get(Src->getType(), ShiftAmt));
+      ShiftAmt += isLittleEndian ? DstBitSize : (~DstBitSize + 1U);
 
       // Truncate the element to an integer with the same pointer size and
       // convert the element back to a pointer using a inttoptr.

From 23118b9eaab90d7cb6b95a95cc8ea3f313b4b05a Mon Sep 17 00:00:00 2001
From: Alex Sepkowski <5620315+alsepkow@users.noreply.github.com>
Date: Tue, 24 Jun 2025 17:51:39 -0700
Subject: [PATCH 74/93] [NFC] Address compiler warnings: C4146 - A 'grab bag'
 of remaining instances (#7574)

## Fix C4146 warnings: unary minus on unsigned types

Fixes several remaining MSVC C4146 warnings where unary minus was
applied to unsigned integers.
This should be the last PR containing MSVC C4146 warning fixes. I will
remove the disablement of the warning as an error in a subsequent PR
once the other pending PRs are completed.

**Changes:**
- Replace `-(unsigned_value)` with `~unsigned_value + 1` for offset
calculations
- Use `-1LL` instead of `-1ULL` where signed values are intended
- Fix alignment padding calculation to avoid unsigned negation

**Files changed:**
- CoverageMappingGen.cpp, Lexer.cpp, Rewriter.cpp: Use two's complement
for safe unsigned negation in offset calculations
- ItaniumCXXABI.cpp: Use signed literal for ABI-compliant null member
pointer (-1)
- ExprConstant.cpp: Replace `-1ULL` with `~0ULL` for bitmasks
- CodeGenMapTable.cpp: Fix sentinel value generation

All changes are mathematically equivalent and preserve existing behavior
while eliminating compiler warnings.

Addresses #7573
---
 lib/Support/APFloat.cpp                       |  2 +-
 lib/Support/StringRef.cpp                     | 14 ++++++++-----
 lib/Support/TimeValue.cpp                     |  3 +--
 lib/Transforms/Scalar/LoopStrengthReduce.cpp  | 21 +++++++++++--------
 tools/clang/lib/AST/ExprConstant.cpp          |  4 ++--
 tools/clang/lib/AST/MicrosoftMangle.cpp       |  6 +++---
 .../clang/lib/CodeGen/CoverageMappingGen.cpp  |  2 +-
 tools/clang/lib/CodeGen/ItaniumCXXABI.cpp     | 14 ++++++-------
 tools/clang/lib/CodeGen/TargetInfo.cpp        |  4 ++--
 tools/clang/lib/Format/Format.cpp             |  2 +-
 tools/clang/lib/Lex/Lexer.cpp                 |  2 +-
 tools/clang/lib/Rewrite/Rewriter.cpp          |  4 ++--
 12 files changed, 42 insertions(+), 36 deletions(-)

diff --git a/lib/Support/APFloat.cpp b/lib/Support/APFloat.cpp
index f8f1fb03cd..40c22459e2 100644
--- a/lib/Support/APFloat.cpp
+++ b/lib/Support/APFloat.cpp
@@ -446,7 +446,7 @@ ulpsFromBoundary(const integerPart *parts, unsigned int bits, bool isNearest)
       if (~parts[count])
         return ~(integerPart) 0; /* A lot.  */
 
-    return -parts[0];
+    return (~parts[0] + 1);
   }
 
   return ~(integerPart) 0; /* A lot.  */
diff --git a/lib/Support/StringRef.cpp b/lib/Support/StringRef.cpp
index ddece087a9..52b949d826 100644
--- a/lib/Support/StringRef.cpp
+++ b/lib/Support/StringRef.cpp
@@ -12,6 +12,7 @@
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/edit_distance.h"
 #include <bitset>
+#include <limits>
 
 using namespace llvm;
 
@@ -393,13 +394,16 @@ bool llvm::getAsSignedInteger(StringRef Str, unsigned Radix,
 
   // Get the positive part of the value.
   if (getAsUnsignedInteger(Str.substr(1), Radix, ULLVal) ||
-      // Reject values so large they'd overflow as negative signed, but allow
-      // "-0".  This negates the unsigned so that the negative isn't undefined
-      // on signed overflow.
-      (long long)-ULLVal > 0)
+      // Reject values larger than what can be represented as negative signed.
+      // The most negative long long is LLONG_MIN, which has magnitude
+      // (LLONG_MAX + 1). Values larger than this magnitude cannot be negated
+      // without overflow.
+      ULLVal > static_cast<unsigned long long>(
+                   std::numeric_limits<long long>::max()) +
+                   1)
     return true;
 
-  Result = -ULLVal;
+  Result = (~ULLVal + 1);
   return false;
 }
 
diff --git a/lib/Support/TimeValue.cpp b/lib/Support/TimeValue.cpp
index 136b93ecee..06de27bbda 100644
--- a/lib/Support/TimeValue.cpp
+++ b/lib/Support/TimeValue.cpp
@@ -19,8 +19,7 @@ using namespace sys;
 
 const TimeValue::SecondsType
   TimeValue::PosixZeroTimeSeconds = -946684800;
-const TimeValue::SecondsType
-  TimeValue::Win32ZeroTimeSeconds = -12591158400ULL;
+const TimeValue::SecondsType TimeValue::Win32ZeroTimeSeconds = -12591158400LL;
 
 void
 TimeValue::normalize( void ) {
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 3ab9367a6b..60962ec69a 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -1395,8 +1395,11 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
       // ICmpZero -1*ScaleReg + BaseOffset => ICmp ScaleReg, BaseOffset
       // Offs is the ICmp immediate.
       if (Scale == 0)
-        // The cast does the right thing with INT64_MIN.
-        BaseOffset = -(uint64_t)BaseOffset;
+        // Negate BaseOffset using two's complement (~x + 1) to avoid undefined
+        // behavior. Simple negation (-BaseOffset) would be undefined for
+        // INT64_MIN since -INT64_MIN cannot fit in int64_t. Two's complement
+        // gives the expected wraparound behavior: -INT64_MIN becomes INT64_MIN.
+        BaseOffset = ~BaseOffset + 1ULL;
       return TTI.isLegalICmpImmediate(BaseOffset);
     }
 
@@ -3000,7 +3003,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {
         // of -1) are now also interesting.
         for (size_t i = 0, e = Factors.size(); i != e; ++i)
           if (Factors[i] != -1)
-            Factors.insert(-(uint64_t)Factors[i]);
+            Factors.insert(~Factors[i] + 1ULL);
         Factors.insert(-1);
       }
 
@@ -3739,7 +3742,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
     const SCEV *OrigReg = WI.OrigReg;
 
     Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType());
-    const SCEV *NegImmS = SE.getSCEV(ConstantInt::get(IntTy, -(uint64_t)Imm));
+    const SCEV *NegImmS = SE.getSCEV(ConstantInt::get(IntTy, ~Imm + 1ULL));
     unsigned BitWidth = SE.getTypeSizeInBits(IntTy);
 
     // TODO: Use a more targeted data structure.
@@ -3754,8 +3757,8 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
       if (F.ScaledReg == OrigReg) {
         int64_t Offset = (uint64_t)F.BaseOffset + Imm * (uint64_t)F.Scale;
         // Don't create 50 + reg(-50).
-        if (F.referencesReg(SE.getSCEV(
-                   ConstantInt::get(IntTy, -(uint64_t)Offset))))
+        if (F.referencesReg(
+                SE.getSCEV(ConstantInt::get(IntTy, ~Offset + 1ULL))))
           continue;
         Formula NewF = F;
         NewF.BaseOffset = Offset;
@@ -4556,7 +4559,7 @@ Value *LSRInstance::Expand(const LSRFixup &LF, const Formula &F,
       // The other interesting way of "folding" with an ICmpZero is to use a
       // negated immediate.
       if (!ICmpScaledV)
-        ICmpScaledV = ConstantInt::get(IntTy, -(uint64_t)Offset);
+        ICmpScaledV = ConstantInt::get(IntTy, ~Offset + 1ULL);
       else {
         Ops.push_back(SE.getUnknown(ICmpScaledV));
         ICmpScaledV = ConstantInt::get(IntTy, Offset);
@@ -4608,8 +4611,8 @@ Value *LSRInstance::Expand(const LSRFixup &LF, const Formula &F,
       assert((F.Scale == 0 || F.Scale == 1) &&
              "ICmp does not support folding a global value and "
              "a scale at the same time!");
-      Constant *C = ConstantInt::getSigned(SE.getEffectiveSCEVType(OpTy),
-                                           -(uint64_t)Offset);
+      Constant *C =
+          ConstantInt::getSigned(SE.getEffectiveSCEVType(OpTy), ~Offset + 1ULL);
       if (C->getType() != OpTy)
         C = ConstantExpr::getCast(CastInst::getCastOpcode(C, false,
                                                           OpTy, false),
diff --git a/tools/clang/lib/AST/ExprConstant.cpp b/tools/clang/lib/AST/ExprConstant.cpp
index baa0349cfe..c24e44022f 100644
--- a/tools/clang/lib/AST/ExprConstant.cpp
+++ b/tools/clang/lib/AST/ExprConstant.cpp
@@ -6555,7 +6555,7 @@ bool IntExprEvaluator::VisitCallExpr(const CallExpr *E) {
     // handle all cases where the expression has side-effects.
     if (E->getArg(0)->HasSideEffects(Info.Ctx)) {
       if (E->getArg(1)->EvaluateKnownConstInt(Info.Ctx).getZExtValue() <= 1)
-        return Success(-1ULL, E);
+        return Success(~0ULL, E);
       return Success(0, E);
     }
 
@@ -6570,7 +6570,7 @@ bool IntExprEvaluator::VisitCallExpr(const CallExpr *E) {
       return Error(E);
     case EvalInfo::EM_ConstantExpressionUnevaluated:
     case EvalInfo::EM_PotentialConstantExpressionUnevaluated:
-      return Success(-1ULL, E);
+      return Success(~0ULL, E);
     }
     llvm_unreachable("Invalid EvalMode!");
   }
diff --git a/tools/clang/lib/AST/MicrosoftMangle.cpp b/tools/clang/lib/AST/MicrosoftMangle.cpp
index 40dca1bb1b..ae9f1cd7f8 100644
--- a/tools/clang/lib/AST/MicrosoftMangle.cpp
+++ b/tools/clang/lib/AST/MicrosoftMangle.cpp
@@ -633,7 +633,7 @@ void MicrosoftCXXNameMangler::mangleNumber(int64_t Number) {
 
   uint64_t Value = static_cast<uint64_t>(Number);
   if (Number < 0) {
-    Value = -Value;
+    Value = ~Value + 1ULL;
     Out << '?';
   }
 
@@ -2308,7 +2308,7 @@ static void mangleThunkThisAdjustment(const CXXMethodDecl *MD,
       Out << AccessSpec;
       Mangler.mangleNumber(
           static_cast<uint32_t>(Adjustment.Virtual.Microsoft.VtordispOffset));
-      Mangler.mangleNumber(-static_cast<uint32_t>(Adjustment.NonVirtual));
+      Mangler.mangleNumber(~static_cast<uint32_t>(Adjustment.NonVirtual) + 1);
     }
   } else if (Adjustment.NonVirtual != 0) {
     switch (MD->getAccess()) {
@@ -2323,7 +2323,7 @@ static void mangleThunkThisAdjustment(const CXXMethodDecl *MD,
     case AS_public:
       Out << 'W';
     }
-    Mangler.mangleNumber(-static_cast<uint32_t>(Adjustment.NonVirtual));
+    Mangler.mangleNumber(~static_cast<uint32_t>(Adjustment.NonVirtual) + 1);
   } else {
     switch (MD->getAccess()) {
     case AS_none:
diff --git a/tools/clang/lib/CodeGen/CoverageMappingGen.cpp b/tools/clang/lib/CodeGen/CoverageMappingGen.cpp
index eca91590e6..e16e015a74 100644
--- a/tools/clang/lib/CodeGen/CoverageMappingGen.cpp
+++ b/tools/clang/lib/CodeGen/CoverageMappingGen.cpp
@@ -116,7 +116,7 @@ class CoverageMappingBuilder {
   /// \brief Return the start location of an included file or expanded macro.
   SourceLocation getStartOfFileOrMacro(SourceLocation Loc) {
     if (Loc.isMacroID())
-      return Loc.getLocWithOffset(-SM.getFileOffset(Loc));
+      return Loc.getLocWithOffset(~SM.getFileOffset(Loc) + 1);
     return SM.getLocForStartOfFile(SM.getFileID(Loc));
   }
 
diff --git a/tools/clang/lib/CodeGen/ItaniumCXXABI.cpp b/tools/clang/lib/CodeGen/ItaniumCXXABI.cpp
index 97fe28be7f..698d34c774 100644
--- a/tools/clang/lib/CodeGen/ItaniumCXXABI.cpp
+++ b/tools/clang/lib/CodeGen/ItaniumCXXABI.cpp
@@ -639,8 +639,8 @@ llvm::Constant *
 ItaniumCXXABI::EmitNullMemberPointer(const MemberPointerType *MPT) {
   // Itanium C++ ABI 2.3:
   //   A NULL pointer is represented as -1.
-  if (MPT->isMemberDataPointer()) 
-    return llvm::ConstantInt::get(CGM.PtrDiffTy, -1ULL, /*isSigned=*/true);
+  if (MPT->isMemberDataPointer())
+    return llvm::ConstantInt::get(CGM.PtrDiffTy, -1LL, /*isSigned=*/true);
 
   llvm::Constant *Zero = llvm::ConstantInt::get(CGM.PtrDiffTy, 0);
   llvm::Constant *Values[2] = { Zero, Zero };
@@ -1023,7 +1023,7 @@ static CharUnits computeOffsetHint(ASTContext &Context,
   // If Dst is not derived from Src we can skip the whole computation below and
   // return that Src is not a public base of Dst.  Record all inheritance paths.
   if (!Dst->isDerivedFrom(Src, Paths))
-    return CharUnits::fromQuantity(-2ULL);
+    return CharUnits::fromQuantity(-2LL);
 
   unsigned NumPublicPaths = 0;
   CharUnits Offset;
@@ -1040,7 +1040,7 @@ static CharUnits computeOffsetHint(ASTContext &Context,
       // If the path contains a virtual base class we can't give any hint.
       // -1: no hint.
       if (J->Base->isVirtual())
-        return CharUnits::fromQuantity(-1ULL);
+        return CharUnits::fromQuantity(-1LL);
 
       if (NumPublicPaths > 1) // Won't use offsets, skip computation.
         continue;
@@ -1053,11 +1053,11 @@ static CharUnits computeOffsetHint(ASTContext &Context,
 
   // -2: Src is not a public base of Dst.
   if (NumPublicPaths == 0)
-    return CharUnits::fromQuantity(-2ULL);
+    return CharUnits::fromQuantity(-2LL);
 
   // -3: Src is a multiple public base type but never a virtual base type.
   if (NumPublicPaths > 1)
-    return CharUnits::fromQuantity(-3ULL);
+    return CharUnits::fromQuantity(-3LL);
 
   // Otherwise, the Src type is a unique public nonvirtual base type of Dst.
   // Return the offset of Src from the origin of Dst.
@@ -1154,7 +1154,7 @@ llvm::Value *ItaniumCXXABI::EmitDynamicCastToVoid(CodeGenFunction &CGF,
 
   // Get the offset-to-top from the vtable.
   llvm::Value *OffsetToTop =
-      CGF.Builder.CreateConstInBoundsGEP1_64(VTable, -2ULL);
+      CGF.Builder.CreateConstInBoundsGEP1_64(VTable, -2LL);
   OffsetToTop = CGF.Builder.CreateLoad(OffsetToTop, "offset.to.top");
 
   // Finally, add the offset to the pointer.
diff --git a/tools/clang/lib/CodeGen/TargetInfo.cpp b/tools/clang/lib/CodeGen/TargetInfo.cpp
index aba43964d9..aaf63355af 100644
--- a/tools/clang/lib/CodeGen/TargetInfo.cpp
+++ b/tools/clang/lib/CodeGen/TargetInfo.cpp
@@ -1283,7 +1283,7 @@ llvm::Value *X86_32ABIInfo::EmitVAArg(llvm::Value *VAListAddr, QualType Ty,
     Addr = CGF.Builder.CreateGEP(Addr, Offset);
     llvm::Value *AsInt = CGF.Builder.CreatePtrToInt(Addr,
                                                     CGF.Int32Ty);
-    llvm::Value *Mask = llvm::ConstantInt::get(CGF.Int32Ty, -Align);
+    llvm::Value *Mask = llvm::ConstantInt::get(CGF.Int32Ty, ~Align + 1);
     Addr = CGF.Builder.CreateIntToPtr(CGF.Builder.CreateAnd(AsInt, Mask),
                                       Addr->getType(),
                                       "ap.cur.aligned");
@@ -2849,7 +2849,7 @@ static llvm::Value *EmitVAArgFromMemory(llvm::Value *VAListAddr,
     overflow_arg_area = CGF.Builder.CreateGEP(overflow_arg_area, Offset);
     llvm::Value *AsInt = CGF.Builder.CreatePtrToInt(overflow_arg_area,
                                                     CGF.Int64Ty);
-    llvm::Value *Mask = llvm::ConstantInt::get(CGF.Int64Ty, -(uint64_t)Align);
+    llvm::Value *Mask = llvm::ConstantInt::get(CGF.Int64Ty, ~Align + 1);
     overflow_arg_area =
       CGF.Builder.CreateIntToPtr(CGF.Builder.CreateAnd(AsInt, Mask),
                                  overflow_arg_area->getType(),
diff --git a/tools/clang/lib/Format/Format.cpp b/tools/clang/lib/Format/Format.cpp
index 7d556c9f0f..b6ca328972 100644
--- a/tools/clang/lib/Format/Format.cpp
+++ b/tools/clang/lib/Format/Format.cpp
@@ -1049,7 +1049,7 @@ class FormatTokenLexer {
     FormatTok = new (Allocator.Allocate()) FormatToken;
     readRawToken(*FormatTok);
     SourceLocation WhitespaceStart =
-        FormatTok->Tok.getLocation().getLocWithOffset(-TrailingWhitespace);
+        FormatTok->Tok.getLocation().getLocWithOffset(~TrailingWhitespace + 1);
     FormatTok->IsFirst = IsFirstToken;
     IsFirstToken = false;
 
diff --git a/tools/clang/lib/Lex/Lexer.cpp b/tools/clang/lib/Lex/Lexer.cpp
index e39573ca34..ce9dd8a3c0 100644
--- a/tools/clang/lib/Lex/Lexer.cpp
+++ b/tools/clang/lib/Lex/Lexer.cpp
@@ -480,7 +480,7 @@ static SourceLocation getBeginningOfFileToken(SourceLocation Loc,
   }
   
   // Create a lexer starting at the beginning of this token.
-  SourceLocation LexerStartLoc = Loc.getLocWithOffset(-LocInfo.second);
+  SourceLocation LexerStartLoc = Loc.getLocWithOffset(~LocInfo.second + 1);
   Lexer TheLexer(LexerStartLoc, LangOpts, BufStart, LexStart, Buffer.end());
   TheLexer.SetCommentRetentionState(true);
   
diff --git a/tools/clang/lib/Rewrite/Rewriter.cpp b/tools/clang/lib/Rewrite/Rewriter.cpp
index be09a363a6..fa081d65ac 100644
--- a/tools/clang/lib/Rewrite/Rewriter.cpp
+++ b/tools/clang/lib/Rewrite/Rewriter.cpp
@@ -60,7 +60,7 @@ void RewriteBuffer::RemoveText(unsigned OrigOffset, unsigned Size,
   Buffer.erase(RealOffset, Size);
 
   // Add a delta so that future changes are offset correctly.
-  AddReplaceDelta(OrigOffset, -Size);
+  AddReplaceDelta(OrigOffset, ~Size + 1);
 
   if (removeLineIfEmpty) {
     // Find the line that the remove occurred and if it is completely empty
@@ -86,7 +86,7 @@ void RewriteBuffer::RemoveText(unsigned OrigOffset, unsigned Size,
     }
     if (posI != end() && *posI == '\n') {
       Buffer.erase(curLineStartOffs, lineSize + 1/* + '\n'*/);
-      AddReplaceDelta(curLineStartOffs, -(lineSize + 1/* + '\n'*/));
+      AddReplaceDelta(curLineStartOffs, ~(lineSize + 1 /* + '\n'*/) + 1);
     }
   }
 }

From b390fb19adc5d7c23180eb470470411fce986910 Mon Sep 17 00:00:00 2001
From: Alex Sepkowski <5620315+alsepkow@users.noreply.github.com>
Date: Tue, 24 Jun 2025 20:42:26 -0700
Subject: [PATCH 75/93] [NFC] Address compiler warnings: C4146 - Use two's
 complement instead of negation  (#7562)

Replaces uses of the unary - operator on signed integers with the
equivalent (sort of, see the details below) expression '~N + 1',
assigning the result to an unsigned type. This avoids undefined behavior
in edge cases and ensures correctness when certain conditions are met.

Details:
This transformation is valid when:

The signed value N is guaranteed to be negative.
The result is stored in an unsigned type that can represent the full
range of the signed type (e.g., uint64_t for int64_t).
The system uses two's complement representation (as is standard on
modern platforms).
While -N is undefined for the minimum representable value (e.g.,
INT64_MIN), the expression ~N + 1 remains well-defined and yields the
correct bit pattern. Assigning this result to an appropriately sized
unsigned type preserves the intended two's complement interpretation
without triggering undefined behavior.

Addresses #7561.
---
 lib/Support/raw_ostream.cpp | 27 ++++++++++++++++++++-------
 1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/lib/Support/raw_ostream.cpp b/lib/Support/raw_ostream.cpp
index b11ffb15d5..595468a6dc 100644
--- a/lib/Support/raw_ostream.cpp
+++ b/lib/Support/raw_ostream.cpp
@@ -134,13 +134,18 @@ raw_ostream &raw_ostream::operator<<(unsigned long N) {
 }
 
 raw_ostream &raw_ostream::operator<<(long N) {
+  // A positive signed long has the same value when casted to its unsigned
+  // counterpart. If its negative, then we'll handle it in the below if block.
+  unsigned long UN = static_cast<unsigned long>(N);
+
   if (N < 0 && writeBase == 10) {
     *this << '-';
-    // Avoid undefined behavior on LONG_MIN with a cast.
-    N = -(unsigned long)N;
+    // Since N is negative and we're storing the result in an unsigned Long,
+    // we can use the equivalence of -N == ~N + 1 to get the positive value.
+    UN = ~N + 1UL;
   }
 
-  return this->operator<<(static_cast<unsigned long>(N));
+  return this->operator<<(UN);
 }
 
 raw_ostream &raw_ostream::operator<<(unsigned long long N) {
@@ -169,13 +174,18 @@ raw_ostream &raw_ostream::operator<<(unsigned long long N) {
 }
 
 raw_ostream &raw_ostream::operator<<(long long N) {
+  // A positive signed long has the same value when casted to its unsigned
+  // counterpart. If its negative, then we'll handle it in the below if block.
+  unsigned long long UN = static_cast<unsigned long long>(N);
+
   if (N < 0 && writeBase == 10) {
     *this << '-';
-    // Avoid undefined behavior on INT64_MIN with a cast.
-    N = -(unsigned long long)N;
+    // Since N is negative and we're storing the result in an unsigned Long,
+    // we can use the equivalence of -N == ~N + 1 to get the positive value.
+    UN = ~N + 1ULL;
   }
 
-  return this->operator<<(static_cast<unsigned long long>(N));
+  return this->operator<<(UN);
 }
 
 // HLSL Change Starts - Generalize non-base10 printing.
@@ -470,7 +480,10 @@ raw_ostream &raw_ostream::operator<<(const FormattedNumber &FN) {
     char *EndPtr = NumberBuffer+sizeof(NumberBuffer);
     char *CurPtr = EndPtr;
     bool Neg = (FN.DecValue < 0);
-    uint64_t N = Neg ? -static_cast<uint64_t>(FN.DecValue) : FN.DecValue;
+    // If the value is negative, and because we are storing the result of the ~
+    // operation in an unsigned value, we can use the equivalence of
+    // -N == ~N + 1 to get the positive value of the negative number
+    uint64_t N = Neg ? (~FN.DecValue + 1UL) : FN.DecValue;
     while (N) {
       *--CurPtr = '0' + char(N % 10);
       N /= 10;

From e07be1c3541013f9604186c741969ccb51aa314d Mon Sep 17 00:00:00 2001
From: Greg Roth <grroth@microsoft.com>
Date: Wed, 25 Jun 2025 14:32:58 -0600
Subject: [PATCH 76/93] Fix errors in retrieving and assigning load status
 parameter (#7513)

There were two problems with processing the status parameter with the
reword of the buffer load code. The first was that the status was not
being passed down to the load instruction generation for aggregate types
in any shader model version. The second was that the status retrieval
from the resret returned by the raw buffer loads was using the wrong
index for native vectors supported by shader model 6.9.

The status Value was not getting passed all the way down to the load
instruction generation for aggregate types because the refactored helper
constructor would always set it to null. It needs to be explicitly
stated since by that point, the original call instruction it came from
has been lost amidst subsequent GEPs, bitcasts, and/or loads that
aggregate types (arrays and structs) will use on the results of the
original call instruction to get the exact element required.

This changes the constructor to take an optional status parameter
allowing the locations where it might be set to pass it along. In other
cases, it will be null and be appropriately ignored.

Modified aggregate tests to verify this behavior. This required keeping
track of the return of the last load operation involved in a raw buffer
load, which made arrays more complicated. Rather than give them their
own CHECK prefix, I lumped them in with large matrices requiring three
loads. This did require making all the array lengths 3 to match. The
loss in test variability is worth the convenience as there is no known
distinction when it comes to array sizes over 1.

The status retrieval from the ResRet returned by the raw buffer loads
was using the wrong index for native vectors supported by shader model
6.9. Adjusting the index according to the opcode ensures that the index
will be correct.

This also required a change to validation that allows
checkAccessFullyMapped to operate on the second element extracted from a
ResRet where applicable and some corresponding null tolerance in related
code.

Adds status retrieving overloads to the relevant load/store tests for
sm6.9, aggregates, and other loads though the last category exhibited no
issues. At least I got some statuses right!

Fixes #7508
---
 include/dxc/DXIL/DxilConstants.h              |   1 +
 lib/DXIL/DxilOperations.cpp                   |   2 +-
 lib/DxilValidation/DxilValidation.cpp         |  10 +-
 lib/HLSL/HLOperationLower.cpp                 |  27 +--
 .../intrinsics/buffer-agg-load-stores.hlsl    | 165 ++++++++++++------
 .../intrinsics/buffer-load-stores-sm69.hlsl   |  45 ++++-
 .../hlsl/intrinsics/buffer-load-stores.hlsl   | 104 ++++++++++-
 7 files changed, 280 insertions(+), 74 deletions(-)

diff --git a/include/dxc/DXIL/DxilConstants.h b/include/dxc/DXIL/DxilConstants.h
index 0f28edbc39..84588a2ff7 100644
--- a/include/dxc/DXIL/DxilConstants.h
+++ b/include/dxc/DXIL/DxilConstants.h
@@ -154,6 +154,7 @@ const float kMaxMipLodBias = 15.99f;
 const float kMinMipLodBias = -16.0f;
 
 const unsigned kResRetStatusIndex = 4;
+const unsigned kVecResRetStatusIndex = 1;
 
 /* <py::lines('OLOAD_DIMS-TEXT')>hctdb_instrhelp.get_max_oload_dims()</py>*/
 // OLOAD_DIMS-TEXT:BEGIN
diff --git a/lib/DXIL/DxilOperations.cpp b/lib/DXIL/DxilOperations.cpp
index a66dfc68d4..253121346a 100644
--- a/lib/DXIL/DxilOperations.cpp
+++ b/lib/DXIL/DxilOperations.cpp
@@ -6438,7 +6438,7 @@ Type *OP::GetFourI32Type() const { return m_pFourI32Type; }
 Type *OP::GetFourI16Type() const { return m_pFourI16Type; }
 
 bool OP::IsResRetType(llvm::Type *Ty) {
-  if (!Ty->isStructTy())
+  if (!Ty || !Ty->isStructTy())
     return false;
   for (Type *ResTy : m_pResRetType) {
     if (Ty == ResTy)
diff --git a/lib/DxilValidation/DxilValidation.cpp b/lib/DxilValidation/DxilValidation.cpp
index db596a3821..9587897e22 100644
--- a/lib/DxilValidation/DxilValidation.cpp
+++ b/lib/DxilValidation/DxilValidation.cpp
@@ -1573,9 +1573,15 @@ static void ValidateResourceDxilOp(CallInst *CI, DXIL::OpCode Opcode,
       ValCtx.EmitInstrError(CI, ValidationRule::InstrCheckAccessFullyMapped);
     } else {
       Value *V = EVI->getOperand(0);
+      StructType *StrTy = dyn_cast<StructType>(V->getType());
+      unsigned ExtractIndex = EVI->getIndices()[0];
+      // Ensure parameter is a single value that is extracted from the correct
+      // ResRet struct location.
       bool IsLegal = EVI->getNumIndices() == 1 &&
-                     EVI->getIndices()[0] == DXIL::kResRetStatusIndex &&
-                     ValCtx.DxilMod.GetOP()->IsResRetType(V->getType());
+                     (ExtractIndex == DXIL::kResRetStatusIndex ||
+                      ExtractIndex == DXIL::kVecResRetStatusIndex) &&
+                     ValCtx.DxilMod.GetOP()->IsResRetType(StrTy) &&
+                     ExtractIndex == StrTy->getNumElements() - 1;
       if (!IsLegal) {
         ValCtx.EmitInstrError(CI, ValidationRule::InstrCheckAccessFullyMapped);
       }
diff --git a/lib/HLSL/HLOperationLower.cpp b/lib/HLSL/HLOperationLower.cpp
index 7d5eb0edce..2033533327 100644
--- a/lib/HLSL/HLOperationLower.cpp
+++ b/lib/HLSL/HLOperationLower.cpp
@@ -3063,10 +3063,10 @@ static Value *ScalarizeResRet(Type *RetTy, Value *ResRet,
 }
 
 void UpdateStatus(Value *ResRet, Value *status, IRBuilder<> &Builder,
-                  hlsl::OP *hlslOp) {
+                  hlsl::OP *hlslOp,
+                  unsigned StatusIndex = DXIL::kResRetStatusIndex) {
   if (status && !isa<UndefValue>(status)) {
-    Value *statusVal =
-        Builder.CreateExtractValue(ResRet, DXIL::kResRetStatusIndex);
+    Value *statusVal = Builder.CreateExtractValue(ResRet, StatusIndex);
     Value *checkAccessOp = hlslOp->GetI32Const(
         static_cast<unsigned>(DXIL::OpCode::CheckAccessFullyMapped));
     Function *checkAccessFn = hlslOp->GetOpFunc(
@@ -4028,9 +4028,9 @@ struct ResLoadHelper {
   // Used for some subscript operators that feed the generic HL call inst
   // into a load op and by the matrixload call instruction.
   ResLoadHelper(Instruction *Inst, DxilResource::Kind RK, Value *h, Value *idx,
-                Value *Offset, Value *mip = nullptr)
+                Value *Offset, Value *status = nullptr, Value *mip = nullptr)
       : intrinsicOpCode(IntrinsicOp::Num_Intrinsics), handle(h), retVal(Inst),
-        addr(idx), offset(Offset), status(nullptr), mipLevel(mip) {
+        addr(idx), offset(Offset), status(status), mipLevel(mip) {
     opcode = LoadOpFromResKind(RK);
     Type *Ty = Inst->getType();
     if (opcode == OP::OpCode::RawBufferLoad && Ty->isVectorTy() &&
@@ -4304,18 +4304,22 @@ Value *TranslateBufLoad(ResLoadHelper &helper, HLResource::Kind RK,
 
     Function *F = OP->GetOpFunc(opcode, EltTy);
     Value *Ld = Builder.CreateCall(F, Args, OP::GetOpCodeName(opcode));
+    unsigned StatusIndex;
 
     // Extract elements from returned ResRet.
     // Native vector loads just have one vector element in the ResRet.
     // Others have up to four scalars that need to be individually extracted.
-    if (opcode == OP::OpCode::RawBufferVectorLoad)
+    if (opcode == OP::OpCode::RawBufferVectorLoad) {
       Elts[i++] = Builder.CreateExtractValue(Ld, 0);
-    else
+      StatusIndex = DXIL::kVecResRetStatusIndex;
+    } else {
       for (unsigned j = 0; j < chunkSize; j++, i++)
         Elts[i] = Builder.CreateExtractValue(Ld, j);
+      StatusIndex = DXIL::kResRetStatusIndex;
+    }
 
     // Update status.
-    UpdateStatus(Ld, helper.status, Builder, OP);
+    UpdateStatus(Ld, helper.status, Builder, OP, StatusIndex);
 
     if (!FirstLd)
       FirstLd = Ld;
@@ -8537,7 +8541,7 @@ Value *TranslateStructBufMatLd(CallInst *CI, IRBuilder<> &Builder,
                                Value *status, Value *bufIdx, Value *baseOffset,
                                const DataLayout &DL) {
 
-  ResLoadHelper helper(CI, RK, handle, bufIdx, baseOffset);
+  ResLoadHelper helper(CI, RK, handle, bufIdx, baseOffset, status);
 #ifndef NDEBUG
   Value *ptr = CI->getArgOperand(HLOperandIndex::kMatLoadPtrOpIdx);
   Type *matType = ptr->getType()->getPointerElementType();
@@ -8864,7 +8868,7 @@ void TranslateStructBufSubscriptUser(Instruction *user, Value *handle,
     }
   } else if (LoadInst *LdInst = dyn_cast<LoadInst>(user)) {
     // Load of scalar/vector within a struct or structured raw load.
-    ResLoadHelper helper(LdInst, ResKind, handle, bufIdx, baseOffset);
+    ResLoadHelper helper(LdInst, ResKind, handle, bufIdx, baseOffset, status);
     TranslateBufLoad(helper, ResKind, Builder, OP, DL);
 
     LdInst->eraseFromParent();
@@ -9239,7 +9243,8 @@ void TranslateHLSubscript(CallInst *CI, HLSubscriptOpcode opcode,
     IRBuilder<> Builder(CI);
     if (LoadInst *ldInst = dyn_cast<LoadInst>(*U)) {
       Value *Offset = UndefValue::get(Builder.getInt32Ty());
-      ResLoadHelper ldHelper(ldInst, RK, handle, coord, Offset, mipLevel);
+      ResLoadHelper ldHelper(ldInst, RK, handle, coord, Offset,
+                             /*status*/ nullptr, mipLevel);
       TranslateBufLoad(ldHelper, RK, Builder, hlslOP, helper.dataLayout);
       ldInst->eraseFromParent();
     } else {
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/buffer-agg-load-stores.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/buffer-agg-load-stores.hlsl
index 9f7a487a05..572734d679 100644
--- a/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/buffer-agg-load-stores.hlsl
+++ b/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/buffer-agg-load-stores.hlsl
@@ -1,35 +1,35 @@
-// RUN: %dxc -T vs_6_6              -DETY=float    -DCOLS=4 %s | FileCheck %s
-// RUN: %dxc -T vs_6_6              -DETY=bool     -DCOLS=4 %s | FileCheck %s
-// RUN: %dxc -T vs_6_6              -DETY=uint64_t -DCOLS=2 %s | FileCheck %s
-// RUN: %dxc -T vs_6_6              -DETY=double   -DCOLS=2 %s | FileCheck %s
+// RUN: %dxc -T vs_6_6              -DETY=float     -DCOLS=3 %s | FileCheck %s --check-prefixes=CHECK,MULTI
+// RUN: %dxc -T vs_6_6              -DETY=bool      -DCOLS=3 %s | FileCheck %s --check-prefixes=CHECK,MULTI
+// RUN: %dxc -T vs_6_6              -DETY=uint64_t  -DCOLS=3 %s | FileCheck %s --check-prefixes=CHECK,MULTI
+// RUN: %dxc -T vs_6_6              -DETY=double    -DCOLS=3 %s | FileCheck %s --check-prefixes=CHECK,MULTI
 
-// RUN: %dxc -T vs_6_6              -DETY=float1    -DCOLS=4 %s | FileCheck %s
-// RUN: %dxc -T vs_6_6              -DETY=bool1     -DCOLS=4 %s | FileCheck %s
-// RUN: %dxc -T vs_6_6              -DETY=uint64_t1 -DCOLS=2 %s | FileCheck %s
-// RUN: %dxc -T vs_6_6              -DETY=double1   -DCOLS=2 %s | FileCheck %s
+// RUN: %dxc -T vs_6_6              -DETY=float1    -DCOLS=3 %s | FileCheck %s --check-prefixes=CHECK,MULTI
+// RUN: %dxc -T vs_6_6              -DETY=bool1     -DCOLS=3 %s | FileCheck %s --check-prefixes=CHECK,MULTI
+// RUN: %dxc -T vs_6_6              -DETY=uint64_t1 -DCOLS=3 %s | FileCheck %s --check-prefixes=CHECK,MULTI
+// RUN: %dxc -T vs_6_6              -DETY=double1   -DCOLS=3 %s | FileCheck %s --check-prefixes=CHECK,MULTI
 
-// RUN: %dxc -T vs_6_6              -DETY=float4    -DCOLS=4 %s | FileCheck %s
-// RUN: %dxc -T vs_6_6              -DETY=bool4     -DCOLS=4 %s | FileCheck %s
-// RUN: %dxc -T vs_6_6              -DETY=uint64_t4 -DCOLS=2 %s | FileCheck %s
-// RUN: %dxc -T vs_6_6              -DETY=double4   -DCOLS=2 %s | FileCheck %s
+// RUN: %dxc -T vs_6_6              -DETY=float4    -DCOLS=3 %s | FileCheck %s --check-prefixes=CHECK,MULTI
+// RUN: %dxc -T vs_6_6              -DETY=bool4     -DCOLS=3 %s | FileCheck %s --check-prefixes=CHECK,MULTI
+// RUN: %dxc -T vs_6_6              -DETY=uint64_t4 -DCOLS=3 %s | FileCheck %s --check-prefixes=CHECK,MULTI
+// RUN: %dxc -T vs_6_6              -DETY=double4   -DCOLS=3 %s | FileCheck %s --check-prefixes=CHECK,MULTI
 
 // RUN: %dxc -T vs_6_6 -DATY=matrix -DETY=float    -DCOLS=2 -DROWS=2 %s | FileCheck %s
 // RUN: %dxc -T vs_6_6 -DATY=matrix -DETY=bool     -DCOLS=2 -DROWS=2 %s | FileCheck %s
 // RUN: %dxc -T vs_6_6 -DATY=matrix -DETY=uint64_t -DCOLS=2 -DROWS=2 %s | FileCheck %s
 // RUN: %dxc -T vs_6_6 -DATY=matrix -DETY=double   -DCOLS=2 -DROWS=2 %s | FileCheck %s
 
-// RUN: %dxc -T vs_6_6 -DATY=matrix -DETY=float    -DCOLS=3 -DROWS=3 %s | FileCheck %s --check-prefixes=CHECK,MAT
-// RUN: %dxc -T vs_6_6 -DATY=matrix -DETY=bool     -DCOLS=3 -DROWS=3 %s | FileCheck %s --check-prefixes=CHECK,MAT
-// RUN: %dxc -T vs_6_6 -DATY=matrix -DETY=uint64_t -DCOLS=3 -DROWS=3 %s | FileCheck %s --check-prefixes=CHECK,MAT
-// RUN: %dxc -T vs_6_6 -DATY=matrix -DETY=double   -DCOLS=3 -DROWS=3 %s | FileCheck %s --check-prefixes=CHECK,MAT
+// RUN: %dxc -T vs_6_6 -DATY=matrix -DETY=float    -DCOLS=3 -DROWS=3 %s | FileCheck %s --check-prefixes=CHECK,MULTI
+// RUN: %dxc -T vs_6_6 -DATY=matrix -DETY=bool     -DCOLS=3 -DROWS=3 %s | FileCheck %s --check-prefixes=CHECK,MULTI
+// RUN: %dxc -T vs_6_6 -DATY=matrix -DETY=uint64_t -DCOLS=3 -DROWS=3 %s | FileCheck %s --check-prefixes=CHECK,MULTI
+// RUN: %dxc -T vs_6_6 -DATY=matrix -DETY=double   -DCOLS=3 -DROWS=3 %s | FileCheck %s --check-prefixes=CHECK,MULTI
 
 // RUN: %dxc -T vs_6_6 -DATY=Matrix -DETY=float    -DCOLS=2 -DROWS=2 %s | FileCheck %s
 // RUN: %dxc -T vs_6_6 -DATY=Matrix -DETY=uint64_t -DCOLS=2 -DROWS=2 %s | FileCheck %s
 // RUN: %dxc -T vs_6_6 -DATY=Matrix -DETY=double   -DCOLS=2 -DROWS=2 %s | FileCheck %s
-// RUN: %dxc -T vs_6_6 -DATY=Matrix -DETY=float    -DCOLS=3 -DROWS=3 %s | FileCheck %s --check-prefixes=CHECK,MAT
-// RUN: %dxc -T vs_6_6 -DATY=Matrix -DETY=bool     -DCOLS=3 -DROWS=3 %s | FileCheck %s --check-prefixes=CHECK,MAT
-// RUN: %dxc -T vs_6_6 -DATY=Matrix -DETY=uint64_t -DCOLS=3 -DROWS=3 %s | FileCheck %s --check-prefixes=CHECK,MAT
-// RUN: %dxc -T vs_6_6 -DATY=Matrix -DETY=double   -DCOLS=3 -DROWS=3 %s | FileCheck %s --check-prefixes=CHECK,MAT
+// RUN: %dxc -T vs_6_6 -DATY=Matrix -DETY=float    -DCOLS=3 -DROWS=3 %s | FileCheck %s --check-prefixes=CHECK,MULTI
+// RUN: %dxc -T vs_6_6 -DATY=Matrix -DETY=bool     -DCOLS=3 -DROWS=3 %s | FileCheck %s --check-prefixes=CHECK,MULTI
+// RUN: %dxc -T vs_6_6 -DATY=Matrix -DETY=uint64_t -DCOLS=3 -DROWS=3 %s | FileCheck %s --check-prefixes=CHECK,MULTI
+// RUN: %dxc -T vs_6_6 -DATY=Matrix -DETY=double   -DCOLS=3 -DROWS=3 %s | FileCheck %s --check-prefixes=CHECK,MULTI
 
 // RUN: %dxc -T vs_6_6 -DATY=Vector -DETY=float    -DCOLS=4 %s | FileCheck %s
 // RUN: %dxc -T vs_6_6 -DATY=Vector -DETY=bool     -DCOLS=4 %s | FileCheck %s
@@ -105,27 +105,27 @@ RWStructuredBuffer< TYPE SS > RwStBuf : register(u2);
 ConsumeStructuredBuffer< TYPE SS > CnStBuf : register(u4);
 AppendStructuredBuffer< TYPE SS > ApStBuf  : register(u5);
 
-TYPE Add(TYPE f1[COLS], TYPE f2[COLS])[COLS] {
+TYPE Add(TYPE f1[COLS], TYPE f2[COLS], TYPE f3[COLS], TYPE f4[COLS])[COLS] {
   TYPE ret[COLS];
   for (int i = 0; i < COLS; i++)
-    ret[i] = f1[i] + f2[i];
+    ret[i] = f1[i] + f2[i] + f3[i] + f4[i];
   return ret;
 }
 
 template<typename T>
-T Add(T v1, T v2) { return v1 + v2; }
+T Add(T v1, T v2, T v3, T v4) { return v1 + v2 + v3 + v4; }
 
-TYPE Add(TYPE f1[COLS], TYPE f2[COLS], TYPE f3[COLS], TYPE f4[COLS])[COLS] {
+TYPE Add(TYPE f1[COLS], TYPE f2[COLS], TYPE f3[COLS], TYPE f4[COLS], TYPE f5[COLS], TYPE f6[COLS])[COLS] {
   TYPE ret[COLS];
   for (int i = 0; i < COLS; i++)
-    ret[i] = f1[i] + f2[i] + f3[i] + f4[i];
+    ret[i] = f1[i] + f2[i] + f3[i] + f4[i] + f5[i] + f6[i];
   return ret;
 }
 
 template<typename T>
-T Add(T v1, T v2, T v3, T v4) { return v1 + v2 + v3 + v4; }
+T Add(T v1, T v2, T v3, T v4, T v5, T v6) { return v1 + v2 + v3 + v4 + v5 + v6; }
 
-void main(uint ix[2] : IX) {
+void main(uint ix[3] : IX) {
   // ByteAddressBuffer Tests
 
   // CHECK-DAG: [[HDLROBY:%.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217, %dx.types.ResBind { i32 1, i32 1, i32 0, i8 0 }, i32 1, i1 false)
@@ -150,26 +150,55 @@ void main(uint ix[2] : IX) {
   // CHECK: [[ANHDLRWBY:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[HDLRWBY]]
   // OFF: [[RIX0:%.*]] = add i32 [[IX0]], [[BOFF:[0-9]+]]
   // CHECK: call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLRWBY]], i32 [[RIX0]]
-  // MAT: [[IX0p4:%.*]] = add i32 [[RIX0]], [[p4:[0-9]+]]
-  // MAT: call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLRWBY]], i32 [[IX0p4]]
-  // MAT: [[IX0p8:%.*]] = add i32 [[RIX0]], [[p8:[0-9]+]]
-  // MAT: call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLRWBY]], i32 [[IX0p8]]
+  // MULTI: [[IX0p4:%.*]] = add i32 [[RIX0]], [[p4:[0-9]+]]
+  // MULTI: call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLRWBY]], i32 [[IX0p4]]
+  // MULTI: [[IX0p8:%.*]] = add i32 [[RIX0]], [[p8:[0-9]+]]
+  // MULTI: call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLRWBY]], i32 [[IX0p8]]
   // I1: icmp ne i32
   // I1: icmp ne i32
   // I1: icmp ne i32
   // I1: icmp ne i32
   TYPE babElt1 SS = RwByBuf.Load< TYPE SS >(ix[0]);
 
+  // CHECK-DAG: [[IX1:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 {{[0-9]*}}, i32 1
+  // CHECK-DAG: [[RIX1:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 {{[0-9]*}}, i32 1
+  // OFF: [[RIX1:%.*]] = add i32 [[IX1]], [[BOFF]]
+  // CHECK: [[RESRET:%.*]] = call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLRWBY]], i32 [[RIX1]]
+  // MULTI: [[IX1p4:%.*]] = add i32 [[RIX1]], [[p4]]
+  // MULTI: call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLRWBY]], i32 [[IX1p4]]
+  // MULTI: [[IX1p8:%.*]] = add i32 [[RIX1]], [[p8]]
+  // MULTI: [[RESRET:%.*]] = call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLRWBY]], i32 [[IX1p8]]
+  // CHECK: [[STATUS:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[RESRET]], 4
+  // CHECK: [[CHK1:%.*]] = call i1 @dx.op.checkAccessFullyMapped.i32(i32 71, i32 [[STATUS]])
+  // I1: icmp ne i32
+  // I1: icmp ne i32
+  // I1: icmp ne i32
+  // I1: icmp ne i32
+  uint status1;
+  TYPE babElt3 SS = RwByBuf.Load< TYPE SS >(ix[1], status1);
+
   // CHECK: [[ANHDLROBY:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[HDLROBY]]
   // CHECK: call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLROBY]], i32 [[RIX0]]
-  // MAT: call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLROBY]], i32 [[IX0p4]]
-  // MAT: call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLROBY]], i32 [[IX0p8]]
+  // MULTI: call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLROBY]], i32 [[IX0p4]]
+  // MULTI: call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLROBY]], i32 [[IX0p8]]
   // I1: icmp ne i32 %{{.*}}, 0
   // I1: icmp ne i32 %{{.*}}, 0
   // I1: icmp ne i32 %{{.*}}, 0
   // I1: icmp ne i32 %{{.*}}, 0
   TYPE babElt2 SS = RoByBuf.Load< TYPE SS >(ix[0]);
 
+  // CHECK: [[RESRET:%.*]] = call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLROBY]], i32 [[RIX1]]
+  // MULTI: call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLROBY]], i32 [[IX1p4]]
+  // MULTI: [[RESRET:%.*]] = call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLROBY]], i32 [[IX1p8]]
+  // CHECK: [[STATUS:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[RESRET]], 4
+  // CHECK: [[CHK2:%.*]] = call i1 @dx.op.checkAccessFullyMapped.i32(i32 71, i32 [[STATUS]])
+  // I1: icmp ne i32 %{{.*}}, 0
+  // I1: icmp ne i32 %{{.*}}, 0
+  // I1: icmp ne i32 %{{.*}}, 0
+  // I1: icmp ne i32 %{{.*}}, 0
+  uint status2;
+  TYPE babElt4 SS = RoByBuf.Load< TYPE SS >(ix[1], status2);
+
   // I1: zext i1 %{{.*}} to i32
   // I1: zext i1 %{{.*}} to i32
   // I1: zext i1 %{{.*}} to i32
@@ -177,48 +206,76 @@ void main(uint ix[2] : IX) {
   // OFF: call void @dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle [[ANHDLRWBY]], i32 {{%.*}}, i32 undef, float 0.0
   // OFF: call void @dx.op.rawBufferStore.f64(i32 140, %dx.types.Handle [[ANHDLRWBY]], i32 {{%.*}}, i32 undef, double 0.0
   // CHECK: call void @dx.op.rawBufferStore.[[TY]](i32 140, %dx.types.Handle [[ANHDLRWBY]], i32 [[RIX0]]
-  // MAT: call void @dx.op.rawBufferStore.[[TY]](i32 140, %dx.types.Handle [[ANHDLRWBY]], i32 [[IX0p4]]
-  // MAT: call void @dx.op.rawBufferStore.[[TY]](i32 140, %dx.types.Handle [[ANHDLRWBY]], i32 [[IX0p8]]
-  RwByBuf.Store< TYPE SS >(ix[0], Add(babElt1, babElt2));
+  // MULTI: call void @dx.op.rawBufferStore.[[TY]](i32 140, %dx.types.Handle [[ANHDLRWBY]], i32 [[IX0p4]]
+  // MULTI: call void @dx.op.rawBufferStore.[[TY]](i32 140, %dx.types.Handle [[ANHDLRWBY]], i32 [[IX0p8]]
+  // CHECK: and i1 [[CHK1]], [[CHK2]]
+  // CHECK: [[ANHDLRWBY:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[HDLRWBY]]
+  // CHECK: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle [[ANHDLRWBY]], i32 100
+  RwByBuf.Store< TYPE SS >(ix[0], Add(babElt1, babElt2, babElt3, babElt4));
+  RwByBuf.Store< uint > (100, status1 && status2);
 
   // StructuredBuffer Tests
   // CHECK: [[ANHDLRWST:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[HDLRWST]]
   // CHECK: call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLRWST]], i32 [[IX0]], i32 [[BOFF]]
-  // MAT:  call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLRWST]], i32 [[IX0]], i32 [[p4]]
-  // MAT:  call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLRWST]], i32 [[IX0]], i32 [[p8]]
+  // MULTI:  call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLRWST]], i32 [[IX0]], i32 [[p4]]
+  // MULTI:  call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLRWST]], i32 [[IX0]], i32 [[p8]]
   // I1: icmp ne i32 %{{.*}}, 0
   // I1: icmp ne i32 %{{.*}}, 0
   // I1: icmp ne i32 %{{.*}}, 0
   // I1: icmp ne i32 %{{.*}}, 0
   TYPE stbElt1 SS = RwStBuf.Load(ix[0]);
-  // CHECK: [[IX1:%.*]] = call i32 @dx.op.loadInput.i32(i32 4,
+
   // CHECK: call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLRWST]], i32 [[IX1]], i32 [[BOFF]]
-  // MAT:  call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLRWST]], i32 [[IX1]], i32 [[p4]]
-  // MAT:  call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLRWST]], i32 [[IX1]], i32 [[p8]]
+  // MULTI:  call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLRWST]], i32 [[IX1]], i32 [[p4]]
+  // MULTI:  call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLRWST]], i32 [[IX1]], i32 [[p8]]
   // I1: icmp ne i32 %{{.*}}, 0
   // I1: icmp ne i32 %{{.*}}, 0
   // I1: icmp ne i32 %{{.*}}, 0
   // I1: icmp ne i32 %{{.*}}, 0
   TYPE stbElt2 SS = RwStBuf[ix[1]];
 
+  // CHECK: [[IX2:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 {{[0-9]*}}, i32 2
+  // CHECK: [[RESRET:%.*]] = call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLRWST]], i32 [[IX2]], i32 [[BOFF]]
+  // MULTI: call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLRWST]], i32 [[IX2]], i32 [[p4]]
+  // MULTI: [[RESRET:%.*]] = call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLRWST]], i32 [[IX2]], i32 [[p8]]
+  // CHECK: [[STATUS:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[RESRET]], 4
+  // CHECK: [[CHK1:%.*]] = call i1 @dx.op.checkAccessFullyMapped.i32(i32 71, i32 [[STATUS]])
+  // I1: icmp ne i32 %{{.*}}, 0
+  // I1: icmp ne i32 %{{.*}}, 0
+  // I1: icmp ne i32 %{{.*}}, 0
+  // I1: icmp ne i32 %{{.*}}, 0
+  TYPE stbElt5 SS = RwStBuf.Load(ix[2], status1);
+
   // CHECK: [[ANHDLROST:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[HDLROST]]
   // CHECK: call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLROST]], i32 [[IX0]], i32 [[BOFF]]
-  // MAT: call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLROST]], i32 [[IX0]], i32 [[p4]]
-  // MAT: call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLROST]], i32 [[IX0]], i32 [[p8]]
+  // MULTI: call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLROST]], i32 [[IX0]], i32 [[p4]]
+  // MULTI: call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLROST]], i32 [[IX0]], i32 [[p8]]
   // I1: icmp ne i32 %{{.*}}, 0
   // I1: icmp ne i32 %{{.*}}, 0
   // I1: icmp ne i32 %{{.*}}, 0
   // I1: icmp ne i32 %{{.*}}, 0
   TYPE stbElt3 SS = RoStBuf.Load(ix[0]);
+
   // CHECK: call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLROST]], i32 [[IX1]], i32 [[BOFF]]
-  // MAT: call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLROST]], i32 [[IX1]], i32 [[p4]]
-  // MAT: call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLROST]], i32 [[IX1]], i32 [[p8]]
+  // MULTI: call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLROST]], i32 [[IX1]], i32 [[p4]]
+  // MULTI: call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLROST]], i32 [[IX1]], i32 [[p8]]
   // I1: icmp ne i32 %{{.*}}, 0
   // I1: icmp ne i32 %{{.*}}, 0
   // I1: icmp ne i32 %{{.*}}, 0
   // I1: icmp ne i32 %{{.*}}, 0
   TYPE stbElt4 SS = RoStBuf[ix[1]];
 
+  // CHECK: [[RESRET:%.*]] = call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLROST]], i32 [[IX2]], i32 [[BOFF]]
+  // MULTI: call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLROST]], i32 [[IX2]], i32 [[p4]]
+  // MULTI: [[RESRET:%.*]] = call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLROST]], i32 [[IX2]], i32 [[p8]]
+  // CHECK: [[STATUS:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[RESRET]], 4
+  // CHECK: [[CHK2:%.*]] = call i1 @dx.op.checkAccessFullyMapped.i32(i32 71, i32 [[STATUS]])
+  // I1: icmp ne i32 %{{.*}}, 0
+  // I1: icmp ne i32 %{{.*}}, 0
+  // I1: icmp ne i32 %{{.*}}, 0
+  // I1: icmp ne i32 %{{.*}}, 0
+  TYPE stbElt6 SS = RoStBuf.Load(ix[2], status2);
+
   // I1: zext i1 %{{.*}} to i32
   // I1: zext i1 %{{.*}} to i32
   // I1: zext i1 %{{.*}} to i32
@@ -226,9 +283,13 @@ void main(uint ix[2] : IX) {
   // OFF: call void @dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle [[ANHDLRWST]], i32 [[IX0]], i32 0, float 0.0
   // OFF: call void @dx.op.rawBufferStore.f64(i32 140, %dx.types.Handle [[ANHDLRWST]], i32 [[IX0]], i32 16, double 0.0
   // CHECK: call void @dx.op.rawBufferStore.[[TY]](i32 140, %dx.types.Handle [[ANHDLRWST]], i32 [[IX0]], i32 [[BOFF]]
-  // MAT: call void @dx.op.rawBufferStore.[[TY]](i32 140, %dx.types.Handle [[ANHDLRWST]], i32 [[IX0]], i32 [[p4]]
-  // MAT: call void @dx.op.rawBufferStore.[[TY]](i32 140, %dx.types.Handle [[ANHDLRWST]], i32 [[IX0]], i32 [[p8]]
-  RwStBuf[ix[0]] = Add(stbElt1, stbElt2, stbElt3, stbElt4);
+  // MULTI: call void @dx.op.rawBufferStore.[[TY]](i32 140, %dx.types.Handle [[ANHDLRWST]], i32 [[IX0]], i32 [[p4]]
+  // MULTI: call void @dx.op.rawBufferStore.[[TY]](i32 140, %dx.types.Handle [[ANHDLRWST]], i32 [[IX0]], i32 [[p8]]
+  // CHECK: and i1 [[CHK1]], [[CHK2]]
+  // CHECK: [[ANHDLRWBY:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[HDLRWBY]]
+  // CHECK: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle [[ANHDLRWBY]], i32 200
+  RwStBuf[ix[0]] = Add(stbElt1, stbElt2, stbElt3, stbElt4, stbElt5, stbElt6);
+  RwByBuf.Store< uint > (200, status1 && status2);
 
   // {Append/Consume}StructuredBuffer Tests
   // CHECK: [[ANHDLCON:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[HDLCON]]
@@ -236,8 +297,8 @@ void main(uint ix[2] : IX) {
   // OFF: call %dx.types.ResRet.f32 @dx.op.rawBufferLoad.f32(i32 139, %dx.types.Handle [[ANHDLCON]], i32 [[CONIX]], i32 
   // OFF: call %dx.types.ResRet.f64 @dx.op.rawBufferLoad.f64(i32 139, %dx.types.Handle [[ANHDLCON]], i32 [[CONIX]], i32 16
   // CHECK: call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLCON]], i32 [[CONIX]], i32 [[BOFF]]
-  // MAT: call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLCON]], i32 [[CONIX]], i32 [[p4]]
-  // MAT: call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLCON]], i32 [[CONIX]], i32 [[p8]]
+  // MULTI: call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLCON]], i32 [[CONIX]], i32 [[p4]]
+  // MULTI: call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLCON]], i32 [[CONIX]], i32 [[p8]]
   // I1: icmp ne i32 %{{.*}}, 0
   // I1: icmp ne i32 %{{.*}}, 0
   // I1: icmp ne i32 %{{.*}}, 0
@@ -253,7 +314,7 @@ void main(uint ix[2] : IX) {
   // OFF: call void @dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle [[ANHDLAPP]], i32 [[APPIX]], i32 0
   // OFF: call void @dx.op.rawBufferStore.f64(i32 140, %dx.types.Handle [[ANHDLAPP]], i32 [[APPIX]], i32 16
   // CHECK: call void @dx.op.rawBufferStore.[[TY]](i32 140, %dx.types.Handle [[ANHDLAPP]], i32 [[APPIX]], i32 [[BOFF]]
-  // MAT: call void @dx.op.rawBufferStore.[[TY]](i32 140, %dx.types.Handle [[ANHDLAPP]], i32 [[APPIX]], i32 [[p4]]
-  // MAT: call void @dx.op.rawBufferStore.[[TY]](i32 140, %dx.types.Handle [[ANHDLAPP]], i32 [[APPIX]], i32 [[p8]]
+  // MULTI: call void @dx.op.rawBufferStore.[[TY]](i32 140, %dx.types.Handle [[ANHDLAPP]], i32 [[APPIX]], i32 [[p4]]
+  // MULTI: call void @dx.op.rawBufferStore.[[TY]](i32 140, %dx.types.Handle [[ANHDLAPP]], i32 [[APPIX]], i32 [[p8]]
   ApStBuf.Append(cnElt);
 }
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/buffer-load-stores-sm69.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/buffer-load-stores-sm69.hlsl
index 5305ee495b..f71b29e83e 100644
--- a/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/buffer-load-stores-sm69.hlsl
+++ b/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/buffer-load-stores-sm69.hlsl
@@ -26,7 +26,7 @@ AppendStructuredBuffer<vector<TYPE, NUM> > ApStBuf  : register(u5);
 
 // CHECK-LABEL: define void @main
 [shader("vertex")]
-void main(uint ix[2] : IX) {
+void main(uint ix[3] : IX) {
   // ByteAddressBuffer Tests
 
   // CHECK-DAG: [[HDLROBY:%.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217, %dx.types.ResBind { i32 1, i32 1, i32 0, i8 0 }, i32 1, i1 false)
@@ -45,36 +45,73 @@ void main(uint ix[2] : IX) {
   // I1: icmp ne <[[NUM]] x i32> %{{.*}}, zeroinitializer
   vector<TYPE, NUM>  babElt1 = RwByBuf.Load< vector<TYPE, NUM>  >(ix[0]);
 
+  // CHECK: [[IX1:%.*]] = call i32 @dx.op.loadInput.i32(i32 4,
+  // CHECK: [[RESRET:%.*]] = call %dx.types.ResRet.[[VTY]] @dx.op.rawBufferVectorLoad.[[VTY]](i32 303, %dx.types.Handle [[ANHDLRWBY]], i32 [[IX1]]
+  // CHECK: [[STATUS:%.*]] = extractvalue %dx.types.ResRet.[[VTY]] [[RESRET]], 1
+  // CHECK: [[CHK1:%.*]] = call i1 @dx.op.checkAccessFullyMapped.i32(i32 71, i32 [[STATUS]])
+  // I1: icmp ne <[[NUM]] x i32> %{{.*}}, zeroinitializer
+  uint status1;
+  vector<TYPE, NUM>  babElt3 = RwByBuf.Load< vector<TYPE, NUM>  >(ix[1], status1);
+
   // CHECK: [[ANHDLROBY:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[HDLROBY]]
   // CHECK: call %dx.types.ResRet.[[VTY]] @dx.op.rawBufferVectorLoad.[[VTY]](i32 303, %dx.types.Handle [[ANHDLROBY]], i32 [[IX0]]
   // I1: icmp ne <[[NUM]] x i32> %{{.*}}, zeroinitializer
   vector<TYPE, NUM>  babElt2 = RoByBuf.Load< vector<TYPE, NUM>  >(ix[0]);
 
+  // CHECK: [[RESRET:%.*]] = call %dx.types.ResRet.[[VTY]] @dx.op.rawBufferVectorLoad.[[VTY]](i32 303, %dx.types.Handle [[ANHDLROBY]], i32 [[IX1]]
+  // CHECK: [[STATUS:%.*]] = extractvalue %dx.types.ResRet.[[VTY]] [[RESRET]], 1
+  // CHECK: [[CHK2:%.*]] = call i1 @dx.op.checkAccessFullyMapped.i32(i32 71, i32 [[STATUS]])
+  // I1: icmp ne <[[NUM]] x i32> %{{.*}}, zeroinitializer
+  uint status2;
+  vector<TYPE, NUM>  babElt4 = RoByBuf.Load< vector<TYPE, NUM>  >(ix[1], status2);
+
   // I1: zext <[[NUM]] x i1> %{{.*}} to <[[NUM]] x i32>
   // CHECK: all void @dx.op.rawBufferVectorStore.[[VTY]](i32 304, %dx.types.Handle [[ANHDLRWBY]], i32 [[IX0]]
-  RwByBuf.Store< vector<TYPE, NUM>  >(ix[0], babElt1 + babElt2);
+  // CHECK: and i1 [[CHK1]], [[CHK2]]
+  // CHECK: [[ANHDLRWBY:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[HDLRWBY]]
+  // CHECK: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle [[ANHDLRWBY]], i32 100
+  RwByBuf.Store< vector<TYPE, NUM>  >(ix[0], babElt1 + babElt2 + babElt3 + babElt4);
+  RwByBuf.Store< uint > (100, status1 && status2);
 
   // StructuredBuffer Tests
   // CHECK: [[ANHDLRWST:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[HDLRWST]]
   // CHECK: call %dx.types.ResRet.[[VTY]] @dx.op.rawBufferVectorLoad.[[VTY]](i32 303, %dx.types.Handle [[ANHDLRWST]], i32 [[IX0]]
   // I1: icmp ne <[[NUM]] x i32> %{{.*}}, zeroinitializer
   vector<TYPE, NUM>  stbElt1 = RwStBuf.Load(ix[0]);
-  // CHECK: [[IX1:%.*]] = call i32 @dx.op.loadInput.i32(i32 4,
+
   // CHECK: call %dx.types.ResRet.[[VTY]] @dx.op.rawBufferVectorLoad.[[VTY]](i32 303, %dx.types.Handle [[ANHDLRWST]], i32 [[IX1]]
   // I1: icmp ne <[[NUM]] x i32> %{{.*}}, zeroinitializer
   vector<TYPE, NUM>  stbElt2 = RwStBuf[ix[1]];
 
+  // CHECK: [[IX2:%.*]] = call i32 @dx.op.loadInput.i32(i32 4,
+  // CHECK: [[RESRET:%.*]] = call %dx.types.ResRet.[[VTY]] @dx.op.rawBufferVectorLoad.[[VTY]](i32 303, %dx.types.Handle [[ANHDLRWST]], i32 [[IX2]]
+  // CHECK: [[STATUS:%.*]] = extractvalue %dx.types.ResRet.[[VTY]] [[RESRET]], 1
+  // CHECK: [[CHK1:%.*]] = call i1 @dx.op.checkAccessFullyMapped.i32(i32 71, i32 [[STATUS]])
+  // I1: icmp ne <[[NUM]] x i32> %{{.*}}, zeroinitializer
+  vector<TYPE, NUM>  stbElt5 = RwStBuf.Load(ix[2], status1);
+
   // CHECK: [[ANHDLROST:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[HDLROST]]
   // CHECK: call %dx.types.ResRet.[[VTY]] @dx.op.rawBufferVectorLoad.[[VTY]](i32 303, %dx.types.Handle [[ANHDLROST]], i32 [[IX0]]
   // I1: icmp ne <[[NUM]] x i32> %{{.*}}, zeroinitializer
   vector<TYPE, NUM>  stbElt3 = RoStBuf.Load(ix[0]);
+
   // CHECK: call %dx.types.ResRet.[[VTY]] @dx.op.rawBufferVectorLoad.[[VTY]](i32 303, %dx.types.Handle [[ANHDLROST]], i32 [[IX1]]
   // I1: icmp ne <[[NUM]] x i32> %{{.*}}, zeroinitializer
   vector<TYPE, NUM>  stbElt4 = RoStBuf[ix[1]];
 
+  // CHECK: [[RESRET:%.*]] = call %dx.types.ResRet.[[VTY]] @dx.op.rawBufferVectorLoad.[[VTY]](i32 303, %dx.types.Handle [[ANHDLROST]], i32 [[IX2]]
+  // CHECK: [[STATUS:%.*]] = extractvalue %dx.types.ResRet.[[VTY]] [[RESRET]], 1
+  // CHECK: [[CHK2:%.*]] = call i1 @dx.op.checkAccessFullyMapped.i32(i32 71, i32 [[STATUS]])
+  // I1: icmp ne <[[NUM]] x i32> %{{.*}}, zeroinitializer
+  vector<TYPE, NUM>  stbElt6 = RoStBuf.Load(ix[2], status2);
+
   // I1: zext <[[NUM]] x i1> %{{.*}} to <[[NUM]] x i32>
   // CHECK: all void @dx.op.rawBufferVectorStore.[[VTY]](i32 304, %dx.types.Handle [[ANHDLRWST]], i32 [[IX0]]
-  RwStBuf[ix[0]] = stbElt1 + stbElt2 + stbElt3 + stbElt4;
+  // CHECK: and i1 [[CHK1]], [[CHK2]]
+  // CHECK: [[ANHDLRWBY:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[HDLRWBY]]
+  // CHECK: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle [[ANHDLRWBY]], i32 200
+  RwStBuf[ix[0]] = stbElt1 + stbElt2 + stbElt3 + stbElt4 + stbElt5 + stbElt6;
+  RwByBuf.Store< uint > (200, status1 && status2);
 
   // {Append/Consume}StructuredBuffer Tests
   // CHECK: [[ANHDLCON:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[HDLCON]]
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/buffer-load-stores.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/buffer-load-stores.hlsl
index 8dcf5ead1c..896f442c2c 100644
--- a/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/buffer-load-stores.hlsl
+++ b/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/buffer-load-stores.hlsl
@@ -87,12 +87,36 @@ void main(uint ix0 : IX0, uint ix1 : IX1, uint2 ix2 : IX2, uint3 ix3 : IX3) {
   // I1: icmp ne i32 %{{.*}}, 0
   TYPE babElt2 = RoByBuf.Load< TYPE >(ix0);
 
+  // CHECK: [[RESRET:%.*]] = call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLRWBY]], i32 [[IX1]]
+  // CHECK: [[STATUS:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[RESRET]], 4
+  // CHECK: [[CHK1:%.*]] = call i1 @dx.op.checkAccessFullyMapped.i32(i32 71, i32 [[STATUS]])
+  // I1: icmp ne i32 %{{.*}}, 0
+  // I1: icmp ne i32 %{{.*}}, 0
+  // I1: icmp ne i32 %{{.*}}, 0
+  // I1: icmp ne i32 %{{.*}}, 0
+  uint status1 = 0;
+  TYPE babElt3 = RwByBuf.Load< TYPE >(ix1, status1);
+
+  // CHECK: [[RESRET:%.*]] = call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLROBY]], i32 [[IX1]]
+  // CHECK: [[STATUS:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[RESRET]], 4
+  // CHECK: [[CHK2:%.*]] = call i1 @dx.op.checkAccessFullyMapped.i32(i32 71, i32 [[STATUS]])
+  // I1: icmp ne i32 %{{.*}}, 0
+  // I1: icmp ne i32 %{{.*}}, 0
+  // I1: icmp ne i32 %{{.*}}, 0
+  // I1: icmp ne i32 %{{.*}}, 0
+  uint status2 = 0;
+  TYPE babElt4 = RoByBuf.Load< TYPE >(ix1, status2);
+
   // I1: zext i1 %{{.*}} to i32
   // I1: zext i1 %{{.*}} to i32
   // I1: zext i1 %{{.*}} to i32
   // I1: zext i1 %{{.*}} to i32
   // CHECK: all void @dx.op.rawBufferStore.[[TY]](i32 140, %dx.types.Handle [[ANHDLRWBY]], i32 [[IX0]]
-  RwByBuf.Store< TYPE >(ix0, babElt1 + babElt2);
+  // CHECK: and i1 [[CHK1]], [[CHK2]]
+  // CHECK: [[ANHDLRWBY:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[HDLRWBY]]
+  // CHECK: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle [[ANHDLRWBY]], i32 100
+  RwByBuf.Store< TYPE >(ix0, babElt1 + babElt2 + babElt3 + babElt4);
+  RwByBuf.Store< uint > (100, status1 && status2);
 
   // StructuredBuffer Tests
   // CHECK: [[ANHDLRWST:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[HDLRWST]]
@@ -102,6 +126,7 @@ void main(uint ix0 : IX0, uint ix1 : IX1, uint2 ix2 : IX2, uint3 ix3 : IX3) {
   // I1: icmp ne i32 %{{.*}}, 0
   // I1: icmp ne i32 %{{.*}}, 0
   TYPE stbElt1 = RwStBuf.Load(ix0);
+
   // CHECK: call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLRWST]], i32 [[IX1]]
   // I1: icmp ne i32 %{{.*}}, 0
   // I1: icmp ne i32 %{{.*}}, 0
@@ -116,6 +141,7 @@ void main(uint ix0 : IX0, uint ix1 : IX1, uint2 ix2 : IX2, uint3 ix3 : IX3) {
   // I1: icmp ne i32 %{{.*}}, 0
   // I1: icmp ne i32 %{{.*}}, 0
   TYPE stbElt3 = RoStBuf.Load(ix0);
+
   // CHECK: call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLROST]], i32 [[IX1]]
   // I1: icmp ne i32 %{{.*}}, 0
   // I1: icmp ne i32 %{{.*}}, 0
@@ -123,12 +149,34 @@ void main(uint ix0 : IX0, uint ix1 : IX1, uint2 ix2 : IX2, uint3 ix3 : IX3) {
   // I1: icmp ne i32 %{{.*}}, 0
   TYPE stbElt4 = RoStBuf[ix1];
 
+  // CHECK: [[RESRET:%.*]] = call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLRWST]], i32 [[IX20]]
+  // CHECK: [[STATUS:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[RESRET]], 4
+  // CHECK: [[CHK1:%.*]] = call i1 @dx.op.checkAccessFullyMapped.i32(i32 71, i32 [[STATUS]])
+  // I1: icmp ne i32 %{{.*}}, 0
+  // I1: icmp ne i32 %{{.*}}, 0
+  // I1: icmp ne i32 %{{.*}}, 0
+  // I1: icmp ne i32 %{{.*}}, 0
+  TYPE stbElt5 = RwStBuf.Load(ix2[0], status1);
+
+  // CHECK: [[RESRET:%.*]] = call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLROST]], i32 [[IX20]]
+  // CHECK: [[STATUS:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[RESRET]], 4
+  // CHECK: [[CHK2:%.*]] = call i1 @dx.op.checkAccessFullyMapped.i32(i32 71, i32 [[STATUS]])
+  // I1: icmp ne i32 %{{.*}}, 0
+  // I1: icmp ne i32 %{{.*}}, 0
+  // I1: icmp ne i32 %{{.*}}, 0
+  // I1: icmp ne i32 %{{.*}}, 0
+  TYPE stbElt6 = RoStBuf.Load(ix2[0], status2);
+
   // I1: zext i1 %{{.*}} to i32
   // I1: zext i1 %{{.*}} to i32
   // I1: zext i1 %{{.*}} to i32
   // I1: zext i1 %{{.*}} to i32
   // CHECK: all void @dx.op.rawBufferStore.[[TY]](i32 140, %dx.types.Handle [[ANHDLRWST]], i32 [[IX0]]
-  RwStBuf[ix0] = stbElt1 + stbElt2 + stbElt3 + stbElt4;
+  // CHECK: and i1 [[CHK1]], [[CHK2]]
+  // CHECK: [[ANHDLRWBY:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[HDLRWBY]]
+  // CHECK: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle [[ANHDLRWBY]], i32 200
+  RwStBuf[ix0] = stbElt1 + stbElt2 + stbElt3 + stbElt4 + stbElt5 + stbElt6;
+  RwByBuf.Store< uint > (200, status1 && status2);
 
   // {Append/Consume}StructuredBuffer Tests
   // CHECK: [[ANHDLCON:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[HDLCON]]
@@ -167,6 +215,7 @@ void main(uint ix0 : IX0, uint ix1 : IX1, uint2 ix2 : IX2, uint3 ix3 : IX3) {
   // I1: icmp ne i32 %{{.*}}, 0
   // I1: icmp ne i32 %{{.*}}, 0
   TYPE typElt1 = RwTyBuf.Load(ix0);
+
   // CHECK: call %dx.types.ResRet.[[TY32]] @dx.op.bufferLoad.[[TY32]](i32 68, %dx.types.Handle [[ANHDLRWTY]], i32 [[IX1]]
   // F64: call double @dx.op.makeDouble.f64(i32 101
   // F64: call double @dx.op.makeDouble.f64(i32 101
@@ -183,6 +232,7 @@ void main(uint ix0 : IX0, uint ix1 : IX1, uint2 ix2 : IX2, uint3 ix3 : IX3) {
   // I1: icmp ne i32 %{{.*}}, 0
   // I1: icmp ne i32 %{{.*}}, 0
   TYPE typElt2 = RwTyBuf[ix1];
+
   // CHECK: [[ANHDLROTY:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[HDLROTY]]
   // CHECK: call %dx.types.ResRet.[[TY32]] @dx.op.bufferLoad.[[TY32]](i32 68, %dx.types.Handle [[ANHDLROTY]], i32 [[IX0]]
   // F64: call double @dx.op.makeDouble.f64(i32 101
@@ -200,6 +250,7 @@ void main(uint ix0 : IX0, uint ix1 : IX1, uint2 ix2 : IX2, uint3 ix3 : IX3) {
   // I1: icmp ne i32 %{{.*}}, 0
   // I1: icmp ne i32 %{{.*}}, 0
   TYPE typElt3 = RoTyBuf.Load(ix0);
+
   // CHECK: call %dx.types.ResRet.[[TY32]] @dx.op.bufferLoad.[[TY32]](i32 68, %dx.types.Handle [[ANHDLROTY]], i32 [[IX1]]
   // F64: call double @dx.op.makeDouble.f64(i32 101
   // F64: call double @dx.op.makeDouble.f64(i32 101
@@ -217,6 +268,44 @@ void main(uint ix0 : IX0, uint ix1 : IX1, uint2 ix2 : IX2, uint3 ix3 : IX3) {
   // I1: icmp ne i32 %{{.*}}, 0
   TYPE typElt4 = RoTyBuf[ix1];
 
+  // CHECK: [[RESRET:%.*]] = call %dx.types.ResRet.[[TY32]] @dx.op.bufferLoad.[[TY32]](i32 68, %dx.types.Handle [[ANHDLRWTY]], i32 [[IX20]]
+  // CHECK: [[STATUS:%.*]] = extractvalue %dx.types.ResRet.[[TY32]] [[RESRET]], 4
+  // CHECK: [[CHK1:%.*]] = call i1 @dx.op.checkAccessFullyMapped.i32(i32 71, i32 [[STATUS]])
+  // F64: call double @dx.op.makeDouble.f64(i32 101
+  // F64: call double @dx.op.makeDouble.f64(i32 101
+  // I64: zext i32 %{{.*}} to i64
+  // I64: zext i32 %{{.*}} to i64
+  // I64: shl nuw i64
+  // I64: or i64
+  // I64: zext i32 %{{.*}} to i64
+  // I64: zext i32 %{{.*}} to i64
+  // I64: shl nuw i64
+  // I64: or i64
+  // I1: icmp ne i32 %{{.*}}, 0
+  // I1: icmp ne i32 %{{.*}}, 0
+  // I1: icmp ne i32 %{{.*}}, 0
+  // I1: icmp ne i32 %{{.*}}, 0
+  TYPE typElt5 = RwTyBuf.Load(ix2[0], status1);
+
+  // CHECK: [[RESRET:%.*]] = call %dx.types.ResRet.[[TY32]] @dx.op.bufferLoad.[[TY32]](i32 68, %dx.types.Handle [[ANHDLROTY]], i32 [[IX20]]
+  // CHECK: [[STATUS:%.*]] = extractvalue %dx.types.ResRet.[[TY32]] [[RESRET]], 4
+  // CHECK: [[CHK2:%.*]] = call i1 @dx.op.checkAccessFullyMapped.i32(i32 71, i32 [[STATUS]])
+  // F64: call double @dx.op.makeDouble.f64(i32 101
+  // F64: call double @dx.op.makeDouble.f64(i32 101
+  // I64: zext i32 %{{.*}} to i64
+  // I64: zext i32 %{{.*}} to i64
+  // I64: shl nuw i64
+  // I64: or i64
+  // I64: zext i32 %{{.*}} to i64
+  // I64: zext i32 %{{.*}} to i64
+  // I64: shl nuw i64
+  // I64: or i64
+  // I1: icmp ne i32 %{{.*}}, 0
+  // I1: icmp ne i32 %{{.*}}, 0
+  // I1: icmp ne i32 %{{.*}}, 0
+  // I1: icmp ne i32 %{{.*}}, 0
+  TYPE typElt6 = RoTyBuf.Load(ix2[0], status2);
+
   // F64: call %dx.types.splitdouble @dx.op.splitDouble.f64(i32 102
   // F64: call %dx.types.splitdouble @dx.op.splitDouble.f64(i32 102
   // I64: trunc i64 %{{.*}} to i32
@@ -229,8 +318,12 @@ void main(uint ix0 : IX0, uint ix1 : IX1, uint2 ix2 : IX2, uint3 ix3 : IX3) {
   // I1: zext i1 %{{.*}} to i32
   // I1: zext i1 %{{.*}} to i32
   // I1: zext i1 %{{.*}} to i32
-  // CHECK: all void @dx.op.bufferStore.[[TY32]](i32 69, %dx.types.Handle [[ANHDLRWTY]], i32 [[IX0]]
-  RwTyBuf[ix0] = typElt1 + typElt2 + typElt3 + typElt4;
+  // CHECK: call void @dx.op.bufferStore.[[TY32]](i32 69, %dx.types.Handle [[ANHDLRWTY]], i32 [[IX0]]
+  // CHECK: and i1 [[CHK1]], [[CHK2]]
+  // CHECK: [[ANHDLRWBY:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[HDLRWBY]]
+  // CHECK: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle [[ANHDLRWBY]], i32 300
+  RwTyBuf[ix0] = typElt1 + typElt2 + typElt3 + typElt4 + typElt5 + typElt6;
+  RwByBuf.Store< uint > (300, status1 && status2);
 
   // Texture Tests
   // CHECK: [[ANHDLROTX1:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[HDLROTX1]]
@@ -250,6 +343,7 @@ void main(uint ix0 : IX0, uint ix1 : IX1, uint2 ix2 : IX2, uint3 ix3 : IX3) {
   // I1: icmp ne i32 %{{.*}}, 0
   // I1: icmp ne i32 %{{.*}}, 0
   TYPE texElt1 = RoTex1d[ix0];
+
   // CHECK: [[ANHDLRWTX1:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[HDLRWTX1]]
   // CHECK: call %dx.types.ResRet.[[TY32]] @dx.op.textureLoad.[[TY32]](i32 66, %dx.types.Handle [[ANHDLRWTX1]], i32 undef, i32 [[IX0]], i32 undef, i32 undef
   // F64: call double @dx.op.makeDouble.f64(i32 101
@@ -285,6 +379,7 @@ void main(uint ix0 : IX0, uint ix1 : IX1, uint2 ix2 : IX2, uint3 ix3 : IX3) {
   // I1: icmp ne i32 %{{.*}}, 0
   // I1: icmp ne i32 %{{.*}}, 0
   TYPE texElt3 = RoTex2d[ix2];
+
   // CHECK: [[ANHDLRWTX2:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[HDLRWTX2]]
   // CHECK: call %dx.types.ResRet.[[TY32]] @dx.op.textureLoad.[[TY32]](i32 66, %dx.types.Handle [[ANHDLRWTX2]], i32 undef, i32 [[IX20]], i32 [[IX21]], i32 undef
   // F64: call double @dx.op.makeDouble.f64(i32 101
@@ -320,6 +415,7 @@ void main(uint ix0 : IX0, uint ix1 : IX1, uint2 ix2 : IX2, uint3 ix3 : IX3) {
   // I1: icmp ne i32 %{{.*}}, 0
   // I1: icmp ne i32 %{{.*}}, 0
   TYPE texElt5 = RoTex3d[ix3];
+
   // CHECK: [[ANHDLRWTX3:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[HDLRWTX3]]
   // CHECK: call %dx.types.ResRet.[[TY32]] @dx.op.textureLoad.[[TY32]](i32 66, %dx.types.Handle [[ANHDLRWTX3]], i32 undef, i32 [[IX30]], i32 [[IX31]], i32 [[IX32]]
   // F64: call double @dx.op.makeDouble.f64(i32 101

From 93c7c2c8e62358ec3600350ce5763dfadbb6d3a0 Mon Sep 17 00:00:00 2001
From: Ashley Coleman <ascoleman@microsoft.com>
Date: Wed, 25 Jun 2025 18:43:47 -0700
Subject: [PATCH 77/93] Update Release Notes (#7563)

Update release notes in preparation for release
---
 docs/ReleaseNotes.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/docs/ReleaseNotes.md b/docs/ReleaseNotes.md
index 274164158e..6850902a81 100644
--- a/docs/ReleaseNotes.md
+++ b/docs/ReleaseNotes.md
@@ -19,7 +19,9 @@ The included licenses apply to the following files:
 
 ### Upcoming Release
 
-Place release notes for the upcoming release below this line and remove this line upon naming this release.
+- Fix regression: [#7510](https://github.com/microsoft/DirectXShaderCompiler/issues/7510) crash when calling `sizeof` on templated type.
+- Fix regression: [#7508](https://github.com/microsoft/DirectXShaderCompiler/issues/7508) crash when calling `Load` with `status`.
+- Header file `dxcpix.h` was added to the release package.
 
 ### Version 1.8.2505
 

From 8a9f8820723dd1677f698fe98b396666f34a2694 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 26 Jun 2025 10:15:30 -0700
Subject: [PATCH 78/93] Bump urllib3 from 2.2.2 to 2.5.0 in /utils/git (#7554)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bumps [urllib3](https://github.com/urllib3/urllib3) from 2.2.2 to 2.5.0.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a
href="https://github.com/urllib3/urllib3/releases">urllib3's
releases</a>.</em></p>
<blockquote>
<h2>2.5.0</h2>
<h2>🚀 urllib3 is fundraising for HTTP/2 support</h2>
<p><a
href="https://sethmlarson.dev/urllib3-is-fundraising-for-http2-support">urllib3
is raising ~$40,000 USD</a> to release HTTP/2 support and ensure
long-term sustainable maintenance of the project after a sharp decline
in financial support. If your company or organization uses Python and
would benefit from HTTP/2 support in Requests, pip, cloud SDKs, and
thousands of other projects <a
href="https://opencollective.com/urllib3">please consider contributing
financially</a> to ensure HTTP/2 support is developed sustainably and
maintained for the long-haul.</p>
<p>Thank you for your support.</p>
<h1>Security issues</h1>
<p>urllib3 2.5.0 fixes two moderate security issues:</p>
<ul>
<li>Pool managers now properly control redirects when
<code>retries</code> is passed — CVE-2025-50181 reported by <a
href="https://github.com/sandumjacob"><code>@​sandumjacob</code></a>
(5.3 Medium, GHSA-pq67-6m6q-mj2v)</li>
<li>Redirects are now controlled by urllib3 in the Node.js runtime —
CVE-2025-50182 (5.3 Medium, GHSA-48p4-8xcf-vxj5)</li>
</ul>
<h1>Features</h1>
<ul>
<li>Added support for the <code>compression.zstd</code> module that is
new in Python 3.14. See <a href="https://peps.python.org/pep-0784/">PEP
784</a> for more information. (<a
href="https://redirect.github.com/urllib3/urllib3/issues/3610">#3610</a>)</li>
<li>Added support for version 0.5 of <code>hatch-vcs</code> (<a
href="https://redirect.github.com/urllib3/urllib3/issues/3612">#3612</a>)</li>
</ul>
<h1>Bugfixes</h1>
<ul>
<li>Raised exception for <code>HTTPResponse.shutdown</code> on a
connection already released to the pool. (<a
href="https://redirect.github.com/urllib3/urllib3/issues/3581">#3581</a>)</li>
<li>Fixed incorrect <code>CONNECT</code> statement when using an IPv6
proxy with <code>connection_from_host</code>. Previously would not be
wrapped in <code>[]</code>. (<a
href="https://redirect.github.com/urllib3/urllib3/issues/3615">#3615</a>)</li>
</ul>
<h2>2.4.0</h2>
<h2>🚀 urllib3 is fundraising for HTTP/2 support</h2>
<p><a
href="https://sethmlarson.dev/urllib3-is-fundraising-for-http2-support">urllib3
is raising ~$40,000 USD</a> to release HTTP/2 support and ensure
long-term sustainable maintenance of the project after a sharp decline
in financial support. If your company or organization uses Python and
would benefit from HTTP/2 support in Requests, pip, cloud SDKs, and
thousands of other projects <a
href="https://opencollective.com/urllib3">please consider contributing
financially</a> to ensure HTTP/2 support is developed sustainably and
maintained for the long-haul.</p>
<p>Thank you for your support.</p>
<h1>Features</h1>
<ul>
<li>Applied PEP 639 by specifying the license fields in pyproject.toml.
(<a
href="https://redirect.github.com/urllib3/urllib3/issues/3522">#3522</a>)</li>
<li>Updated exceptions to save and restore more properties during the
pickle/serialization process. (<a
href="https://redirect.github.com/urllib3/urllib3/issues/3567">#3567</a>)</li>
<li>Added <code>verify_flags</code> option to
<code>create_urllib3_context</code> with a default of
<code>VERIFY_X509_PARTIAL_CHAIN</code> and
<code>VERIFY_X509_STRICT</code> for Python 3.13+. (<a
href="https://redirect.github.com/urllib3/urllib3/issues/3571">#3571</a>)</li>
</ul>
<h1>Bugfixes</h1>
<ul>
<li>Fixed a bug with partial reads of streaming data in Emscripten. (<a
href="https://redirect.github.com/urllib3/urllib3/issues/3555">#3555</a>)</li>
</ul>
<h1>Misc</h1>
<ul>
<li>Switched to uv for installing development dependecies. (<a
href="https://redirect.github.com/urllib3/urllib3/issues/3550">#3550</a>)</li>
<li>Removed the <code>multiple.intoto.jsonl</code> asset from GitHub
releases. Attestation of release files since v2.3.0 can be found on
PyPI. (<a
href="https://redirect.github.com/urllib3/urllib3/issues/3566">#3566</a>)</li>
</ul>
<h2>2.3.0</h2>
<!-- raw HTML omitted -->
</blockquote>
<p>... (truncated)</p>
</details>
<details>
<summary>Changelog</summary>
<p><em>Sourced from <a
href="https://github.com/urllib3/urllib3/blob/main/CHANGES.rst">urllib3's
changelog</a>.</em></p>
<blockquote>
<h1>2.5.0 (2025-06-18)</h1>
<h2>Features</h2>
<ul>
<li>Added support for the <code>compression.zstd</code> module that is
new in Python 3.14.
See <code>PEP 784 &lt;https://peps.python.org/pep-0784/&gt;</code>_ for
more information.
(<code>[#3610](https://github.com/urllib3/urllib3/issues/3610)
&lt;https://github.com/urllib3/urllib3/issues/3610&gt;</code>__)</li>
<li>Added support for version 0.5 of <code>hatch-vcs</code>
(<code>[#3612](https://github.com/urllib3/urllib3/issues/3612)
&lt;https://github.com/urllib3/urllib3/issues/3612&gt;</code>__)</li>
</ul>
<h2>Bugfixes</h2>
<ul>
<li>Fixed a security issue where restricting the maximum number of
followed
redirects at the <code>urllib3.PoolManager</code> level via the
<code>retries</code> parameter
did not work.</li>
<li>Made the Node.js runtime respect redirect parameters such as
<code>retries</code>
and <code>redirects</code>.</li>
<li>Raised exception for <code>HTTPResponse.shutdown</code> on a
connection already released to the pool.
(<code>[#3581](https://github.com/urllib3/urllib3/issues/3581)
&lt;https://github.com/urllib3/urllib3/issues/3581&gt;</code>__)</li>
<li>Fixed incorrect <code>CONNECT</code> statement when using an IPv6
proxy with <code>connection_from_host</code>. Previously would not be
wrapped in <code>[]</code>.
(<code>[#3615](https://github.com/urllib3/urllib3/issues/3615)
&lt;https://github.com/urllib3/urllib3/issues/3615&gt;</code>__)</li>
</ul>
<h1>2.4.0 (2025-04-10)</h1>
<h2>Features</h2>
<ul>
<li>Applied PEP 639 by specifying the license fields in pyproject.toml.
(<code>[#3522](https://github.com/urllib3/urllib3/issues/3522)
&lt;https://github.com/urllib3/urllib3/issues/3522&gt;</code>__)</li>
<li>Updated exceptions to save and restore more properties during the
pickle/serialization process.
(<code>[#3567](https://github.com/urllib3/urllib3/issues/3567)
&lt;https://github.com/urllib3/urllib3/issues/3567&gt;</code>__)</li>
<li>Added <code>verify_flags</code> option to
<code>create_urllib3_context</code> with a default of
<code>VERIFY_X509_PARTIAL_CHAIN</code> and
<code>VERIFY_X509_STRICT</code> for Python 3.13+.
(<code>[#3571](https://github.com/urllib3/urllib3/issues/3571)
&lt;https://github.com/urllib3/urllib3/issues/3571&gt;</code>__)</li>
</ul>
<h2>Bugfixes</h2>
<ul>
<li>Fixed a bug with partial reads of streaming data in Emscripten.
(<code>[#3555](https://github.com/urllib3/urllib3/issues/3555)
&lt;https://github.com/urllib3/urllib3/issues/3555&gt;</code>__)</li>
</ul>
<h2>Misc</h2>
<ul>
<li>Switched to uv for installing development dependecies.
(<code>[#3550](https://github.com/urllib3/urllib3/issues/3550)
&lt;https://github.com/urllib3/urllib3/issues/3550&gt;</code>__)</li>
<li>Removed the <code>multiple.intoto.jsonl</code> asset from GitHub
releases. Attestation of release files since v2.3.0 can be found on
PyPI. (<code>[#3566](https://github.com/urllib3/urllib3/issues/3566)
&lt;https://github.com/urllib3/urllib3/issues/3566&gt;</code>__)</li>
</ul>
<h1>2.3.0 (2024-12-22)</h1>
<!-- raw HTML omitted -->
</blockquote>
<p>... (truncated)</p>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a
href="https://github.com/urllib3/urllib3/commit/aaab4eccc10c965897540b21e15f11859d0b62e7"><code>aaab4ec</code></a>
Release 2.5.0</li>
<li><a
href="https://github.com/urllib3/urllib3/commit/7eb4a2aafe49a279c29b6d1f0ed0f42e9736194f"><code>7eb4a2a</code></a>
Merge commit from fork</li>
<li><a
href="https://github.com/urllib3/urllib3/commit/f05b1329126d5be6de501f9d1e3e36738bc08857"><code>f05b132</code></a>
Merge commit from fork</li>
<li><a
href="https://github.com/urllib3/urllib3/commit/d03fe327a71d09728512217149f269763671f296"><code>d03fe32</code></a>
Fix HTTP tunneling with IPv6 in older Python versions</li>
<li><a
href="https://github.com/urllib3/urllib3/commit/11661e9bb4278e43d081f47a516e287a928c2206"><code>11661e9</code></a>
Bump github/codeql-action from 3.28.0 to 3.29.0 (<a
href="https://redirect.github.com/urllib3/urllib3/issues/3624">#3624</a>)</li>
<li><a
href="https://github.com/urllib3/urllib3/commit/6a0ecc6b16fe30f721021b44a81d19615098c71e"><code>6a0ecc6</code></a>
Update v2 migration guide to 2.4.0 (<a
href="https://redirect.github.com/urllib3/urllib3/issues/3621">#3621</a>)</li>
<li><a
href="https://github.com/urllib3/urllib3/commit/8e32e60d9024c05bc6f7adda08bdf6c539d0b0d4"><code>8e32e60</code></a>
Raise exception for shutdown on a connection already released to the
pool (<a
href="https://redirect.github.com/urllib3/urllib3/issues/3">#3</a>...</li>
<li><a
href="https://github.com/urllib3/urllib3/commit/9996e0fbf90b77083ad3c73737a6c6395703faa9"><code>9996e0f</code></a>
Fix emscripten CI for Chrome 137+ (<a
href="https://redirect.github.com/urllib3/urllib3/issues/3599">#3599</a>)</li>
<li><a
href="https://github.com/urllib3/urllib3/commit/4fd1a99a59725faf0efc946ce3b6bc9a194420af"><code>4fd1a99</code></a>
Bump RECENT_DATE (<a
href="https://redirect.github.com/urllib3/urllib3/issues/3617">#3617</a>)</li>
<li><a
href="https://github.com/urllib3/urllib3/commit/c4b5917e911a90c8bf279448df8952a682294135"><code>c4b5917</code></a>
Add support for the new <code>compression.zstd</code> module in Python
3.14 (<a
href="https://redirect.github.com/urllib3/urllib3/issues/3611">#3611</a>)</li>
<li>Additional commits viewable in <a
href="https://github.com/urllib3/urllib3/compare/2.2.2...2.5.0">compare
view</a></li>
</ul>
</details>
<br />


[![Dependabot compatibility
score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=urllib3&package-manager=pip&previous-version=2.2.2&new-version=2.5.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't
alter it yourself. You can also trigger a rebase manually by commenting
`@dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@dependabot rebase` will rebase this PR
- `@dependabot recreate` will recreate this PR, overwriting any edits
that have been made to it
- `@dependabot merge` will merge this PR after your CI passes on it
- `@dependabot squash and merge` will squash and merge this PR after
your CI passes on it
- `@dependabot cancel merge` will cancel a previously requested merge
and block automerging
- `@dependabot reopen` will reopen this PR if it is closed
- `@dependabot close` will close this PR and stop Dependabot recreating
it. You can achieve the same result by closing it manually
- `@dependabot show <dependency name> ignore conditions` will show all
of the ignore conditions of the specified dependency
- `@dependabot ignore this major version` will close this PR and stop
Dependabot creating any more for this major version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this minor version` will close this PR and stop
Dependabot creating any more for this minor version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this dependency` will close this PR and stop
Dependabot creating any more for this dependency (unless you reopen the
PR or upgrade to it yourself)
You can disable automated security fix PRs for this repo from the
[Security Alerts
page](https://github.com/microsoft/DirectXShaderCompiler/network/alerts).

</details>

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 utils/git/requirements_formatting.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/git/requirements_formatting.txt b/utils/git/requirements_formatting.txt
index 14123e4ac0..2afb003c4f 100644
--- a/utils/git/requirements_formatting.txt
+++ b/utils/git/requirements_formatting.txt
@@ -46,7 +46,7 @@ requests==2.32.4
     # via pygithub
 toml==0.10.2
     # via darker
-urllib3==2.2.2
+urllib3==2.5.0
     # via requests
 wrapt==1.15.0
     # via deprecated

From 94abfe972ad839185965f670329bcf33cd7bccbd Mon Sep 17 00:00:00 2001
From: Alex Sepkowski <5620315+alsepkow@users.noreply.github.com>
Date: Thu, 26 Jun 2025 13:04:21 -0700
Subject: [PATCH 79/93] Address compiler warnings: Enable warning C4146 as a
 break.  (#7587)

Addresses #7584 by removing the warning disable for 4146. Also includes
a few trivial fixes for C4146 across several files that were missed in
previous PRs.
---
 cmake/modules/HandleLLVMOptions.cmake            | 1 -
 include/llvm/ADT/IntervalMap.h                   | 6 +++++-
 lib/Analysis/LoopAccessAnalysis.cpp              | 4 ++--
 lib/Transforms/Scalar/LoadCombine.cpp            | 2 +-
 lib/Transforms/Vectorize/LoopVectorize.cpp       | 6 +++---
 tools/clang/lib/AST/SelectorLocationsKind.cpp    | 4 ++--
 tools/clang/lib/CodeGen/ItaniumCXXABI.cpp        | 2 +-
 tools/clang/unittests/HLSLExec/ExecutionTest.cpp | 2 +-
 tools/clang/unittests/HLSLExec/ShaderOpTest.cpp  | 2 +-
 tools/clang/unittests/HLSLExec/ShaderOpTest.h    | 2 +-
 unittests/ADT/APIntTest.cpp                      | 3 ++-
 unittests/ADT/BitVectorTest.cpp                  | 7 +++++--
 unittests/Support/DataExtractorTest.cpp          | 6 ++++--
 13 files changed, 28 insertions(+), 19 deletions(-)

diff --git a/cmake/modules/HandleLLVMOptions.cmake b/cmake/modules/HandleLLVMOptions.cmake
index acf76c2907..00bdaed363 100644
--- a/cmake/modules/HandleLLVMOptions.cmake
+++ b/cmake/modules/HandleLLVMOptions.cmake
@@ -301,7 +301,6 @@ if( MSVC )
 
   set(msvc_warning_flags
     # Disabled warnings.
-    -wd4146 # Suppress 'unary minus operator applied to unsigned type, result still unsigned'
     -wd4180 # Suppress 'qualifier applied to function type has no meaning; ignored'
     -wd4244 # Suppress ''argument' : conversion from 'type1' to 'type2', possible loss of data'
     -wd4258 # Suppress ''var' : definition from the for loop is ignored; the definition from the enclosing scope is used'
diff --git a/include/llvm/ADT/IntervalMap.h b/include/llvm/ADT/IntervalMap.h
index 2a00667227..5bb948727e 100644
--- a/include/llvm/ADT/IntervalMap.h
+++ b/include/llvm/ADT/IntervalMap.h
@@ -320,7 +320,11 @@ class NodeBase {
       return Count;
     } else {
       // We want to shrink, copy to sib.
-      unsigned Count = std::min(std::min(unsigned(-Add), Size), N - SSize);
+      // Count <= INT_MAX: Since Add is an int, unsigned(-Add) <= 2^31, so
+      // std::min result <= INT_MAX. Meaning its safe to store the result in an
+      // int to avoid the compiler warning for '-Count' if we were to use an
+      // unsigned value instead.
+      int Count = std::min(std::min(unsigned(-Add), Size), N - SSize);
       transferToLeftSib(Size, Sib, SSize, Count);
       return -Count;
     }
diff --git a/lib/Analysis/LoopAccessAnalysis.cpp b/lib/Analysis/LoopAccessAnalysis.cpp
index 7e5e3e5ebd..d855df32dc 100644
--- a/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/lib/Analysis/LoopAccessAnalysis.cpp
@@ -1677,8 +1677,8 @@ LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE,
                                const ValueToValueMap &Strides)
     : PtrRtChecking(SE), DepChecker(SE, L), TheLoop(L), SE(SE), DL(DL),
       TLI(TLI), AA(AA), DT(DT), LI(LI), NumLoads(0), NumStores(0),
-      MaxSafeDepDistBytes(-1U), CanVecMem(false),
-      StoreToLoopInvariantAddress(false) {
+      MaxSafeDepDistBytes(std::numeric_limits<unsigned>::max()),
+      CanVecMem(false), StoreToLoopInvariantAddress(false) {
   if (canAnalyzeLoop())
     analyzeLoop(Strides);
 }
diff --git a/lib/Transforms/Scalar/LoadCombine.cpp b/lib/Transforms/Scalar/LoadCombine.cpp
index 8f22bb337d..fb48513c18 100644
--- a/lib/Transforms/Scalar/LoadCombine.cpp
+++ b/lib/Transforms/Scalar/LoadCombine.cpp
@@ -186,7 +186,7 @@ bool LoadCombine::combineLoads(SmallVectorImpl<LoadPOPPair> &Loads) {
 
   // Find first load. This is where we put the new load.
   LoadPOPPair FirstLP;
-  FirstLP.InsertOrder = -1u;
+  FirstLP.InsertOrder = std::numeric_limits<unsigned>::max();
   for (const auto &L : Loads)
     if (L.InsertOrder < FirstLP.InsertOrder)
       FirstLP = L;
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 69ca2688c8..d8e8fa11bd 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4472,8 +4472,8 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
 
   unsigned WidestType = getWidestType();
   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
-  unsigned MaxSafeDepDist = -1U;
-  if (Legal->getMaxSafeDepDistBytes() != -1U)
+  unsigned MaxSafeDepDist = std::numeric_limits<unsigned>::max();
+  if (Legal->getMaxSafeDepDistBytes() != std::numeric_limits<unsigned>::max())
     MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
   WidestRegister = ((WidestRegister < MaxSafeDepDist) ?
                     WidestRegister : MaxSafeDepDist);
@@ -4638,7 +4638,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
     return 1;
 
   // We used the distance for the interleave count.
-  if (Legal->getMaxSafeDepDistBytes() != -1U)
+  if (Legal->getMaxSafeDepDistBytes() != std::numeric_limits<unsigned>::max())
     return 1;
 
   // Do not interleave loops with a relatively small trip count.
diff --git a/tools/clang/lib/AST/SelectorLocationsKind.cpp b/tools/clang/lib/AST/SelectorLocationsKind.cpp
index 671207a7f2..36fd8cea6e 100644
--- a/tools/clang/lib/AST/SelectorLocationsKind.cpp
+++ b/tools/clang/lib/AST/SelectorLocationsKind.cpp
@@ -28,7 +28,7 @@ static SourceLocation getStandardSelLoc(unsigned Index,
     if (EndLoc.isInvalid())
       return SourceLocation();
     IdentifierInfo *II = Sel.getIdentifierInfoForSlot(0);
-    unsigned Len = II ? II->getLength() : 0;
+    int Len = II ? II->getLength() : 0;
     return EndLoc.getLocWithOffset(-Len);
   }
 
@@ -36,7 +36,7 @@ static SourceLocation getStandardSelLoc(unsigned Index,
   if (ArgLoc.isInvalid())
     return SourceLocation();
   IdentifierInfo *II = Sel.getIdentifierInfoForSlot(Index);
-  unsigned Len = /* selector id */ (II ? II->getLength() : 0) + /* ':' */ 1;
+  int Len = /* selector id */ (II ? II->getLength() : 0) + /* ':' */ 1;
   if (WithArgSpace)
     ++Len;
   return ArgLoc.getLocWithOffset(-Len);
diff --git a/tools/clang/lib/CodeGen/ItaniumCXXABI.cpp b/tools/clang/lib/CodeGen/ItaniumCXXABI.cpp
index 698d34c774..f39ec6d497 100644
--- a/tools/clang/lib/CodeGen/ItaniumCXXABI.cpp
+++ b/tools/clang/lib/CodeGen/ItaniumCXXABI.cpp
@@ -1090,7 +1090,7 @@ llvm::Value *ItaniumCXXABI::EmitTypeid(CodeGenFunction &CGF,
       CGF.GetVTablePtr(ThisPtr, StdTypeInfoPtrTy->getPointerTo());
 
   // Load the type info.
-  Value = CGF.Builder.CreateConstInBoundsGEP1_64(Value, -1ULL);
+  Value = CGF.Builder.CreateConstInBoundsGEP1_64(Value, -1LL);
   return CGF.Builder.CreateLoad(Value);
 }
 
diff --git a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
index c26b9a1b5b..586c55328d 100644
--- a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
+++ b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
@@ -11,7 +11,7 @@
 ///////////////////////////////////////////////////////////////////////////////
 
 // We need to keep & fix these warnings to integrate smoothly with HLK
-#pragma warning(error : 4100 4146 4242 4244 4267 4701 4389 4018)
+#pragma warning(error : 4100 4242 4244 4267 4701 4389 4018)
 
 // *** THIS FILE CANNOT TAKE ANY LLVM DEPENDENCIES  *** //
 
diff --git a/tools/clang/unittests/HLSLExec/ShaderOpTest.cpp b/tools/clang/unittests/HLSLExec/ShaderOpTest.cpp
index 9e18351a6d..60ce3a9241 100644
--- a/tools/clang/unittests/HLSLExec/ShaderOpTest.cpp
+++ b/tools/clang/unittests/HLSLExec/ShaderOpTest.cpp
@@ -10,7 +10,7 @@
 ///////////////////////////////////////////////////////////////////////////////
 
 // We need to keep & fix these warnings to integrate smoothly with HLK
-#pragma warning(error : 4100 4146 4242 4244 4267 4701 4389)
+#pragma warning(error : 4100 4242 4244 4267 4701 4389)
 
 #include "d3dx12.h"
 #include <atlbase.h>
diff --git a/tools/clang/unittests/HLSLExec/ShaderOpTest.h b/tools/clang/unittests/HLSLExec/ShaderOpTest.h
index 52b5f37730..e8298fc8d9 100644
--- a/tools/clang/unittests/HLSLExec/ShaderOpTest.h
+++ b/tools/clang/unittests/HLSLExec/ShaderOpTest.h
@@ -26,7 +26,7 @@
 #include <vector>
 
 // We need to keep & fix these warnings to integrate smoothly with HLK
-#pragma warning(error : 4100 4146 4242 4244 4267 4701 4389)
+#pragma warning(error : 4100 4242 4244 4267 4701 4389)
 
 ///////////////////////////////////////////////////////////////////////////////
 // Forward declarations.
diff --git a/unittests/ADT/APIntTest.cpp b/unittests/ADT/APIntTest.cpp
index ffba7b1633..a15307023e 100644
--- a/unittests/ADT/APIntTest.cpp
+++ b/unittests/ADT/APIntTest.cpp
@@ -11,6 +11,7 @@
 #include "llvm/ADT/SmallString.h"
 #include "gtest/gtest.h"
 #include <array>
+#include <limits>
 #include <ostream>
 
 using namespace llvm;
@@ -753,7 +754,7 @@ TEST(APIntTest, StringDeath) {
 #endif
 
 TEST(APIntTest, mul_clear) {
-  APInt ValA(65, -1ULL);
+  APInt ValA(65, std::numeric_limits<uint64_t>::max());
   APInt ValB(65, 4);
   APInt ValC(65, 0);
   ValC = ValA * ValB;
diff --git a/unittests/ADT/BitVectorTest.cpp b/unittests/ADT/BitVectorTest.cpp
index 26f103b3c1..c7de9194c4 100644
--- a/unittests/ADT/BitVectorTest.cpp
+++ b/unittests/ADT/BitVectorTest.cpp
@@ -12,6 +12,7 @@
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/SmallBitVector.h"
 #include "gtest/gtest.h"
+#include <limits>
 
 using namespace llvm;
 
@@ -73,7 +74,8 @@ TYPED_TEST(BitVectorTest, TrivialOperation) {
   Vec.resize(33, true);
   Vec.resize(57, false);
   unsigned Count = 0;
-  for (unsigned i = Vec.find_first(); i != -1u; i = Vec.find_next(i)) {
+  for (unsigned i = Vec.find_first(); i != std::numeric_limits<unsigned>::max();
+       i = Vec.find_next(i)) {
     ++Count;
     EXPECT_TRUE(Vec[i]);
     EXPECT_TRUE(Vec.test(i));
@@ -103,7 +105,8 @@ TYPED_TEST(BitVectorTest, TrivialOperation) {
   Vec.resize(91, true);
   Vec.resize(130, false);
   Count = 0;
-  for (unsigned i = Vec.find_first(); i != -1u; i = Vec.find_next(i)) {
+  for (unsigned i = Vec.find_first(); i != std::numeric_limits<unsigned>::max();
+       i = Vec.find_next(i)) {
     ++Count;
     EXPECT_TRUE(Vec[i]);
     EXPECT_TRUE(Vec.test(i));
diff --git a/unittests/Support/DataExtractorTest.cpp b/unittests/Support/DataExtractorTest.cpp
index 81de983d22..250b89d696 100644
--- a/unittests/Support/DataExtractorTest.cpp
+++ b/unittests/Support/DataExtractorTest.cpp
@@ -7,8 +7,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "gtest/gtest.h"
 #include "llvm/Support/DataExtractor.h"
+#include "gtest/gtest.h"
+#include <limits>
 using namespace llvm;
 
 namespace {
@@ -20,7 +21,8 @@ const char bigleb128data[] = "\xAA\xA9\xFF\xAA\xFF\xAA\xFF\x4A";
 
 TEST(DataExtractorTest, OffsetOverflow) {
   DataExtractor DE(StringRef(numberData, sizeof(numberData)-1), false, 8);
-  EXPECT_FALSE(DE.isValidOffsetForDataOfSize(-2U, 5));
+  EXPECT_FALSE(DE.isValidOffsetForDataOfSize(
+      std::numeric_limits<uint32_t>::max() - 1, 5));
 }
 
 TEST(DataExtractorTest, UnsignedNumbers) {

From a9d33d3500d37bd24c10288c76aca8e1c948d4a2 Mon Sep 17 00:00:00 2001
From: Ashley Coleman <ascoleman@microsoft.com>
Date: Mon, 30 Jun 2025 17:03:14 -0700
Subject: [PATCH 80/93] [NFC] Clear C33010 Warning (#7603)

Resolves `C33010` which is currently blocking the release pipeline.

---------

Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
---
 lib/HLSL/DxilCondenseResources.cpp        |  2 +-
 tools/clang/include/clang/Sema/Overload.h | 15 ++++++++-------
 tools/clang/lib/Format/FormatToken.h      |  6 +++---
 tools/clang/lib/Sema/SemaOverload.cpp     |  4 ++--
 4 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/lib/HLSL/DxilCondenseResources.cpp b/lib/HLSL/DxilCondenseResources.cpp
index 529c203bdc..09dd9cea64 100644
--- a/lib/HLSL/DxilCondenseResources.cpp
+++ b/lib/HLSL/DxilCondenseResources.cpp
@@ -655,7 +655,7 @@ class ResourceUseErrors {
 public:
   ResourceUseErrors() : m_bErrorsReported(false) {}
 
-  enum ErrorCode {
+  enum ErrorCode : unsigned int {
     // Collision between use of one resource GV and another.
     // All uses must be guaranteed to resolve to only one GV.
     // Additionally, when writing resource to alloca, all uses
diff --git a/tools/clang/include/clang/Sema/Overload.h b/tools/clang/include/clang/Sema/Overload.h
index 89de4ce984..473af49cab 100644
--- a/tools/clang/include/clang/Sema/Overload.h
+++ b/tools/clang/include/clang/Sema/Overload.h
@@ -57,7 +57,7 @@ namespace clang {
   /// convert an argument to a parameter's type. The enumerator values
   /// match with Table 9 of (C++ 13.3.3.1.1) and are listed such that
   /// better conversion kinds have smaller values.
-  enum ImplicitConversionKind {
+  enum ImplicitConversionKind : unsigned int {
     ICK_Identity = 0,          ///< Identity conversion (no conversion)
     ICK_Lvalue_To_Rvalue,      ///< Lvalue-to-rvalue conversion (C++ 4.1)
     ICK_Array_To_Pointer,      ///< Array-to-pointer conversion (C++ 4.2)
@@ -79,27 +79,28 @@ namespace clang {
     ICK_Vector_Conversion,     ///< Vector conversions
     ICK_Vector_Splat,          ///< A vector splat from an arithmetic type
     ICK_Complex_Real,          ///< Complex-real conversions (C99 6.3.1.7)
-    ICK_Block_Pointer_Conversion,    ///< Block Pointer conversions 
+    ICK_Block_Pointer_Conversion,   ///< Block Pointer conversions
     ICK_TransparentUnionConversion, ///< Transparent Union Conversions
-    ICK_Writeback_Conversion,  ///< Objective-C ARC writeback conversion
+    ICK_Writeback_Conversion,       ///< Objective-C ARC writeback conversion
     ICK_Zero_Event_Conversion, ///< Zero constant to event (OpenCL1.2 6.12.10)
 
     // HLSL Change Starts
-    // The following conversion types also imply a potential followup 
+    // The following conversion types also imply a potential followup
     // ComponentConversion.
     // List is roughly ordered to preserve the property:
     //   "better conversion kinds have smaller values"
-    // Unfortunately, this property isn't really possible to preserve due 
+    // Unfortunately, this property isn't really possible to preserve due
     // to potential additional component conversion.
     ICK_HLSLVector_Scalar,     ///< HLSLVector/Matrix to scalar
     ICK_HLSLVector_Conversion, ///< HLSLVector/Matrix conversion
-    ICK_Flat_Conversion,       ///< Flat assignment conversion for HLSL (inline conversion, straddled)
+    ICK_Flat_Conversion,       ///< Flat assignment conversion for HLSL (inline
+                               ///< conversion, straddled)
     ICK_HLSLVector_Splat,      ///< HLSLVector/Matrix splat
     ICK_HLSLVector_Truncation, ///< HLSLVector/Matrix truncation
     ICK_HLSL_Derived_To_Base,  ///< HLSL Derived-to-base
     // HLSL Change Ends
 
-    ICK_Num_Conversion_Kinds   ///< The number of conversion kinds
+    ICK_Num_Conversion_Kinds ///< The number of conversion kinds
   };
 
   /// ImplicitConversionRank - The rank of an implicit conversion
diff --git a/tools/clang/lib/Format/FormatToken.h b/tools/clang/lib/Format/FormatToken.h
index f335eda086..249d526871 100644
--- a/tools/clang/lib/Format/FormatToken.h
+++ b/tools/clang/lib/Format/FormatToken.h
@@ -86,11 +86,11 @@ namespace format {
   TYPE(UnaryOperator) \
   TYPE(Unknown)
 
-enum TokenType {
+enum TokenType : unsigned int {
 #define TYPE(X) TT_##X,
-LIST_TOKEN_TYPES
+  LIST_TOKEN_TYPES
 #undef TYPE
-  NUM_TOKEN_TYPES
+      NUM_TOKEN_TYPES
 };
 
 /// \brief Determines the name of a token type.
diff --git a/tools/clang/lib/Sema/SemaOverload.cpp b/tools/clang/lib/Sema/SemaOverload.cpp
index 636eaf0213..1bcbc7442f 100644
--- a/tools/clang/lib/Sema/SemaOverload.cpp
+++ b/tools/clang/lib/Sema/SemaOverload.cpp
@@ -146,8 +146,8 @@ ImplicitConversionRank clang::GetConversionRank(ImplicitConversionKind Kind) {
   };
   static_assert(_countof(Rank) == ICK_Num_Conversion_Kinds,
       "Otherwise, GetConversionRank is out of sync with ImplicitConversionKind"); // HLSL Change
-  assert((int)Kind < (int)ICK_Num_Conversion_Kinds); // HLSL Change
-  return Rank[(int)Kind];
+  assert(Kind < _countof(Rank)); // HLSL Change
+  return Rank[Kind];             // HLSL Change
 }
 
 /// GetImplicitConversionName - Return the name of this kind of

From 7e0d771197110c10a39d279dc6a3c3c213c056d3 Mon Sep 17 00:00:00 2001
From: Jeff Noyle <jeffno@microsoft.com>
Date: Tue, 1 Jul 2025 13:17:08 -0700
Subject: [PATCH 81/93] PIX: Report correct bitfield values in PIX shader
 debugger (#7557)

The key change here is the & in DxcDxilPixStorage.cpp. The generated
DXIL packs the bitfields into their 32- or 64-bit-typed Values as
expected, but this code, when trying to figure out which Value a
bitfield lives in, was looking up the unpacked bit offset, so only
fields within the zeroth underlying Value were being reported correctly.

With this change, PIX reports correct bitfield values wherever they
live, including within deeply nested structs.

Unfortunately, the tests had to be in C++ because file-check obv.
doesn't run the APIs that PIX uses to read debug data.

---------

Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
---
 lib/DxilDia/DxcPixDxilStorage.cpp         |   6 +-
 tools/clang/unittests/HLSL/PixDiaTest.cpp | 167 +++++++++++++++++-----
 2 files changed, 138 insertions(+), 35 deletions(-)

diff --git a/lib/DxilDia/DxcPixDxilStorage.cpp b/lib/DxilDia/DxcPixDxilStorage.cpp
index 79d21303dc..4b06f472e8 100644
--- a/lib/DxilDia/DxcPixDxilStorage.cpp
+++ b/lib/DxilDia/DxcPixDxilStorage.cpp
@@ -185,7 +185,11 @@ dxil_debug_info::DxcPixDxilScalarStorage::Index(DWORD Index,
 STDMETHODIMP dxil_debug_info::DxcPixDxilScalarStorage::GetRegisterNumber(
     DWORD *pRegisterNumber) {
   const auto &ValueLocationMap = m_pVarInfo->m_ValueLocationMap;
-  auto RegIt = ValueLocationMap.find(m_OffsetFromStorageStartInBits);
+  // Bitfields will have been packed into their containing integer type:
+  DWORD size;
+  m_pOriginalType->GetSizeInBits(&size);
+  auto RegIt =
+      ValueLocationMap.find(m_OffsetFromStorageStartInBits & ~(size - 1));
 
   if (RegIt == ValueLocationMap.end()) {
     return E_FAIL;
diff --git a/tools/clang/unittests/HLSL/PixDiaTest.cpp b/tools/clang/unittests/HLSL/PixDiaTest.cpp
index a4439b998d..d36e762762 100644
--- a/tools/clang/unittests/HLSL/PixDiaTest.cpp
+++ b/tools/clang/unittests/HLSL/PixDiaTest.cpp
@@ -13,6 +13,7 @@
 #ifdef _WIN32
 
 #include <array>
+#include <set>
 
 #include "dxc/DxilContainer/DxilContainer.h"
 #include "dxc/Support/WinIncludes.h"
@@ -186,6 +187,7 @@ class PixDiaTest {
   TEST_METHOD(DxcPixDxilDebugInfo_BitFields_Derived)
   TEST_METHOD(DxcPixDxilDebugInfo_BitFields_Bool)
   TEST_METHOD(DxcPixDxilDebugInfo_BitFields_Overlap)
+  TEST_METHOD(DxcPixDxilDebugInfo_BitFields_uint64)
   TEST_METHOD(DxcPixDxilDebugInfo_Min16SizesAndOffsets_Enabled)
   TEST_METHOD(DxcPixDxilDebugInfo_Min16SizesAndOffsets_Disabled)
   TEST_METHOD(DxcPixDxilDebugInfo_Min16VectorOffsets_Enabled)
@@ -658,11 +660,11 @@ class PixDiaTest {
       const char *hlsl, const wchar_t *profile,
       const char *lineAtWhichToExamineVariables,
       std::vector<VariableComponentInfo> const &ExpectedVariables);
-  void RunSizeAndOffsetTestCase(const char *hlsl,
-                                std::array<DWORD, 4> const &memberOffsets,
-                                std::array<DWORD, 4> const &memberSizes,
-                                std::vector<const wchar_t *> extraArgs = {
-                                    L"-Od"});
+  CComPtr<IDxcPixDxilStorage>
+  RunSizeAndOffsetTestCase(const char *hlsl,
+                           std::array<DWORD, 4> const &memberOffsets,
+                           std::array<DWORD, 4> const &memberSizes,
+                           std::vector<const wchar_t *> extraArgs = {L"-Od"});
   void RunVectorSizeAndOffsetTestCase(const char *hlsl,
                                       std::array<DWORD, 4> const &memberOffsets,
                                       std::vector<const wchar_t *> extraArgs = {
@@ -2948,12 +2950,11 @@ void main()
   VERIFY_ARE_EQUAL(32u, secondFieldOffset);
 }
 
-void PixDiaTest::RunSizeAndOffsetTestCase(
-    const char *hlsl, std::array<DWORD, 4> const &memberOffsets,
-    std::array<DWORD, 4> const &memberSizes,
-    std::vector<const wchar_t *> extraArgs) {
-  if (m_ver.SkipDxilVersion(1, 5))
-    return;
+CComPtr<IDxcPixDxilStorage>
+PixDiaTest::RunSizeAndOffsetTestCase(const char *hlsl,
+                                     std::array<DWORD, 4> const &memberOffsets,
+                                     std::array<DWORD, 4> const &memberSizes,
+                                     std::vector<const wchar_t *> extraArgs) {
   auto debugInfo =
       CompileAndCreateDxcDebug(hlsl, L"cs_6_5", nullptr, extraArgs).debugInfo;
   auto live = GetLiveVariablesAt(hlsl, "STOP_HERE", debugInfo);
@@ -2974,9 +2975,46 @@ void PixDiaTest::RunSizeAndOffsetTestCase(
     VERIFY_SUCCEEDED(field->GetFieldSizeInBits(&sizeInBits));
     VERIFY_ARE_EQUAL(memberSizes[i], sizeInBits);
   }
+  // Check that first and second and third are reported as residing in the same
+  // register (cuz they do!), and that the third does not
+
+  CComPtr<IDxcPixDxilStorage> bfStorage;
+  VERIFY_SUCCEEDED(bf->GetStorage(&bfStorage));
+  return bfStorage;
+}
+
+void RunBitfieldAdjacencyTest(
+    IDxcPixDxilStorage *bfStorage,
+    std::vector<std::vector<wchar_t const *>> const &adjacentRuns) {
+  std::vector<std::set<DWORD>> registersByRun;
+  registersByRun.resize(adjacentRuns.size());
+  for (size_t run = 0; run < adjacentRuns.size(); ++run) {
+    for (auto const &field : adjacentRuns[run]) {
+      CComPtr<IDxcPixDxilStorage> fieldStorage;
+      VERIFY_SUCCEEDED(bfStorage->AccessField(field, &fieldStorage));
+      DWORD reg;
+      VERIFY_SUCCEEDED(fieldStorage->GetRegisterNumber(&reg));
+      registersByRun[run].insert(reg);
+    }
+  }
+  for (size_t run = 0; run < registersByRun.size(); ++run) {
+    {
+      // Every field in this run should have the same register number, so this
+      // set should be of size 1:
+      VERIFY_ARE_EQUAL(1, registersByRun[run].size());
+      // Every adjacent run should have different register numbers:
+      if (run != 0) {
+        VERIFY_ARE_NOT_EQUAL(*registersByRun[run - 1].begin(),
+                             *registersByRun[run].begin());
+      }
+    }
+  }
 }
 
 TEST_F(PixDiaTest, DxcPixDxilDebugInfo_BitFields_Simple) {
+  if (m_ver.SkipDxilVersion(1, 5))
+    return;
+
   const char *hlsl = R"(
 struct Bitfields
 {
@@ -3000,10 +3038,16 @@ void main()
 }
 
 )";
-  RunSizeAndOffsetTestCase(hlsl, {0, 17, 32, 64}, {17, 15, 3, 32});
+  auto bfStorage =
+      RunSizeAndOffsetTestCase(hlsl, {0, 17, 32, 64}, {17, 15, 3, 32});
+  RunBitfieldAdjacencyTest(bfStorage,
+                           {{L"first", L"second"}, {L"third"}, {L"fourth"}});
 }
 
 TEST_F(PixDiaTest, DxcPixDxilDebugInfo_BitFields_Derived) {
+  if (m_ver.SkipDxilVersion(1, 5))
+    return;
+
   const char *hlsl = R"(
 struct Bitfields
 {
@@ -3027,10 +3071,16 @@ void main()
 }
 
 )";
-  RunSizeAndOffsetTestCase(hlsl, {0, 17, 32, 64}, {17, 15, 3, 32});
+  auto bfStorage =
+      RunSizeAndOffsetTestCase(hlsl, {0, 17, 32, 64}, {17, 15, 3, 32});
+  RunBitfieldAdjacencyTest(bfStorage,
+                           {{L"first", L"second"}, {L"third"}, {L"fourth"}});
 }
 
 TEST_F(PixDiaTest, DxcPixDxilDebugInfo_BitFields_Bool) {
+  if (m_ver.SkipDxilVersion(1, 5))
+    return;
+
   const char *hlsl = R"(
 struct Bitfields
 {
@@ -3054,17 +3104,58 @@ void main()
 }
 
 )";
-  RunSizeAndOffsetTestCase(hlsl, {0, 1, 2, 32}, {1, 1, 3, 32});
+  auto bfStorage = RunSizeAndOffsetTestCase(hlsl, {0, 1, 2, 32}, {1, 1, 3, 32});
+  RunBitfieldAdjacencyTest(bfStorage,
+                           {{L"first", L"second", L"third"}, {L"fourth"}});
 }
 
 TEST_F(PixDiaTest, DxcPixDxilDebugInfo_BitFields_Overlap) {
+  if (m_ver.SkipDxilVersion(1, 5))
+    return;
+
+  const char *hlsl = R"(
+struct Bitfields
+{
+    uint32_t first : 20;
+    uint32_t second : 20; // should end up in second DWORD
+    uint32_t third : 3; // should shader second DWORD
+    uint32_t fourth; // should be in third DWORD
+};
+
+RWStructuredBuffer<int> UAV: register(u0);
+
+[numthreads(1, 1, 1)]
+void main()
+{
+  Bitfields bf;
+  bf.first = UAV[0];
+  bf.second = UAV[1];
+  bf.third = UAV[2];
+  bf.fourth = UAV[3];
+  UAV[16] = bf.first + bf.second + bf.third + bf.fourth; //STOP_HERE
+}
+
+)";
+  auto bfStorage =
+      RunSizeAndOffsetTestCase(hlsl, {0, 32, 52, 64}, {20, 20, 3, 32});
+  // (PIX #58022343): fields that overlap their storage type are not yet
+  // reflected properly in terms of their packed offsets as maintained via
+  // these PixDxc interfaces based on the dbg.declare data
+  // RunBitfieldAdjacencyTest(bfStorage,
+  //                         {{L"first"}, {L"second", L"third"}, {L"fourth"}});
+}
+
+TEST_F(PixDiaTest, DxcPixDxilDebugInfo_BitFields_uint64) {
+  if (m_ver.SkipDxilVersion(1, 5))
+    return;
+
   const char *hlsl = R"(
 struct Bitfields
 {
-    unsigned int first : 20;
-    unsigned int second : 20; // should end up in second DWORD
-    unsigned int third : 3; // should shader second DWORD
-    unsigned int fourth; // should be in third DWORD
+    uint64_t first : 20;
+    uint64_t second : 20; // should end up in first uint64 also
+    uint64_t third : 24; // in first
+    uint64_t fourth; // should be in second
 };
 
 RWStructuredBuffer<int> UAV: register(u0);
@@ -3081,7 +3172,10 @@ void main()
 }
 
 )";
-  RunSizeAndOffsetTestCase(hlsl, {0, 32, 52, 64}, {20, 20, 3, 32});
+  auto bfStorage =
+      RunSizeAndOffsetTestCase(hlsl, {0, 20, 40, 64}, {20, 20, 24, 64});
+  RunBitfieldAdjacencyTest(bfStorage,
+                           {{L"first", L"second", L"third"}, {L"fourth"}});
 }
 
 TEST_F(PixDiaTest, DxcPixDxilDebugInfo_Alignment_ConstInt) {
@@ -3502,9 +3596,10 @@ void ClosestHitShader3(inout RayPayload payload, in BuiltInTriangleIntersectionA
 
   // Case: same function called from two places in same top-level function.
   // In this case, we expect the storage for the variable to be in the same
-  // place for both "instances" of the function: as a thread proceeds through
-  // the caller, it will write new values into the variable's storage during
-  // the second or subsequent invocations of the inlined function.
+  // place for both "instances" of the function: as a thread proceeds
+  // through the caller, it will write new values into the variable's
+  // storage during the second or subsequent invocations of the inlined
+  // function.
   DWORD instructionOffset =
       AdvanceUntilFunctionEntered(dxilDebugger, 0, L"ClosestHitShader3");
   instructionOffset = AdvanceUntilFunctionEntered(
@@ -3550,9 +3645,10 @@ TEST_F(PixDiaTest, DxcPixDxilDebugInfo_VariableScopes_ForScopes) {
 
   // Case: same function called from two places in same top-level function.
   // In this case, we expect the storage for the variable to be in the same
-  // place for both "instances" of the function: as a thread proceeds through
-  // the caller, it will write new values into the variable's storage during
-  // the second or subsequent invocations of the inlined function.
+  // place for both "instances" of the function: as a thread proceeds
+  // through the caller, it will write new values into the variable's
+  // storage during the second or subsequent invocations of the inlined
+  // function.
   DWORD instructionOffset =
       AdvanceUntilFunctionEntered(dxilDebugger, 0, L"CSMain");
 
@@ -3597,9 +3693,10 @@ TEST_F(PixDiaTest, DxcPixDxilDebugInfo_VariableScopes_ScopeBraces) {
 
   // Case: same function called from two places in same top-level function.
   // In this case, we expect the storage for the variable to be in the same
-  // place for both "instances" of the function: as a thread proceeds through
-  // the caller, it will write new values into the variable's storage during
-  // the second or subsequent invocations of the inlined function.
+  // place for both "instances" of the function: as a thread proceeds
+  // through the caller, it will write new values into the variable's
+  // storage during the second or subsequent invocations of the inlined
+  // function.
   DWORD instructionOffset =
       AdvanceUntilFunctionEntered(dxilDebugger, 0, L"CSMain");
 
@@ -3644,9 +3741,10 @@ TEST_F(PixDiaTest, DxcPixDxilDebugInfo_VariableScopes_Function) {
 
   // Case: same function called from two places in same top-level function.
   // In this case, we expect the storage for the variable to be in the same
-  // place for both "instances" of the function: as a thread proceeds through
-  // the caller, it will write new values into the variable's storage during
-  // the second or subsequent invocations of the inlined function.
+  // place for both "instances" of the function: as a thread proceeds
+  // through the caller, it will write new values into the variable's
+  // storage during the second or subsequent invocations of the inlined
+  // function.
   DWORD instructionOffset =
       AdvanceUntilFunctionEntered(dxilDebugger, 0, L"CSMain");
 
@@ -3692,9 +3790,10 @@ void CSMain()
 
   // Case: same function called from two places in same top-level function.
   // In this case, we expect the storage for the variable to be in the same
-  // place for both "instances" of the function: as a thread proceeds through
-  // the caller, it will write new values into the variable's storage during
-  // the second or subsequent invocations of the inlined function.
+  // place for both "instances" of the function: as a thread proceeds
+  // through the caller, it will write new values into the variable's
+  // storage during the second or subsequent invocations of the inlined
+  // function.
   DWORD instructionOffset =
       AdvanceUntilFunctionEntered(dxilDebugger, 0, L"CSMain");
 

From 2da0a54f150f51bd6a2b85fd4cc76bdfd614219e Mon Sep 17 00:00:00 2001
From: Alex Sepkowski <5620315+alsepkow@users.noreply.github.com>
Date: Tue, 1 Jul 2025 17:12:36 -0700
Subject: [PATCH 82/93] Long Vector Execution Tests: Merge unary and binary op
 tests to main (#7549)

**Summary**
Adds infrastructure for long vector execution tests. This code and
additional test cases were already added to the staging-sm6.9 branch.
This is the second of several PRs to bring these changes into main. That
being said, reviews of this code should treat it as brand new. Resolves
#7545

**Includes:**
- A new test class `LongVector::OpTest` in `LongVectors.h/cpp`, still
part of the `ExecHLSLTests.dll` binary.
- HLSL source added to `ShaderOpArith.xml` to leverage the existing exec
test framework for shader compilation and execution.
- A new TAEF metadata file `LongVectorOpTable.xml` defining long vector
test cases.
- `LongVectorTestData.h` for statically defined input values, including
`HLSLHalf_t` and `HLSLBool_t`. This avoids duplicating values across
test cases.

**Template Handling**
To support template instantiation across translation units,
`LongVectors.tpp` contains full template definitions included by
`LongVectors.h`. These were originally required when tests lived in
`ExecutionTests.cpp`. Now that the tests are isolated, the plan is to
move the definitions back into `LongVectors.cpp` after merging the long
vector tests from `staging-sm6.9` to simplify the manual merge.

**Utilities**
`HlslTestUtils.h` includes minor updates to support the new test
scenarios.
---
 include/dxc/Test/HlslTestUtils.h              |  95 +++-
 tools/clang/unittests/HLSLExec/CMakeLists.txt |   1 +
 .../clang/unittests/HLSLExec/ExecHLSLTests.rc |   3 +-
 .../unittests/HLSLExec/LongVectorOpTable.xml  | 515 ++++++++++++++++++
 .../unittests/HLSLExec/LongVectorTestData.h   |  74 +++
 .../clang/unittests/HLSLExec/LongVectors.cpp  | 316 +++++++++++
 tools/clang/unittests/HLSLExec/LongVectors.h  | 282 ++++++++++
 .../clang/unittests/HLSLExec/LongVectors.tpp  | 476 ++++++++++++++++
 .../unittests/HLSLExec/ShaderOpArith.xml      |  67 +++
 9 files changed, 1819 insertions(+), 10 deletions(-)
 create mode 100644 tools/clang/unittests/HLSLExec/LongVectorOpTable.xml
 create mode 100644 tools/clang/unittests/HLSLExec/LongVectorTestData.h
 create mode 100644 tools/clang/unittests/HLSLExec/LongVectors.cpp
 create mode 100644 tools/clang/unittests/HLSLExec/LongVectors.h
 create mode 100644 tools/clang/unittests/HLSLExec/LongVectors.tpp

diff --git a/include/dxc/Test/HlslTestUtils.h b/include/dxc/Test/HlslTestUtils.h
index 44f3f6148a..dd89fda676 100644
--- a/include/dxc/Test/HlslTestUtils.h
+++ b/include/dxc/Test/HlslTestUtils.h
@@ -260,6 +260,29 @@ inline void LogErrorFmt(const wchar_t *fmt, ...) {
   WEX::Logging::Log::Error(buf.data());
 }
 
+inline void LogErrorFmtThrow(const char *fileName, int line, const wchar_t *fmt,
+                             ...) {
+  va_list args;
+  va_start(args, fmt);
+  std::wstring buf(vFormatToWString(fmt, args));
+  va_end(args);
+
+  std::wstringstream wss;
+  wss << L"Error in file: " << fileName << L" at line: " << line << L"\n"
+      << buf.data() << L"\n"
+      << buf;
+
+  WEX::Logging::Log::Error(wss.str().c_str());
+
+  // Throws an exception to abort the test.
+  VERIFY_FAIL(L"Test error");
+}
+
+// Macro to pass the file name and line number. Otherwise TAEF prints this file
+// and line number.
+#define LOG_ERROR_FMT_THROW(fmt, ...)                                          \
+  hlsl_test::LogErrorFmtThrow(__FILE__, __LINE__, fmt, __VA_ARGS__)
+
 inline std::wstring
 GetPathToHlslDataFile(const wchar_t *relative,
                       LPCWSTR paramName = HLSLDATAFILEPARAM,
@@ -461,15 +484,17 @@ inline bool GetTestParamUseWARP(bool defaultVal) {
 
 #ifdef FP_SUBNORMAL
 
-inline bool isdenorm(float f) { return FP_SUBNORMAL == std::fpclassify(f); }
+template <typename T> inline bool isdenorm(T f) {
+  return FP_SUBNORMAL == std::fpclassify(f);
+}
 
 #else
 
-inline bool isdenorm(float f) {
-  return (std::numeric_limits<float>::denorm_min() <= f &&
-          f < std::numeric_limits<float>::min()) ||
-         (-std::numeric_limits<float>::min() < f &&
-          f <= -std::numeric_limits<float>::denorm_min());
+template <typename T> inline bool isdenorm(T f) {
+  return (std::numeric_limits<T>::denorm_min() <= f &&
+          f < std::numeric_limits<T>::min()) ||
+         (-std::numeric_limits<T>::min() < f &&
+          f <= -std::numeric_limits<T>::denorm_min());
 }
 
 #endif // FP_SUBNORMAL
@@ -517,6 +542,44 @@ inline bool isnanFloat16(uint16_t val) {
 uint16_t ConvertFloat32ToFloat16(float val) throw();
 float ConvertFloat16ToFloat32(uint16_t val) throw();
 
+inline bool CompareDoubleULP(
+    const double &Src, const double &Ref, int64_t ULPTolerance,
+    hlsl::DXIL::Float32DenormMode Mode = hlsl::DXIL::Float32DenormMode::Any) {
+  if (Src == Ref) {
+    return true;
+  }
+  if (std::isnan(Src)) {
+    return std::isnan(Ref);
+  }
+
+  if (Mode == hlsl::DXIL::Float32DenormMode::Any) {
+    // If denorm expected, output can be sign preserved zero. Otherwise output
+    // should pass the regular ulp testing.
+    if (isdenorm(Ref) && Src == 0 && std::signbit(Src) == std::signbit(Ref))
+      return true;
+  }
+
+  // For FTZ or Preserve mode, we should get the expected number within
+  // ULPTolerance for any operations.
+  int64_t Diff = *((const uint64_t *)&Src) - *((const uint64_t *)&Ref);
+
+  uint64_t AbsoluteDiff = Diff < 0 ? -Diff : Diff;
+  return AbsoluteDiff <= (uint64_t)ULPTolerance;
+}
+
+inline bool CompareDoubleEpsilon(const double &Src, const double &Ref,
+                                 float Epsilon) {
+  if (Src == Ref) {
+    return true;
+  }
+  if (std::isnan(Src)) {
+    return std::isnan(Ref);
+  }
+  // For FTZ or Preserve mode, we should get the expected number within
+  // epsilon for any operations.
+  return fabs(Src - Ref) < Epsilon;
+}
+
 inline bool CompareFloatULP(
     const float &fsrc, const float &fref, int ULPTolerance,
     hlsl::DXIL::Float32DenormMode mode = hlsl::DXIL::Float32DenormMode::Any) {
@@ -568,12 +631,26 @@ inline bool CompareFloatRelativeEpsilon(
 
 inline bool CompareHalfULP(const uint16_t &fsrc, const uint16_t &fref,
                            float ULPTolerance) {
+  // Treat +0 and -0 as equal
+  if ((fsrc & ~FLOAT16_BIT_SIGN) == 0 && (fref & ~FLOAT16_BIT_SIGN) == 0)
+    return true;
   if (fsrc == fref)
     return true;
-  if (isnanFloat16(fsrc))
-    return isnanFloat16(fref);
+
+  const bool nanRef = isnanFloat16(fref);
+  const bool nanSrc = isnanFloat16(fsrc);
+  if (nanRef || nanSrc)
+    return nanRef && nanSrc;
+
+  // Map to monotonic ordering for correct ULP diff
+  auto toOrdered = [](uint16_t h) -> int {
+    return (h & FLOAT16_BIT_SIGN) ? (~h & 0xFFFF) : (h | 0x8000);
+  };
+
   // 16-bit floating point numbers must preserve denorms
-  int diff = fsrc - fref;
+  int i_fsrc = toOrdered(fsrc);
+  int i_fref = toOrdered(fref);
+  int diff = i_fsrc - i_fref;
   unsigned int uDiff = diff < 0 ? -diff : diff;
   return uDiff <= (unsigned int)ULPTolerance;
 }
diff --git a/tools/clang/unittests/HLSLExec/CMakeLists.txt b/tools/clang/unittests/HLSLExec/CMakeLists.txt
index df61aad854..b490ac94e9 100644
--- a/tools/clang/unittests/HLSLExec/CMakeLists.txt
+++ b/tools/clang/unittests/HLSLExec/CMakeLists.txt
@@ -9,6 +9,7 @@ add_clang_library(ExecHLSLTests SHARED
   ExecutionTest.cpp
   ShaderOpTest.cpp
   TableParameterHandler.cpp
+  LongVectors.cpp
   ExecHLSLTests.rc
   )
 
diff --git a/tools/clang/unittests/HLSLExec/ExecHLSLTests.rc b/tools/clang/unittests/HLSLExec/ExecHLSLTests.rc
index 6f4659910c..29459ee825 100644
--- a/tools/clang/unittests/HLSLExec/ExecHLSLTests.rc
+++ b/tools/clang/unittests/HLSLExec/ExecHLSLTests.rc
@@ -1,3 +1,4 @@
 #include <windows.h>
 
-ShaderOpArithTable.xml DATASOURCE_XML "ShaderOpArithTable.xml"
\ No newline at end of file
+ShaderOpArithTable.xml DATASOURCE_XML "ShaderOpArithTable.xml"
+LongVectorOpTable.xml DATASOURCE_XML "LongVectorOpTable.xml"
diff --git a/tools/clang/unittests/HLSLExec/LongVectorOpTable.xml b/tools/clang/unittests/HLSLExec/LongVectorOpTable.xml
new file mode 100644
index 0000000000..39a2fa481e
--- /dev/null
+++ b/tools/clang/unittests/HLSLExec/LongVectorOpTable.xml
@@ -0,0 +1,515 @@
+<?xml version="1.0" ?>
+<Data>
+    <Table Id="BinaryOpTable">
+        <ParameterTypes>
+          <!-- InputValueSetName1 is optional. If no value is provided use the
+          default value set for the data type. This string is meant to be a
+          key value for the the array of std::pairs defined in
+          LongVectorTestData.h for the applicable DataType-->
+          <ParameterType Name="InputValueSetName1">String</ParameterType>
+          <!-- InputValueSetName2 is optional. Same as InputValueSetName1 -->
+          <ParameterType Name="InputValueSetName2">String</ParameterType>
+          <ParameterType Name="DataType">String</ParameterType>
+          <ParameterType Name="OpTypeEnum">String</ParameterType>
+        </ParameterTypes>
+        <!-- LongVectorBinaryOpTypeTable DataType: int16 -->
+        <Row Name="ScalarAdd_int16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarAdd</Parameter>
+          <Parameter Name="DataType">int16</Parameter>
+        </Row>
+        <Row Name="Add_int16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Add</Parameter>
+          <Parameter Name="DataType">int16</Parameter>
+        </Row>
+        <Row Name="ScalarSubtract_int16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarSubtract</Parameter>
+          <Parameter Name="DataType">int16</Parameter>
+        </Row>
+        <Row Name="Subtract_int16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Subtract</Parameter>
+          <Parameter Name="DataType">int16</Parameter>
+        </Row>
+        <Row Name="ScalarMultiply_int16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarMultiply</Parameter>
+          <Parameter Name="DataType">int16</Parameter>
+        </Row>
+        <Row Name="Multiply_int16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Multiply</Parameter>
+          <Parameter Name="DataType">int16</Parameter>
+        </Row>
+        <Row Name="ScalarDivide_int16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarDivide</Parameter>
+          <Parameter Name="DataType">int16</Parameter>
+        </Row>
+        <Row Name="Divide_int16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Divide</Parameter>
+          <Parameter Name="DataType">int16</Parameter>
+        </Row>
+        <Row Name="ScalarModulus_int16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarModulus</Parameter>
+          <Parameter Name="DataType">int16</Parameter>
+        </Row>
+        <Row Name="Modulus_int16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Modulus</Parameter>
+          <Parameter Name="DataType">int16</Parameter>
+        </Row>
+        <Row Name="ScalarMin_int16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarMin</Parameter>
+          <Parameter Name="DataType">int16</Parameter>
+        </Row>
+        <Row Name="Min_int16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Min</Parameter>
+          <Parameter Name="DataType">int16</Parameter>
+        </Row>
+        <Row Name="ScalarMax_int16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarMax</Parameter>
+          <Parameter Name="DataType">int16</Parameter>
+        </Row>
+        <Row Name="Max_int16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Max</Parameter>
+          <Parameter Name="DataType">int16</Parameter>
+        </Row>
+        <!-- LongVectorBinaryOpTypeTable DataType: int32 -->
+        <Row Name="ScalarAdd_int32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarAdd</Parameter>
+          <Parameter Name="DataType">int32</Parameter>
+        </Row>
+        <Row Name="Add_int32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Add</Parameter>
+          <Parameter Name="DataType">int32</Parameter>
+        </Row>
+        <Row Name="ScalarSubtract_int32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarSubtract</Parameter>
+          <Parameter Name="DataType">int32</Parameter>
+        </Row>
+        <Row Name="Subtract_int32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Subtract</Parameter>
+          <Parameter Name="DataType">int32</Parameter>
+        </Row>
+        <Row Name="ScalarMultiply_int32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarMultiply</Parameter>
+          <Parameter Name="DataType">int32</Parameter>
+        </Row>
+        <Row Name="Multiply_int32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Multiply</Parameter>
+          <Parameter Name="DataType">int32</Parameter>
+        </Row>
+        <Row Name="ScalarDivide_int32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarDivide</Parameter>
+          <Parameter Name="DataType">int32</Parameter>
+        </Row>
+        <Row Name="Divide_int32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Divide</Parameter>
+          <Parameter Name="DataType">int32</Parameter>
+        </Row>
+        <Row Name="ScalarModulus_int32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarModulus</Parameter>
+          <Parameter Name="DataType">int32</Parameter>
+        </Row>
+        <Row Name="Modulus_int32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Modulus</Parameter>
+          <Parameter Name="DataType">int32</Parameter>
+        </Row>
+        <Row Name="ScalarMin_int32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarMin</Parameter>
+          <Parameter Name="DataType">int32</Parameter>
+        </Row>
+        <Row Name="Min_int32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Min</Parameter>
+          <Parameter Name="DataType">int32</Parameter>
+        </Row>
+        <Row Name="ScalarMax_int32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarMax</Parameter>
+          <Parameter Name="DataType">int32</Parameter>
+        </Row>
+        <Row Name="Max_int32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Max</Parameter>
+          <Parameter Name="DataType">int32</Parameter>
+        </Row>
+        <!-- LongVectorBinaryOpTypeTable DataType: int64 -->
+        <Row Name="ScalarAdd_int64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarAdd</Parameter>
+          <Parameter Name="DataType">int64</Parameter>
+        </Row>
+        <Row Name="Add_int64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Add</Parameter>
+          <Parameter Name="DataType">int64</Parameter>
+        </Row>
+        <Row Name="ScalarSubtract_int64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarSubtract</Parameter>
+          <Parameter Name="DataType">int64</Parameter>
+        </Row>
+        <Row Name="Subtract_int64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Subtract</Parameter>
+          <Parameter Name="DataType">int64</Parameter>
+        </Row>
+        <Row Name="ScalarMultiply_int64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarMultiply</Parameter>
+          <Parameter Name="DataType">int64</Parameter>
+        </Row>
+        <Row Name="Multiply_int64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Multiply</Parameter>
+          <Parameter Name="DataType">int64</Parameter>
+        </Row>
+        <Row Name="ScalarDivide_int64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarDivide</Parameter>
+          <Parameter Name="DataType">int64</Parameter>
+        </Row>
+        <Row Name="Divide_int64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Divide</Parameter>
+          <Parameter Name="DataType">int64</Parameter>
+        </Row>
+        <Row Name="ScalarModulus_int64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarModulus</Parameter>
+          <Parameter Name="DataType">int64</Parameter>
+        </Row>
+        <Row Name="Modulus_int64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Modulus</Parameter>
+          <Parameter Name="DataType">int64</Parameter>
+        </Row>
+        <Row Name="ScalarMin_int64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarMin</Parameter>
+          <Parameter Name="DataType">int64</Parameter>
+        </Row>
+        <Row Name="Min_int64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Min</Parameter>
+          <Parameter Name="DataType">int64</Parameter>
+        </Row>
+        <Row Name="ScalarMax_int64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarMax</Parameter>
+          <Parameter Name="DataType">int64</Parameter>
+        </Row>
+        <Row Name="Max_int64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Max</Parameter>
+          <Parameter Name="DataType">int64</Parameter>
+        </Row>
+        <!-- LongVectorBinaryOpTypeTable DataType: uint16 -->
+        <Row Name="ScalarAdd_uint16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarAdd</Parameter>
+          <Parameter Name="DataType">uint16</Parameter>
+        </Row>
+        <Row Name="Add_uint16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Add</Parameter>
+          <Parameter Name="DataType">uint16</Parameter>
+        </Row>
+        <Row Name="ScalarSubtract_uint16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarSubtract</Parameter>
+          <Parameter Name="DataType">uint16</Parameter>
+        </Row>
+        <Row Name="Subtract_uint16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Subtract</Parameter>
+          <Parameter Name="DataType">uint16</Parameter>
+        </Row>
+        <Row Name="ScalarMultiply_uint16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarMultiply</Parameter>
+          <Parameter Name="DataType">uint16</Parameter>
+        </Row>
+        <Row Name="Multiply_uint16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Multiply</Parameter>
+          <Parameter Name="DataType">uint16</Parameter>
+        </Row>
+        <Row Name="ScalarDivide_uint16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarDivide</Parameter>
+          <Parameter Name="DataType">uint16</Parameter>
+        </Row>
+        <Row Name="Divide_uint16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Divide</Parameter>
+          <Parameter Name="DataType">uint16</Parameter>
+        </Row>
+        <Row Name="ScalarModulus_uint16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarModulus</Parameter>
+          <Parameter Name="DataType">uint16</Parameter>
+        </Row>
+        <Row Name="Modulus_uint16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Modulus</Parameter>
+          <Parameter Name="DataType">uint16</Parameter>
+        </Row>
+        <Row Name="ScalarMin_uint16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarMin</Parameter>
+          <Parameter Name="DataType">uint16</Parameter>
+        </Row>
+        <Row Name="Min_uint16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Min</Parameter>
+          <Parameter Name="DataType">uint16</Parameter>
+        </Row>
+        <Row Name="ScalarMax_uint16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarMax</Parameter>
+          <Parameter Name="DataType">uint16</Parameter>
+        </Row>
+        <Row Name="Max_uint16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Max</Parameter>
+          <Parameter Name="DataType">uint16</Parameter>
+        </Row>
+        <!-- LongVectorBinaryOpTypeTable DataType: uint32 -->
+        <Row Name="ScalarAdd_uint32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarAdd</Parameter>
+          <Parameter Name="DataType">uint32</Parameter>
+        </Row>
+        <Row Name="Add_uint32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Add</Parameter>
+          <Parameter Name="DataType">uint32</Parameter>
+        </Row>
+        <Row Name="ScalarSubtract_uint32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarSubtract</Parameter>
+          <Parameter Name="DataType">uint32</Parameter>
+        </Row>
+        <Row Name="Subtract_uint32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Subtract</Parameter>
+          <Parameter Name="DataType">uint32</Parameter>
+        </Row>
+        <Row Name="ScalarMultiply_uint32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarMultiply</Parameter>
+          <Parameter Name="DataType">uint32</Parameter>
+        </Row>
+        <Row Name="Multiply_uint32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Multiply</Parameter>
+          <Parameter Name="DataType">uint32</Parameter>
+        </Row>
+        <Row Name="ScalarDivide_uint32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarDivide</Parameter>
+          <Parameter Name="DataType">uint32</Parameter>
+        </Row>
+        <Row Name="Divide_uint32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Divide</Parameter>
+          <Parameter Name="DataType">uint32</Parameter>
+        </Row>
+        <Row Name="ScalarModulus_uint32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarModulus</Parameter>
+          <Parameter Name="DataType">uint32</Parameter>
+        </Row>
+        <Row Name="Modulus_uint32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Modulus</Parameter>
+          <Parameter Name="DataType">uint32</Parameter>
+        </Row>
+        <Row Name="ScalarMin_uint32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarMin</Parameter>
+          <Parameter Name="DataType">uint32</Parameter>
+        </Row>
+        <Row Name="Min_uint32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Min</Parameter>
+          <Parameter Name="DataType">uint32</Parameter>
+        </Row>
+        <Row Name="ScalarMax_uint32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarMax</Parameter>
+          <Parameter Name="DataType">uint32</Parameter>
+        </Row>
+        <Row Name="Max_uint32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Max</Parameter>
+          <Parameter Name="DataType">uint32</Parameter>
+        </Row>
+        <!-- LongVectorBinaryOpTypeTable DataType: uint64 -->
+        <Row Name="ScalarAdd_uint64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarAdd</Parameter>
+          <Parameter Name="DataType">uint64</Parameter>
+        </Row>
+        <Row Name="Add_uint64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Add</Parameter>
+          <Parameter Name="DataType">uint64</Parameter>
+        </Row>
+        <Row Name="ScalarSubtract_uint64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarSubtract</Parameter>
+          <Parameter Name="DataType">uint64</Parameter>
+        </Row>
+        <Row Name="Subtract_uint64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Subtract</Parameter>
+          <Parameter Name="DataType">uint64</Parameter>
+        </Row>
+        <Row Name="ScalarMultiply_uint64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarMultiply</Parameter>
+          <Parameter Name="DataType">uint64</Parameter>
+        </Row>
+        <Row Name="Multiply_uint64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Multiply</Parameter>
+          <Parameter Name="DataType">uint64</Parameter>
+        </Row>
+        <Row Name="ScalarDivide_uint64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarDivide</Parameter>
+          <Parameter Name="DataType">uint64</Parameter>
+        </Row>
+        <Row Name="Divide_uint64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Divide</Parameter>
+          <Parameter Name="DataType">uint64</Parameter>
+        </Row>
+        <Row Name="ScalarModulus_uint64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarModulus</Parameter>
+          <Parameter Name="DataType">uint64</Parameter>
+        </Row>
+        <Row Name="Modulus_uint64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Modulus</Parameter>
+          <Parameter Name="DataType">uint64</Parameter>
+        </Row>
+        <Row Name="ScalarMin_uint64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarMin</Parameter>
+          <Parameter Name="DataType">uint64</Parameter>
+        </Row>
+        <Row Name="Min_uint64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Min</Parameter>
+          <Parameter Name="DataType">uint64</Parameter>
+        </Row>
+        <Row Name="ScalarMax_uint64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarMax</Parameter>
+          <Parameter Name="DataType">uint64</Parameter>
+        </Row>
+        <Row Name="Max_uint64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Max</Parameter>
+          <Parameter Name="DataType">uint64</Parameter>
+        </Row>
+        <!-- LongVectorBinaryOpTypeTable DataType: float32 -->
+        <Row Name="ScalarAdd_float32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarAdd</Parameter>
+          <Parameter Name="DataType">float32</Parameter>
+        </Row>
+        <Row Name="Add_float32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Add</Parameter>
+          <Parameter Name="DataType">float32</Parameter>
+        </Row>
+        <Row Name="ScalarSubtract_float32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarSubtract</Parameter>
+          <Parameter Name="DataType">float32</Parameter>
+        </Row>
+        <Row Name="Subtract_float32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Subtract</Parameter>
+          <Parameter Name="DataType">float32</Parameter>
+        </Row>
+        <Row Name="ScalarMultiply_float32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarMultiply</Parameter>
+          <Parameter Name="DataType">float32</Parameter>
+        </Row>
+        <Row Name="Multiply_float32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Multiply</Parameter>
+          <Parameter Name="DataType">float32</Parameter>
+        </Row>
+        <Row Name="ScalarDivide_float32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarDivide</Parameter>
+          <Parameter Name="DataType">float32</Parameter>
+        </Row>
+        <Row Name="Divide_float32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Divide</Parameter>
+          <Parameter Name="DataType">float32</Parameter>
+        </Row>
+        <Row Name="ScalarModulus_float32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarModulus</Parameter>
+          <Parameter Name="DataType">float32</Parameter>
+        </Row>
+        <Row Name="Modulus_float32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Modulus</Parameter>
+          <Parameter Name="DataType">float32</Parameter>
+        </Row>
+        <Row Name="ScalarMin_float32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarMin</Parameter>
+          <Parameter Name="DataType">float32</Parameter>
+        </Row>
+        <Row Name="Min_float32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Min</Parameter>
+          <Parameter Name="DataType">float32</Parameter>
+        </Row>
+        <Row Name="ScalarMax_float32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarMax</Parameter>
+          <Parameter Name="DataType">float32</Parameter>
+        </Row>
+        <Row Name="Max_float32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Max</Parameter>
+          <Parameter Name="DataType">float32</Parameter>
+        </Row>
+        <!-- LongVectorBinaryOpTypeTable DataType: float64 -->
+        <Row Name="ScalarAdd_float64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarAdd</Parameter>
+          <Parameter Name="DataType">float64</Parameter>
+        </Row>
+        <Row Name="Add_float64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Add</Parameter>
+          <Parameter Name="DataType">float64</Parameter>
+        </Row>
+        <Row Name="ScalarSubtract_float64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarSubtract</Parameter>
+          <Parameter Name="DataType">float64</Parameter>
+        </Row>
+        <Row Name="Subtract_float64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Subtract</Parameter>
+          <Parameter Name="DataType">float64</Parameter>
+        </Row>
+        <Row Name="ScalarMultiply_float64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarMultiply</Parameter>
+          <Parameter Name="DataType">float64</Parameter>
+        </Row>
+        <Row Name="Multiply_float64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Multiply</Parameter>
+          <Parameter Name="DataType">float64</Parameter>
+        </Row>
+        <Row Name="ScalarDivide_float64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarDivide</Parameter>
+          <Parameter Name="DataType">float64</Parameter>
+        </Row>
+        <Row Name="Divide_float64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Divide</Parameter>
+          <Parameter Name="DataType">float64</Parameter>
+        </Row>
+        <Row Name="ScalarMin_float64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarMin</Parameter>
+          <Parameter Name="DataType">float64</Parameter>
+        </Row>
+        <Row Name="Min_float64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Min</Parameter>
+          <Parameter Name="DataType">float64</Parameter>
+        </Row>
+        <Row Name="ScalarMax_float64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarMax</Parameter>
+          <Parameter Name="DataType">float64</Parameter>
+        </Row>
+        <Row Name="Max_float64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Max</Parameter>
+          <Parameter Name="DataType">float64</Parameter>
+        </Row>
+    </Table>
+    <Table Id="UnaryOpTable">
+      <ParameterTypes>
+        <!-- InputValueSetName1 is optional. If no value is provided use the
+        default value set for the data type. This string is meant to be a key
+        value for the the array of std::pairs defined in LongVectorTestData.h
+        for the applicable DataType-->
+        <ParameterType Name="InputValueSetName1">String</ParameterType>
+        <ParameterType Name="DataType">String</ParameterType>
+        <ParameterType Name="OpTypeEnum">String</ParameterType>
+      </ParameterTypes>
+      <!-- LongVectorUnaryOpTypeTable DataType: int16 -->
+      <Row Name="Initialize_int16">
+        <Parameter Name="OpTypeEnum">UnaryOpType_Initialize</Parameter>
+        <Parameter Name="DataType">int16</Parameter>
+      </Row>
+      <!-- LongVectorUnaryOpTypeTable DataType: int32 -->
+      <Row Name="Initialize_int32">
+        <Parameter Name="OpTypeEnum">UnaryOpType_Initialize</Parameter>
+        <Parameter Name="DataType">int32</Parameter>
+      </Row>
+      <!-- LongVectorUnaryOpTypeTable DataType: int64 -->
+      <Row Name="Initialize_int64">
+        <Parameter Name="OpTypeEnum">UnaryOpType_Initialize</Parameter>
+        <Parameter Name="DataType">int64</Parameter>
+      </Row>
+      <!-- LongVectorUnaryOpTypeTable DataType: uint16 -->
+      <Row Name="Initialize_uint16">
+        <Parameter Name="OpTypeEnum">UnaryOpType_Initialize</Parameter>
+        <Parameter Name="DataType">uint16</Parameter>
+      </Row>
+      <!-- LongVectorUnaryOpTypeTable DataType: uint32 -->
+      <Row Name="Initialize_uint32">
+        <Parameter Name="OpTypeEnum">UnaryOpType_Initialize</Parameter>
+        <Parameter Name="DataType">uint32</Parameter>
+      </Row>
+      <!-- LongVectorUnaryOpTypeTable DataType: uint64 -->
+      <Row Name="Initialize_uint64">
+        <Parameter Name="OpTypeEnum">UnaryOpType_Initialize</Parameter>
+        <Parameter Name="DataType">uint64</Parameter>
+      </Row>
+      <!-- LongVectorUnaryOpTypeTable DataType: float32 -->
+      <Row Name="Initialize_float32">
+        <Parameter Name="OpTypeEnum">UnaryOpType_Initialize</Parameter>
+        <Parameter Name="DataType">float32</Parameter>
+      </Row>
+      <!-- LongVectorUnaryOpTypeTable DataType: float64 -->
+      <Row Name="Initialize_float64">
+        <Parameter Name="OpTypeEnum">UnaryOpType_Initialize</Parameter>
+        <Parameter Name="DataType">float64</Parameter>
+      </Row>
+    </Table>
+</Data>
diff --git a/tools/clang/unittests/HLSLExec/LongVectorTestData.h b/tools/clang/unittests/HLSLExec/LongVectorTestData.h
new file mode 100644
index 0000000000..002c765609
--- /dev/null
+++ b/tools/clang/unittests/HLSLExec/LongVectorTestData.h
@@ -0,0 +1,74 @@
+#ifndef LONGVECTORTESTDATA_H
+#define LONGVECTORTESTDATA_H
+
+#include <Verify.h>
+#include <limits>
+#include <map>
+#include <string>
+#include <vector>
+
+template <typename T> struct LongVectorTestData {
+  static const std::map<std::wstring, std::vector<T>> Data;
+};
+
+template <> struct LongVectorTestData<int16_t> {
+  inline static const std::map<std::wstring, std::vector<int16_t>> Data = {
+      {L"DefaultInputValueSet1", {-6, 1, 7, 3, 8, 4, -3, 8, 8, -2}},
+      {L"DefaultInputValueSet2", {5, -6, -3, -2, 9, 3, 1, -3, -7, 2}},
+  };
+};
+
+template <> struct LongVectorTestData<int32_t> {
+  inline static const std::map<std::wstring, std::vector<int32_t>> Data = {
+      {L"DefaultInputValueSet1", {-6, 1, 7, 3, 8, 4, -3, 8, 8, -2}},
+      {L"DefaultInputValueSet2", {5, -6, -3, -2, 9, 3, 1, -3, -7, 2}},
+  };
+};
+
+template <> struct LongVectorTestData<int64_t> {
+  inline static const std::map<std::wstring, std::vector<int64_t>> Data = {
+      {L"DefaultInputValueSet1", {-6, 11, 7, 3, 8, 4, -3, 8, 8, -2}},
+      {L"DefaultInputValueSet2", {5, -1337, -3, -2, 9, 3, 1, -3, 501, 2}},
+  };
+};
+
+template <> struct LongVectorTestData<uint16_t> {
+  inline static const std::map<std::wstring, std::vector<uint16_t>> Data = {
+      {L"DefaultInputValueSet1", {1, 699, 3, 1023, 5, 6, 0, 8, 9, 10}},
+      {L"DefaultInputValueSet2", {2, 111, 3, 4, 5, 9, 21, 8, 9, 10}},
+  };
+};
+
+template <> struct LongVectorTestData<uint32_t> {
+  inline static const std::map<std::wstring, std::vector<uint32_t>> Data = {
+      {L"DefaultInputValueSet1", {1, 2, 3, 4, 5, 0, 7, 8, 9, 10}},
+      {L"DefaultInputValueSet2", {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}},
+  };
+};
+
+template <> struct LongVectorTestData<uint64_t> {
+  inline static const std::map<std::wstring, std::vector<uint64_t>> Data = {
+      {L"DefaultInputValueSet1", {1, 2, 3, 4, 5, 0, 7, 1000, 9, 10}},
+      {L"DefaultInputValueSet2", {1, 2, 1337, 4, 5, 6, 7, 8, 9, 10}},
+  };
+};
+
+template <> struct LongVectorTestData<float> {
+  inline static const std::map<std::wstring, std::vector<float>> Data = {
+      {L"DefaultInputValueSet1",
+       {1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0}},
+      {L"DefaultInputValueSet2",
+       {1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0}},
+  };
+};
+
+template <> struct LongVectorTestData<double> {
+  inline static const std::map<std::wstring, std::vector<double>> Data = {
+      {L"DefaultInputValueSet1",
+       {1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0}},
+      {L"DefaultInputValueSet2",
+       {1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0}},
+  };
+};
+
+#endif // LONGVECTORTESTDATA_H
diff --git a/tools/clang/unittests/HLSLExec/LongVectors.cpp b/tools/clang/unittests/HLSLExec/LongVectors.cpp
new file mode 100644
index 0000000000..54e5224798
--- /dev/null
+++ b/tools/clang/unittests/HLSLExec/LongVectors.cpp
@@ -0,0 +1,316 @@
+#include "LongVectors.h"
+#include "HlslExecTestUtils.h"
+#include <iomanip>
+
+LongVector::BinaryOpType
+LongVector::getBinaryOpType(const std::wstring &OpTypeString) {
+  return getLongVectorOpType<LongVector::BinaryOpType>(
+      binaryOpTypeStringToEnumMap, OpTypeString,
+      std::size(binaryOpTypeStringToEnumMap));
+}
+
+LongVector::UnaryOpType
+LongVector::getUnaryOpType(const std::wstring &OpTypeString) {
+  return getLongVectorOpType<LongVector::UnaryOpType>(
+      unaryOpTypeStringToEnumMap, OpTypeString,
+      std::size(unaryOpTypeStringToEnumMap));
+}
+
+// These are helper arrays to be used with the TableParameterHandler that parses
+// the LongVectorOpTable.xml file for us.
+static TableParameter BinaryOpParameters[] = {
+    {L"DataType", TableParameter::STRING, true},
+    {L"OpTypeEnum", TableParameter::STRING, true},
+    {L"InputValueSetName1", TableParameter::STRING, false},
+    {L"InputValueSetName2", TableParameter::STRING, false},
+};
+
+static TableParameter UnaryOpParameters[] = {
+    {L"DataType", TableParameter::STRING, true},
+    {L"OpTypeEnum", TableParameter::STRING, true},
+    {L"InputValueSetName1", TableParameter::STRING, false},
+};
+
+bool LongVector::OpTest::classSetup() {
+  // Run this only once.
+  if (!Initialized) {
+    Initialized = true;
+
+    HMODULE Runtime = LoadLibraryW(L"d3d12.dll");
+    if (Runtime == NULL)
+      return false;
+    // Do not: FreeLibrary(hRuntime);
+    // If we actually free the library, it defeats the purpose of
+    // enableAgilitySDK and enableExperimentalMode.
+
+    HRESULT HR;
+    HR = enableAgilitySDK(Runtime);
+
+    if (FAILED(HR))
+      hlsl_test::LogCommentFmt(L"Unable to enable Agility SDK - 0x%08x.", HR);
+    else if (HR == S_FALSE)
+      hlsl_test::LogCommentFmt(L"Agility SDK not enabled.");
+    else
+      hlsl_test::LogCommentFmt(L"Agility SDK enabled.");
+
+    HR = enableExperimentalMode(Runtime);
+    if (FAILED(HR))
+      hlsl_test::LogCommentFmt(
+          L"Unable to enable shader experimental mode - 0x%08x.", HR);
+    else if (HR == S_FALSE)
+      hlsl_test::LogCommentFmt(L"Experimental mode not enabled.");
+    else
+      hlsl_test::LogCommentFmt(L"Experimental mode enabled.");
+
+    HR = enableDebugLayer();
+    if (FAILED(HR))
+      hlsl_test::LogCommentFmt(L"Unable to enable debug layer - 0x%08x.", HR);
+    else if (HR == S_FALSE)
+      hlsl_test::LogCommentFmt(L"Debug layer not enabled.");
+    else
+      hlsl_test::LogCommentFmt(L"Debug layer enabled.");
+  }
+
+  return true;
+}
+
+TEST_F(LongVector::OpTest, binaryOpTest) {
+  WEX::TestExecution::SetVerifyOutput verifySettings(
+      WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
+
+  using namespace WEX::Common;
+
+  const int TableSize = sizeof(BinaryOpParameters) / sizeof(TableParameter);
+  TableParameterHandler Handler(BinaryOpParameters, TableSize);
+
+  std::wstring DataType(Handler.GetTableParamByName(L"DataType")->m_str);
+  std::wstring OpTypeString(Handler.GetTableParamByName(L"OpTypeEnum")->m_str);
+
+  auto OpType = LongVector::getBinaryOpType(OpTypeString);
+  dispatchTestByDataType(OpType, DataType, Handler);
+}
+
+TEST_F(LongVector::OpTest, unaryOpTest) {
+  WEX::TestExecution::SetVerifyOutput verifySettings(
+      WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
+
+  const int TableSize = sizeof(UnaryOpParameters) / sizeof(TableParameter);
+  TableParameterHandler Handler(UnaryOpParameters, TableSize);
+
+  std::wstring DataType(Handler.GetTableParamByName(L"DataType")->m_str);
+  std::wstring OpTypeString(Handler.GetTableParamByName(L"OpTypeEnum")->m_str);
+
+  auto OpType = LongVector::getUnaryOpType(OpTypeString);
+  dispatchTestByDataType(OpType, DataType, Handler);
+}
+
+template <typename LongVectorOpTypeT>
+void LongVector::OpTest::dispatchTestByDataType(
+    LongVectorOpTypeT OpType, std::wstring DataType,
+    TableParameterHandler &Handler) {
+  using namespace WEX::Common;
+
+  if (DataType == L"int16")
+    dispatchTestByVectorSize<int16_t>(OpType, Handler);
+  else if (DataType == L"int32")
+    dispatchTestByVectorSize<int32_t>(OpType, Handler);
+  else if (DataType == L"int64")
+    dispatchTestByVectorSize<int64_t>(OpType, Handler);
+  else if (DataType == L"uint16")
+    dispatchTestByVectorSize<uint16_t>(OpType, Handler);
+  else if (DataType == L"uint32")
+    dispatchTestByVectorSize<uint32_t>(OpType, Handler);
+  else if (DataType == L"uint64")
+    dispatchTestByVectorSize<uint64_t>(OpType, Handler);
+  else if (DataType == L"float32")
+    dispatchTestByVectorSize<float>(OpType, Handler);
+  else if (DataType == L"float64")
+    dispatchTestByVectorSize<double>(OpType, Handler);
+  else
+    VERIFY_FAIL(
+        String().Format(L"DataType: %s is not recognized.", DataType.c_str()));
+}
+
+template <typename DataTypeT, typename LongVectorOpTypeT>
+void LongVector::OpTest::dispatchTestByVectorSize(
+    LongVectorOpTypeT opType, TableParameterHandler &Handler) {
+  WEX::TestExecution::SetVerifyOutput verifySettings(
+      WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
+
+  LongVector::TestConfig<DataTypeT, LongVectorOpTypeT> TestConfig(opType);
+
+  // InputValueSetName1 is optional. So the string may be empty. An empty
+  // string will result in the default value set for this DataType being used.
+  std::wstring InputValueSet1(
+      Handler.GetTableParamByName(L"InputValueSetName1")->m_str);
+  if (!InputValueSet1.empty())
+    TestConfig.setInputValueSet1(InputValueSet1);
+
+  // InputValueSetName2 is optional. So the string may be empty. An empty
+  // string will result in the default value set for this DataType being used.
+  if (TestConfig.isBinaryOp()) {
+    std::wstring InputValueSet2(
+        Handler.GetTableParamByName(L"InputValueSetName2")->m_str);
+    if (!InputValueSet2.empty())
+      TestConfig.setInputValueSet2(InputValueSet2);
+  }
+
+  std::vector<size_t> InputVectorSizes = {3, 4, 5, 16, 17, 35, 100, 256, 1024};
+  for (auto SizeToTest : InputVectorSizes) {
+    testBaseMethod<DataTypeT, LongVectorOpTypeT>(TestConfig, SizeToTest);
+  }
+}
+
+template <typename DataTypeT, typename LongVectorOpTypeT>
+void LongVector::OpTest::testBaseMethod(
+    LongVector::TestConfig<DataTypeT, LongVectorOpTypeT> &TestConfig,
+    size_t VectorSizeToTest) {
+  WEX::TestExecution::SetVerifyOutput verifySettings(
+      WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
+
+  hlsl_test::LogCommentFmt(L"Running LongVectorOpTestBase<%S, %zu>",
+                           typeid(DataTypeT).name(), VectorSizeToTest);
+
+  bool LogInputs = false;
+  WEX::TestExecution::RuntimeParameters::TryGetValue(L"LongVectorLogInputs",
+                                                     LogInputs);
+
+  CComPtr<ID3D12Device> D3DDevice;
+  if (!createDevice(&D3DDevice, ExecTestUtils::D3D_SHADER_MODEL_6_9, false)) {
+#ifdef _HLK_CONF
+    LOG_ERROR_FMT_THROW(
+        L"Device does not support SM 6.9. Can't run these tests.");
+#else
+    WEX::Logging::Log::Comment(
+        "Device does not support SM 6.9. Can't run these tests.");
+    WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped);
+    return;
+#endif
+  }
+
+  std::vector<DataTypeT> InputVector1;
+  InputVector1.reserve(VectorSizeToTest);
+  std::vector<DataTypeT> InputVector2; // May be unused, but must be defined.
+  InputVector2.reserve(VectorSizeToTest);
+  std::vector<DataTypeT> ScalarInput; // May be unused, but must be defined.
+  const bool IsVectorBinaryOp =
+      TestConfig.isBinaryOp() && !TestConfig.isScalarOp();
+
+  std::vector<DataTypeT> InputVector1ValueSet = TestConfig.getInputValueSet1();
+  std::vector<DataTypeT> InputVector2ValueSet =
+      TestConfig.isBinaryOp() ? TestConfig.getInputValueSet2()
+                              : std::vector<DataTypeT>();
+
+  if (TestConfig.isScalarOp())
+    // Scalar ops are always binary ops. So InputVector2ValueSet is initialized
+    // with values above.
+    ScalarInput.push_back(InputVector2ValueSet[0]);
+
+  // Fill the input vectors with values from the value set. Repeat the values
+  // when we reach the end of the value set.
+  for (size_t Index = 0; Index < VectorSizeToTest; Index++) {
+    InputVector1.push_back(
+        InputVector1ValueSet[Index % InputVector1ValueSet.size()]);
+
+    if (IsVectorBinaryOp)
+      InputVector2.push_back(
+          InputVector2ValueSet[Index % InputVector2ValueSet.size()]);
+  }
+
+  std::vector<DataTypeT> ExpectedVector;
+  ExpectedVector.reserve(VectorSizeToTest);
+  if (IsVectorBinaryOp)
+    ExpectedVector =
+        computeExpectedValues(InputVector1, InputVector2, TestConfig);
+  else if (TestConfig.isScalarOp())
+    ExpectedVector =
+        computeExpectedValues(InputVector1, ScalarInput[0], TestConfig);
+  else // Must be a unary op
+    ExpectedVector = computeExpectedValues(InputVector1, TestConfig);
+
+  if (LogInputs) {
+    logLongVector<DataTypeT>(InputVector1, L"InputVector1");
+
+    if (IsVectorBinaryOp)
+      logLongVector<DataTypeT>(InputVector2, L"InputVector2");
+    else if (TestConfig.isScalarOp())
+      logLongVector<DataTypeT>(ScalarInput, L"ScalarInput");
+  }
+
+  // We have to construct the string outside of the lambda. Otherwise it's
+  // cleaned up when the lambda finishes executing but before the shader runs.
+  std::string CompilerOptionsString =
+      TestConfig.getCompilerOptionsString(VectorSizeToTest);
+
+  // The name of the shader we want to use in ShaderOpArith.xml. Could also add
+  // logic to set this name in ShaderOpArithTable.xml so we can use different
+  // shaders for different tests.
+  LPCSTR ShaderName = "LongVectorOp";
+  // ShaderOpArith.xml defines the input/output resources and the shader source.
+  CComPtr<IStream> TestXML;
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &TestXML, DxcDllSupport);
+
+  // RunShaderOpTest is a helper function that handles resource creation
+  // and setup. It also handles the shader compilation and execution. It takes a
+  // callback that is called when the shader is compiled, but before it is
+  // executed.
+  std::shared_ptr<st::ShaderOpTestResult> TestResult = st::RunShaderOpTest(
+      D3DDevice, DxcDllSupport, TestXML, ShaderName,
+      [&](LPCSTR Name, std::vector<BYTE> &ShaderData, st::ShaderOp *ShaderOp) {
+        hlsl_test::LogCommentFmt(L"RunShaderOpTest CallBack. Resource Name: %S",
+                                 Name);
+
+        // This callback is called once for each resource defined for
+        // "LongVectorOp" in ShaderOpArith.xml. All callbacks are fired for each
+        // resource. We determine whether they are applicable to the test case
+        // when they run.
+
+        // Process the callback for the OutputVector resource.
+        if (0 == _stricmp(Name, "OutputVector")) {
+          // We only need to set the compiler options string once. So this is a
+          // convenient place to do it.
+          ShaderOp->Shaders.at(0).Arguments = CompilerOptionsString.c_str();
+
+          return;
+        }
+
+        // Process the callback for the InputFuncArgs resource.
+        if (0 == _stricmp(Name, "InputFuncArgs")) {
+          if (TestConfig.isScalarOp())
+            fillShaderBufferFromLongVectorData<DataTypeT>(ShaderData,
+                                                          ScalarInput);
+          return;
+        }
+
+        // Process the callback for the InputVector1 resource.
+        if (0 == _stricmp(Name, "InputVector1")) {
+          fillShaderBufferFromLongVectorData<DataTypeT>(ShaderData,
+                                                        InputVector1);
+          return;
+        }
+
+        // Process the callback for the InputVector2 resource.
+        if (0 == _stricmp(Name, "InputVector2")) {
+          if (IsVectorBinaryOp)
+            fillShaderBufferFromLongVectorData<DataTypeT>(ShaderData,
+                                                          InputVector2);
+
+          return;
+        }
+
+        LOG_ERROR_FMT_THROW(
+            L"RunShaderOpTest CallBack. Unexpected Resource Name: %S", Name);
+      });
+
+  // Map the data from GPU to CPU memory so we can verify our expectations.
+  MappedData ShaderOutData;
+  TestResult->Test->GetReadBackData("OutputVector", &ShaderOutData);
+
+  std::vector<DataTypeT> OutputVector;
+  fillLongVectorDataFromShaderBuffer<DataTypeT>(ShaderOutData, OutputVector,
+                                                VectorSizeToTest);
+
+  VERIFY_SUCCEEDED(doVectorsMatch<DataTypeT>(OutputVector, ExpectedVector,
+                                             TestConfig.getTolerance(),
+                                             TestConfig.getValidationType()));
+}
diff --git a/tools/clang/unittests/HLSLExec/LongVectors.h b/tools/clang/unittests/HLSLExec/LongVectors.h
new file mode 100644
index 0000000000..392d059bcd
--- /dev/null
+++ b/tools/clang/unittests/HLSLExec/LongVectors.h
@@ -0,0 +1,282 @@
+#ifndef LONGVECTORS_H
+#define LONGVECTORS_H
+
+#include <array>
+#include <ostream>
+#include <random>
+#include <sstream>
+#include <string>
+
+#include <DirectXMath.h>
+#include <DirectXPackedVector.h>
+
+#include <Verify.h>
+
+#include "LongVectorTestData.h"
+#include "ShaderOpTest.h"
+#include "TableParameterHandler.h"
+#include "dxc/Support/WinIncludes.h"
+#include "dxc/Support/dxcapi.use.h"
+#include "dxc/Test/HlslTestUtils.h"
+
+namespace LongVector {
+template <typename DataTypeT, typename LongVectorOpTypeT>
+class TestConfig; // Forward declaration
+
+class OpTest {
+public:
+  BEGIN_TEST_CLASS(OpTest)
+  END_TEST_CLASS()
+
+  TEST_CLASS_SETUP(classSetup);
+
+  BEGIN_TEST_METHOD(binaryOpTest)
+  TEST_METHOD_PROPERTY(L"DataSource",
+                       L"Table:LongVectorOpTable.xml#BinaryOpTable")
+  END_TEST_METHOD()
+
+  BEGIN_TEST_METHOD(unaryOpTest)
+  TEST_METHOD_PROPERTY(L"DataSource",
+                       L"Table:LongVectorOpTable.xml#UnaryOpTable")
+  END_TEST_METHOD()
+
+  template <typename LongVectorOpTypeT>
+  void dispatchTestByDataType(LongVectorOpTypeT OpType, std::wstring DataType,
+                              TableParameterHandler &Handler);
+
+  template <typename DataTypeT, typename LongVectorOpTypeT>
+  void dispatchTestByVectorSize(LongVectorOpTypeT OpType,
+                                TableParameterHandler &Handler);
+
+  template <typename DataTypeT, typename LongVectorOpTypeT>
+  void testBaseMethod(
+      LongVector::TestConfig<DataTypeT, LongVectorOpTypeT> &TestConfig,
+      size_t VectorSizeToTest);
+
+private:
+  dxc::DxcDllSupport DxcDllSupport;
+  bool Initialized = false;
+};
+
+template <typename DataTypeT>
+void fillShaderBufferFromLongVectorData(std::vector<BYTE> &ShaderBuffer,
+                                        std::vector<DataTypeT> &TestData);
+
+template <typename DataTypeT>
+void fillLongVectorDataFromShaderBuffer(MappedData &ShaderBuffer,
+                                        std::vector<DataTypeT> &TestData,
+                                        size_t NumElements);
+
+template <typename DataTypeT> constexpr bool isFloatingPointType() {
+  return std::is_same_v<DataTypeT, float> || std::is_same_v<DataTypeT, double>;
+}
+
+struct LongVectorOpTypeStringToEnumValue {
+  std::wstring OpTypeString;
+  uint32_t OpTypeValue;
+};
+
+template <typename DataTypeT>
+DataTypeT getLongVectorOpType(const LongVectorOpTypeStringToEnumValue *Values,
+                              const std::wstring &OpTypeString,
+                              std::size_t Length);
+
+enum ValidationType {
+  ValidationType_Epsilon,
+  ValidationType_Ulp,
+};
+
+enum BasicOpType {
+  BasicOpType_Binary,
+  BasicOpType_Unary,
+  BasicOpType_ScalarBinary,
+  BasicOpType_EnumValueCount
+};
+
+enum BinaryOpType {
+  BinaryOpType_ScalarAdd,
+  BinaryOpType_ScalarMultiply,
+  BinaryOpType_ScalarSubtract,
+  BinaryOpType_ScalarDivide,
+  BinaryOpType_ScalarModulus,
+  BinaryOpType_Multiply,
+  BinaryOpType_Add,
+  BinaryOpType_Subtract,
+  BinaryOpType_Divide,
+  BinaryOpType_Modulus,
+  BinaryOpType_Min,
+  BinaryOpType_Max,
+  BinaryOpType_ScalarMin,
+  BinaryOpType_ScalarMax,
+  BinaryOpType_EnumValueCount
+};
+
+static const LongVectorOpTypeStringToEnumValue binaryOpTypeStringToEnumMap[] = {
+    {L"BinaryOpType_ScalarAdd", BinaryOpType_ScalarAdd},
+    {L"BinaryOpType_ScalarMultiply", BinaryOpType_ScalarMultiply},
+    {L"BinaryOpType_ScalarSubtract", BinaryOpType_ScalarSubtract},
+    {L"BinaryOpType_ScalarDivide", BinaryOpType_ScalarDivide},
+    {L"BinaryOpType_ScalarModulus", BinaryOpType_ScalarModulus},
+    {L"BinaryOpType_Add", BinaryOpType_Add},
+    {L"BinaryOpType_Multiply", BinaryOpType_Multiply},
+    {L"BinaryOpType_Subtract", BinaryOpType_Subtract},
+    {L"BinaryOpType_Divide", BinaryOpType_Divide},
+    {L"BinaryOpType_Modulus", BinaryOpType_Modulus},
+    {L"BinaryOpType_Min", BinaryOpType_Min},
+    {L"BinaryOpType_Max", BinaryOpType_Max},
+    {L"BinaryOpType_ScalarMin", BinaryOpType_ScalarMin},
+    {L"BinaryOpType_ScalarMax", BinaryOpType_ScalarMax},
+};
+
+static_assert(_countof(binaryOpTypeStringToEnumMap) ==
+                  BinaryOpType_EnumValueCount,
+              "binaryOpTypeStringToEnumMap size mismatch. Did you "
+              "add a new enum value?");
+
+BinaryOpType getBinaryOpType(const std::wstring &OpTypeString);
+
+enum UnaryOpType { UnaryOpType_Initialize, UnaryOpType_EnumValueCount };
+
+static const LongVectorOpTypeStringToEnumValue unaryOpTypeStringToEnumMap[] = {
+    {L"UnaryOpType_Initialize", UnaryOpType_Initialize},
+};
+
+static_assert(_countof(unaryOpTypeStringToEnumMap) ==
+                  UnaryOpType_EnumValueCount,
+              "unaryOpTypeStringToEnumMap size mismatch. Did you add "
+              "a new enum value?");
+
+UnaryOpType getUnaryOpType(const std::wstring &OpTypeString);
+
+template <typename DataTypeT>
+std::vector<DataTypeT> getInputValueSetByKey(const std::wstring &Key,
+                                             bool LogKey = true) {
+  if (LogKey)
+    WEX::Logging::Log::Comment(
+        WEX::Common::String().Format(L"Using Value Set Key: %s", Key.c_str()));
+  return std::vector<DataTypeT>(LongVectorTestData<DataTypeT>::Data.at(Key));
+}
+
+template <typename DataTypeT>
+DataTypeT mod(const DataTypeT &A, const DataTypeT &B);
+
+template <typename LongVectorOpTypeT> struct TestConfigTraits {
+  TestConfigTraits(LongVectorOpTypeT OpType) : OpType(OpType) {}
+  // LongVectorOpTypeT* Enum values. We don't use a UINT because
+  // we want the type data.
+  LongVectorOpTypeT OpType;
+};
+
+template <typename DataTypeT>
+bool doValuesMatch(DataTypeT A, DataTypeT B, float Tolerance, ValidationType);
+bool doValuesMatch(float A, float B, float Tolerance,
+                   ValidationType ValidationType);
+bool doValuesMatch(double A, double B, float Tolerance,
+                   ValidationType ValidationType);
+
+template <typename DataTypeT>
+bool doVectorsMatch(const std::vector<DataTypeT> &ActualValues,
+                    const std::vector<DataTypeT> &ExpectedValues,
+                    float Tolerance, ValidationType ValidationType);
+// Binary ops
+template <typename DataTypeT, typename LongVectorOpTypeT>
+std::vector<DataTypeT>
+computeExpectedValues(const std::vector<DataTypeT> &InputVector1,
+                      const std::vector<DataTypeT> &InputVector2,
+                      const TestConfig<DataTypeT, LongVectorOpTypeT> &Config);
+
+// Binary scalar ops
+template <typename DataTypeT, typename LongVectorOpTypeT>
+std::vector<DataTypeT>
+computeExpectedValues(const std::vector<DataTypeT> &InputVector1,
+                      const DataTypeT &ScalarInput,
+                      const TestConfig<DataTypeT, LongVectorOpTypeT> &Config);
+
+// Unary ops
+template <typename DataTypeT, typename LongVectorOpTypeT>
+std::vector<DataTypeT>
+computeExpectedValues(const std::vector<DataTypeT> &InputVector1,
+                      const TestConfig<DataTypeT, LongVectorOpTypeT> &Config);
+
+template <typename DataTypeT>
+void logLongVector(const std::vector<DataTypeT> &Values,
+                   const std::wstring &Name);
+
+// Used to pass into LongVectorOpTestBase
+template <typename DataTypeT, typename LongVectorOpTypeT> class TestConfig {
+public:
+  TestConfig() = default;
+
+  TestConfig(UnaryOpType OpType);
+  TestConfig(BinaryOpType OpType);
+
+  bool isBinaryOp() const {
+    return BasicOpType == LongVector::BasicOpType_Binary ||
+           BasicOpType == LongVector::BasicOpType_ScalarBinary;
+  }
+
+  bool isUnaryOp() const {
+    return BasicOpType == LongVector::BasicOpType_Unary;
+  }
+
+  bool isScalarOp() const {
+    return BasicOpType == LongVector::BasicOpType_ScalarBinary;
+  }
+
+  bool hasFunctionDefinition() const;
+  std::string getOPERAND2String() const;
+
+  // A helper to get the hlsl type as a string for a given C++ type.
+  // Used in the long vector tests.
+  std::string getHLSLTypeString() const;
+
+  DataTypeT computeExpectedValue(const DataTypeT &A, const DataTypeT &B,
+                                 BinaryOpType OpType) const;
+  DataTypeT computeExpectedValue(const DataTypeT &A, const DataTypeT &B) const;
+  DataTypeT computeExpectedValue(const DataTypeT &A, UnaryOpType OpType) const;
+  DataTypeT computeExpectedValue(const DataTypeT &A) const;
+
+  void setInputValueSet1(const std::wstring &InputValueSetName) {
+    this->InputValueSetName1 = InputValueSetName;
+  }
+
+  void setInputValueSet2(const std::wstring &InputValueSetName) {
+    this->InputValueSetName2 = InputValueSetName;
+  }
+
+  std::vector<DataTypeT> getInputValueSet1() const {
+    return getInputValueSet(1);
+  }
+
+  std::vector<DataTypeT> getInputValueSet2() const {
+    return getInputValueSet(2);
+  }
+
+  float getTolerance() const { return Tolerance; }
+  LongVector::ValidationType getValidationType() const {
+    return ValidationType;
+  }
+
+  std::string getCompilerOptionsString(size_t VectorSize) const;
+
+private:
+  std::vector<DataTypeT> getInputValueSet(size_t ValueSetIndex) const;
+
+  // To be used for the value of -DOPERATOR
+  std::string OperatorString;
+  // To be used for the value of -DFUNC
+  std::string IntrinsicString;
+  LongVector::BasicOpType BasicOpType = LongVector::BasicOpType_EnumValueCount;
+  float Tolerance = 0.0;
+  LongVector::ValidationType ValidationType =
+      LongVector::ValidationType::ValidationType_Epsilon;
+  LongVector::TestConfigTraits<LongVectorOpTypeT> OpTypeTraits;
+  std::wstring InputValueSetName1 = L"DefaultInputValueSet1";
+  std::wstring InputValueSetName2 = L"DefaultInputValueSet2";
+}; // class LongVector::TestConfig
+
+}; // namespace LongVector
+
+#include "LongVectors.tpp"
+
+#endif // LONGVECTORS_H
diff --git a/tools/clang/unittests/HLSLExec/LongVectors.tpp b/tools/clang/unittests/HLSLExec/LongVectors.tpp
new file mode 100644
index 0000000000..de333cf863
--- /dev/null
+++ b/tools/clang/unittests/HLSLExec/LongVectors.tpp
@@ -0,0 +1,476 @@
+template <typename DataTypeT>
+DataTypeT LongVector::getLongVectorOpType(const LongVectorOpTypeStringToEnumValue *Values,
+                             const std::wstring &OpTypeString,
+                             std::size_t Length) {
+  for (size_t i = 0; i < Length; i++) {
+    if (Values[i].OpTypeString == OpTypeString)
+      return static_cast<DataTypeT>(Values[i].OpTypeValue);
+  }
+
+  LOG_ERROR_FMT_THROW(L"Invalid LongVectorOpType string: %s",
+                      OpTypeString.c_str());
+
+  return static_cast<DataTypeT>(UINT_MAX);
+}
+
+// Helper to fill the shader buffer based on type. Convenient to be used when
+// copying HLSL*_t types so we can copy the underlying type directly instead of
+// the struct.
+template <typename DataTypeT>
+void LongVector::fillShaderBufferFromLongVectorData(std::vector<BYTE> &ShaderBuffer, std::vector<DataTypeT> &TestData) {
+
+  const size_t NumElements = TestData.size();
+  const size_t DataSize = sizeof(DataTypeT) * NumElements;
+  ShaderBuffer.resize(DataSize);
+
+  DataTypeT *ShaderBufferPtr =
+    reinterpret_cast<DataTypeT *>(ShaderBuffer.data());
+  for (size_t i = 0; i < NumElements; ++i)
+    ShaderBufferPtr[i] = TestData[i];
+}
+
+// Helpers so we do the right thing for float types.
+template <typename DataTypeT>
+DataTypeT LongVector::mod(const DataTypeT &A, const DataTypeT &B) {
+  return A % B;
+}
+
+template <> float LongVector::mod(const float &A, const float &B) {
+  return std::fmod(A, B);
+}
+
+template <> double LongVector::mod(const double &A, const double &B) {
+  return std::fmod(A, B);
+}
+
+// Helper to fill the test data from the shader buffer based on type. Convenient
+// to be used when copying HLSL*_t types so we can use the underlying type.
+template <typename DataTypeT>
+void LongVector::fillLongVectorDataFromShaderBuffer(MappedData &ShaderBuffer,
+                                        std::vector<DataTypeT> &TestData,
+                                        size_t NumElements) {
+  DataTypeT *ShaderBufferPtr =
+    reinterpret_cast<DataTypeT *>(ShaderBuffer.data());
+  for (size_t i = 0; i < NumElements; ++i)
+    TestData.push_back(ShaderBufferPtr[i]);
+}
+
+template <typename DataTypeT>
+bool LongVector::doValuesMatch(DataTypeT A, DataTypeT B, float Tolerance,
+                   LongVector::ValidationType) {
+  if (Tolerance == 0.0f)
+    return A == B;
+
+  DataTypeT Diff = A > B ? A - B : B - A;
+  return Diff <= Tolerance;
+}
+
+bool LongVector::doValuesMatch(float A, float B, float Tolerance,
+                          LongVector::ValidationType ValidationType) {
+  switch (ValidationType) {
+  case LongVector::ValidationType_Epsilon:
+    return CompareFloatEpsilon(A, B, Tolerance);
+  case LongVector::ValidationType_Ulp: {
+    // Tolerance is in ULPs. Convert to int for the comparison.
+    const int IntTolerance = static_cast<int>(Tolerance);
+    return CompareFloatULP(A, B, IntTolerance);
+  };
+  default:
+    WEX::Logging::Log::Error(
+        L"Invalid ValidationType. Expecting Epsilon or ULP.");
+    return false;
+  }
+}
+
+bool LongVector::doValuesMatch(double A, double B, float Tolerance,
+                          LongVector::ValidationType ValidationType) {
+  switch (ValidationType) {
+  case LongVector::ValidationType_Epsilon:
+    return CompareDoubleEpsilon(A, B, Tolerance);
+  case LongVector::ValidationType_Ulp: {
+    // Tolerance is in ULPs. Convert to int64_t for the comparison.
+    const int64_t IntTolerance = static_cast<int64_t>(Tolerance);
+    return CompareDoubleULP(A, B, IntTolerance);
+  };
+  default:
+    WEX::Logging::Log::Error(
+      L"Invalid ValidationType. Expecting Epsilon or ULP.");
+    return false;
+  }
+}
+
+
+template <typename DataTypeT>
+bool LongVector::doVectorsMatch(const std::vector<DataTypeT> &ActualValues,
+                   const std::vector<DataTypeT> &ExpectedValues,
+                   float Tolerance,
+                   LongVector::ValidationType ValidationType) {
+  // Stash mismatched indexes for easy failure logging later
+  std::vector<size_t> MismatchedIndexes;
+  VERIFY_IS_TRUE(ActualValues.size() == ExpectedValues.size(),
+                 L"doVectorsMatch() called with mismatched vector sizes.");
+  for (size_t i = 0; i < ActualValues.size(); ++i) {
+    if (!doValuesMatch(ActualValues[i], ExpectedValues[i], Tolerance,
+                       ValidationType))
+      MismatchedIndexes.push_back(i);
+  }
+
+  if (MismatchedIndexes.empty())
+    return true;
+
+  if (!MismatchedIndexes.empty()) {
+    for (size_t Index : MismatchedIndexes) {
+      std::wstringstream Wss(L"");
+      Wss << std::setprecision(15); // Set precision for floating point types
+      Wss << L"Mismatch at Index: " << Index;
+      Wss << L" Actual Value:" << ActualValues[Index] << ",";
+      Wss << L" Expected Value:" << ExpectedValues[Index];
+      WEX::Logging::Log::Error(Wss.str().c_str());
+    }
+  }
+
+  return false;
+}
+
+template <typename DataTypeT, typename LongVectorOpTypeT>
+std::vector<DataTypeT> LongVector::computeExpectedValues(
+  const std::vector<DataTypeT> &InputVector1,
+  const std::vector<DataTypeT> &InputVector2,
+  const LongVector::TestConfig<DataTypeT, LongVectorOpTypeT> &Config) {
+
+  VERIFY_IS_TRUE(
+      Config.isBinaryOp(),
+      L"computeExpectedValues() called with a non-binary op config.");
+
+  std::vector<DataTypeT> ExpectedValues = {};
+
+  for (size_t i = 0; i < InputVector1.size(); ++i)
+    ExpectedValues.push_back(
+      Config.computeExpectedValue(InputVector1[i], InputVector2[i]));
+
+  return ExpectedValues;
+}
+
+template <typename DataTypeT, typename LongVectorOpTypeT>
+std::vector<DataTypeT> LongVector::computeExpectedValues(
+  const std::vector<DataTypeT> &InputVector1, const DataTypeT &ScalarInput,
+  const LongVector::TestConfig<DataTypeT, LongVectorOpTypeT> &Config) {
+
+  VERIFY_IS_TRUE(Config.isScalarOp(), L"computeExpectedValues() called with a "
+                                      L"non-binary non-scalar op config.");
+
+  std::vector<DataTypeT> ExpectedValues;
+
+  for (size_t i = 0; i < InputVector1.size(); ++i)
+    ExpectedValues.push_back(
+      Config.computeExpectedValue(InputVector1[i], ScalarInput));
+
+  return ExpectedValues;
+}
+
+template <typename DataTypeT, typename LongVectorOpTypeT>
+std::vector<DataTypeT> LongVector::computeExpectedValues(
+    const std::vector<DataTypeT> &InputVector1,
+    const LongVector::TestConfig<DataTypeT, LongVectorOpTypeT> &Config) {
+
+  VERIFY_IS_TRUE(Config.isUnaryOp(),
+                 L"computeExpectedValues() called with a non-unary op config.");
+
+  std::vector<DataTypeT> ExpectedValues;
+
+  for (size_t i = 0; i < InputVector1.size(); ++i)
+    ExpectedValues.push_back(Config.computeExpectedValue(InputVector1[i]));
+
+  return ExpectedValues;
+}
+
+template <typename DataTypeT>
+void LongVector::logLongVector(const std::vector<DataTypeT> &Values,
+                   const std::wstring &Name) {
+  WEX::Logging::Log::Comment(
+      WEX::Common::String().Format(L"LongVector Name: %s", Name.c_str()));
+
+  const size_t LoggingWidth = 40;
+
+  std::wstringstream Wss(L"");
+  Wss << L"LongVector Values: ";
+  Wss << L"[";
+  const size_t NumElements = Values.size();
+  for (size_t i = 0; i < NumElements; i++) {
+    if (i % LoggingWidth == 0 && i != 0)
+      Wss << L"\n ";
+    Wss << Values[i];
+    if (i != NumElements - 1)
+      Wss << L", ";
+  }
+  Wss << L" ]";
+
+  WEX::Logging::Log::Comment(Wss.str().c_str());
+}
+
+template <typename DataTypeT, typename LongVectorOpTypeT>
+LongVector::TestConfig<DataTypeT, LongVectorOpTypeT>::TestConfig(LongVector::UnaryOpType OpType)
+    : OpTypeTraits(OpType) {
+  IntrinsicString = "";
+  BasicOpType = LongVector::BasicOpType_Unary;
+
+  if (isFloatingPointType<DataTypeT>())
+    Tolerance = 1;
+
+  switch (OpType) {
+  case LongVector::UnaryOpType_Initialize:
+    IntrinsicString = "TestInitialize";
+    break;
+  default:
+    VERIFY_FAIL("Invalid UnaryOpType");
+  }
+}
+
+template <typename DataTypeT, typename LongVectorOpTypeT>
+LongVector::TestConfig<DataTypeT, LongVectorOpTypeT>::TestConfig(LongVector::BinaryOpType OpType)
+   : OpTypeTraits(OpType) {
+  IntrinsicString = "";
+  BasicOpType = LongVector::BasicOpType_Binary;
+
+  if (isFloatingPointType<DataTypeT>())
+    Tolerance = 1;
+  ValidationType = LongVector::ValidationType_Ulp;
+
+  switch (OpType) {
+  case LongVector::BinaryOpType_ScalarAdd:
+    BasicOpType = LongVector::BasicOpType_ScalarBinary;
+    OperatorString = "+";
+    break;
+  case LongVector::BinaryOpType_ScalarMultiply:
+    BasicOpType = LongVector::BasicOpType_ScalarBinary;
+    OperatorString = "*";
+    break;
+  case LongVector::BinaryOpType_ScalarSubtract:
+    BasicOpType = LongVector::BasicOpType_ScalarBinary;
+    OperatorString = "-";
+    break;
+  case LongVector::BinaryOpType_ScalarDivide:
+    BasicOpType = LongVector::BasicOpType_ScalarBinary;
+    OperatorString = "/";
+    break;
+  case LongVector::BinaryOpType_ScalarModulus:
+    BasicOpType = LongVector::BasicOpType_ScalarBinary;
+    OperatorString = "%";
+    break;
+  case LongVector::BinaryOpType_Multiply:
+    OperatorString = "*";
+    break;
+  case LongVector::BinaryOpType_Add:
+    OperatorString = "+";
+    break;
+  case LongVector::BinaryOpType_Subtract:
+    OperatorString = "-";
+    break;
+  case LongVector::BinaryOpType_Divide:
+    OperatorString = "/";
+    break;
+  case LongVector::BinaryOpType_Modulus:
+    OperatorString = "%";
+    break;
+  case LongVector::BinaryOpType_Min:
+    OperatorString = ",";
+    IntrinsicString = "min";
+    break;
+  case LongVector::BinaryOpType_Max:
+    OperatorString = ",";
+    IntrinsicString = "max";
+    break;
+  case LongVector::BinaryOpType_ScalarMin:
+    BasicOpType = LongVector::BasicOpType_ScalarBinary;
+    OperatorString = ",";
+    IntrinsicString = "min";
+    break;
+  case LongVector::BinaryOpType_ScalarMax:
+    BasicOpType = LongVector::BasicOpType_ScalarBinary;
+    OperatorString = ",";
+    IntrinsicString = "max";
+    break;
+  default:
+    VERIFY_FAIL("Invalid BinaryOpType");
+  }
+}
+
+template <typename DataTypeT, typename LongVectorOpTypeT>
+bool LongVector::TestConfig<DataTypeT, LongVectorOpTypeT>::hasFunctionDefinition() const {
+  if constexpr (std::is_same_v<LongVectorOpTypeT, LongVector::UnaryOpType>) {
+    if (OpTypeTraits.OpType == LongVector::UnaryOpType_Initialize)
+      return true;
+    else
+      return false;
+  }
+
+  return false;
+}
+
+template <typename DataTypeT, typename LongVectorOpTypeT>
+std::string LongVector::TestConfig<DataTypeT, LongVectorOpTypeT>::getOPERAND2String() const {
+  if (hasFunctionDefinition()) {
+    switch (static_cast<LongVector::UnaryOpType>(OpTypeTraits.OpType)) {
+    case LongVector::UnaryOpType_Initialize:
+      return std::string(" -DFUNC_INITIALIZE=1");
+    default:
+      VERIFY_FAIL("Invalid UnaryOpType");
+    }
+  }
+  return std::string("");
+}
+
+template <typename DataTypeT, typename LongVectorOpTypeT>
+std::string LongVector::TestConfig<DataTypeT, LongVectorOpTypeT>::getHLSLTypeString() const {
+  if (std::is_same_v<DataTypeT, float>)
+    return "float";
+  if (std::is_same_v<DataTypeT, double>)
+    return "double";
+  if (std::is_same_v<DataTypeT, int16_t>)
+    return "int16_t";
+  if (std::is_same_v<DataTypeT, int32_t>)
+    return "int";
+  if (std::is_same_v<DataTypeT, int64_t>)
+    return "int64_t";
+  if (std::is_same_v<DataTypeT, uint16_t>)
+    return "uint16_t";
+  if (std::is_same_v<DataTypeT, uint32_t>)
+    return "uint32_t";
+  if (std::is_same_v<DataTypeT, uint64_t>)
+    return "uint64_t";
+
+  std::string ErrStr("getHLSLTypeString() Unsupported type: ");
+  ErrStr.append(typeid(DataTypeT).name());
+  VERIFY_IS_TRUE(false, ErrStr.c_str());
+  return "UnknownType";
+}
+
+template <typename DataTypeT, typename LongVectorOpTypeT>
+DataTypeT LongVector::TestConfig<DataTypeT, LongVectorOpTypeT>::computeExpectedValue(const DataTypeT &A, const DataTypeT &B,
+                              LongVector::BinaryOpType OpType) const {
+  switch (OpType) {
+  case LongVector::BinaryOpType_ScalarAdd:
+    return A + B;
+  case LongVector::BinaryOpType_ScalarMultiply:
+    return A * B;
+  case LongVector::BinaryOpType_ScalarSubtract:
+    return A - B;
+  case LongVector::BinaryOpType_ScalarDivide:
+    return A / B;
+  case LongVector::BinaryOpType_ScalarModulus:
+    return mod(A, B);
+  case LongVector::BinaryOpType_Multiply:
+    return A * B;
+  case LongVector::BinaryOpType_Add:
+    return A + B;
+  case LongVector::BinaryOpType_Subtract:
+    return A - B;
+  case LongVector::BinaryOpType_Divide:
+    return A / B;
+  case LongVector::BinaryOpType_Modulus:
+    return mod(A, B);
+  case LongVector::BinaryOpType_Min:
+    // std::max and std::min are wrapped in () to avoid collisions with the //
+    // macro defintions for min and max in windows.h
+    return (std::min)(A, B);
+  case LongVector::BinaryOpType_Max:
+    return (std::max)(A, B);
+  case LongVector::BinaryOpType_ScalarMin:
+    return (std::min)(A, B);
+  case LongVector::BinaryOpType_ScalarMax:
+    return (std::max)(A, B);
+  default:
+    LOG_ERROR_FMT_THROW(L"Unknown BinaryOpType: %d", OpTypeTraits.OpType);
+    return DataTypeT();
+  }
+}
+
+template <typename DataTypeT, typename LongVectorOpTypeT>
+DataTypeT LongVector::TestConfig<DataTypeT, LongVectorOpTypeT>::computeExpectedValue(const DataTypeT &A, const DataTypeT &B) const {
+  if(!isBinaryOp())
+    LOG_ERROR_FMT_THROW(
+      L"computeExpectedValue(const DataTypeT &A, const DataTypeT &B) called "
+      L"on a unary op: %d",
+      OpTypeTraits.OpType);
+
+  return computeExpectedValue(A, B, static_cast<LongVector::BinaryOpType>(OpTypeTraits.OpType));
+}
+
+
+template <typename DataTypeT, typename LongVectorOpTypeT>
+DataTypeT LongVector::TestConfig<DataTypeT, LongVectorOpTypeT>::computeExpectedValue(const DataTypeT &A,
+                              LongVector::UnaryOpType OpType) const {
+  switch (OpType) {
+  case LongVector::UnaryOpType_Initialize:
+    return A;
+  default:
+    LOG_ERROR_FMT_THROW(L"Unknown UnaryOpType :%d", OpTypeTraits.OpType);
+    return DataTypeT();
+  }
+}
+
+template <typename DataTypeT, typename LongVectorOpTypeT>
+DataTypeT LongVector::TestConfig<DataTypeT, LongVectorOpTypeT>::computeExpectedValue(const DataTypeT &A) const {
+
+  if constexpr (std::is_same_v<LongVectorOpTypeT, LongVector::UnaryOpType>) {
+    const auto OpType = static_cast<LongVector::UnaryOpType>(OpTypeTraits.OpType);
+    return computeExpectedValue(A, OpType);
+  }
+
+  LOG_ERROR_FMT_THROW(
+    L"computeExpectedValue(const DataType&A) called on an unrecognized binary op: %d",
+    OpTypeTraits.OpType);
+
+  return DataTypeT();
+}
+
+template <typename DataTypeT, typename LongVectorOpTypeT>
+std::string LongVector::TestConfig<DataTypeT, LongVectorOpTypeT>::getCompilerOptionsString(size_t VectorSize) const {
+  std::stringstream CompilerOptions("");
+  std::string HLSLType = getHLSLTypeString();
+  CompilerOptions << "-DTYPE=";
+  CompilerOptions << HLSLType;
+  CompilerOptions << " -DNUM=";
+  CompilerOptions << VectorSize;
+  const bool Is16BitType =
+    (HLSLType == "int16_t" || HLSLType == "uint16_t" || HLSLType == "half");
+  CompilerOptions << (Is16BitType ? " -enable-16bit-types" : "");
+  CompilerOptions << " -DOPERATOR=";
+  CompilerOptions << OperatorString;
+
+  if (isBinaryOp()) {
+    CompilerOptions << " -DOPERAND2=";
+    CompilerOptions << (isScalarOp() ? "InputScalar" : "InputVector2");
+
+    if (isScalarOp())
+      CompilerOptions << " -DIS_SCALAR_OP=1";
+    else
+      CompilerOptions << " -DIS_BINARY_VECTOR_OP=1";
+
+    CompilerOptions << " -DFUNC=";
+    CompilerOptions << IntrinsicString;
+  } else { // Unary Op
+    CompilerOptions << " -DFUNC=";
+    CompilerOptions << IntrinsicString;
+    CompilerOptions << " -DOPERAND2=";
+    CompilerOptions << getOPERAND2String();
+  }
+
+  return CompilerOptions.str();
+}
+
+template <typename DataTypeT, typename LongVectorOpTypeT>
+std::vector<DataTypeT> LongVector::TestConfig<DataTypeT, LongVectorOpTypeT>::getInputValueSet(size_t ValueSetIndex) const {
+  if (ValueSetIndex == 2 && !isBinaryOp())
+    VERIFY_FAIL("ValueSetindex==2 is only valid for binary ops.");
+
+  std::wstring InputValueSetName = L"";
+  if (ValueSetIndex == 1)
+    InputValueSetName = InputValueSetName1;
+  else if (ValueSetIndex == 2)
+    InputValueSetName = InputValueSetName2;
+  else
+    VERIFY_FAIL("Invalid ValueSetIndex");
+
+  return getInputValueSetByKey<DataTypeT>(InputValueSetName);
+}
diff --git a/tools/clang/unittests/HLSLExec/ShaderOpArith.xml b/tools/clang/unittests/HLSLExec/ShaderOpArith.xml
index e768f205f1..a782bd97ae 100644
--- a/tools/clang/unittests/HLSLExec/ShaderOpArith.xml
+++ b/tools/clang/unittests/HLSLExec/ShaderOpArith.xml
@@ -3750,4 +3750,71 @@ void MSMain(uint GID : SV_GroupIndex,
     </Shader>
   </ShaderOp>
 
+  <ShaderOp Name="LongVectorOp" CS="CS">
+    <RootSignature>RootFlags(0), UAV(u0), UAV(u1), UAV(u2),
+    UAV(u3)</RootSignature>
+    <!-- Width="16" BYTES to account for two largest scalar types (64 bits)-->
+    <Resource Name="InputFuncArgs" Dimension="BUFFER" Width="16"
+    Flags="ALLOW_UNORDERED_ACCESS" InitialResourceState="COPY_DEST"
+    TransitionTo="UNORDERED_ACCESS" Init="ByName" ReadBack="true" />
+    <!-- Width="8192" BYTES to account for largest type (64 bits) and vector
+    size of 1024 elements (the max long vector size)-->
+    <Resource Name="InputVector1" Dimension="BUFFER" Width="8192" Flags="ALLOW_UNORDERED_ACCESS" InitialResourceState="COPY_DEST" TransitionTo="UNORDERED_ACCESS" Init="ByName" ReadBack="true" />
+    <Resource Name="InputVector2" Dimension="BUFFER" Width="8192" Flags="ALLOW_UNORDERED_ACCESS" InitialResourceState="COPY_DEST" TransitionTo="UNORDERED_ACCESS" Init="ByName" ReadBack="true" />
+    <Resource Name="OutputVector" Dimension="BUFFER" Width="8192" Flags="ALLOW_UNORDERED_ACCESS" InitialResourceState="COPY_DEST" TransitionTo="UNORDERED_ACCESS" Init="ByName" ReadBack="true" />
+    <RootValues>
+      <RootValue Index="0" ResName="InputFuncArgs" />
+      <RootValue Index="1" ResName="InputVector1" />
+      <RootValue Index="2" ResName="InputVector2" />
+      <RootValue Index="3" ResName="OutputVector" />
+    </RootValues>
+    <!-- This shader requires the following defines to be passed in as arguments:
+     // 1 and 2 are required to compile the shader.
+     // 1. -DOPERATOR : "*" "+" "," "-" "/" etc.
+     // 2. -DOPERAND2: : InputVector2, ScalarInput, or "" depending on the test.
+     // Other defines are optional and are used to test different functions.
+     -->
+    <Shader Name="CS" Target="cs_6_9" EntryPoint="main">
+      <![CDATA[
+
+        #ifdef FUNC_INITIALIZE
+        vector<TYPE, NUM> TestInitialize(vector<TYPE, NUM> Vector)
+        {
+          vector<TYPE, NUM> VectorCopy = Vector;
+          return VectorCopy;
+        }
+        #endif
+
+        RWByteAddressBuffer g_InputFuncArgs : register(u0);
+        RWByteAddressBuffer g_InputVector1 : register(u1);
+        RWByteAddressBuffer g_InputVector2 : register(u2);
+        RWByteAddressBuffer g_OutputVector : register(u3);
+        [numthreads(1,1,1)]
+        void main(uint GI : SV_GroupIndex) {
+
+          vector<TYPE, NUM> InputVector1 = g_InputVector1.Load< vector<TYPE,
+          NUM> >(0);
+
+          #ifdef IS_BINARY_VECTOR_OP
+          vector<TYPE, NUM> InputVector2 = g_InputVector2.Load< vector<TYPE,
+          NUM> >(0);
+          #endif
+
+          #ifdef IS_SCALAR_OP
+          TYPE InputScalar = g_InputFuncArgs.Load<TYPE>(0);
+          #endif
+
+          #ifdef FUNC_CLAMP
+          TYPE Clamp_ArgMin = g_InputFuncArgs.Load<TYPE>(0);
+          TYPE Clamp_ArgMax = g_InputFuncArgs.Load<TYPE>(sizeof(TYPE));
+          vector<TYPE, 2> ClampArgMinMax = {Clamp_ArgMin, Clamp_ArgMax};
+          #endif
+
+          vector<TYPE, NUM> OutputVector = FUNC(InputVector1 OPERATOR OPERAND2);
+
+          g_OutputVector.Store< vector<TYPE, NUM> >(0, OutputVector);
+        };
+      ]]>
+    </Shader>
+  </ShaderOp>
 </ShaderOpSet>

From b331216bace89303857ef66617b94f03a6c715fd Mon Sep 17 00:00:00 2001
From: Shawn Hatori <5499686+shawnhatori@users.noreply.github.com>
Date: Thu, 3 Jul 2025 13:45:28 -0400
Subject: [PATCH 83/93] [SPIR-V] Explicitly state which layout rules require
 scalar block layout (#7539)

I was trying to debug a Vulkan Storage Buffer-related memory alignment
issue in my application where I was using SPIR-V generated via `dxc`
with `-fvk-use-dx-layout`. In `SPIR-V.rst`, I happened to miss the
paragraph that follows the list of layout rules (removed in this
proposal). That paragraph starts with "To use scalar layout", which
given my use of DirectX layout, I did not think was relevant to me.
However, the next sentence of that paragraph sneakily and indirectly
mentions that `VK_EXT_scalar_block_layout` is required for the DirectX
memory layout as well.

I have proposed explicitly stating the extension requirement when the
relevant layout rules are listed.
---
 docs/SPIR-V.rst | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/docs/SPIR-V.rst b/docs/SPIR-V.rst
index 771cf0e5a2..2bcdb99bfe 100644
--- a/docs/SPIR-V.rst
+++ b/docs/SPIR-V.rst
@@ -1012,17 +1012,18 @@ right now:
 2. DirectX memory layout rules for uniform buffers and storage buffers:
    they allow packing data on the application side that can be shared with
    DirectX. They can be enabled by ``-fvk-use-dx-layout``.
+   
+   NOTE: This requires ``VK_EXT_scalar_block_layout`` to be enabled on the
+   application side.
 3. Strict OpenGL ``std140`` for uniform buffers and strict OpenGL ``std430``
    for storage buffers: they allow packing data on the application side that
    can be shared with OpenGL. They can be enabled by ``-fvk-use-gl-layout``.
 4. Scalar layout rules introduced via `VK_EXT_scalar_block_layout`, which
    basically aligns all aggregrate types according to their elements'
    natural alignment. They can be enabled by ``-fvk-use-scalar-layout``.
-
-To use scalar layout, the application side need to request
-``VK_EXT_scalar_block_layout``. This is also true for using DirectX memory
-layout since there is no dedicated DirectX layout extension for Vulkan
-(at least for now). So we must request something more permissive.
+   
+   NOTE: This requires ``VK_EXT_scalar_block_layout`` to be enabled on the
+   application side.
 
 In the above, "vector-relaxed OpenGL ``std140``/``std430``" rules mean OpenGL
 ``std140``/``std430`` rules with the following modification for vector type
@@ -1032,7 +1033,7 @@ alignment:
 2. If the above causes an `improper straddle <https://registry.khronos.org/vulkan/specs/latest/html/vkspec.html#interfaces-resources-layout>`_,
    the alignment will be set to 16 bytes.
 
-As an exmaple, for the following HLSL definition:
+As an example, for the following HLSL definition:
 
 .. code:: hlsl
 

From 4fcf67f78f7d6ffd286316112694a3ae000860e2 Mon Sep 17 00:00:00 2001
From: Dan Brown <61992655+danbrown-amd@users.noreply.github.com>
Date: Thu, 3 Jul 2025 12:34:35 -0600
Subject: [PATCH 84/93] [spirv] Fixes #7535 (vk::BufferPointer alignment
 issue). (#7571)

---
 tools/clang/lib/SPIRV/SpirvBuilder.cpp        | 15 ++++++--
 tools/clang/lib/SPIRV/SpirvEmitter.cpp        | 37 +++++--------------
 .../vk.buffer-pointer.alias.cs.hlsl           |  2 +-
 .../CodeGenSPIRV/vk.buffer-pointer.alias.hlsl |  4 +-
 .../vk.buffer-pointer.atomic.hlsl             |  2 +-
 .../vk.buffer-pointer.from-uint.hlsl          |  4 +-
 .../vk.buffer-pointer.linked-list.hlsl        |  6 +--
 7 files changed, 30 insertions(+), 40 deletions(-)

diff --git a/tools/clang/lib/SPIRV/SpirvBuilder.cpp b/tools/clang/lib/SPIRV/SpirvBuilder.cpp
index e085603b21..22523eed0e 100644
--- a/tools/clang/lib/SPIRV/SpirvBuilder.cpp
+++ b/tools/clang/lib/SPIRV/SpirvBuilder.cpp
@@ -205,10 +205,17 @@ SpirvInstruction *SpirvBuilder::createLoad(QualType resultType,
   instruction->setRValue(true);
 
   if (pointer->getStorageClass() == spv::StorageClass::PhysicalStorageBuffer) {
-    AlignmentSizeCalculator alignmentCalc(astContext, spirvOptions);
-    uint32_t align, size, stride;
-    std::tie(align, size) = alignmentCalc.getAlignmentAndSize(
-        resultType, pointer->getLayoutRule(), llvm::None, &stride);
+    QualType pointerType = pointer->getAstResultType();
+    uint32_t align = 0;
+    if (!pointerType.isNull() && hlsl::IsVKBufferPointerType(pointerType)) {
+      align = hlsl::GetVKBufferPointerAlignment(pointerType);
+    }
+    if (!align) {
+      AlignmentSizeCalculator alignmentCalc(astContext, spirvOptions);
+      uint32_t stride;
+      std::tie(align, std::ignore) = alignmentCalc.getAlignmentAndSize(
+          resultType, pointer->getLayoutRule(), llvm::None, &stride);
+    }
     instruction->setAlignment(align);
   }
 
diff --git a/tools/clang/lib/SPIRV/SpirvEmitter.cpp b/tools/clang/lib/SPIRV/SpirvEmitter.cpp
index 850a8dd736..c2ee495d28 100644
--- a/tools/clang/lib/SPIRV/SpirvEmitter.cpp
+++ b/tools/clang/lib/SPIRV/SpirvEmitter.cpp
@@ -11179,36 +11179,19 @@ SpirvEmitter::processIntrinsicPointerCast(const CallExpr *callExpr,
 
 SpirvInstruction *SpirvEmitter::processIntrinsicGetBufferContents(
     const CXXMemberCallExpr *callExpr) {
-  LowerTypeVisitor lowerTypeVisitor(astContext, spvContext, spirvOptions,
-                                    spvBuilder);
-  Expr *obj = callExpr->getImplicitObjectArgument();
-  SpirvInstruction *bufferPointer = doExpr(obj);
+  SpirvInstruction *bufferPointer =
+      doExpr(callExpr->getImplicitObjectArgument());
   if (!bufferPointer)
     return nullptr;
-  if (bufferPointer->isRValue()) {
-    bufferPointer->setRValue(false);
-    bufferPointer->setStorageClass(spv::StorageClass::PhysicalStorageBuffer);
-    bufferPointer->setLayoutRule(spirvOptions.sBufferLayoutRule);
-    return bufferPointer;
-  }
-
-  unsigned align = hlsl::GetVKBufferPointerAlignment(obj->getType());
-  lowerTypeVisitor.visitInstruction(bufferPointer);
-
-  const SpirvPointerType *bufferPointerType =
-      dyn_cast<SpirvPointerType>(bufferPointer->getResultType());
-  SpirvLoad *retVal =
-      spvBuilder.createLoad(bufferPointerType->getPointeeType(), bufferPointer,
-                            callExpr->getLocStart());
-  if (!align) {
-    QualType bufferType = hlsl::GetVKBufferPointerBufferType(obj->getType());
-    AlignmentSizeCalculator alignmentCalc(astContext, spirvOptions);
-    uint32_t stride;
-    std::tie(align, std::ignore) = alignmentCalc.getAlignmentAndSize(
-        bufferType, retVal->getLayoutRule(), llvm::None, &stride);
-  }
-  retVal->setAlignment(align);
+
+  SpirvInstruction *retVal =
+      bufferPointer->isRValue()
+          ? bufferPointer
+          : spvBuilder.createLoad(bufferPointer->getAstResultType(),
+                                  bufferPointer, callExpr->getLocStart());
   retVal->setRValue(false);
+  retVal->setStorageClass(spv::StorageClass::PhysicalStorageBuffer);
+  retVal->setLayoutRule(spirvOptions.sBufferLayoutRule);
   return retVal;
 }
 
diff --git a/tools/clang/test/CodeGenSPIRV/vk.buffer-pointer.alias.cs.hlsl b/tools/clang/test/CodeGenSPIRV/vk.buffer-pointer.alias.cs.hlsl
index f0f5c54a16..e063a4bc23 100644
--- a/tools/clang/test/CodeGenSPIRV/vk.buffer-pointer.alias.cs.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/vk.buffer-pointer.alias.cs.hlsl
@@ -20,7 +20,7 @@ void main() {
   foo(rwbuf[0].Get());
 }
 
-// CHECK: [[L0:%[_0-9A-Za-z]*]] = OpLoad %{{[_0-9A-Za-z]*}} %{{[_0-9A-Za-z]*}} Aligned 8
+// CHECK: [[L0:%[_0-9A-Za-z]*]] = OpLoad %{{[_0-9A-Za-z]*}} %{{[_0-9A-Za-z]*}}
 // CHECK: [[L1:%[_0-9A-Za-z]*]] = OpLoad %{{[_0-9A-Za-z]*}} [[L0]] Aligned 8
 // CHECK: [[L2:%[_0-9A-Za-z]*]] = OpAccessChain %{{[_0-9A-Za-z]*}} [[L1]] %int_0
 // CHECK: OpStore [[L2]] %int_1 Aligned 4
diff --git a/tools/clang/test/CodeGenSPIRV/vk.buffer-pointer.alias.hlsl b/tools/clang/test/CodeGenSPIRV/vk.buffer-pointer.alias.hlsl
index fc5b9edad0..e159f6997c 100644
--- a/tools/clang/test/CodeGenSPIRV/vk.buffer-pointer.alias.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/vk.buffer-pointer.alias.hlsl
@@ -62,10 +62,10 @@ float4 MainPs(void) : SV_Target0
 // CHECK: [[X4:%[_0-9A-Za-z]*]] = OpLoad [[PGS]] [[X3]]
 // CHECK: OpStore [[BP1]] [[X4]]
 // CHECK: [[X5:%[_0-9A-Za-z]*]] = OpLoad [[V4FLOAT]] [[VTEST]]
-// CHECK: [[X6:%[_0-9A-Za-z]*]] = OpLoad [[PGS]] [[BP0]] Aligned 16
+// CHECK: [[X6:%[_0-9A-Za-z]*]] = OpLoad [[PGS]] [[BP0]]
 // CHECK: [[X7:%[_0-9A-Za-z]*]] = OpAccessChain [[PBV4FLOAT]] [[X6]] [[I1]]
 // CHECK: OpStore [[X7]] [[X5]] Aligned 16
-// CHECK: [[X8:%[_0-9A-Za-z]*]] = OpLoad [[PGS]] [[BP1]] Aligned 16
+// CHECK: [[X8:%[_0-9A-Za-z]*]] = OpLoad [[PGS]] [[BP1]]
 // CHECK: [[X9:%[_0-9A-Za-z]*]] = OpAccessChain [[PBV4FLOAT]] [[X8]] [[I1]]
 // CHECK: [[X10:%[_0-9A-Za-z]*]] = OpLoad [[V4FLOAT]] [[X9]] Aligned 16
 // CHECK: OpReturnValue [[X10]]
diff --git a/tools/clang/test/CodeGenSPIRV/vk.buffer-pointer.atomic.hlsl b/tools/clang/test/CodeGenSPIRV/vk.buffer-pointer.atomic.hlsl
index 992d8b39fd..485da6fd93 100644
--- a/tools/clang/test/CodeGenSPIRV/vk.buffer-pointer.atomic.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/vk.buffer-pointer.atomic.hlsl
@@ -29,7 +29,7 @@ void main()
   uint u0, u1;
 
 // CHECK: [[X1:%[_0-9]+]] = OpAccessChain %{{[_0-9A-Za-z]*}} [[PC]] [[I0]]
-// CHECK: [[X2:%[_0-9]+]] = OpLoad [[PS]] [[X1]] Aligned 4
+// CHECK: [[X2:%[_0-9]+]] = OpLoad [[PS]] [[X1]]
 // CHECK: [[X3:%[_0-9]+]] = OpAccessChain [[PU]] [[X2]] [[I0]]
 // CHECK: [[X4:%[_0-9]+]] = OpLoad [[UINT]] [[IN]]
 // CHECK: [[X5:%[_0-9]+]] = OpAtomicExchange [[UINT]] [[X3]] [[U1]] [[U0]] [[X4]]
diff --git a/tools/clang/test/CodeGenSPIRV/vk.buffer-pointer.from-uint.hlsl b/tools/clang/test/CodeGenSPIRV/vk.buffer-pointer.from-uint.hlsl
index b44e1eca09..e7908e0ce7 100644
--- a/tools/clang/test/CodeGenSPIRV/vk.buffer-pointer.from-uint.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/vk.buffer-pointer.from-uint.hlsl
@@ -37,8 +37,8 @@ void main() {
 // CHECK: [[TEST:%[_0-9A-Za-z]*]] = OpVariable [[PFPPUINT]] Function
 // CHECK: [[X1:%[_0-9A-Za-z]*]] = OpConvertUToPtr [[PPUINT]]
 // CHECK: OpStore [[TEST]] [[X1]]
-// CHECK: [[X2:%[_0-9A-Za-z]*]] = OpLoad [[PPUINT]] [[TEST]] Aligned 32
-// CHECK: [[X3:%[_0-9A-Za-z]*]] = OpLoad [[UINT]] [[X2]] Aligned 4
+// CHECK: [[X2:%[_0-9A-Za-z]*]] = OpLoad [[PPUINT]] [[TEST]]
+// CHECK: [[X3:%[_0-9A-Za-z]*]] = OpLoad [[UINT]] [[X2]] Aligned 32
 // CHECK: [[X4:%[_0-9A-Za-z]*]] = OpAccessChain [[PUUINT]] [[OUTPUT]] [[I0]] [[U0]]
 // CHECK: OpStore [[X4]] [[X3]]
 // CHECK: OpReturn
diff --git a/tools/clang/test/CodeGenSPIRV/vk.buffer-pointer.linked-list.hlsl b/tools/clang/test/CodeGenSPIRV/vk.buffer-pointer.linked-list.hlsl
index 71fee1a795..75380d3f4e 100644
--- a/tools/clang/test/CodeGenSPIRV/vk.buffer-pointer.linked-list.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/vk.buffer-pointer.linked-list.hlsl
@@ -76,9 +76,9 @@ float4 MainPs(void) : SV_Target0
 // CHECK: [[X1:%[_0-9A-Za-z]*]] = OpAccessChain [[PPBLOCK1]] [[GPC]] [[S0]]
 // CHECK: [[X2:%[_0-9A-Za-z]*]] = OpLoad [[PBLOCK]] [[X1]]
 // CHECK: OpStore [[GP]] [[X2]]
-// CHECK: [[X3:%[_0-9A-Za-z]*]] = OpLoad [[PBLOCK]] [[GP]] Aligned 32
+// CHECK: [[X3:%[_0-9A-Za-z]*]] = OpLoad [[PBLOCK]] [[GP]]
 // CHECK: [[X4:%[_0-9A-Za-z]*]] = OpAccessChain [[PPBLOCK2]] [[X3]] [[S1]]
-// CHECK: [[X5:%[_0-9A-Za-z]*]] = OpLoad [[PBLOCK]] [[X4]] Aligned 8
+// CHECK: [[X5:%[_0-9A-Za-z]*]] = OpLoad [[PBLOCK]] [[X4]] Aligned 32
 // CHECK: OpStore [[GP]] [[X5]]
 // CHECK: [[X6:%[_0-9A-Za-z]*]] = OpLoad [[PBLOCK]] [[GP]]
 // CHECK: [[X7:%[_0-9A-Za-z]*]] = OpConvertPtrToU [[ULONG]] [[X6]]
@@ -94,7 +94,7 @@ float4 MainPs(void) : SV_Target0
 // CHECK: [[IF_TRUE]] = OpLabel
 // CHECK: OpReturnValue [[CV4FLOAT]]
 // CHECK: [[IF_MERGE]] = OpLabel
-// CHECK: [[X13:%[_0-9A-Za-z]*]] = OpLoad [[PBLOCK]] [[GP]] Aligned 32
+// CHECK: [[X13:%[_0-9A-Za-z]*]] = OpLoad [[PBLOCK]] [[GP]]
 // CHECK: [[X14:%[_0-9A-Za-z]*]] = OpAccessChain [[PV4FLOAT2]] [[X13]] [[S0]]
 // CHECK: [[X15:%[_0-9A-Za-z]*]] = OpLoad [[V4FLOAT]] [[X14]] Aligned 16
 // CHECK: OpReturnValue [[X15]]

From d751c827ed3b61e87fdf57d0f424cb2d7af30cd0 Mon Sep 17 00:00:00 2001
From: Russell Liu <ginshio78@gmail.com>
Date: Mon, 7 Jul 2025 21:34:41 +0800
Subject: [PATCH 85/93] [SPIRV] Allow spirv type as template parameter (#7626)

SPIR-V intrinsics allow us to create spirv basic type and opaque type in
HLSL, but these type are object and not allowed in template parameter.

```fundamental
error: object 'Int8Type' is not allowed in builtin template parameters
    /* OpTypeCooperativeMatrixKHR */ 4456, Int8Type,
                                           ^
```

This doesn't make sense to me, and is not convenience to use. This
change wants to allow that use those in template parameter.
---
 tools/clang/lib/Sema/SemaHLSL.cpp             |  9 ++++++
 .../CodeGenSPIRV/spv.intrinsicInTemplate.hlsl | 29 +++++++++++++++++++
 2 files changed, 38 insertions(+)
 create mode 100644 tools/clang/test/CodeGenSPIRV/spv.intrinsicInTemplate.hlsl

diff --git a/tools/clang/lib/Sema/SemaHLSL.cpp b/tools/clang/lib/Sema/SemaHLSL.cpp
index 3d9de1804d..8e800e8f68 100644
--- a/tools/clang/lib/Sema/SemaHLSL.cpp
+++ b/tools/clang/lib/Sema/SemaHLSL.cpp
@@ -5402,6 +5402,15 @@ class HLSLExternalSource : public ExternalSemaSource {
         objectKind = ClassifyRecordType(recordType);
         switch (objectKind) {
         case AR_TOBJ_OBJECT:
+#ifdef ENABLE_SPIRV_CODEGEN
+          if (const auto *namespaceDecl = dyn_cast<NamespaceDecl>(
+                  recordType->getDecl()->getDeclContext());
+              namespaceDecl && namespaceDecl->getName().equals("vk") &&
+              (recordType->getDecl()->getName().equals("SpirvType") ||
+               recordType->getDecl()->getName().equals("SpirvOpaqueType"))) {
+            return true;
+          }
+#endif
           m_sema->Diag(argLoc, diag::err_hlsl_unsupported_object_context)
               << type << static_cast<unsigned>(TypeDiagContext::TypeParameter);
           return false;
diff --git a/tools/clang/test/CodeGenSPIRV/spv.intrinsicInTemplate.hlsl b/tools/clang/test/CodeGenSPIRV/spv.intrinsicInTemplate.hlsl
new file mode 100644
index 0000000000..0ecda64dbb
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/spv.intrinsicInTemplate.hlsl
@@ -0,0 +1,29 @@
+// RUN: %dxc -T cs_6_8 -HV 2021 -O0 -spirv -fspv-target-env=universal1.5 %s | FileCheck %s
+
+// CHECK: [[Int8Type:%.*]] = OpTypeInt 8 0
+using Int8Type = vk::SpirvType</* OpTypeInt */ 21, 8, 8,
+                               vk::Literal<vk::integral_constant<uint32_t, 8> >,
+                               vk::Literal<vk::integral_constant<bool, 0> > >;
+
+// CHECK: [[MatrixType:%.*]] = OpTypeCooperativeMatrixKHR [[Int8Type]] %uint_3 %uint_16 %uint_16 %uint_0
+using I8MatA = vk::SpirvOpaqueType<
+    /* OpTypeCooperativeMatrixKHR */ 4456, Int8Type,
+    vk::integral_constant<uint, /* ScopeSubgroup */ 3>,
+    vk::integral_constant<uint, 16>, vk::integral_constant<uint, 16>,
+    vk::integral_constant<uint, /* Use */ 0> >;
+
+template <typename ResultType, typename PointerType>
+[[vk::ext_instruction(/* OpCooperativeMatrixLoadKHR */ 4457)]] ResultType
+__builtin_spv_CooperativeMatrixLoadKHR([[vk::ext_reference]] PointerType pointer,
+    uint32_t memory_layout, uint32_t stride, [[vk::ext_literal]] uint32_t memory_operand);
+
+StructuredBuffer<uint32_t> buffer : register(t0, space0);
+
+[numthreads(32, 1, 1)] void main() {
+  [[vk::ext_extension("SPV_KHR_cooperative_matrix")]]
+  [[vk::ext_capability(/* CooperativeMatrixKHRCapability */ 6022)]]
+  [[vk::ext_capability(/* VulkanMemoryModel */ 5345)]]
+  [[vk::ext_capability(/* Int8 */ 39)]]
+  // CHECK: OpCooperativeMatrixLoadKHR [[MatrixType]] %{{.*}} %uint_0 %uint_32 None
+  I8MatA matA = __builtin_spv_CooperativeMatrixLoadKHR<I8MatA>(buffer[0], /* rowMajor */ 0, 32, 0);
+}

From a11702ef0a393a9e0b78f982f9f0fa66d919c867 Mon Sep 17 00:00:00 2001
From: Steven Perron <stevenperron@google.com>
Date: Tue, 8 Jul 2025 10:46:31 -0400
Subject: [PATCH 86/93] [SPIRV] Add the derivative group execution mode only on
 shader types that allow it. (#7628)

DXC allows user to use decrivative instruction in shader models that do
not allow it, but they must be dead code that will be removed. However,
when we see a derivative instruction in the SPIR-V backend that is not
in a pixel shader we assume it need the DerivativeGroup execution mode,
and we fail when we try to add it to a vertex shader.

To allow out implementation to match DXIL, we will not assume we can add
the execution mode. We will only add it for shader that we know can use
is, and skip the other.

If the derivative instruction is not removed during optimizations, there
will be a validation error.

While fixing this, we observed another bug that is fixed at the same
time since they are closely related. The TaskNV and TaskEXT shader types
do not have the same id, and the SPV_KHR_compute_shader_derivatives does
not work with the NV mesh shader extension. That was fixed up.

Fixes #7478
---
 tools/clang/lib/SPIRV/SpirvEmitter.cpp        | 49 ++++++++++++-------
 .../amplification_shader_derivative.hlsl      | 28 +++++++++++
 .../CodeGenSPIRV/mesh_shader_derivative.hlsl  | 34 +++++++++++++
 .../vertex_shader_derivative_in_branch.hlsl   | 23 +++++++++
 4 files changed, 115 insertions(+), 19 deletions(-)
 create mode 100644 tools/clang/test/CodeGenSPIRV/amplification_shader_derivative.hlsl
 create mode 100644 tools/clang/test/CodeGenSPIRV/mesh_shader_derivative.hlsl
 create mode 100644 tools/clang/test/CodeGenSPIRV/vertex_shader_derivative_in_branch.hlsl

diff --git a/tools/clang/lib/SPIRV/SpirvEmitter.cpp b/tools/clang/lib/SPIRV/SpirvEmitter.cpp
index c2ee495d28..734340e9ae 100644
--- a/tools/clang/lib/SPIRV/SpirvEmitter.cpp
+++ b/tools/clang/lib/SPIRV/SpirvEmitter.cpp
@@ -4399,9 +4399,7 @@ SpirvEmitter::processTextureLevelOfDetail(const CXXMemberCallExpr *expr,
       spvBuilder.createImageQuery(spv::Op::OpImageQueryLod, queryResultType,
                                   expr->getExprLoc(), sampledImage, coordinate);
 
-  if (spvContext.isCS() || spvContext.isNode()) {
-    addDerivativeGroupExecutionMode();
-  }
+  addDerivativeGroupExecutionMode();
   // The first component of the float2 contains the mipmap array layer.
   // The second component of the float2 represents the unclamped lod.
   return spvBuilder.createCompositeExtract(astContext.FloatTy, query,
@@ -5780,9 +5778,7 @@ SpirvEmitter::processTextureSampleGather(const CXXMemberCallExpr *expr,
 
   const auto retType = expr->getDirectCallee()->getReturnType();
   if (isSample) {
-    if (spvContext.isCS() || spvContext.isNode()) {
-      addDerivativeGroupExecutionMode();
-    }
+    addDerivativeGroupExecutionMode();
     return createImageSample(retType, imageType, image, sampler, coordinate,
                              /*compareVal*/ nullptr, /*bias*/ nullptr,
                              /*lod*/ nullptr, std::make_pair(nullptr, nullptr),
@@ -5870,9 +5866,9 @@ SpirvEmitter::processTextureSampleBiasLevel(const CXXMemberCallExpr *expr,
 
   const auto retType = expr->getDirectCallee()->getReturnType();
 
-  if (!lod && (spvContext.isCS() || spvContext.isNode())) {
+  if (!lod)
     addDerivativeGroupExecutionMode();
-  }
+
   return createImageSample(
       retType, imageType, image, sampler, coordinate,
       /*compareVal*/ nullptr, bias, lod, std::make_pair(nullptr, nullptr),
@@ -5992,9 +5988,7 @@ SpirvEmitter::processTextureSampleCmp(const CXXMemberCallExpr *expr) {
   const auto retType = expr->getDirectCallee()->getReturnType();
   const auto imageType = imageExpr->getType();
 
-  if (spvContext.isCS()) {
-    addDerivativeGroupExecutionMode();
-  }
+  addDerivativeGroupExecutionMode();
 
   return createImageSample(
       retType, imageType, image, sampler, coordinate, compareVal,
@@ -6047,9 +6041,7 @@ SpirvEmitter::processTextureSampleCmpBias(const CXXMemberCallExpr *expr) {
   const auto retType = expr->getDirectCallee()->getReturnType();
   const auto imageType = imageExpr->getType();
 
-  if (spvContext.isCS()) {
-    addDerivativeGroupExecutionMode();
-  }
+  addDerivativeGroupExecutionMode();
 
   return createImageSample(
       retType, imageType, image, sampler, coordinate, compareVal, bias,
@@ -9782,8 +9774,7 @@ SpirvInstruction *SpirvEmitter::processDerivativeIntrinsic(
   QualType returnType = arg->getAstResultType();
   assert(isFloatOrVecOfFloatType(returnType));
 
-  if (!spvContext.isPS())
-    addDerivativeGroupExecutionMode();
+  addDerivativeGroupExecutionMode();
   needsLegalization = true;
 
   QualType B32Type = astContext.FloatTy;
@@ -12512,8 +12503,7 @@ SpirvInstruction *SpirvEmitter::processIntrinsicUsingSpirvInst(
     case spv::Op::OpFwidth:
     case spv::Op::OpFwidthFine:
     case spv::Op::OpFwidthCoarse:
-      if (spvContext.isCS() || spvContext.isNode())
-        addDerivativeGroupExecutionMode();
+      addDerivativeGroupExecutionMode();
       needsLegalization = true;
       break;
     default:
@@ -15771,8 +15761,29 @@ bool SpirvEmitter::spirvToolsValidate(std::vector<uint32_t> *mod,
   return tools.Validate(mod->data(), mod->size(), options);
 }
 
+static bool canUseDerivativeGroupExecutionMode(SpirvContext::ShaderModelKind sm,
+                                               bool usingEXTMeshShader) {
+  switch (sm) {
+  case SpirvContext::ShaderModelKind::Compute:
+  case SpirvContext::ShaderModelKind::Node:
+    return true;
+
+  // The KHR extension that allows derivative instruction in mesh and task
+  // (amplification) shader does not work with SPV_NV_mesh_shader extesion.
+  case SpirvContext::ShaderModelKind::Mesh:
+  case SpirvContext::ShaderModelKind::Amplification:
+    return usingEXTMeshShader;
+  default:
+    return false;
+  }
+}
+
 void SpirvEmitter::addDerivativeGroupExecutionMode() {
-  assert(spvContext.isCS());
+  bool usingEXTMeshShader =
+      featureManager.isExtensionEnabled(Extension::EXT_mesh_shader);
+  SpirvContext::ShaderModelKind sm = spvContext.getCurrentShaderModelKind();
+  if (!canUseDerivativeGroupExecutionMode(sm, usingEXTMeshShader))
+    return;
 
   SpirvExecutionMode *numThreadsEm =
       cast<SpirvExecutionMode>(spvBuilder.getModule()->findExecutionMode(
diff --git a/tools/clang/test/CodeGenSPIRV/amplification_shader_derivative.hlsl b/tools/clang/test/CodeGenSPIRV/amplification_shader_derivative.hlsl
new file mode 100644
index 0000000000..9982cf1cda
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/amplification_shader_derivative.hlsl
@@ -0,0 +1,28 @@
+// RUN: %dxc -T as_6_5 -E main -fspv-target-env=vulkan1.3 %s -spirv | FileCheck %s --check-prefix=VK13
+// RUN: %dxc -T as_6_5 -E main -fspv-target-env=vulkan1.1 -Vd %s -spirv | FileCheck %s --check-prefix=VK11
+
+// VK13-DAG: OpCapability ComputeDerivativeGroupLinearKHR
+// VK13-DAG: OpCapability DerivativeControl
+// VK13-DAG: OpCapability MeshShadingEXT
+// VK13-DAG: OpExtension "SPV_EXT_mesh_shader"
+// VK13-DAG: OpExtension "SPV_KHR_compute_shader_derivatives"
+// VK13: OpEntryPoint TaskEXT %main "main"
+// VK13: OpExecutionMode %main DerivativeGroupLinearKHR
+
+// VK11-DAG: OpExtension "SPV_NV_mesh_shader"
+// VK11: OpEntryPoint TaskNV %main "main"
+// VK11-NOT: OpExecutionMode %main DerivativeGroup
+
+struct AmplificationPayload
+{
+    float4 value;
+};
+
+groupshared AmplificationPayload payload;
+
+[numthreads(4, 1, 1)]
+void main(in uint tid : SV_GroupThreadID, in uint gtid : SV_GroupID)
+{
+    payload.value = ddx_coarse(float4(tid, 0, 0, 0));
+    DispatchMesh(1,1,1, payload);
+}
diff --git a/tools/clang/test/CodeGenSPIRV/mesh_shader_derivative.hlsl b/tools/clang/test/CodeGenSPIRV/mesh_shader_derivative.hlsl
new file mode 100644
index 0000000000..3f26921e28
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/mesh_shader_derivative.hlsl
@@ -0,0 +1,34 @@
+// RUN: %dxc -T ms_6_5 -E main -fspv-target-env=vulkan1.3 %s -spirv | FileCheck %s --check-prefix=VK13
+// RUN: %dxc -T ms_6_5 -E main -fspv-target-env=vulkan1.1 -Vd %s -spirv | FileCheck %s --check-prefix=VK11
+
+// VK13-DAG: OpCapability ComputeDerivativeGroupLinearKHR
+// VK13-DAG: OpCapability DerivativeControl
+// vk13-DAG: OpCapability MeshShadingEXT
+// VK13-DAG: OpExtension "SPV_EXT_mesh_shader"
+// VK13-DAG: OpExtension "SPV_KHR_compute_shader_derivatives"
+// VK13: OpEntryPoint MeshEXT %main "main"
+// VK13: OpExecutionMode %main DerivativeGroupLinearKHR
+
+// VK11-DAG: OpExtension "SPV_NV_mesh_shader"
+// VK11: OpEntryPoint MeshNV %main "main"
+// VK11-NOT: OpExecutionMode %main DerivativeGroup
+
+struct VSOut
+{
+    float4 pos : SV_Position;
+};
+
+[numthreads(4, 1, 1)]
+[outputtopology("triangle")]
+void main(in uint tid : SV_GroupThreadID, out vertices VSOut verts[3], out indices uint3 tris[1])
+{
+    SetMeshOutputCounts(3, 1);
+
+    float4 val = ddx_coarse(float4(tid, 0, 0, 0));
+
+    verts[0].pos = val;
+    verts[1].pos = val + float4(0,1,0,0);
+    verts[2].pos = val + float4(1,0,0,0);
+
+    tris[0] = uint3(0,1,2);
+}
diff --git a/tools/clang/test/CodeGenSPIRV/vertex_shader_derivative_in_branch.hlsl b/tools/clang/test/CodeGenSPIRV/vertex_shader_derivative_in_branch.hlsl
new file mode 100644
index 0000000000..9719dc1dc5
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/vertex_shader_derivative_in_branch.hlsl
@@ -0,0 +1,23 @@
+// RUN: %dxc -T vs_6_0 -E main -DCOND=false -fspv-target-env=vulkan1.3 %s -spirv | FileCheck %s
+// CHECK-NOT: OpCapability DerivativeControl
+// CHECK-NOT: OpExtension "SPV_KHR_compute_shader_derivatives"
+
+// RUN: not %dxc -T vs_6_0 -E main -DCOND=true -fspv-target-env=vulkan1.3 %s -spirv 2>&1 | FileCheck %s -check-prefix=ERROR
+// ERROR: generated SPIR-V is invalid:
+// ERROR-NEXT: Derivative instructions require Fragment, GLCompute, MeshEXT or TaskEXT execution model: DPdx
+
+struct VSOut
+{
+    float4 pos : SV_Position;
+};
+
+VSOut main(float4 pos : POSITION)
+{
+    VSOut output;
+    output.pos = pos;
+    if (COND)
+    {
+        output.pos += ddx(pos);
+    }
+    return output;
+}

From 4efa3dc842ac99a38d940aa64cb80819a7ebd49c Mon Sep 17 00:00:00 2001
From: Alex Sepkowski <alexsepkowski@gmail.com>
Date: Thu, 10 Jul 2025 15:30:42 -0700
Subject: [PATCH 87/93] Merge HLSLHalf_t and HLSLBool_t (#7630)

This PR merges some more long vector exec test code from staging-sm6.9
into main. Specifically, we bring over the helper classes that define
data types for half and bool. Halfs are only available in newer c++
versions so a simple class was needed to implement the proper logic
using existing DX helpers that were added for this same reason. The bool
class is used as the size of a bool in c++ differs from that in HLSL.

Also brings in some tests cases using these data types. Test cases were
verified locally by running against WARP.

Addresses #7546
---
 .../unittests/HLSLExec/LongVectorOpTable.xml  |  84 +++++++
 .../unittests/HLSLExec/LongVectorTestData.h   | 226 +++++++++++++++++-
 .../clang/unittests/HLSLExec/LongVectors.cpp  |   6 +-
 tools/clang/unittests/HLSLExec/LongVectors.h  |   7 +-
 .../clang/unittests/HLSLExec/LongVectors.tpp  |  73 +++++-
 5 files changed, 383 insertions(+), 13 deletions(-)

diff --git a/tools/clang/unittests/HLSLExec/LongVectorOpTable.xml b/tools/clang/unittests/HLSLExec/LongVectorOpTable.xml
index 39a2fa481e..df8fe250c9 100644
--- a/tools/clang/unittests/HLSLExec/LongVectorOpTable.xml
+++ b/tools/clang/unittests/HLSLExec/LongVectorOpTable.xml
@@ -12,6 +12,23 @@
           <ParameterType Name="DataType">String</ParameterType>
           <ParameterType Name="OpTypeEnum">String</ParameterType>
         </ParameterTypes>
+        <!-- LongVectorBinaryOpTypeTable DataType: bool -->
+        <Row Name="ScalarAdd_bool">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarAdd</Parameter>
+          <Parameter Name="DataType">bool</Parameter>
+        </Row>
+        <Row Name="Add_bool">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Add</Parameter>
+          <Parameter Name="DataType">bool</Parameter>
+        </Row>
+        <Row Name="ScalarSubtract_bool">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarSubtract</Parameter>
+          <Parameter Name="DataType">bool</Parameter>
+        </Row>
+        <Row Name="Subtract_bool">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Subtract</Parameter>
+          <Parameter Name="DataType">bool</Parameter>
+        </Row>
         <!-- LongVectorBinaryOpTypeTable DataType: int16 -->
         <Row Name="ScalarAdd_int16">
           <Parameter Name="OpTypeEnum">BinaryOpType_ScalarAdd</Parameter>
@@ -354,6 +371,63 @@
           <Parameter Name="OpTypeEnum">BinaryOpType_Max</Parameter>
           <Parameter Name="DataType">uint64</Parameter>
         </Row>
+        <!-- LongVectorBinaryOpTypeTable DataType: float16 -->
+        <Row Name="ScalarAdd_float16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarAdd</Parameter>
+          <Parameter Name="DataType">float16</Parameter>
+        </Row>
+        <Row Name="Add_float16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Add</Parameter>
+          <Parameter Name="DataType">float16</Parameter>
+        </Row>
+        <Row Name="ScalarSubtract_float16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarSubtract</Parameter>
+          <Parameter Name="DataType">float16</Parameter>
+        </Row>
+        <Row Name="Subtract_float16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Subtract</Parameter>
+          <Parameter Name="DataType">float16</Parameter>
+        </Row>
+        <Row Name="ScalarMultiply_float16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarMultiply</Parameter>
+          <Parameter Name="DataType">float16</Parameter>
+        </Row>
+        <Row Name="Multiply_float16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Multiply</Parameter>
+          <Parameter Name="DataType">float16</Parameter>
+        </Row>
+        <Row Name="ScalarDivide_float16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarDivide</Parameter>
+          <Parameter Name="DataType">float16</Parameter>
+        </Row>
+        <Row Name="Divide_float16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Divide</Parameter>
+          <Parameter Name="DataType">float16</Parameter>
+        </Row>
+        <Row Name="ScalarModulus_float16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarModulus</Parameter>
+          <Parameter Name="DataType">float16</Parameter>
+        </Row>
+        <Row Name="Modulus_float16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Modulus</Parameter>
+          <Parameter Name="DataType">float16</Parameter>
+        </Row>
+        <Row Name="ScalarMin_float16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarMin</Parameter>
+          <Parameter Name="DataType">float16</Parameter>
+        </Row>
+        <Row Name="Min_float16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Min</Parameter>
+          <Parameter Name="DataType">float16</Parameter>
+        </Row>
+        <Row Name="ScalarMax_float16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarMax</Parameter>
+          <Parameter Name="DataType">float16</Parameter>
+        </Row>
+        <Row Name="Max_float16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Max</Parameter>
+          <Parameter Name="DataType">float16</Parameter>
+        </Row>
         <!-- LongVectorBinaryOpTypeTable DataType: float32 -->
         <Row Name="ScalarAdd_float32">
           <Parameter Name="OpTypeEnum">BinaryOpType_ScalarAdd</Parameter>
@@ -471,6 +545,11 @@
         <ParameterType Name="DataType">String</ParameterType>
         <ParameterType Name="OpTypeEnum">String</ParameterType>
       </ParameterTypes>
+      <!-- LongVectorUnaryOpTypeTable DataType: bool -->
+      <Row Name="Initialize_bool">
+        <Parameter Name="OpTypeEnum">UnaryOpType_Initialize</Parameter>
+        <Parameter Name="DataType">bool</Parameter>
+      </Row>
       <!-- LongVectorUnaryOpTypeTable DataType: int16 -->
       <Row Name="Initialize_int16">
         <Parameter Name="OpTypeEnum">UnaryOpType_Initialize</Parameter>
@@ -501,6 +580,11 @@
         <Parameter Name="OpTypeEnum">UnaryOpType_Initialize</Parameter>
         <Parameter Name="DataType">uint64</Parameter>
       </Row>
+      <!-- LongVectorUnaryOpTypeTable DataType: float16 -->
+      <Row Name="Initialize_float16">
+        <Parameter Name="OpTypeEnum">UnaryOpType_Initialize</Parameter>
+        <Parameter Name="DataType">float16</Parameter>
+      </Row>
       <!-- LongVectorUnaryOpTypeTable DataType: float32 -->
       <Row Name="Initialize_float32">
         <Parameter Name="OpTypeEnum">UnaryOpType_Initialize</Parameter>
diff --git a/tools/clang/unittests/HLSLExec/LongVectorTestData.h b/tools/clang/unittests/HLSLExec/LongVectorTestData.h
index 002c765609..bc6ea8c7c2 100644
--- a/tools/clang/unittests/HLSLExec/LongVectorTestData.h
+++ b/tools/clang/unittests/HLSLExec/LongVectorTestData.h
@@ -7,10 +7,204 @@
 #include <string>
 #include <vector>
 
+// A helper struct because C++ bools are 1 byte and HLSL bools are 4 bytes.
+// Take int32_t as a constuctor argument and convert it to bool when needed.
+// Comparisons cast to a bool because we only care if the bool representation is
+// true or false.
+struct HLSLBool_t {
+  HLSLBool_t() : Val(0) {}
+  HLSLBool_t(int32_t Val) : Val(Val) {}
+  HLSLBool_t(bool Val) : Val(Val) {}
+  HLSLBool_t(const HLSLBool_t &Other) : Val(Other.Val) {}
+
+  bool operator==(const HLSLBool_t &Other) const {
+    return static_cast<bool>(Val) == static_cast<bool>(Other.Val);
+  }
+
+  bool operator!=(const HLSLBool_t &Other) const {
+    return static_cast<bool>(Val) != static_cast<bool>(Other.Val);
+  }
+
+  bool operator<(const HLSLBool_t &Other) const { return Val < Other.Val; }
+
+  bool operator>(const HLSLBool_t &Other) const { return Val > Other.Val; }
+
+  bool operator<=(const HLSLBool_t &Other) const { return Val <= Other.Val; }
+
+  bool operator>=(const HLSLBool_t &Other) const { return Val >= Other.Val; }
+
+  HLSLBool_t operator*(const HLSLBool_t &Other) const {
+    return HLSLBool_t(Val * Other.Val);
+  }
+
+  HLSLBool_t operator+(const HLSLBool_t &Other) const {
+    return HLSLBool_t(Val + Other.Val);
+  }
+
+  HLSLBool_t operator-(const HLSLBool_t &Other) const {
+    return HLSLBool_t(Val - Other.Val);
+  }
+
+  HLSLBool_t operator/(const HLSLBool_t &Other) const {
+    return HLSLBool_t(Val / Other.Val);
+  }
+
+  HLSLBool_t operator%(const HLSLBool_t &Other) const {
+    return HLSLBool_t(Val % Other.Val);
+  }
+
+  // So we can construct std::wstrings using std::wostream
+  friend std::wostream &operator<<(std::wostream &Os, const HLSLBool_t &Obj) {
+    Os << static_cast<bool>(Obj.Val);
+    return Os;
+  }
+
+  // So we can construct std::strings using std::ostream
+  friend std::ostream &operator<<(std::ostream &Os, const HLSLBool_t &Obj) {
+    Os << static_cast<bool>(Obj.Val);
+    return Os;
+  }
+
+  int32_t Val = 0;
+};
+
+//  No native float16 type in C++ until C++23 . So we use uint16_t to represent
+//  it. Simple little wrapping struct to help handle the right behavior.
+struct HLSLHalf_t {
+  HLSLHalf_t() : Val(0) {}
+  HLSLHalf_t(DirectX::PackedVector::HALF Val) : Val(Val) {}
+  HLSLHalf_t(const HLSLHalf_t &Other) : Val(Other.Val) {}
+  HLSLHalf_t(const float F) {
+    Val = DirectX::PackedVector::XMConvertFloatToHalf(F);
+  }
+  HLSLHalf_t(const double D) {
+    float F = 0.0f;
+    // We wrap '::max' in () to prevent it from being expanded as a
+    // macro by the Windows SDK.
+    if (D >= (std::numeric_limits<double>::max)())
+      F = (std::numeric_limits<float>::max)();
+    else if (D <= std::numeric_limits<double>::lowest())
+      F = std::numeric_limits<float>::lowest();
+    else
+      F = static_cast<float>(D);
+
+    Val = DirectX::PackedVector::XMConvertFloatToHalf(F);
+  }
+  HLSLHalf_t(const int I) {
+    VERIFY_IS_TRUE(I == 0, L"HLSLHalf_t constructor with int override only "
+                           L"meant for cases when initializing to 0.");
+    const float F = static_cast<float>(I);
+    Val = DirectX::PackedVector::XMConvertFloatToHalf(F);
+  }
+
+  // Implicit conversion to float for use with things like std::acos, std::tan,
+  // etc
+  operator float() const {
+    return DirectX::PackedVector::XMConvertHalfToFloat(Val);
+  }
+
+  bool operator==(const HLSLHalf_t &Other) const {
+    // Convert to floats to properly handle the '0 == -0' case which must
+    // compare to true but have different uint16_t values.
+    // That is, 0 == -0 is true. We store Val as a uint16_t.
+    const float A = DirectX::PackedVector::XMConvertHalfToFloat(Val);
+    const float B = DirectX::PackedVector::XMConvertHalfToFloat(Other.Val);
+    return A == B;
+  }
+
+  bool operator<(const HLSLHalf_t &Other) const {
+    return DirectX::PackedVector::XMConvertHalfToFloat(Val) <
+           DirectX::PackedVector::XMConvertHalfToFloat(Other.Val);
+  }
+
+  bool operator>(const HLSLHalf_t &Other) const {
+    return DirectX::PackedVector::XMConvertHalfToFloat(Val) >
+           DirectX::PackedVector::XMConvertHalfToFloat(Other.Val);
+  }
+
+  // Used by tolerance checks in the tests.
+  bool operator>(float F) const {
+    const float A = DirectX::PackedVector::XMConvertHalfToFloat(Val);
+    return A > F;
+  }
+
+  bool operator<(float F) const {
+    const float A = DirectX::PackedVector::XMConvertHalfToFloat(Val);
+    return A < F;
+  }
+
+  bool operator<=(const HLSLHalf_t &Other) const {
+    return DirectX::PackedVector::XMConvertHalfToFloat(Val) <=
+           DirectX::PackedVector::XMConvertHalfToFloat(Other.Val);
+  }
+
+  bool operator>=(const HLSLHalf_t &Other) const {
+    return DirectX::PackedVector::XMConvertHalfToFloat(Val) >=
+           DirectX::PackedVector::XMConvertHalfToFloat(Other.Val);
+  }
+
+  bool operator!=(const HLSLHalf_t &Other) const { return Val != Other.Val; }
+
+  HLSLHalf_t operator*(const HLSLHalf_t &Other) const {
+    const float A = DirectX::PackedVector::XMConvertHalfToFloat(Val);
+    const float B = DirectX::PackedVector::XMConvertHalfToFloat(Other.Val);
+    return HLSLHalf_t(DirectX::PackedVector::XMConvertFloatToHalf(A * B));
+  }
+
+  HLSLHalf_t operator+(const HLSLHalf_t &Other) const {
+    const float A = DirectX::PackedVector::XMConvertHalfToFloat(Val);
+    const float B = DirectX::PackedVector::XMConvertHalfToFloat(Other.Val);
+    return HLSLHalf_t(DirectX::PackedVector::XMConvertFloatToHalf(A + B));
+  }
+
+  HLSLHalf_t operator-(const HLSLHalf_t &Other) const {
+    const float A = DirectX::PackedVector::XMConvertHalfToFloat(Val);
+    const float B = DirectX::PackedVector::XMConvertHalfToFloat(Other.Val);
+    return HLSLHalf_t(DirectX::PackedVector::XMConvertFloatToHalf(A - B));
+  }
+
+  HLSLHalf_t operator/(const HLSLHalf_t &Other) const {
+    const float A = DirectX::PackedVector::XMConvertHalfToFloat(Val);
+    const float B = DirectX::PackedVector::XMConvertHalfToFloat(Other.Val);
+    return HLSLHalf_t(DirectX::PackedVector::XMConvertFloatToHalf(A / B));
+  }
+
+  HLSLHalf_t operator%(const HLSLHalf_t &Other) const {
+    const float A = DirectX::PackedVector::XMConvertHalfToFloat(Val);
+    const float B = DirectX::PackedVector::XMConvertHalfToFloat(Other.Val);
+    const float C = std::fmod(A, B);
+    return HLSLHalf_t(DirectX::PackedVector::XMConvertFloatToHalf(C));
+  }
+
+  // So we can construct std::wstrings using std::wostream
+  friend std::wostream &operator<<(std::wostream &Os, const HLSLHalf_t &Obj) {
+    Os << DirectX::PackedVector::XMConvertHalfToFloat(Obj.Val);
+    return Os;
+  }
+
+  // So we can construct std::wstrings using std::wostream
+  friend std::ostream &operator<<(std::ostream &Os, const HLSLHalf_t &Obj) {
+    Os << DirectX::PackedVector::XMConvertHalfToFloat(Obj.Val);
+    return Os;
+  }
+
+  // HALF is an alias to uint16_t
+  DirectX::PackedVector::HALF Val = 0;
+};
+
 template <typename T> struct LongVectorTestData {
   static const std::map<std::wstring, std::vector<T>> Data;
 };
 
+template <> struct LongVectorTestData<HLSLBool_t> {
+  inline static const std::map<std::wstring, std::vector<HLSLBool_t>> Data = {
+      {L"DefaultInputValueSet1",
+       {false, true, false, false, false, false, true, true, true, true}},
+      {L"DefaultInputValueSet2",
+       {true, false, false, false, false, true, true, true, false, false}},
+  };
+};
+
 template <> struct LongVectorTestData<int16_t> {
   inline static const std::map<std::wstring, std::vector<int16_t>> Data = {
       {L"DefaultInputValueSet1", {-6, 1, 7, 3, 8, 4, -3, 8, 8, -2}},
@@ -53,12 +247,36 @@ template <> struct LongVectorTestData<uint64_t> {
   };
 };
 
+template <> struct LongVectorTestData<HLSLHalf_t> {
+  inline static const std::map<std::wstring, std::vector<HLSLHalf_t>> Data = {
+      {L"DefaultInputValueSet1",
+       {-1.0, -1.0, 1.0, -0.01, 1.0, -0.01, 1.0, -0.01, 1.0, -0.01}},
+      {L"DefaultInputValueSet2",
+       {1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0}},
+      {L"DefaultClampArgs", {-1.0, 1.0}}, // Min, Max values for clamp
+      // Range [ -pi/2, pi/2]
+      {L"TrigonometricInputValueSet_RangeHalfPi",
+       {-1.073, 0.044, -1.047, 0.313, 1.447, -0.865, 1.364, -0.715, -0.800,
+        0.541}},
+      {L"TrigonometricInputValueSet_RangeOne",
+       {0.331, 0.727, -0.957, 0.677, -0.025, 0.495, 0.855, -0.673, -0.678,
+        -0.905}},
+  };
+};
+
 template <> struct LongVectorTestData<float> {
   inline static const std::map<std::wstring, std::vector<float>> Data = {
       {L"DefaultInputValueSet1",
        {1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0}},
       {L"DefaultInputValueSet2",
        {1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0}},
+      // Range [ -pi/2, pi/2]
+      {L"TrigonometricInputValueSet_RangeHalfPi",
+       {0.315f, -0.316f, 1.409f, -0.09f, -1.569f, 1.302f, -0.326f, 0.781f,
+        -1.235f, 0.623f}},
+      {L"TrigonometricInputValueSet_RangeOne",
+       {0.727f, 0.331f, -0.957f, 0.677f, -0.025f, 0.495f, 0.855f, -0.673f,
+        -0.678f, -0.905f}},
   };
 };
 
@@ -68,7 +286,13 @@ template <> struct LongVectorTestData<double> {
        {1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0}},
       {L"DefaultInputValueSet2",
        {1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0}},
-  };
+      // Range [ -pi/2, pi/2]
+      {L"TrigonometricInputValueSet_RangeHalfPi",
+       {0.807, 0.605, 1.317, 0.188, 1.566, -1.507, 0.67, -1.553, 0.194,
+        -0.883}},
+      {L"TrigonometricInputValueSet_RangeOne",
+       {0.331, 0.277, -0.957, 0.677, -0.025, 0.495, 0.855, -0.673, -0.678,
+        -0.905}}};
 };
 
 #endif // LONGVECTORTESTDATA_H
diff --git a/tools/clang/unittests/HLSLExec/LongVectors.cpp b/tools/clang/unittests/HLSLExec/LongVectors.cpp
index 54e5224798..9c2d3d229c 100644
--- a/tools/clang/unittests/HLSLExec/LongVectors.cpp
+++ b/tools/clang/unittests/HLSLExec/LongVectors.cpp
@@ -110,7 +110,9 @@ void LongVector::OpTest::dispatchTestByDataType(
     TableParameterHandler &Handler) {
   using namespace WEX::Common;
 
-  if (DataType == L"int16")
+  if (DataType == L"bool")
+    dispatchTestByVectorSize<HLSLBool_t>(OpType, Handler);
+  else if (DataType == L"int16")
     dispatchTestByVectorSize<int16_t>(OpType, Handler);
   else if (DataType == L"int32")
     dispatchTestByVectorSize<int32_t>(OpType, Handler);
@@ -122,6 +124,8 @@ void LongVector::OpTest::dispatchTestByDataType(
     dispatchTestByVectorSize<uint32_t>(OpType, Handler);
   else if (DataType == L"uint64")
     dispatchTestByVectorSize<uint64_t>(OpType, Handler);
+  else if (DataType == L"float16")
+    dispatchTestByVectorSize<HLSLHalf_t>(OpType, Handler);
   else if (DataType == L"float32")
     dispatchTestByVectorSize<float>(OpType, Handler);
   else if (DataType == L"float64")
diff --git a/tools/clang/unittests/HLSLExec/LongVectors.h b/tools/clang/unittests/HLSLExec/LongVectors.h
index 392d059bcd..9157da679d 100644
--- a/tools/clang/unittests/HLSLExec/LongVectors.h
+++ b/tools/clang/unittests/HLSLExec/LongVectors.h
@@ -68,7 +68,9 @@ void fillLongVectorDataFromShaderBuffer(MappedData &ShaderBuffer,
                                         size_t NumElements);
 
 template <typename DataTypeT> constexpr bool isFloatingPointType() {
-  return std::is_same_v<DataTypeT, float> || std::is_same_v<DataTypeT, double>;
+  return std::is_same_v<DataTypeT, float> ||
+         std::is_same_v<DataTypeT, double> ||
+         std::is_same_v<DataTypeT, HLSLHalf_t>;
 }
 
 struct LongVectorOpTypeStringToEnumValue {
@@ -169,6 +171,9 @@ template <typename LongVectorOpTypeT> struct TestConfigTraits {
 
 template <typename DataTypeT>
 bool doValuesMatch(DataTypeT A, DataTypeT B, float Tolerance, ValidationType);
+bool doValuesMatch(HLSLBool_t A, HLSLBool_t B, float, ValidationType);
+bool doValuesMatch(HLSLHalf_t A, HLSLHalf_t B, float Tolerance,
+                   ValidationType ValidationType);
 bool doValuesMatch(float A, float B, float Tolerance,
                    ValidationType ValidationType);
 bool doValuesMatch(double A, double B, float Tolerance,
diff --git a/tools/clang/unittests/HLSLExec/LongVectors.tpp b/tools/clang/unittests/HLSLExec/LongVectors.tpp
index de333cf863..331d4452eb 100644
--- a/tools/clang/unittests/HLSLExec/LongVectors.tpp
+++ b/tools/clang/unittests/HLSLExec/LongVectors.tpp
@@ -19,17 +19,32 @@ DataTypeT LongVector::getLongVectorOpType(const LongVectorOpTypeStringToEnumValu
 template <typename DataTypeT>
 void LongVector::fillShaderBufferFromLongVectorData(std::vector<BYTE> &ShaderBuffer, std::vector<DataTypeT> &TestData) {
 
+  // Note: DataSize for HLSLHalf_t and HLSLBool_t may be larger than the
+  // underlying type in some cases. Thats fine. Resize just makes sure we have
+  // enough space.
   const size_t NumElements = TestData.size();
   const size_t DataSize = sizeof(DataTypeT) * NumElements;
   ShaderBuffer.resize(DataSize);
 
-  DataTypeT *ShaderBufferPtr =
-    reinterpret_cast<DataTypeT *>(ShaderBuffer.data());
-  for (size_t i = 0; i < NumElements; ++i)
-    ShaderBufferPtr[i] = TestData[i];
+  if constexpr (std::is_same_v<DataTypeT, HLSLHalf_t>) {
+    DirectX::PackedVector::HALF *ShaderBufferPtr =
+        reinterpret_cast<DirectX::PackedVector::HALF *>(ShaderBuffer.data());
+    for (size_t i = 0; i < NumElements; ++i)
+      ShaderBufferPtr[i] = TestData[i].Val;
+  } else if constexpr (std::is_same_v<DataTypeT, HLSLBool_t>) {
+    int32_t *ShaderBufferPtr = reinterpret_cast<int32_t *>(ShaderBuffer.data());
+    for (size_t i = 0; i < NumElements; ++i)
+      ShaderBufferPtr[i] = TestData[i].Val;
+  } else {
+    DataTypeT *ShaderBufferPtr =
+        reinterpret_cast<DataTypeT *>(ShaderBuffer.data());
+    for (size_t i = 0; i < NumElements; ++i)
+      ShaderBufferPtr[i] = TestData[i];
+  }
 }
 
-// Helpers so we do the right thing for float types.
+// Helpers so we do the right thing for float types. HLSLHalf_t is handled in an
+// operator overload.
 template <typename DataTypeT>
 DataTypeT LongVector::mod(const DataTypeT &A, const DataTypeT &B) {
   return A % B;
@@ -49,10 +64,23 @@ template <typename DataTypeT>
 void LongVector::fillLongVectorDataFromShaderBuffer(MappedData &ShaderBuffer,
                                         std::vector<DataTypeT> &TestData,
                                         size_t NumElements) {
-  DataTypeT *ShaderBufferPtr =
-    reinterpret_cast<DataTypeT *>(ShaderBuffer.data());
-  for (size_t i = 0; i < NumElements; ++i)
-    TestData.push_back(ShaderBufferPtr[i]);
+  if constexpr (std::is_same_v<DataTypeT, HLSLHalf_t>) {
+    DirectX::PackedVector::HALF *ShaderBufferPtr =
+        reinterpret_cast<DirectX::PackedVector::HALF *>(ShaderBuffer.data());
+    for (size_t i = 0; i < NumElements; ++i)
+      // HLSLHalf_t has a DirectX::PackedVector::HALF based constructor.
+      TestData.push_back(ShaderBufferPtr[i]);
+  } else if constexpr (std::is_same_v<DataTypeT, HLSLBool_t>) {
+    int32_t *ShaderBufferPtr = reinterpret_cast<int32_t *>(ShaderBuffer.data());
+    for (size_t i = 0; i < NumElements; ++i)
+      // HLSLBool_t has a int32_t based constructor.
+      TestData.push_back(ShaderBufferPtr[i]);
+  } else {
+    DataTypeT *ShaderBufferPtr =
+        reinterpret_cast<DataTypeT *>(ShaderBuffer.data());
+    for (size_t i = 0; i < NumElements; ++i)
+      TestData.push_back(ShaderBufferPtr[i]);
+  }
 }
 
 template <typename DataTypeT>
@@ -65,6 +93,25 @@ bool LongVector::doValuesMatch(DataTypeT A, DataTypeT B, float Tolerance,
   return Diff <= Tolerance;
 }
 
+bool LongVector::doValuesMatch(HLSLBool_t A, HLSLBool_t B, float,
+                          LongVector::ValidationType) {
+  return A == B;
+}
+
+bool LongVector::doValuesMatch(HLSLHalf_t A, HLSLHalf_t B, float Tolerance,
+                          LongVector::ValidationType ValidationType) {
+  switch (ValidationType) {
+  case LongVector::ValidationType_Epsilon:
+    return CompareHalfEpsilon(A.Val, B.Val, Tolerance);
+  case LongVector::ValidationType_Ulp:
+    return CompareHalfULP(A.Val, B.Val, Tolerance);
+  default:
+    WEX::Logging::Log::Error(
+        L"Invalid ValidationType. Expecting Epsilon or ULP.");
+    return false;
+  }
+}
+
 bool LongVector::doValuesMatch(float A, float B, float Tolerance,
                           LongVector::ValidationType ValidationType) {
   switch (ValidationType) {
@@ -322,6 +369,10 @@ std::string LongVector::TestConfig<DataTypeT, LongVectorOpTypeT>::getOPERAND2Str
 
 template <typename DataTypeT, typename LongVectorOpTypeT>
 std::string LongVector::TestConfig<DataTypeT, LongVectorOpTypeT>::getHLSLTypeString() const {
+  if (std::is_same_v<DataTypeT, HLSLBool_t>)
+    return "bool";
+  if (std::is_same_v<DataTypeT, HLSLHalf_t>)
+    return "half";
   if (std::is_same_v<DataTypeT, float>)
     return "float";
   if (std::is_same_v<DataTypeT, double>)
@@ -414,6 +465,8 @@ DataTypeT LongVector::TestConfig<DataTypeT, LongVectorOpTypeT>::computeExpectedV
 
   if constexpr (std::is_same_v<LongVectorOpTypeT, LongVector::UnaryOpType>) {
     const auto OpType = static_cast<LongVector::UnaryOpType>(OpTypeTraits.OpType);
+    // HLSLHalf_t is a struct. We need to call the constructor to get the
+    // expected value.
     return computeExpectedValue(A, OpType);
   }
 
@@ -433,7 +486,7 @@ std::string LongVector::TestConfig<DataTypeT, LongVectorOpTypeT>::getCompilerOpt
   CompilerOptions << " -DNUM=";
   CompilerOptions << VectorSize;
   const bool Is16BitType =
-    (HLSLType == "int16_t" || HLSLType == "uint16_t" || HLSLType == "half");
+      (HLSLType == "int16_t" || HLSLType == "uint16_t" || HLSLType == "half");
   CompilerOptions << (Is16BitType ? " -enable-16bit-types" : "");
   CompilerOptions << " -DOPERATOR=";
   CompilerOptions << OperatorString;

From 68dedee546982f23a47766c20d37d587befb5ed0 Mon Sep 17 00:00:00 2001
From: Steven Perron <stevenperron@google.com>
Date: Fri, 11 Jul 2025 10:25:05 -0400
Subject: [PATCH 88/93] [SPIRV] Add option to use the Unknown image format
 (#7632)

Many Vulkan driver allow the SPIR-V to use the `Unknown` image format
for storage images and texel buffers. This makes the SPIR-V more
flexible, and it useful for developers. However, it can be cumbersome to
have to add the `vk::image_format` attribute to all resources.

This option allows users to make `Unkown` the default image format for
all resource types.

Fixes #7484
---
 docs/SPIR-V.rst                               |   7 +
 include/dxc/Support/HLSLOptions.td            |   6 +
 include/dxc/Support/SPIRVOptions.h            |   1 +
 lib/DxcSupport/HLSLOptions.cpp                |   2 +
 tools/clang/lib/SPIRV/LowerTypeVisitor.cpp    |   4 +
 .../clang/test/CodeGenSPIRV/type.buffer.hlsl  | 171 +++++++++++-------
 .../type.rasterizer-ordered-buffer.hlsl       |  91 ++++++----
 .../type.rasterizer-ordered-texture.hlsl      |  35 ++--
 .../test/CodeGenSPIRV/type.rwtexture.hlsl     |  56 ++++--
 9 files changed, 237 insertions(+), 136 deletions(-)

diff --git a/docs/SPIR-V.rst b/docs/SPIR-V.rst
index 2bcdb99bfe..a695e5854d 100644
--- a/docs/SPIR-V.rst
+++ b/docs/SPIR-V.rst
@@ -896,6 +896,13 @@ are translated into SPIR-V ``OpTypeImage``, with parameters:
 The meanings of the headers in the above table is explained in ``OpTypeImage``
 of the SPIR-V spec.
 
+For storage images (e.g. ``RWTexture2D<T>``) and texel buffers (e.g. ``RWBuffer<T>``),
+the image format is typically inferred from the data type ``T``. However, the
+``-fspv-use-unknown-image-format`` command-line option can be used to change
+this behavior. When this option is active, the default format for these
+resources becomes ``Unknown`` if not otherwise specified by the
+``[[vk::image_format]]`` attribute.
+
 Vulkan specific Image Formats
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/include/dxc/Support/HLSLOptions.td b/include/dxc/Support/HLSLOptions.td
index 58f6bdfbf3..4a38e275c3 100644
--- a/include/dxc/Support/HLSLOptions.td
+++ b/include/dxc/Support/HLSLOptions.td
@@ -403,6 +403,12 @@ def fspv_enable_maximal_reconvergence: Flag<["-"], "fspv-enable-maximal-reconver
   HelpText<"Enables the MaximallyReconvergesKHR execution mode for this module.">;
 def fspv_use_vulkan_memory_model: Flag<["-"], "fspv-use-vulkan-memory-model">, Group<spirv_Group>, Flags<[CoreOption, DriverOption]>,
   HelpText<"Generates SPIR-V modules that use the Vulkan memory model instead of GLSL450.">;
+def fspv_use_unknown_image_format
+    : Flag<["-"], "fspv-use-unknown-image-format">,
+      Group<spirv_Group>,
+      Flags<[CoreOption, DriverOption]>,
+      HelpText<"For storage images and texel buffers, sets the default format to 'Unknown' when not specified via the `vk::image_format` attribute. If this option is not used, the format is inferred from the resource's data type.">;
+
 def fvk_auto_shift_bindings: Flag<["-"], "fvk-auto-shift-bindings">, Group<spirv_Group>, Flags<[CoreOption, DriverOption]>,
   HelpText<"Apply fvk-*-shift to resources without an explicit register assignment.">;
 def Wno_vk_ignored_features : Joined<["-"], "Wno-vk-ignored-features">, Group<spirv_Group>, Flags<[CoreOption, DriverOption, HelpHidden]>,
diff --git a/include/dxc/Support/SPIRVOptions.h b/include/dxc/Support/SPIRVOptions.h
index 1b88ef4def..352cf6c2ec 100644
--- a/include/dxc/Support/SPIRVOptions.h
+++ b/include/dxc/Support/SPIRVOptions.h
@@ -71,6 +71,7 @@ struct SpirvCodeGenOptions {
   bool fixFuncCallArguments;
   bool enableMaximalReconvergence;
   bool useVulkanMemoryModel;
+  bool useUnknownImageFormat;
   bool IEEEStrict;
   /// Maximum length in words for the OpString literal containing the shader
   /// source for DebugSource and DebugSourceContinued. If the source code length
diff --git a/lib/DxcSupport/HLSLOptions.cpp b/lib/DxcSupport/HLSLOptions.cpp
index eb071eb0a6..b3eb422eb9 100644
--- a/lib/DxcSupport/HLSLOptions.cpp
+++ b/lib/DxcSupport/HLSLOptions.cpp
@@ -1120,6 +1120,8 @@ int ReadDxcOpts(const OptTable *optionTable, unsigned flagsToInclude,
       Args.hasFlag(OPT_fspv_enable_maximal_reconvergence, OPT_INVALID, false);
   opts.SpirvOptions.useVulkanMemoryModel =
       Args.hasFlag(OPT_fspv_use_vulkan_memory_model, OPT_INVALID, false);
+  opts.SpirvOptions.useUnknownImageFormat =
+      Args.hasFlag(OPT_fspv_use_unknown_image_format, OPT_INVALID, false);
 
   if (!handleVkShiftArgs(Args, OPT_fvk_b_shift, "b", &opts.SpirvOptions.bShift,
                          errors) ||
diff --git a/tools/clang/lib/SPIRV/LowerTypeVisitor.cpp b/tools/clang/lib/SPIRV/LowerTypeVisitor.cpp
index 1869983ae3..9d1f1fff60 100644
--- a/tools/clang/lib/SPIRV/LowerTypeVisitor.cpp
+++ b/tools/clang/lib/SPIRV/LowerTypeVisitor.cpp
@@ -1156,6 +1156,10 @@ LowerTypeVisitor::lowerStructFields(const RecordDecl *decl,
 spv::ImageFormat
 LowerTypeVisitor::translateSampledTypeToImageFormat(QualType sampledType,
                                                     SourceLocation srcLoc) {
+
+  if (spvOptions.useUnknownImageFormat)
+    return spv::ImageFormat::Unknown;
+
   uint32_t elemCount = 1;
   QualType ty = {};
   if (!isScalarType(sampledType, &ty) &&
diff --git a/tools/clang/test/CodeGenSPIRV/type.buffer.hlsl b/tools/clang/test/CodeGenSPIRV/type.buffer.hlsl
index 35d1b868a8..769fe808b2 100644
--- a/tools/clang/test/CodeGenSPIRV/type.buffer.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/type.buffer.hlsl
@@ -1,109 +1,144 @@
-// RUN: %dxc -T ps_6_0 -E main -fcgl  %s -spirv | FileCheck %s
+// RUN: %dxc -T ps_6_0 -E main -fcgl  %s -spirv | FileCheck %s --check-prefixes=CHECK,INFER
+// RUN: %dxc -fspv-use-unknown-image-format -T ps_6_0 -E main -fcgl  %s -spirv | FileCheck %s --check-prefixes=CHECK,UNKNOWN
 
 // CHECK: OpCapability SampledBuffer
-// CHECK: OpCapability StorageImageExtendedFormats
+// INFER: OpCapability StorageImageExtendedFormats
 
-// CHECK: %type_buffer_image = OpTypeImage %int Buffer 2 0 0 1 R32i
+// INFER: %type_buffer_image = OpTypeImage %int Buffer 2 0 0 1 R32i
+// UNKNOWN: %type_buffer_image = OpTypeImage %int Buffer 2 0 0 1 Unknown
 // CHECK: %_ptr_UniformConstant_type_buffer_image = OpTypePointer UniformConstant %type_buffer_image
 Buffer<int> intbuf;
-// CHECK: %type_buffer_image_0 = OpTypeImage %uint Buffer 2 0 0 1 R32ui
+// INFER: %type_buffer_image_0 = OpTypeImage %uint Buffer 2 0 0 1 R32ui
+// UNKNOWN: %type_buffer_image_0 = OpTypeImage %uint Buffer 2 0 0 1 Unknown
 // CHECK: %_ptr_UniformConstant_type_buffer_image_0 = OpTypePointer UniformConstant %type_buffer_image_0
 Buffer<uint> uintbuf;
-// CHECK: %type_buffer_image_1 = OpTypeImage %float Buffer 2 0 0 1 R32f
+// INFER: %type_buffer_image_1 = OpTypeImage %float Buffer 2 0 0 1 R32f
+// UNKNOWN: %type_buffer_image_1 = OpTypeImage %float Buffer 2 0 0 1 Unknown
 // CHECK: %_ptr_UniformConstant_type_buffer_image_1 = OpTypePointer UniformConstant %type_buffer_image_1
 Buffer<float> floatbuf;
 
-// CHECK: %type_buffer_image_2 = OpTypeImage %int Buffer 2 0 0 2 R32i
+// INFER: %type_buffer_image_2 = OpTypeImage %int Buffer 2 0 0 2 R32i
+// UNKNOWN: %type_buffer_image_2 = OpTypeImage %int Buffer 2 0 0 2 Unknown
 // CHECK: %_ptr_UniformConstant_type_buffer_image_2 = OpTypePointer UniformConstant %type_buffer_image_2
 RWBuffer<int> intrwbuf;
-// CHECK: %type_buffer_image_3 = OpTypeImage %uint Buffer 2 0 0 2 R32ui
+// INFER: %type_buffer_image_3 = OpTypeImage %uint Buffer 2 0 0 2 R32ui
+// UNKNOWN: %type_buffer_image_3 = OpTypeImage %uint Buffer 2 0 0 2 Unknown
 // CHECK: %_ptr_UniformConstant_type_buffer_image_3 = OpTypePointer UniformConstant %type_buffer_image_3
 RWBuffer<uint> uintrwbuf;
-// CHECK: %type_buffer_image_4 = OpTypeImage %float Buffer 2 0 0 2 R32f
+// INFER: %type_buffer_image_4 = OpTypeImage %float Buffer 2 0 0 2 R32f
+// UNKNOWN: %type_buffer_image_4 = OpTypeImage %float Buffer 2 0 0 2 Unknown
 // CHECK: %_ptr_UniformConstant_type_buffer_image_4 = OpTypePointer UniformConstant %type_buffer_image_4
 RWBuffer<float> floatrwbuf;
 
-// CHECK: %type_buffer_image_5 = OpTypeImage %int Buffer 2 0 0 1 Rg32i
-// CHECK: %_ptr_UniformConstant_type_buffer_image_5 = OpTypePointer UniformConstant %type_buffer_image_5
+// If the `Unkonwn image format is used, then the images below will reuse the types above.
+// UNKNOWN-NOT: OpTypeImage
+
+// INFER: %type_buffer_image_5 = OpTypeImage %int Buffer 2 0 0 1 Rg32i
+// INFER: %_ptr_UniformConstant_type_buffer_image_5 = OpTypePointer UniformConstant %type_buffer_image_5
 Buffer<int2> int2buf;
-// CHECK: %type_buffer_image_6 = OpTypeImage %uint Buffer 2 0 0 1 Rg32ui
-// CHECK: %_ptr_UniformConstant_type_buffer_image_6 = OpTypePointer UniformConstant %type_buffer_image_6
+// INFER: %type_buffer_image_6 = OpTypeImage %uint Buffer 2 0 0 1 Rg32ui
+// INFER: %_ptr_UniformConstant_type_buffer_image_6 = OpTypePointer UniformConstant %type_buffer_image_6
 Buffer<uint2> uint2buf;
-// CHECK: %type_buffer_image_7 = OpTypeImage %float Buffer 2 0 0 1 Rg32f
-// CHECK: %_ptr_UniformConstant_type_buffer_image_7 = OpTypePointer UniformConstant %type_buffer_image_7
+// INFER: %type_buffer_image_7 = OpTypeImage %float Buffer 2 0 0 1 Rg32f
+// INFER: %_ptr_UniformConstant_type_buffer_image_7 = OpTypePointer UniformConstant %type_buffer_image_7
 Buffer<float2> float2buf;
 
-// CHECK: %type_buffer_image_8 = OpTypeImage %int Buffer 2 0 0 2 Rg32i
-// CHECK: %_ptr_UniformConstant_type_buffer_image_8 = OpTypePointer UniformConstant %type_buffer_image_8
+// INFER: %type_buffer_image_8 = OpTypeImage %int Buffer 2 0 0 2 Rg32i
+// INFER: %_ptr_UniformConstant_type_buffer_image_8 = OpTypePointer UniformConstant %type_buffer_image_8
 RWBuffer<int2> int2rwbuf;
-// CHECK: %type_buffer_image_9 = OpTypeImage %uint Buffer 2 0 0 2 Rg32ui
-// CHECK: %_ptr_UniformConstant_type_buffer_image_9 = OpTypePointer UniformConstant %type_buffer_image_9
+// INFER: %type_buffer_image_9 = OpTypeImage %uint Buffer 2 0 0 2 Rg32ui
+// INFER: %_ptr_UniformConstant_type_buffer_image_9 = OpTypePointer UniformConstant %type_buffer_image_9
 RWBuffer<uint2> uint2rwbuf;
-// CHECK: %type_buffer_image_10 = OpTypeImage %float Buffer 2 0 0 2 Rg32f
-// CHECK: %_ptr_UniformConstant_type_buffer_image_10 = OpTypePointer UniformConstant %type_buffer_image_10
+// INFER: %type_buffer_image_10 = OpTypeImage %float Buffer 2 0 0 2 Rg32f
+// INFER: %_ptr_UniformConstant_type_buffer_image_10 = OpTypePointer UniformConstant %type_buffer_image_10
 RWBuffer<float2> float2rwbuf;
 
-// CHECK: %type_buffer_image_11 = OpTypeImage %int Buffer 2 0 0 1 Unknown
-// CHECK: %_ptr_UniformConstant_type_buffer_image_11 = OpTypePointer UniformConstant %type_buffer_image_11
-// CHECK: %type_buffer_image_12 = OpTypeImage %int Buffer 2 0 0 1 Rgba32i
-// CHECK: %_ptr_UniformConstant_type_buffer_image_12 = OpTypePointer UniformConstant %type_buffer_image_12
+// INFER: %type_buffer_image_11 = OpTypeImage %int Buffer 2 0 0 1 Unknown
+// INFER: %_ptr_UniformConstant_type_buffer_image_11 = OpTypePointer UniformConstant %type_buffer_image_11
+// INFER: %type_buffer_image_12 = OpTypeImage %int Buffer 2 0 0 1 Rgba32i
+// INFER: %_ptr_UniformConstant_type_buffer_image_12 = OpTypePointer UniformConstant %type_buffer_image_12
 Buffer<int3> int3buf;
 Buffer<int4> int4buf;
-// CHECK: %type_buffer_image_13 = OpTypeImage %uint Buffer 2 0 0 1 Unknown
-// CHECK: %_ptr_UniformConstant_type_buffer_image_13 = OpTypePointer UniformConstant %type_buffer_image_13
-// CHECK: %type_buffer_image_14 = OpTypeImage %uint Buffer 2 0 0 1 Rgba32ui
-// CHECK: %_ptr_UniformConstant_type_buffer_image_14 = OpTypePointer UniformConstant %type_buffer_image_14
+// INFER: %type_buffer_image_13 = OpTypeImage %uint Buffer 2 0 0 1 Unknown
+// INFER: %_ptr_UniformConstant_type_buffer_image_13 = OpTypePointer UniformConstant %type_buffer_image_13
+// INFER: %type_buffer_image_14 = OpTypeImage %uint Buffer 2 0 0 1 Rgba32ui
+// INFER: %_ptr_UniformConstant_type_buffer_image_14 = OpTypePointer UniformConstant %type_buffer_image_14
 Buffer<uint3> uint3buf;
 Buffer<uint4> uint4buf;
-// CHECK: %type_buffer_image_15 = OpTypeImage %float Buffer 2 0 0 1 Unknown
-// CHECK: %_ptr_UniformConstant_type_buffer_image_15 = OpTypePointer UniformConstant %type_buffer_image_15
-// CHECK: %type_buffer_image_16 = OpTypeImage %float Buffer 2 0 0 1 Rgba32f
-// CHECK: %_ptr_UniformConstant_type_buffer_image_16 = OpTypePointer UniformConstant %type_buffer_image_16
+// INFER: %type_buffer_image_15 = OpTypeImage %float Buffer 2 0 0 1 Unknown
+// INFER: %_ptr_UniformConstant_type_buffer_image_15 = OpTypePointer UniformConstant %type_buffer_image_15
+// INFER: %type_buffer_image_16 = OpTypeImage %float Buffer 2 0 0 1 Rgba32f
+// INFER: %_ptr_UniformConstant_type_buffer_image_16 = OpTypePointer UniformConstant %type_buffer_image_16
 Buffer<float3> float3buf;
 Buffer<float4> float4buf;
 
-// CHECK: %type_buffer_image_17 = OpTypeImage %int Buffer 2 0 0 2 Unknown
-// CHECK: %_ptr_UniformConstant_type_buffer_image_17 = OpTypePointer UniformConstant %type_buffer_image_17
-// CHECK: %type_buffer_image_18 = OpTypeImage %int Buffer 2 0 0 2 Rgba32i
-// CHECK: %_ptr_UniformConstant_type_buffer_image_18 = OpTypePointer UniformConstant %type_buffer_image_18
+// INFER: %type_buffer_image_17 = OpTypeImage %int Buffer 2 0 0 2 Unknown
+// INFER: %_ptr_UniformConstant_type_buffer_image_17 = OpTypePointer UniformConstant %type_buffer_image_17
+// INFER: %type_buffer_image_18 = OpTypeImage %int Buffer 2 0 0 2 Rgba32i
+// INFER: %_ptr_UniformConstant_type_buffer_image_18 = OpTypePointer UniformConstant %type_buffer_image_18
 RWBuffer<int3> int3rwbuf;
 RWBuffer<int4> int4rwbuf;
-// CHECK: %type_buffer_image_19 = OpTypeImage %uint Buffer 2 0 0 2 Unknown
-// CHECK: %_ptr_UniformConstant_type_buffer_image_19 = OpTypePointer UniformConstant %type_buffer_image_19
-// CHECK: %type_buffer_image_20 = OpTypeImage %uint Buffer 2 0 0 2 Rgba32ui
-// CHECK: %_ptr_UniformConstant_type_buffer_image_20 = OpTypePointer UniformConstant %type_buffer_image_20
+// INFER: %type_buffer_image_19 = OpTypeImage %uint Buffer 2 0 0 2 Unknown
+// INFER: %_ptr_UniformConstant_type_buffer_image_19 = OpTypePointer UniformConstant %type_buffer_image_19
+// INFER: %type_buffer_image_20 = OpTypeImage %uint Buffer 2 0 0 2 Rgba32ui
+// INFER: %_ptr_UniformConstant_type_buffer_image_20 = OpTypePointer UniformConstant %type_buffer_image_20
 RWBuffer<uint3> uint3rwbuf;
 RWBuffer<uint4> uint4rwbuf;
-// CHECK: %type_buffer_image_21 = OpTypeImage %float Buffer 2 0 0 2 Unknown
-// CHECK: %_ptr_UniformConstant_type_buffer_image_21 = OpTypePointer UniformConstant %type_buffer_image_21
-// CHECK: %type_buffer_image_22 = OpTypeImage %float Buffer 2 0 0 2 Rgba32f
-// CHECK: %_ptr_UniformConstant_type_buffer_image_22 = OpTypePointer UniformConstant %type_buffer_image_22
+// INFER: %type_buffer_image_21 = OpTypeImage %float Buffer 2 0 0 2 Unknown
+// INFER: %_ptr_UniformConstant_type_buffer_image_21 = OpTypePointer UniformConstant %type_buffer_image_21
+// INFER: %type_buffer_image_22 = OpTypeImage %float Buffer 2 0 0 2 Rgba32f
+// INFER: %_ptr_UniformConstant_type_buffer_image_22 = OpTypePointer UniformConstant %type_buffer_image_22
 RWBuffer<float3> float3rwbuf;
 RWBuffer<float4> float4rwbuf;
 
-// CHECK: %intbuf = OpVariable %_ptr_UniformConstant_type_buffer_image UniformConstant
-// CHECK: %uintbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_0 UniformConstant
-// CHECK: %floatbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_1 UniformConstant
-// CHECK: %intrwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_2 UniformConstant
-// CHECK: %uintrwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_3 UniformConstant
-// CHECK: %floatrwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_4 UniformConstant
-// CHECK: %int2buf = OpVariable %_ptr_UniformConstant_type_buffer_image_5 UniformConstant
-// CHECK: %uint2buf = OpVariable %_ptr_UniformConstant_type_buffer_image_6 UniformConstant
-// CHECK: %float2buf = OpVariable %_ptr_UniformConstant_type_buffer_image_7 UniformConstant
-// CHECK: %int2rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_8 UniformConstant
-// CHECK: %uint2rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_9 UniformConstant
-// CHECK: %float2rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_10 UniformConstant
-// CHECK: %int3buf = OpVariable %_ptr_UniformConstant_type_buffer_image_11 UniformConstant
-// CHECK: %int4buf = OpVariable %_ptr_UniformConstant_type_buffer_image_12 UniformConstant
-// CHECK: %uint3buf = OpVariable %_ptr_UniformConstant_type_buffer_image_13 UniformConstant
-// CHECK: %uint4buf = OpVariable %_ptr_UniformConstant_type_buffer_image_14 UniformConstant
-// CHECK: %float3buf = OpVariable %_ptr_UniformConstant_type_buffer_image_15 UniformConstant
-// CHECK: %float4buf = OpVariable %_ptr_UniformConstant_type_buffer_image_16 UniformConstant
-// CHECK: %int3rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_17 UniformConstant
-// CHECK: %int4rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_18 UniformConstant
-// CHECK: %uint3rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_19 UniformConstant
-// CHECK: %uint4rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_20 UniformConstant
-// CHECK: %float3rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_21 UniformConstant
-// CHECK: %float4rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_22 UniformConstant
+// INFER: %intbuf = OpVariable %_ptr_UniformConstant_type_buffer_image UniformConstant
+// INFER: %uintbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_0 UniformConstant
+// INFER: %floatbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_1 UniformConstant
+// INFER: %intrwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_2 UniformConstant
+// INFER: %uintrwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_3 UniformConstant
+// INFER: %floatrwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_4 UniformConstant
+// INFER: %int2buf = OpVariable %_ptr_UniformConstant_type_buffer_image_5 UniformConstant
+// INFER: %uint2buf = OpVariable %_ptr_UniformConstant_type_buffer_image_6 UniformConstant
+// INFER: %float2buf = OpVariable %_ptr_UniformConstant_type_buffer_image_7 UniformConstant
+// INFER: %int2rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_8 UniformConstant
+// INFER: %uint2rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_9 UniformConstant
+// INFER: %float2rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_10 UniformConstant
+// INFER: %int3buf = OpVariable %_ptr_UniformConstant_type_buffer_image_11 UniformConstant
+// INFER: %int4buf = OpVariable %_ptr_UniformConstant_type_buffer_image_12 UniformConstant
+// INFER: %uint3buf = OpVariable %_ptr_UniformConstant_type_buffer_image_13 UniformConstant
+// INFER: %uint4buf = OpVariable %_ptr_UniformConstant_type_buffer_image_14 UniformConstant
+// INFER: %float3buf = OpVariable %_ptr_UniformConstant_type_buffer_image_15 UniformConstant
+// INFER: %float4buf = OpVariable %_ptr_UniformConstant_type_buffer_image_16 UniformConstant
+// INFER: %int3rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_17 UniformConstant
+// INFER: %int4rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_18 UniformConstant
+// INFER: %uint3rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_19 UniformConstant
+// INFER: %uint4rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_20 UniformConstant
+// INFER: %float3rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_21 UniformConstant
+// INFER: %float4rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_22 UniformConstant
+
+// UNKNOWN: %intbuf = OpVariable %_ptr_UniformConstant_type_buffer_image UniformConstant
+// UNKNOWN: %uintbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_0 UniformConstant
+// UNKNOWN: %floatbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_1 UniformConstant
+// UNKNOWN: %intrwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_2 UniformConstant
+// UNKNOWN: %uintrwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_3 UniformConstant
+// UNKNOWN: %floatrwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_4 UniformConstant
+// UNKNOWN: %int2buf = OpVariable %_ptr_UniformConstant_type_buffer_image UniformConstant
+// UNKNOWN: %uint2buf = OpVariable %_ptr_UniformConstant_type_buffer_image_0 UniformConstant
+// UNKNOWN: %float2buf = OpVariable %_ptr_UniformConstant_type_buffer_image_1 UniformConstant
+// UNKNOWN: %int2rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_2 UniformConstant
+// UNKNOWN: %uint2rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_3 UniformConstant
+// UNKNOWN: %float2rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_4 UniformConstant
+// UNKNOWN: %int3buf = OpVariable %_ptr_UniformConstant_type_buffer_image UniformConstant
+// UNKNOWN: %int4buf = OpVariable %_ptr_UniformConstant_type_buffer_image UniformConstant
+// UNKNOWN: %uint3buf = OpVariable %_ptr_UniformConstant_type_buffer_image_0 UniformConstant
+// UNKNOWN: %uint4buf = OpVariable %_ptr_UniformConstant_type_buffer_image_0 UniformConstant
+// UNKNOWN: %float3buf = OpVariable %_ptr_UniformConstant_type_buffer_image_1 UniformConstant
+// UNKNOWN: %float4buf = OpVariable %_ptr_UniformConstant_type_buffer_image_1 UniformConstant
+// UNKNOWN: %int3rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_2 UniformConstant
+// UNKNOWN: %int4rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_2 UniformConstant
+// UNKNOWN: %uint3rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_3 UniformConstant
+// UNKNOWN: %uint4rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_3 UniformConstant
+// UNKNOWN: %float3rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_4 UniformConstant
+// UNKNOWN: %float4rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_4 UniformConstant
 
 void main() {}
diff --git a/tools/clang/test/CodeGenSPIRV/type.rasterizer-ordered-buffer.hlsl b/tools/clang/test/CodeGenSPIRV/type.rasterizer-ordered-buffer.hlsl
index c616f65bb9..cf84562e52 100644
--- a/tools/clang/test/CodeGenSPIRV/type.rasterizer-ordered-buffer.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/type.rasterizer-ordered-buffer.hlsl
@@ -1,59 +1,80 @@
-// RUN: %dxc -T ps_6_6 -E main -fcgl %s -spirv | FileCheck %s
+// RUN: %dxc -T ps_6_6 -E main -fcgl %s -spirv | FileCheck %s --check-prefixes=CHECK,INFER
+// RUN: %dxc -fspv-use-unknown-image-format -T ps_6_6 -E main -fcgl %s -spirv | FileCheck %s --check-prefixes=CHECK,UNKNOWN
+
+// Before vulkan1.3, we should be trying to infer the image type for because
+// we cannot necessarily use Unknown. However in VK1.3 and later, we can use
+// Unknown.
 
 // CHECK: OpCapability SampledBuffer
-// CHECK: OpCapability StorageImageExtendedFormats
+// INFER: OpCapability StorageImageExtendedFormats
 
-// CHECK: %type_buffer_image = OpTypeImage %int Buffer 2 0 0 2 R32i
+// INFER: %type_buffer_image = OpTypeImage %int Buffer 2 0 0 2 R32i
+// UNKNOWN: %type_buffer_image = OpTypeImage %int Buffer 2 0 0 2 Unknown
 // CHECK: %_ptr_UniformConstant_type_buffer_image = OpTypePointer UniformConstant %type_buffer_image
 RasterizerOrderedBuffer<int> introvbuf;
-// CHECK: %type_buffer_image_0 = OpTypeImage %uint Buffer 2 0 0 2 R32ui
+// INFER: %type_buffer_image_0 = OpTypeImage %uint Buffer 2 0 0 2 R32ui
+// UNKNOWN: %type_buffer_image_0 = OpTypeImage %uint Buffer 2 0 0 2 Unknown
 // CHECK: %_ptr_UniformConstant_type_buffer_image_0 = OpTypePointer UniformConstant %type_buffer_image_0
 RasterizerOrderedBuffer<uint> uintrovbuf;
-// CHECK: %type_buffer_image_1 = OpTypeImage %float Buffer 2 0 0 2 R32f
+// INFER: %type_buffer_image_1 = OpTypeImage %float Buffer 2 0 0 2 R32f
+// UNKNOWN: %type_buffer_image_1 = OpTypeImage %float Buffer 2 0 0 2 Unknown
 // CHECK: %_ptr_UniformConstant_type_buffer_image_1 = OpTypePointer UniformConstant %type_buffer_image_1
 RasterizerOrderedBuffer<float> floatrovbuf;
 
-// CHECK: %type_buffer_image_2 = OpTypeImage %int Buffer 2 0 0 2 Rg32i
-// CHECK: %_ptr_UniformConstant_type_buffer_image_2 = OpTypePointer UniformConstant %type_buffer_image_2
+// INFER: %type_buffer_image_2 = OpTypeImage %int Buffer 2 0 0 2 Rg32i
+// INFER: %_ptr_UniformConstant_type_buffer_image_2 = OpTypePointer UniformConstant %type_buffer_image_2
 RasterizerOrderedBuffer<int2> int2rovbuf;
-// CHECK: %type_buffer_image_3 = OpTypeImage %uint Buffer 2 0 0 2 Rg32ui
-// CHECK: %_ptr_UniformConstant_type_buffer_image_3 = OpTypePointer UniformConstant %type_buffer_image_3
+// INFER: %type_buffer_image_3 = OpTypeImage %uint Buffer 2 0 0 2 Rg32ui
+// INFER: %_ptr_UniformConstant_type_buffer_image_3 = OpTypePointer UniformConstant %type_buffer_image_3
 RasterizerOrderedBuffer<uint2> uint2rovbuf;
-// CHECK: %type_buffer_image_4 = OpTypeImage %float Buffer 2 0 0 2 Rg32f
-// CHECK: %_ptr_UniformConstant_type_buffer_image_4 = OpTypePointer UniformConstant %type_buffer_image_4
+// INFER: %type_buffer_image_4 = OpTypeImage %float Buffer 2 0 0 2 Rg32f
+// INFER: %_ptr_UniformConstant_type_buffer_image_4 = OpTypePointer UniformConstant %type_buffer_image_4
 RasterizerOrderedBuffer<float2> float2rovbuf;
 
-// CHECK: %type_buffer_image_5 = OpTypeImage %int Buffer 2 0 0 2 Unknown
-// CHECK: %_ptr_UniformConstant_type_buffer_image_5 = OpTypePointer UniformConstant %type_buffer_image_5
-// CHECK: %type_buffer_image_6 = OpTypeImage %int Buffer 2 0 0 2 Rgba32i
-// CHECK: %_ptr_UniformConstant_type_buffer_image_6 = OpTypePointer UniformConstant %type_buffer_image_6
+// INFER: %type_buffer_image_5 = OpTypeImage %int Buffer 2 0 0 2 Unknown
+// INFER: %_ptr_UniformConstant_type_buffer_image_5 = OpTypePointer UniformConstant %type_buffer_image_5
+// INFER: %type_buffer_image_6 = OpTypeImage %int Buffer 2 0 0 2 Rgba32i
+// INFER: %_ptr_UniformConstant_type_buffer_image_6 = OpTypePointer UniformConstant %type_buffer_image_6
 RasterizerOrderedBuffer<int3> int3rovbuf;
 RasterizerOrderedBuffer<int4> int4rovbuf;
-// CHECK: %type_buffer_image_7 = OpTypeImage %uint Buffer 2 0 0 2 Unknown
-// CHECK: %_ptr_UniformConstant_type_buffer_image_7 = OpTypePointer UniformConstant %type_buffer_image_7
-// CHECK: %type_buffer_image_8 = OpTypeImage %uint Buffer 2 0 0 2 Rgba32ui
-// CHECK: %_ptr_UniformConstant_type_buffer_image_8 = OpTypePointer UniformConstant %type_buffer_image_8
+// INFER: %type_buffer_image_7 = OpTypeImage %uint Buffer 2 0 0 2 Unknown
+// INFER: %_ptr_UniformConstant_type_buffer_image_7 = OpTypePointer UniformConstant %type_buffer_image_7
+// INFER: %type_buffer_image_8 = OpTypeImage %uint Buffer 2 0 0 2 Rgba32ui
+// INFER: %_ptr_UniformConstant_type_buffer_image_8 = OpTypePointer UniformConstant %type_buffer_image_8
 RasterizerOrderedBuffer<uint3> uint3rovbuf;
 RasterizerOrderedBuffer<uint4> uint4rovbuf;
-// CHECK: %type_buffer_image_9 = OpTypeImage %float Buffer 2 0 0 2 Unknown
-// CHECK: %_ptr_UniformConstant_type_buffer_image_9 = OpTypePointer UniformConstant %type_buffer_image_9
-// CHECK: %type_buffer_image_10 = OpTypeImage %float Buffer 2 0 0 2 Rgba32f
-// CHECK: %_ptr_UniformConstant_type_buffer_image_10 = OpTypePointer UniformConstant %type_buffer_image_10
+// INFER: %type_buffer_image_9 = OpTypeImage %float Buffer 2 0 0 2 Unknown
+// INFER: %_ptr_UniformConstant_type_buffer_image_9 = OpTypePointer UniformConstant %type_buffer_image_9
+// INFER: %type_buffer_image_10 = OpTypeImage %float Buffer 2 0 0 2 Rgba32f
+// INFER: %_ptr_UniformConstant_type_buffer_image_10 = OpTypePointer UniformConstant %type_buffer_image_10
 RasterizerOrderedBuffer<float3> float3rovbuf;
 RasterizerOrderedBuffer<float4> float4rovbuf;
 
-// CHECK: %introvbuf = OpVariable %_ptr_UniformConstant_type_buffer_image UniformConstant
-// CHECK: %uintrovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_0 UniformConstant
-// CHECK: %floatrovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_1 UniformConstant
-// CHECK: %int2rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_2 UniformConstant
-// CHECK: %uint2rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_3 UniformConstant
-// CHECK: %float2rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_4 UniformConstant
-// CHECK: %int3rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_5 UniformConstant
-// CHECK: %int4rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_6 UniformConstant
-// CHECK: %uint3rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_7 UniformConstant
-// CHECK: %uint4rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_8 UniformConstant
-// CHECK: %float3rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_9 UniformConstant
-// CHECK: %float4rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_10 UniformConstant
+// INFER: %introvbuf = OpVariable %_ptr_UniformConstant_type_buffer_image UniformConstant
+// INFER: %uintrovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_0 UniformConstant
+// INFER: %floatrovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_1 UniformConstant
+// INFER: %int2rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_2 UniformConstant
+// INFER: %uint2rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_3 UniformConstant
+// INFER: %float2rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_4 UniformConstant
+// INFER: %int3rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_5 UniformConstant
+// INFER: %int4rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_6 UniformConstant
+// INFER: %uint3rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_7 UniformConstant
+// INFER: %uint4rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_8 UniformConstant
+// INFER: %float3rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_9 UniformConstant
+// INFER: %float4rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_10 UniformConstant
+
+// UNKNOWN: %introvbuf = OpVariable %_ptr_UniformConstant_type_buffer_image UniformConstant
+// UNKNOWN: %uintrovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_0 UniformConstant
+// UNKNOWN: %floatrovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_1 UniformConstant
+// UNKNOWN: %int2rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image UniformConstant
+// UNKNOWN: %uint2rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_0 UniformConstant
+// UNKNOWN: %float2rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_1 UniformConstant
+// UNKNOWN: %int3rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image UniformConstant
+// UNKNOWN: %int4rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image UniformConstant
+// UNKNOWN: %uint3rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_0 UniformConstant
+// UNKNOWN: %uint4rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_0 UniformConstant
+// UNKNOWN: %float3rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_1 UniformConstant
+// UNKNOWN: %float4rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_1 UniformConstant
 
 void main() {}
 
diff --git a/tools/clang/test/CodeGenSPIRV/type.rasterizer-ordered-texture.hlsl b/tools/clang/test/CodeGenSPIRV/type.rasterizer-ordered-texture.hlsl
index 32dd76e6f1..651840b0e6 100644
--- a/tools/clang/test/CodeGenSPIRV/type.rasterizer-ordered-texture.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/type.rasterizer-ordered-texture.hlsl
@@ -1,23 +1,27 @@
-// RUN: %dxc -T ps_6_6 -E main -fcgl %s -spirv | FileCheck %s
+// RUN: %dxc -T ps_6_6 -E main -fcgl %s -spirv | FileCheck %s --check-prefixes=CHECK,INFER
+// RUN: %dxc -fspv-use-unknown-image-format -T ps_6_6 -E main -fcgl %s -spirv | FileCheck %s --check-prefixes=CHECK,UNKNOWN
 
 // CHECK: OpCapability Image1D
 
-// CHECK: %type_1d_image = OpTypeImage %int 1D 2 0 0 2 R32i
+// INFER: %type_1d_image = OpTypeImage %int 1D 2 0 0 2 R32i
+// UNKNOWN: %type_1d_image = OpTypeImage %int 1D 2 0 0 2 Unknown
 // CHECK: %_ptr_UniformConstant_type_1d_image = OpTypePointer UniformConstant %type_1d_image
-// CHECK: %type_2d_image = OpTypeImage %uint 2D 2 0 0 2 Rg32ui
+// INFER: %type_2d_image = OpTypeImage %uint 2D 2 0 0 2 Rg32ui
+// UNKNOWN: %type_2d_image = OpTypeImage %uint 2D 2 0 0 2 Unknown
 // CHECK: %_ptr_UniformConstant_type_2d_image = OpTypePointer UniformConstant %type_2d_image
-// CHECK: %type_3d_image = OpTypeImage %int 3D 2 0 0 2 R32i
+// INFER: %type_3d_image = OpTypeImage %int 3D 2 0 0 2 R32i
+// UNKNOWN: %type_3d_image = OpTypeImage %int 3D 2 0 0 2 Unknown
 // CHECK: %_ptr_UniformConstant_type_3d_image = OpTypePointer UniformConstant %type_3d_image
-// CHECK: %type_3d_image_0 = OpTypeImage %float 3D 2 0 0 2 Rgba32f
-// CHECK: %_ptr_UniformConstant_type_3d_image_0 = OpTypePointer UniformConstant %type_3d_image_0
-// CHECK: %type_1d_image_array = OpTypeImage %int 1D 2 1 0 2 R32i
-// CHECK: %_ptr_UniformConstant_type_1d_image_array = OpTypePointer UniformConstant %type_1d_image_array
-// CHECK: %type_2d_image_array = OpTypeImage %uint 2D 2 1 0 2 Rg32ui
-// CHECK: %_ptr_UniformConstant_type_2d_image_array = OpTypePointer UniformConstant %type_2d_image_array
-// CHECK: %type_1d_image_array_0 = OpTypeImage %float 1D 2 1 0 2 Rgba32f
-// CHECK: %_ptr_UniformConstant_type_1d_image_array_0 = OpTypePointer UniformConstant %type_1d_image_array_0
-// CHECK: %type_2d_image_array_0 = OpTypeImage %float 2D 2 1 0 2 Rgba32f
-// CHECK: %_ptr_UniformConstant_type_2d_image_array_0 = OpTypePointer UniformConstant %type_2d_image_array_0
+// INFER: %type_3d_image_0 = OpTypeImage %float 3D 2 0 0 2 Rgba32f
+// INFER: %_ptr_UniformConstant_type_3d_image_0 = OpTypePointer UniformConstant %type_3d_image_0
+// INFER: %type_1d_image_array = OpTypeImage %int 1D 2 1 0 2 R32i
+// INFER: %_ptr_UniformConstant_type_1d_image_array = OpTypePointer UniformConstant %type_1d_image_array
+// INFER: %type_2d_image_array = OpTypeImage %uint 2D 2 1 0 2 Rg32ui
+// INFER: %_ptr_UniformConstant_type_2d_image_array = OpTypePointer UniformConstant %type_2d_image_array
+// INFER: %type_1d_image_array_0 = OpTypeImage %float 1D 2 1 0 2 Rgba32f
+// INFER: %_ptr_UniformConstant_type_1d_image_array_0 = OpTypePointer UniformConstant %type_1d_image_array_0
+// INFER: %type_2d_image_array_0 = OpTypeImage %float 2D 2 1 0 2 Rgba32f
+// INFER: %_ptr_UniformConstant_type_2d_image_array_0 = OpTypePointer UniformConstant %type_2d_image_array_0
 
 
 // CHECK: %t1 = OpVariable %_ptr_UniformConstant_type_1d_image UniformConstant
@@ -33,7 +37,8 @@ RasterizerOrderedTexture3D   <int>    t3 ;
 [[vk::image_format("rgba32f")]]
 RasterizerOrderedTexture3D   <float3> t4 ;
 
-// CHECK: %t5 = OpVariable %_ptr_UniformConstant_type_3d_image_0 UniformConstant
+// INFER: %t5 = OpVariable %_ptr_UniformConstant_type_3d_image_0 UniformConstant
+// UNKNOWN: %t5 = OpVariable %_ptr_UniformConstant_type_3d_image_1 UniformConstant
 RasterizerOrderedTexture3D   <float4> t5 ;
 
 // CHECK: %t6 = OpVariable %_ptr_UniformConstant_type_1d_image_array UniformConstant
diff --git a/tools/clang/test/CodeGenSPIRV/type.rwtexture.hlsl b/tools/clang/test/CodeGenSPIRV/type.rwtexture.hlsl
index f901d44cfa..44e7592869 100644
--- a/tools/clang/test/CodeGenSPIRV/type.rwtexture.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/type.rwtexture.hlsl
@@ -1,24 +1,43 @@
-// RUN: %dxc -T vs_6_0 -E main -fcgl  %s -spirv | FileCheck %s
+// RUN: %dxc -T vs_6_0 -E main -fcgl  %s -spirv | FileCheck %s --check-prefixes=CHECK,INFER
+// RUN: %dxc -fspv-use-unknown-image-format -T vs_6_0 -E main -fcgl  %s -spirv | FileCheck %s --check-prefixes=CHECK,UNKNOWN
 
 // CHECK: OpCapability Image1D
 
-// CHECK: %type_1d_image = OpTypeImage %int 1D 2 0 0 2 R32i
-// CHECK: %_ptr_UniformConstant_type_1d_image = OpTypePointer UniformConstant %type_1d_image
-// CHECK: %type_2d_image = OpTypeImage %uint 2D 2 0 0 2 Rg32ui
-// CHECK: %_ptr_UniformConstant_type_2d_image = OpTypePointer UniformConstant %type_2d_image
-// CHECK: %type_3d_image = OpTypeImage %int 3D 2 0 0 2 R32i
-// CHECK: %_ptr_UniformConstant_type_3d_image = OpTypePointer UniformConstant %type_3d_image
-// CHECK: %type_3d_image_0 = OpTypeImage %float 3D 2 0 0 2 Rgba32f
-// CHECK: %_ptr_UniformConstant_type_3d_image_0 = OpTypePointer UniformConstant %type_3d_image_0
-// CHECK: %type_1d_image_array = OpTypeImage %int 1D 2 1 0 2 R32i
-// CHECK: %_ptr_UniformConstant_type_1d_image_array = OpTypePointer UniformConstant %type_1d_image_array
-// CHECK: %type_2d_image_array = OpTypeImage %uint 2D 2 1 0 2 Rg32ui
-// CHECK: %_ptr_UniformConstant_type_2d_image_array = OpTypePointer UniformConstant %type_2d_image_array
-// CHECK: %type_1d_image_array_0 = OpTypeImage %float 1D 2 1 0 2 Rgba32f
-// CHECK: %_ptr_UniformConstant_type_1d_image_array_0 = OpTypePointer UniformConstant %type_1d_image_array_0
-// CHECK: %type_2d_image_array_0 = OpTypeImage %float 2D 2 1 0 2 Rgba32f
-// CHECK: %_ptr_UniformConstant_type_2d_image_array_0 = OpTypePointer UniformConstant %type_2d_image_array_0
+// INFER: %type_1d_image = OpTypeImage %int 1D 2 0 0 2 R32i
+// INFER: %_ptr_UniformConstant_type_1d_image = OpTypePointer UniformConstant %type_1d_image
+// INFER: %type_2d_image = OpTypeImage %uint 2D 2 0 0 2 Rg32ui
+// INFER: %_ptr_UniformConstant_type_2d_image = OpTypePointer UniformConstant %type_2d_image
+// INFER: %type_3d_image = OpTypeImage %int 3D 2 0 0 2 R32i
+// INFER: %_ptr_UniformConstant_type_3d_image = OpTypePointer UniformConstant %type_3d_image
+// INFER: %type_3d_image_0 = OpTypeImage %float 3D 2 0 0 2 Rgba32f
+// INFER: %_ptr_UniformConstant_type_3d_image_0 = OpTypePointer UniformConstant %type_3d_image_0
+// INFER: %type_1d_image_array = OpTypeImage %int 1D 2 1 0 2 R32i
+// INFER: %_ptr_UniformConstant_type_1d_image_array = OpTypePointer UniformConstant %type_1d_image_array
+// INFER: %type_2d_image_array = OpTypeImage %uint 2D 2 1 0 2 Rg32ui
+// INFER: %_ptr_UniformConstant_type_2d_image_array = OpTypePointer UniformConstant %type_2d_image_array
+// INFER: %type_1d_image_array_0 = OpTypeImage %float 1D 2 1 0 2 Rgba32f
+// INFER: %_ptr_UniformConstant_type_1d_image_array_0 = OpTypePointer UniformConstant %type_1d_image_array_0
+// INFER: %type_2d_image_array_0 = OpTypeImage %float 2D 2 1 0 2 Rgba32f
+// INFER: %_ptr_UniformConstant_type_2d_image_array_0 = OpTypePointer UniformConstant %type_2d_image_array_0
 
+// UNKNOWN: %type_1d_image = OpTypeImage %int 1D 2 0 0 2 Unknown
+// UNKNOWN: %_ptr_UniformConstant_type_1d_image = OpTypePointer UniformConstant %type_1d_image
+// UNKNOWN: %type_2d_image = OpTypeImage %uint 2D 2 0 0 2 Unknown
+// UNKNOWN: %_ptr_UniformConstant_type_2d_image = OpTypePointer UniformConstant %type_2d_image
+// UNKNOWN: %type_3d_image = OpTypeImage %int 3D 2 0 0 2 Unknown
+// UNKNOWN: %_ptr_UniformConstant_type_3d_image = OpTypePointer UniformConstant %type_3d_image
+// UNKNOWN: %type_3d_image_0 = OpTypeImage %float 3D 2 0 0 2 Rgba32f
+// UNKNOWN: %_ptr_UniformConstant_type_3d_image_0 = OpTypePointer UniformConstant %type_3d_image_0
+// UNKNOWN: %type_3d_image_1 = OpTypeImage %float 3D 2 0 0 2 Unknown
+// UNKNOWN: %_ptr_UniformConstant_type_3d_image_1 = OpTypePointer UniformConstant %type_3d_image_1
+// UNKNOWN: %type_1d_image_array = OpTypeImage %int 1D 2 1 0 2 Unknown
+// UNKNOWN: %_ptr_UniformConstant_type_1d_image_array = OpTypePointer UniformConstant %type_1d_image_array
+// UNKNOWN: %type_2d_image_array = OpTypeImage %uint 2D 2 1 0 2 Unknown
+// UNKNOWN: %_ptr_UniformConstant_type_2d_image_array = OpTypePointer UniformConstant %type_2d_image_array
+// UNKNOWN: %type_1d_image_array_0 = OpTypeImage %float 1D 2 1 0 2 Unknown
+// UNKNOWN: %_ptr_UniformConstant_type_1d_image_array_0 = OpTypePointer UniformConstant %type_1d_image_array_0
+// UNKNOWN: %type_2d_image_array_0 = OpTypeImage %float 2D 2 1 0 2 Unknown
+// UNKNOWN: %_ptr_UniformConstant_type_2d_image_array_0 = OpTypePointer UniformConstant %type_2d_image_array_0
 
 // CHECK: %t1 = OpVariable %_ptr_UniformConstant_type_1d_image UniformConstant
 RWTexture1D   <int>    t1 ;
@@ -33,7 +52,8 @@ RWTexture3D   <int>    t3 ;
 [[vk::image_format("rgba32f")]]
 RWTexture3D   <float3> t4 ;
 
-// CHECK: %t5 = OpVariable %_ptr_UniformConstant_type_3d_image_0 UniformConstant
+// INFER: %t5 = OpVariable %_ptr_UniformConstant_type_3d_image_0 UniformConstant
+// UNKNOWN: %t5 = OpVariable %_ptr_UniformConstant_type_3d_image_1 UniformConstant
 RWTexture3D   <float4> t5 ;
 
 // CHECK: %t6 = OpVariable %_ptr_UniformConstant_type_1d_image_array UniformConstant

From 162bf4ec397e9074031052e4d00d5e7973deec42 Mon Sep 17 00:00:00 2001
From: Chris B <cbieneman@microsoft.com>
Date: Thu, 17 Jul 2025 17:45:56 -0500
Subject: [PATCH 89/93] Fix intrinsic lookup with namespaces (#7599)

This change fixes issues with intrinsic lookup caused by not correctly
respecting the using declaration(s) that impact unqualified lookups.

This probably isn't a perfect solution because I'm sure there's some
nuance of unqualified lookups in C++ that I'm not handling, but this
does respect scoped using directives and allows us to get things
working.

Additionally this change disables emitting some "declared here" notes
when the source location referred to is invalid.

Fixes #7495
---
 .../include/clang/Sema/ExternalSemaSource.h   |   7 +-
 tools/clang/include/clang/Sema/Sema.h         |   5 +
 tools/clang/lib/Sema/SemaCodeComplete.cpp     |   2 +-
 tools/clang/lib/Sema/SemaHLSL.cpp             | 170 ++++++++++--------
 tools/clang/lib/Sema/SemaLookup.cpp           |  36 +++-
 tools/clang/lib/Sema/SemaOverload.cpp         |   7 +-
 tools/clang/test/SemaHLSL/effects-syntax.hlsl |   2 -
 tools/clang/test/SemaHLSL/raytracings.hlsl    |   4 +-
 .../SemaHLSL/using-namespace-dx-errors.hlsl   |  42 +++++
 .../test/SemaHLSL/using-namespace-dx.hlsl     |  56 ++++++
 10 files changed, 247 insertions(+), 84 deletions(-)
 create mode 100644 tools/clang/test/SemaHLSL/using-namespace-dx-errors.hlsl
 create mode 100644 tools/clang/test/SemaHLSL/using-namespace-dx.hlsl

diff --git a/tools/clang/include/clang/Sema/ExternalSemaSource.h b/tools/clang/include/clang/Sema/ExternalSemaSource.h
index 91578e2440..b10d649cc6 100644
--- a/tools/clang/include/clang/Sema/ExternalSemaSource.h
+++ b/tools/clang/include/clang/Sema/ExternalSemaSource.h
@@ -211,10 +211,9 @@ class ExternalSemaSource : public ExternalASTSource {
   // add call candidates to the given expression. It returns 'true'
   // if standard overload search should be suppressed; false otherwise.
   virtual bool AddOverloadedCallCandidates(UnresolvedLookupExpr *ULE,
-    ArrayRef<Expr *> Args,
-    OverloadCandidateSet &CandidateSet,
-    bool PartialOverloading)
-  {
+                                           ArrayRef<Expr *> Args,
+                                           OverloadCandidateSet &CandidateSet,
+                                           Scope *S, bool PartialOverloading) {
     return false;
   }
 
diff --git a/tools/clang/include/clang/Sema/Sema.h b/tools/clang/include/clang/Sema/Sema.h
index 755c7e0755..5e20f6f0f8 100644
--- a/tools/clang/include/clang/Sema/Sema.h
+++ b/tools/clang/include/clang/Sema/Sema.h
@@ -2495,9 +2495,14 @@ class Sema {
                                             DeclAccessPair FoundDecl,
                                             FunctionDecl *Fn);
 
+  // HLSL Change Begin
+  void CollectNamespaceContexts(Scope *,
+                                SmallVectorImpl<const DeclContext *> &);
+  // HLSL Change End
   void AddOverloadedCallCandidates(UnresolvedLookupExpr *ULE,
                                    ArrayRef<Expr *> Args,
                                    OverloadCandidateSet &CandidateSet,
+                                   Scope *S, // HLSL Change
                                    bool PartialOverloading = false);
 
   // An enum used to represent the different possible results of building a
diff --git a/tools/clang/lib/Sema/SemaCodeComplete.cpp b/tools/clang/lib/Sema/SemaCodeComplete.cpp
index b1b4668ba3..84d0990346 100644
--- a/tools/clang/lib/Sema/SemaCodeComplete.cpp
+++ b/tools/clang/lib/Sema/SemaCodeComplete.cpp
@@ -4020,7 +4020,7 @@ void Sema::CodeCompleteCall(Scope *S, Expr *Fn, ArrayRef<Expr *> Args) {
 
   Expr *NakedFn = Fn->IgnoreParenCasts();
   if (auto ULE = dyn_cast<UnresolvedLookupExpr>(NakedFn))
-    AddOverloadedCallCandidates(ULE, Args, CandidateSet,
+    AddOverloadedCallCandidates(ULE, Args, CandidateSet, S, // HLSL Change
                                 /*PartialOverloading=*/true);
   else if (auto UME = dyn_cast<UnresolvedMemberExpr>(NakedFn)) {
     TemplateArgumentListInfo TemplateArgsBuffer, *TemplateArgs = nullptr;
diff --git a/tools/clang/lib/Sema/SemaHLSL.cpp b/tools/clang/lib/Sema/SemaHLSL.cpp
index 8e800e8f68..dcb6142858 100644
--- a/tools/clang/lib/Sema/SemaHLSL.cpp
+++ b/tools/clang/lib/Sema/SemaHLSL.cpp
@@ -4152,6 +4152,7 @@ class HLSLExternalSource : public ExternalSemaSource {
                               SourceLocation(), &context.Idents.get("dx"),
                               /*PrevDecl*/ nullptr);
     m_dxNSDecl->setImplicit();
+    m_dxNSDecl->setHasExternalLexicalStorage(true);
     context.getTranslationUnitDecl()->addDecl(m_dxNSDecl);
 
 #ifdef ENABLE_SPIRV_CODEGEN
@@ -5169,7 +5170,7 @@ class HLSLExternalSource : public ExternalSemaSource {
 
   bool AddOverloadedCallCandidates(UnresolvedLookupExpr *ULE,
                                    ArrayRef<Expr *> Args,
-                                   OverloadCandidateSet &CandidateSet,
+                                   OverloadCandidateSet &CandidateSet, Scope *S,
                                    bool PartialOverloading) override {
     DXASSERT_NOMSG(ULE != nullptr);
 
@@ -5194,6 +5195,8 @@ class HLSLExternalSource : public ExternalSemaSource {
     // Exceptions:
     // - Vulkan-specific intrinsics live in the 'vk::' namespace.
     // - DirectX-specific intrinsics live in the 'dx::' namespace.
+    // - Global namespaces could just mean we have a `using` declaration... so
+    // it can be anywhere!
     if (isQualified && !isGlobalNamespace && !isVkNamespace && !isDxNamespace)
       return false;
 
@@ -5204,81 +5207,106 @@ class HLSLExternalSource : public ExternalSemaSource {
     }
 
     StringRef nameIdentifier = idInfo->getName();
-    const HLSL_INTRINSIC *table = g_Intrinsics;
-    auto tableCount = _countof(g_Intrinsics);
-    if (isDxNamespace) {
-      table = g_DxIntrinsics;
-      tableCount = _countof(g_DxIntrinsics);
+    using IntrinsicArray = llvm::ArrayRef<const HLSL_INTRINSIC>;
+    struct IntrinsicTableEntry {
+      IntrinsicArray Table;
+      NamespaceDecl *NS;
+    };
+
+    llvm::SmallVector<IntrinsicTableEntry, 3> SearchTables;
+
+    bool SearchDX = isDxNamespace;
+    bool SearchVK = isVkNamespace;
+    if (isGlobalNamespace || !isQualified)
+      SearchTables.push_back(
+          IntrinsicTableEntry{IntrinsicArray(g_Intrinsics), m_hlslNSDecl});
+
+    if (S && !isQualified) {
+      SmallVector<const DeclContext *, 4> NSContexts;
+      m_sema->CollectNamespaceContexts(S, NSContexts);
+      for (const auto &UD : NSContexts) {
+        if (static_cast<DeclContext *>(m_dxNSDecl) == UD)
+          SearchDX = true;
+        else if (static_cast<DeclContext *>(m_vkNSDecl) == UD)
+          SearchVK = true;
+      }
     }
+
+    if (SearchDX)
+      SearchTables.push_back(
+          IntrinsicTableEntry{IntrinsicArray(g_DxIntrinsics), m_dxNSDecl});
 #ifdef ENABLE_SPIRV_CODEGEN
-    if (isVkNamespace) {
-      table = g_VkIntrinsics;
-      tableCount = _countof(g_VkIntrinsics);
-    }
-#endif // ENABLE_SPIRV_CODEGEN
+    if (SearchVK)
+      SearchTables.push_back(
+          IntrinsicTableEntry{IntrinsicArray(g_VkIntrinsics), m_vkNSDecl});
+#endif
 
-    IntrinsicDefIter cursor = FindIntrinsicByNameAndArgCount(
-        table, tableCount, StringRef(), nameIdentifier, Args.size());
-    IntrinsicDefIter end = IntrinsicDefIter::CreateEnd(
-        table, tableCount, IntrinsicTableDefIter::CreateEnd(m_intrinsicTables));
-
-    for (; cursor != end; ++cursor) {
-      // If this is the intrinsic we're interested in, build up a representation
-      // of the types we need.
-      const HLSL_INTRINSIC *pIntrinsic = *cursor;
-      LPCSTR tableName = cursor.GetTableName();
-      LPCSTR lowering = cursor.GetLoweringStrategy();
-      DXASSERT(pIntrinsic->uNumArgs <= g_MaxIntrinsicParamCount + 1,
-               "otherwise g_MaxIntrinsicParamCount needs to be updated for "
-               "wider signatures");
-
-      std::vector<QualType> functionArgTypes;
-      size_t badArgIdx;
-      bool argsMatch =
-          MatchArguments(cursor, QualType(), QualType(), QualType(), Args,
-                         &functionArgTypes, badArgIdx);
-      if (!functionArgTypes.size())
-        return false;
+    assert(!SearchTables.empty() && "Must have at least one search table!");
+
+    for (const auto &T : SearchTables) {
+
+      IntrinsicDefIter cursor = FindIntrinsicByNameAndArgCount(
+          T.Table.data(), T.Table.size(), StringRef(), nameIdentifier,
+          Args.size());
+      IntrinsicDefIter end = IntrinsicDefIter::CreateEnd(
+          T.Table.data(), T.Table.size(),
+          IntrinsicTableDefIter::CreateEnd(m_intrinsicTables));
+
+      for (; cursor != end; ++cursor) {
+        // If this is the intrinsic we're interested in, build up a
+        // representation of the types we need.
+        const HLSL_INTRINSIC *pIntrinsic = *cursor;
+        LPCSTR tableName = cursor.GetTableName();
+        LPCSTR lowering = cursor.GetLoweringStrategy();
+        DXASSERT(pIntrinsic->uNumArgs <= g_MaxIntrinsicParamCount + 1,
+                 "otherwise g_MaxIntrinsicParamCount needs to be updated for "
+                 "wider signatures");
+
+        std::vector<QualType> functionArgTypes;
+        size_t badArgIdx;
+        bool argsMatch =
+            MatchArguments(cursor, QualType(), QualType(), QualType(), Args,
+                           &functionArgTypes, badArgIdx);
+        if (!functionArgTypes.size())
+          return false;
 
-      // Get or create the overload we're interested in.
-      FunctionDecl *intrinsicFuncDecl = nullptr;
-      std::pair<UsedIntrinsicStore::iterator, bool> insertResult =
-          m_usedIntrinsics.insert(UsedIntrinsic(pIntrinsic, functionArgTypes));
-      bool insertedNewValue = insertResult.second;
-      if (insertedNewValue) {
-        NamespaceDecl *nsDecl = m_hlslNSDecl;
-        if (isVkNamespace)
-          nsDecl = m_vkNSDecl;
-        else if (isDxNamespace)
-          nsDecl = m_dxNSDecl;
-        DXASSERT(tableName,
-                 "otherwise IDxcIntrinsicTable::GetTableName() failed");
-        intrinsicFuncDecl =
-            AddHLSLIntrinsicFunction(*m_context, nsDecl, tableName, lowering,
-                                     pIntrinsic, &functionArgTypes);
-        insertResult.first->setFunctionDecl(intrinsicFuncDecl);
-      } else {
-        intrinsicFuncDecl = (*insertResult.first).getFunctionDecl();
-      }
+        // Get or create the overload we're interested in.
+        FunctionDecl *intrinsicFuncDecl = nullptr;
+        std::pair<UsedIntrinsicStore::iterator, bool> insertResult =
+            m_usedIntrinsics.insert(
+                UsedIntrinsic(pIntrinsic, functionArgTypes));
+        bool insertedNewValue = insertResult.second;
+        if (insertedNewValue) {
+          DXASSERT(tableName,
+                   "otherwise IDxcIntrinsicTable::GetTableName() failed");
+          intrinsicFuncDecl =
+              AddHLSLIntrinsicFunction(*m_context, T.NS, tableName, lowering,
+                                       pIntrinsic, &functionArgTypes);
+          insertResult.first->setFunctionDecl(intrinsicFuncDecl);
+        } else {
+          intrinsicFuncDecl = (*insertResult.first).getFunctionDecl();
+        }
 
-      OverloadCandidate &candidate = CandidateSet.addCandidate(Args.size());
-      candidate.Function = intrinsicFuncDecl;
-      candidate.FoundDecl.setDecl(intrinsicFuncDecl);
-      candidate.Viable = argsMatch;
-      CandidateSet.isNewCandidate(intrinsicFuncDecl); // used to insert into set
-      if (argsMatch)
-        return true;
-      if (badArgIdx) {
-        candidate.FailureKind = ovl_fail_bad_conversion;
-        QualType ParamType =
-            intrinsicFuncDecl->getParamDecl(badArgIdx - 1)->getType();
-        candidate.Conversions[badArgIdx - 1].setBad(
-            BadConversionSequence::no_conversion, Args[badArgIdx - 1],
-            ParamType);
-      } else {
-        // A less informative error. Needed when the failure relates to the
-        // return type
-        candidate.FailureKind = ovl_fail_bad_final_conversion;
+        OverloadCandidate &candidate = CandidateSet.addCandidate(Args.size());
+        candidate.Function = intrinsicFuncDecl;
+        candidate.FoundDecl.setDecl(intrinsicFuncDecl);
+        candidate.Viable = argsMatch;
+        CandidateSet.isNewCandidate(
+            intrinsicFuncDecl); // used to insert into set
+        if (argsMatch)
+          return true;
+        if (badArgIdx) {
+          candidate.FailureKind = ovl_fail_bad_conversion;
+          QualType ParamType =
+              intrinsicFuncDecl->getParamDecl(badArgIdx - 1)->getType();
+          candidate.Conversions[badArgIdx - 1].setBad(
+              BadConversionSequence::no_conversion, Args[badArgIdx - 1],
+              ParamType);
+        } else {
+          // A less informative error. Needed when the failure relates to the
+          // return type
+          candidate.FailureKind = ovl_fail_bad_final_conversion;
+        }
       }
     }
 
diff --git a/tools/clang/lib/Sema/SemaLookup.cpp b/tools/clang/lib/Sema/SemaLookup.cpp
index 98832a8f57..eec8a7fa64 100644
--- a/tools/clang/lib/Sema/SemaLookup.cpp
+++ b/tools/clang/lib/Sema/SemaLookup.cpp
@@ -55,6 +55,7 @@
 using namespace clang;
 using namespace sema;
 
+// HLSL Note: This set of utilities copied to SemaHLSL.cpp.
 namespace {
   class UnqualUsingEntry {
     const DeclContext *Nominated;
@@ -4809,9 +4810,12 @@ void Sema::diagnoseTypo(const TypoCorrection &Correction,
 
   NamedDecl *ChosenDecl =
       Correction.isKeyword() ? nullptr : Correction.getCorrectionDecl();
-  if (PrevNote.getDiagID() && ChosenDecl)
+  // HLSL Change begin: don't put notes on invalid source locations.
+  if (PrevNote.getDiagID() && ChosenDecl &&
+      !ChosenDecl->getLocation().isInvalid())
     Diag(ChosenDecl->getLocation(), PrevNote)
       << CorrectedQuotedStr << (ErrorRecovery ? FixItHint() : FixTypo);
+  // HLSL Change end
 }
 
 TypoExpr *Sema::createDelayedTypo(std::unique_ptr<TypoCorrectionConsumer> TCC,
@@ -4836,3 +4840,33 @@ const Sema::TypoExprState &Sema::getTypoExprState(TypoExpr *TE) const {
 void Sema::clearDelayedTypo(TypoExpr *TE) {
   DelayedTypos.erase(TE);
 }
+
+// HLSL Change Begin
+void Sema::CollectNamespaceContexts(Scope *S,
+                                    SmallVectorImpl<const DeclContext *> &NSs) {
+  UnqualUsingDirectiveSet UDirs;
+
+  // Add using directives from this context up to the top level. This
+  // handles cases where the current declaration is in a context that has
+  // a using directive but might be in a scope chain that doesn't reach
+  // the using directive (i.e. a using inside a namespace or class
+  // declaration but the function definition is outside).
+  DeclContext *Ctx = S->getEntity();
+  for (DeclContext *UCtx = Ctx; UCtx; UCtx = UCtx->getParent()) {
+    if (UCtx->isTransparentContext())
+      continue;
+
+    UDirs.visit(UCtx, UCtx);
+  }
+  // Find the first namespace or translation-unit scope.
+  Scope *Innermost = S;
+  while (Innermost && !isNamespaceOrTranslationUnitScope(Innermost))
+    Innermost = Innermost->getParent();
+
+  UDirs.visitScopeChain(S, Innermost);
+  UDirs.done();
+
+  for (auto &UD : UDirs)
+    NSs.push_back(UD.getNominatedNamespace());
+}
+// HLSL Change End
diff --git a/tools/clang/lib/Sema/SemaOverload.cpp b/tools/clang/lib/Sema/SemaOverload.cpp
index 1bcbc7442f..274b66646b 100644
--- a/tools/clang/lib/Sema/SemaOverload.cpp
+++ b/tools/clang/lib/Sema/SemaOverload.cpp
@@ -10627,6 +10627,7 @@ static void AddOverloadedCallCandidate(Sema &S,
 void Sema::AddOverloadedCallCandidates(UnresolvedLookupExpr *ULE,
                                        ArrayRef<Expr *> Args,
                                        OverloadCandidateSet &CandidateSet,
+                                       Scope *S, // HLSL Change
                                        bool PartialOverloading) {
 
 #ifndef NDEBUG
@@ -10659,8 +10660,8 @@ void Sema::AddOverloadedCallCandidates(UnresolvedLookupExpr *ULE,
 #endif
 
   // HLSL Change - allow ExternalSource the ability to add the overloads for a call.
-  if (ExternalSource &&
-    ExternalSource->AddOverloadedCallCandidates(ULE, Args, CandidateSet, PartialOverloading)) {
+  if (ExternalSource && ExternalSource->AddOverloadedCallCandidates(
+                            ULE, Args, CandidateSet, S, PartialOverloading)) {
     return;
   }
 
@@ -10970,7 +10971,7 @@ bool Sema::buildOverloadedCallSet(Scope *S, Expr *Fn,
 
   // Add the functions denoted by the callee to the set of candidate
   // functions, including those from argument-dependent lookup.
-  AddOverloadedCallCandidates(ULE, Args, *CandidateSet);
+  AddOverloadedCallCandidates(ULE, Args, *CandidateSet, S); // HLSL Change
 
   if (getLangOpts().MSVCCompat &&
       CurContext->isDependentContext() && !isSFINAEContext() &&
diff --git a/tools/clang/test/SemaHLSL/effects-syntax.hlsl b/tools/clang/test/SemaHLSL/effects-syntax.hlsl
index 5a7492a9da..e5468cbd41 100644
--- a/tools/clang/test/SemaHLSL/effects-syntax.hlsl
+++ b/tools/clang/test/SemaHLSL/effects-syntax.hlsl
@@ -108,12 +108,10 @@ static const PixelShader ps1 { state=foo; };                /* expected-warning
 /*verify-ast
   No matching AST found for line!
 */
-// expected-note@? {{'PixelShader' declared here}}
 PixelShadeR ps < int foo=1;>  = ps1;   // Case insensitive! /* expected-error {{unknown type name 'PixelShadeR'; did you mean 'PixelShader'?}} expected-warning {{effect object ignored - effect syntax is deprecated}} expected-warning {{possible effect annotation ignored - effect syntax is deprecated}} fxc-pass {{}} */
 /*verify-ast
   No matching AST found for line!
 */
-// expected-note@? {{'VertexShader' declared here}}
 VertexShadeR vs;        // Case insensitive!                /* expected-error {{unknown type name 'VertexShadeR'; did you mean 'VertexShader'?}} expected-warning {{effect object ignored - effect syntax is deprecated}} fxc-pass {{}} */
 
 // Case sensitive
diff --git a/tools/clang/test/SemaHLSL/raytracings.hlsl b/tools/clang/test/SemaHLSL/raytracings.hlsl
index d3bc01fcd6..429037f22b 100644
--- a/tools/clang/test/SemaHLSL/raytracings.hlsl
+++ b/tools/clang/test/SemaHLSL/raytracings.hlsl
@@ -12,14 +12,14 @@ void run() {
     RAY_FLAG_CULL_OPAQUE                     +
     RAY_FLAG_CULL_NON_OPAQUE;
 
-  rayFlags += RAY_FLAG_INVALID;                             /* expected-note@? {{'RAY_FLAG_NONE' declared here}} expected-error {{use of undeclared identifier 'RAY_FLAG_INVALID'; did you mean 'RAY_FLAG_NONE'?}} */
+  rayFlags += RAY_FLAG_INVALID;                             /* expected-error {{use of undeclared identifier 'RAY_FLAG_INVALID'; did you mean 'RAY_FLAG_NONE'?}} */
 
   int intFlag = RAY_FLAG_CULL_OPAQUE;
 
   int hitKindFlag =
     HIT_KIND_TRIANGLE_FRONT_FACE + HIT_KIND_TRIANGLE_BACK_FACE;
 
-  hitKindFlag += HIT_KIND_INVALID;                          /* expected-note@? {{'HIT_KIND_NONE' declared here}} expected-error {{use of undeclared identifier 'HIT_KIND_INVALID'; did you mean 'HIT_KIND_NONE'?}} */
+  hitKindFlag += HIT_KIND_INVALID;                          /* expected-error {{use of undeclared identifier 'HIT_KIND_INVALID'; did you mean 'HIT_KIND_NONE'?}} */
 
 
   BuiltInTriangleIntersectionAttributes attr;
diff --git a/tools/clang/test/SemaHLSL/using-namespace-dx-errors.hlsl b/tools/clang/test/SemaHLSL/using-namespace-dx-errors.hlsl
new file mode 100644
index 0000000000..233ce103ce
--- /dev/null
+++ b/tools/clang/test/SemaHLSL/using-namespace-dx-errors.hlsl
@@ -0,0 +1,42 @@
+// RUN: %dxc -T lib_6_9 %s -verify
+
+RaytracingAccelerationStructure Scene : register(t0, space0);
+
+struct[raypayload] RayPayload {
+  float4 color : write(caller) : read(closesthit);
+};
+
+[shader("raygeneration")] void MyRaygenShader() {
+  // Set the ray's extents.
+  RayDesc ray;
+  ray.Origin = float3(0, 0, 1);
+  ray.Direction = float3(1, 0, 0);
+  ray.TMin = 0.001;
+  ray.TMax = 10000.0;
+
+  RayPayload payload = {float4(0, 0, 0, 0)};
+
+  {
+    using namespace dx;
+    HitObject hit =
+        HitObject::TraceRay(Scene, RAY_FLAG_NONE, ~0, 0, 1, 0,
+                            ray, payload);
+
+    int sortKey = 1;
+    MaybeReorderThread(sortKey, 1);
+  }
+
+  {
+    int sortKey = 1;
+    MaybeReorderThread(sortKey, 1); // expected-error{{use of undeclared identifier 'MaybeReorderThread'; did you mean 'MaybeReorderThread'?}}
+  }
+
+  int sortKey = 1;
+  MaybeReorderThread(sortKey, 1); // expected-error{{use of undeclared identifier 'MaybeReorderThread'; did you mean 'MaybeReorderThread'?}}
+
+  HitObject hit = // expected-error{{unknown type name 'HitObject'}}
+        HitObject::TraceRay(Scene, RAY_FLAG_NONE, ~0, 0, 1, 0,
+                            ray, payload);
+
+  HitObject::Invoke(hit, payload); // expected-error{{use of undeclared identifier 'HitObject'}}
+}
diff --git a/tools/clang/test/SemaHLSL/using-namespace-dx.hlsl b/tools/clang/test/SemaHLSL/using-namespace-dx.hlsl
new file mode 100644
index 0000000000..093e86b2fa
--- /dev/null
+++ b/tools/clang/test/SemaHLSL/using-namespace-dx.hlsl
@@ -0,0 +1,56 @@
+// RUN: %dxc -T lib_6_9 -ast-dump-implicit %s | FileCheck %s
+
+RaytracingAccelerationStructure Scene : register(t0, space0);
+
+struct[raypayload] RayPayload {
+  float4 color : write(caller) : read(closesthit);
+};
+
+namespace MyStuff {
+  using namespace dx;
+  void MaybeReorderThread(int2 V);
+}
+
+void MyStuff::MaybeReorderThread(int2 V) {
+  MaybeReorderThread(V.x, V.y);
+}
+
+[shader("raygeneration")] void MyRaygenShader() {
+  // Set the ray's extents.
+  RayDesc ray;
+  ray.Origin = float3(0, 0, 1);
+  ray.Direction = float3(1, 0, 0);
+  ray.TMin = 0.001;
+  ray.TMax = 10000.0;
+
+  RayPayload payload = {float4(0, 0, 0, 0)};
+  
+  using namespace dx;
+  HitObject hit =
+      HitObject::TraceRay(Scene, RAY_FLAG_NONE, ~0, 0, 1, 0,
+                          ray, payload);
+
+  int sortKey = 1;
+  MaybeReorderThread(sortKey, 1);
+
+  HitObject::Invoke(hit, payload);
+
+  MyStuff::MaybeReorderThread(int2(sortKey, 1));
+}
+
+// Find the DeclRefExpr for the call to MaybeReorderThread:
+
+// CHECK: FunctionDecl [[MyDeclAddr:0x[0-9a-fA-F]+]] parent {{.*}} used MaybeReorderThread 'void (int2)'
+// CHECK: DeclRefExpr {{.*}} 'void (unsigned int, unsigned int)' lvalue Function [[DeclAddr:0x[0-9a-fA-F]+]] 'MaybeReorderThread' 'void (unsigned int, unsigned int)'
+
+// CHECK: FunctionDecl [[DeclAddr]] <<invalid sloc>> <invalid sloc> implicit used MaybeReorderThread 'void (unsigned int, unsigned int)' extern
+// CHECK-NEXT: ParmVarDecl {{.*}} CoherenceHint 'unsigned int'
+// CHECK-NEXT: ParmVarDecl {{.*}} NumCoherenceHintBitsFromLSB 'unsigned int'
+// CHECK-NEXT: HLSLIntrinsicAttr {{.*}} Implicit "op" "" 359
+// CHECK-NEXT: AvailabilityAttr {{.*}} Implicit  6.9 0 0 ""
+
+// CHECK-LABEL: MyRaygenShader
+
+// CHECK: DeclRefExpr {{.*}} 'void (unsigned int, unsigned int)' lvalue Function [[DeclAddr:0x[0-9a-fA-F]+]] 'MaybeReorderThread' 'void (unsigned int, unsigned int)'
+// CHECK: DeclRefExpr {{.*}} 'void (int2)' lvalue Function [[MyDeclAddr:0x[0-9a-fA-F]+]] 'MaybeReorderThread' 'void (int2)'
+

From 020fbdf5056a798387d85ddb95fe630389125256 Mon Sep 17 00:00:00 2001
From: iOrange <sergey.kudlay@hotmail.com>
Date: Mon, 21 Jul 2025 13:26:35 -0400
Subject: [PATCH 90/93] [SPIR-V] Fixed a crash if encounter constant buffer
 fields with overlapping register assignments (#7636)

The issue:
simple vertex shader like so
```
uniform float4x4 gMVP : register(c0);
uniform float4   gFoo : register(c5);
uniform float4   gBar : register(c5);

float4 main(float4 pos : POSITION) : SV_Position {
    return mul(gMVP, pos * gFoo + gBar);
}
```
will result in an internal crash
```
dxc.exe -spirv -T vs_6_2 -E main test.hlsl -Fo test.spirv
Internal compiler error: access violation. Attempted to read from address 0x0000000000000000
```

Due to `LowerTypeVisitor` trying to assign offsets to fields without
explicit locations.
It'll sort fields first, which will fill the map with the fields first.
And since it's using `std::map` - if there's fields with the same
`register` number - it'll only insert first, other will be left out,
resulting nullptrs in the output vector.
We read the content of the vector down the road crashing.

My change fixes the crash and tries to output somewhat useful info about
compilation fail.

I hope this helps you in fixing it properly, or you can take it as it
is.
---
 tools/clang/lib/SPIRV/LowerTypeVisitor.cpp    | 66 +++++++++++--------
 tools/clang/lib/SPIRV/LowerTypeVisitor.h      |  6 ++
 .../test/CodeGenSPIRV/cbuffer.overlap.hlsl    | 11 ++++
 3 files changed, 55 insertions(+), 28 deletions(-)
 create mode 100644 tools/clang/test/CodeGenSPIRV/cbuffer.overlap.hlsl

diff --git a/tools/clang/lib/SPIRV/LowerTypeVisitor.cpp b/tools/clang/lib/SPIRV/LowerTypeVisitor.cpp
index 9d1f1fff60..45d04e8160 100644
--- a/tools/clang/lib/SPIRV/LowerTypeVisitor.cpp
+++ b/tools/clang/lib/SPIRV/LowerTypeVisitor.cpp
@@ -37,33 +37,6 @@ inline uint32_t roundToPow2(uint32_t val, uint32_t pow2) {
 
 } // end anonymous namespace
 
-// This method sorts a field list in the following order:
-//  - fields with register annotation first, sorted by register index.
-//  - then fields without annotation, in order of declaration.
-static std::vector<const HybridStructType::FieldInfo *>
-sortFields(llvm::ArrayRef<HybridStructType::FieldInfo> fields) {
-  std::vector<const HybridStructType::FieldInfo *> output;
-  output.resize(fields.size());
-
-  auto back_inserter = output.rbegin();
-  std::map<uint32_t, const HybridStructType::FieldInfo *> fixed_fields;
-  for (auto it = fields.rbegin(); it < fields.rend(); it++) {
-    if (it->registerC) {
-      fixed_fields.insert({it->registerC->RegisterNumber, &*it});
-    } else {
-      *back_inserter = &*it;
-      back_inserter++;
-    }
-  }
-
-  auto front_inserter = output.begin();
-  for (const auto &item : fixed_fields) {
-    *front_inserter = item.second;
-    front_inserter++;
-  }
-  return output;
-}
-
 static void setDefaultFieldSize(const AlignmentSizeCalculator &alignmentCalc,
                                 const SpirvLayoutRule rule,
                                 const HybridStructType::FieldInfo *currentField,
@@ -292,6 +265,37 @@ bool LowerTypeVisitor::visitInstruction(SpirvInstruction *instr) {
   return true;
 }
 
+std::vector<const HybridStructType::FieldInfo *> LowerTypeVisitor::sortFields(
+    llvm::ArrayRef<HybridStructType::FieldInfo> fields) {
+  std::vector<const HybridStructType::FieldInfo *> output;
+  output.resize(fields.size());
+
+  auto back_inserter = output.rbegin();
+  std::map<uint32_t, const HybridStructType::FieldInfo *> fixed_fields;
+  for (auto it = fields.rbegin(); it < fields.rend(); it++) {
+    if (it->registerC) {
+      auto insertionResult =
+          fixed_fields.insert({it->registerC->RegisterNumber, &*it});
+      if (!insertionResult.second) {
+        emitError(
+            "field \"%0\" at register(c%1) overlaps with previous members",
+            it->registerC->Loc)
+            << it->name << it->registerC->RegisterNumber;
+      }
+    } else {
+      *back_inserter = &*it;
+      back_inserter++;
+    }
+  }
+
+  auto front_inserter = output.begin();
+  for (const auto &item : fixed_fields) {
+    *front_inserter = item.second;
+    front_inserter++;
+  }
+  return output;
+}
+
 const SpirvType *LowerTypeVisitor::lowerType(const SpirvType *type,
                                              SpirvLayoutRule rule,
                                              SourceLocation loc) {
@@ -1378,12 +1382,19 @@ LowerTypeVisitor::populateLayoutInformation(
   llvm::SmallVector<StructType::FieldInfo, 4> loweredFields;
   llvm::DenseMap<const HybridStructType::FieldInfo *, uint32_t> fieldToIndexMap;
 
+  llvm::SmallVector<StructType::FieldInfo, 4> result;
+
   // This stores the index of the field in the actual SPIR-V construct.
   // When bitfields are merged, this index will be the same for merged fields.
   uint32_t fieldIndexInConstruct = 0;
   for (size_t i = 0, iPrevious = -1; i < sortedFields.size(); iPrevious = i++) {
     const size_t fieldIndexForMap = loweredFields.size();
 
+    // Can happen if sortFields runs over fields with the same register(c#)
+    if (!sortedFields[i]) {
+      return result;
+    }
+
     loweredFields.emplace_back(fieldVisitor(
         (iPrevious < loweredFields.size() ? &loweredFields[iPrevious]
                                           : nullptr),
@@ -1397,7 +1408,6 @@ LowerTypeVisitor::populateLayoutInformation(
   }
 
   // Re-order the sorted fields back to their original order.
-  llvm::SmallVector<StructType::FieldInfo, 4> result;
   for (const auto &field : fields)
     result.push_back(loweredFields[fieldToIndexMap[&field]]);
   return result;
diff --git a/tools/clang/lib/SPIRV/LowerTypeVisitor.h b/tools/clang/lib/SPIRV/LowerTypeVisitor.h
index 26b6e44f6d..276e6c9232 100644
--- a/tools/clang/lib/SPIRV/LowerTypeVisitor.h
+++ b/tools/clang/lib/SPIRV/LowerTypeVisitor.h
@@ -62,6 +62,12 @@ class LowerTypeVisitor : public Visitor {
     return astContext.getDiagnostics().Report(srcLoc, diagId);
   }
 
+  // This method sorts a field list in the following order:
+  //  - fields with register annotation first, sorted by register index.
+  //  - then fields without annotation, in order of declaration.
+  std::vector<const HybridStructType::FieldInfo *>
+  sortFields(llvm::ArrayRef<HybridStructType::FieldInfo> fields);
+
   /// Lowers the given Hybrid type into a SPIR-V type.
   ///
   /// Uses the above lowerType method to lower the QualType components of hybrid
diff --git a/tools/clang/test/CodeGenSPIRV/cbuffer.overlap.hlsl b/tools/clang/test/CodeGenSPIRV/cbuffer.overlap.hlsl
new file mode 100644
index 0000000000..4a2e72c7b5
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/cbuffer.overlap.hlsl
@@ -0,0 +1,11 @@
+// RUN: not %dxc -T vs_6_2 -E main -fcgl  %s -spirv  2>&1 | FileCheck %s
+
+// CHECK: error: field "gFoo" at register(c5) overlaps with previous members
+
+uniform float4x4 gMVP : register(c0);
+uniform float4   gFoo : register(c5);
+uniform float4   gBar : register(c5);
+
+float4 main(float4 pos : POSITION) : SV_Position {
+    return mul(gMVP, pos * gFoo + gBar);
+}

From 5ceaf84f8e033cce3e195237bbc772c80b9b2540 Mon Sep 17 00:00:00 2001
From: Alex Sepkowski <alexsepkowski@gmail.com>
Date: Tue, 22 Jul 2025 10:38:54 -0700
Subject: [PATCH 91/93] [ExecutionTests] Fix shader source to prevent OOB
 access for ExecutionTests::AtomicsShared64Test (#7641)

Fixes #5198.

ExecutionTests::AtomicsShared64Test could sporadically fail.

g_sint64Share is indexed with ix%3 + 1, which could lead to an out of
bounds access.
Increase the size of g_sint64Share to 4 and ensure all values are
initialized.
---
 tools/clang/unittests/HLSLExec/ShaderOpArith.xml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tools/clang/unittests/HLSLExec/ShaderOpArith.xml b/tools/clang/unittests/HLSLExec/ShaderOpArith.xml
index a782bd97ae..dbea8e2aaf 100644
--- a/tools/clang/unittests/HLSLExec/ShaderOpArith.xml
+++ b/tools/clang/unittests/HLSLExec/ShaderOpArith.xml
@@ -1976,7 +1976,7 @@
         RWStructuredBuffer<uint64_t> g_shareXchg64Buf : register(u5);
 
         groupshared uint64_t g_uint64Share[6];
-        groupshared int64_t g_sint64Share[3];
+        groupshared int64_t g_sint64Share[4];
         groupshared uint64_t g_xchg64Share[64];
 
         #define VEC_CALL(op, uav, ix, val) op(uav[ix*stride], val);
@@ -2046,7 +2046,7 @@
           // Zero-init shared memory, with special cases
           if (ix < 6)
             g_uint64Share[ix] = ix == 1 ? 99999999ULL | (99999999ULL << 32) : ix == 3 ? ~0ULL : 0;
-          if (ix < 3)
+          if (ix < 4)
             g_sint64Share[ix] = ix == 1 ? 99999999ULL | (99999999ULL << 32) : 0;
           if (ix < 64)
             g_xchg64Share[ix] = 0;
@@ -2552,11 +2552,11 @@
 
         void InitSharedMem(uint ix) {
           // Zero-init shared memory, with special cases
-          if (ix < 6)
+          if (ix < 7)
             g_uintShare[ix] = ix == 1 ? 99999999 : ix == 3 ? -1 : 0;
-          if (ix < 3)
+          if (ix < 4)
             g_sintShare[ix] = ix == 1 ? 99999999 : 0;
-          if (ix < 64)
+          if (ix < 65)
             g_xchgShare[ix] = 0;
 
           GroupMemoryBarrierWithGroupSync();

From c78ed99263d4fba32764b54b49beba075973c851 Mon Sep 17 00:00:00 2001
From: Simon Moll <smoll@nvidia.com>
Date: Tue, 22 Jul 2025 19:58:51 +0200
Subject: [PATCH 92/93] [SER] GetAttributes(out udt) instead of templated
 return (#7606)

```
Old: T    HitObject::GetAttributes<T>()
New: void HitObject::GetAttributes(out udt)
```
- remove HitObject::GetAttributes<T> template code path from
DeduceTemplateArgumentsForHLSL
- cleanup intersection attribute diagnostic code path
- adjust GetAttributes calls and expected AST, HLOps in tests (DXIL
unaffected)

Closes #7534

This is a breaking change. Merge and release must be coordinated with:
- hlsl-spec change (https://github.com/microsoft/hlsl-specs/issues/495)
- HLK releases (SM6.9 preview tests use old signature)

---------

Co-authored-by: Tex Riddell <texr@microsoft.com>
---
 include/dxc/HLSL/HLOperations.h               |   3 +
 lib/HLSL/HLOperationLower.cpp                 |  17 +-
 .../Scalar/ScalarReplAggregatesHLSL.cpp       |   4 +
 tools/clang/include/clang/Sema/Sema.h         |   5 +-
 tools/clang/lib/Sema/SemaChecking.cpp         |   2 +-
 tools/clang/lib/Sema/SemaExpr.cpp             |   2 -
 tools/clang/lib/Sema/SemaHLSL.cpp             | 236 ++++++++----------
 .../HitObject/hitobject_attributes.hlsl       |   3 +-
 .../hitobject_attributes_builtin.hlsl         |   5 +-
 .../DxilGen/hitobject_attributes_dxilgen.ll   | 103 +++++---
 .../HitObject/hitobject_attributes.hlsl       |  14 +-
 .../hitobject_attributes_invalid_longvec.hlsl |   5 +-
 .../hitobject_attributes_invalid_udt.hlsl     |   6 +-
 .../types/invalid-hitobject-decls-struct.hlsl |   2 +-
 .../invalid-hitobject-decls-templated.hlsl    |   2 +-
 .../hlsl/types/invalid-longvec-decls.hlsl     |   2 +-
 utils/hct/gen_intrin_main.txt                 |   2 +-
 17 files changed, 209 insertions(+), 204 deletions(-)

diff --git a/include/dxc/HLSL/HLOperations.h b/include/dxc/HLSL/HLOperations.h
index 0da9804ecb..79cbadc42c 100644
--- a/include/dxc/HLSL/HLOperations.h
+++ b/include/dxc/HLSL/HLOperations.h
@@ -462,6 +462,9 @@ const unsigned kHitObjectInvoke_PayloadOpIdx = 2;
 const unsigned kHitObjectFromRayQuery_WithAttrs_AttributeOpIdx = 4;
 const unsigned kHitObjectFromRayQuery_WithAttrs_NumOp = 5;
 
+// HitObject::GetAttributes
+const unsigned kHitObjectGetAttributes_AttributeOpIdx = 2;
+
 // Linear Algebra Operations
 
 // MatVecMul
diff --git a/lib/HLSL/HLOperationLower.cpp b/lib/HLSL/HLOperationLower.cpp
index 2033533327..3c062475af 100644
--- a/lib/HLSL/HLOperationLower.cpp
+++ b/lib/HLSL/HLOperationLower.cpp
@@ -6378,18 +6378,11 @@ Value *TranslateHitObjectGetAttributes(CallInst *CI, IntrinsicOp IOP,
 
   Value *HitObjectPtr = CI->getArgOperand(1);
   Value *HitObject = Builder.CreateLoad(HitObjectPtr);
-
-  Type *AttrTy = cast<PointerType>(CI->getType())->getPointerElementType();
-
-  IRBuilder<> EntryBuilder(
-      dxilutil::FindAllocaInsertionPt(CI->getParent()->getParent()));
-  unsigned AttrAlign = Helper.dataLayout.getABITypeAlignment(AttrTy);
-  AllocaInst *AttrMem = EntryBuilder.CreateAlloca(AttrTy);
-  AttrMem->setAlignment(AttrAlign);
-  Constant *opArg = OP->GetU32Const((unsigned)OpCode);
-  TrivialDxilOperation(OpCode, {opArg, HitObject, AttrMem}, CI->getType(),
-                       Helper.voidTy, OP, Builder);
-  return AttrMem;
+  Value *AttrOutPtr =
+      CI->getArgOperand(HLOperandIndex::kHitObjectGetAttributes_AttributeOpIdx);
+  TrivialDxilOperation(OpCode, {nullptr, HitObject, AttrOutPtr},
+                       AttrOutPtr->getType(), CI, OP);
+  return nullptr;
 }
 
 Value *TranslateHitObjectScalarGetter(CallInst *CI, IntrinsicOp IOP,
diff --git a/lib/Transforms/Scalar/ScalarReplAggregatesHLSL.cpp b/lib/Transforms/Scalar/ScalarReplAggregatesHLSL.cpp
index d8746862bc..54250ad36d 100644
--- a/lib/Transforms/Scalar/ScalarReplAggregatesHLSL.cpp
+++ b/lib/Transforms/Scalar/ScalarReplAggregatesHLSL.cpp
@@ -1518,6 +1518,10 @@ static bool isUDTIntrinsicArg(CallInst *CI, unsigned OpIdx) {
     if (OpIdx == HLOperandIndex::kHitObjectInvoke_PayloadOpIdx)
       return true;
     break;
+  case IntrinsicOp::MOP_DxHitObject_GetAttributes:
+    if (OpIdx == HLOperandIndex::kHitObjectGetAttributes_AttributeOpIdx)
+      return true;
+    break;
   default:
     break;
   }
diff --git a/tools/clang/include/clang/Sema/Sema.h b/tools/clang/include/clang/Sema/Sema.h
index 5e20f6f0f8..6eb0aba801 100644
--- a/tools/clang/include/clang/Sema/Sema.h
+++ b/tools/clang/include/clang/Sema/Sema.h
@@ -3811,8 +3811,7 @@ class Sema {
   void DiagnoseHLSLDeclAttr(const Decl *D, const Attr *A);
   void DiagnoseCoherenceMismatch(const Expr *SrcExpr, QualType TargetType,
                                  SourceLocation Loc);
-  void CheckHLSLFunctionCall(FunctionDecl *FDecl, CallExpr *TheCall,
-                             const FunctionProtoType *Proto);
+  void CheckHLSLFunctionCall(FunctionDecl *FDecl, CallExpr *TheCall);
   void DiagnoseReachableHLSLCall(CallExpr *CE, const hlsl::ShaderModel *SM,
                                  hlsl::DXIL::ShaderKind EntrySK,
                                  hlsl::DXIL::NodeLaunchType NodeLaunchTy,
@@ -8831,8 +8830,6 @@ class Sema {
                         bool AllowOnePastEnd=true, bool IndexNegated=false);
   // HLSL Change Starts - checking array subscript access to vector or matrix member
   void CheckHLSLArrayAccess(const Expr *expr);
-  bool CheckHLSLIntrinsicCall(FunctionDecl *FDecl, CallExpr *TheCall);
-  bool CheckHLSLFunctionCall(FunctionDecl *FDecl, CallExpr *TheCall);
   // HLSL Change ends
   void CheckArrayAccess(const Expr *E);
   // Used to grab the relevant information from a FormatAttr and a
diff --git a/tools/clang/lib/Sema/SemaChecking.cpp b/tools/clang/lib/Sema/SemaChecking.cpp
index 9e64732336..e3932220f9 100644
--- a/tools/clang/lib/Sema/SemaChecking.cpp
+++ b/tools/clang/lib/Sema/SemaChecking.cpp
@@ -1426,7 +1426,7 @@ bool Sema::CheckFunctionCall(FunctionDecl *FDecl, CallExpr *TheCall,
     CheckMemaccessArguments(TheCall, CMId, FnInfo);
 #endif // HLSL Change Ends
 
-  CheckHLSLFunctionCall(FDecl, TheCall, Proto); // HLSL Change
+  CheckHLSLFunctionCall(FDecl, TheCall); // HLSL Change
 
   return false;
 }
diff --git a/tools/clang/lib/Sema/SemaExpr.cpp b/tools/clang/lib/Sema/SemaExpr.cpp
index cbc4ac37ab..cccf711126 100644
--- a/tools/clang/lib/Sema/SemaExpr.cpp
+++ b/tools/clang/lib/Sema/SemaExpr.cpp
@@ -5349,8 +5349,6 @@ Sema::BuildResolvedCallExpr(Expr *Fn, NamedDecl *NDecl,
   if (FDecl) {
     if (CheckFunctionCall(FDecl, TheCall, Proto))
       return ExprError();
-    if (CheckHLSLFunctionCall(FDecl, TheCall))
-      return ExprError();
     if (BuiltinID)
       return CheckBuiltinFunctionCall(FDecl, BuiltinID, TheCall);
   } else if (NDecl) {
diff --git a/tools/clang/lib/Sema/SemaHLSL.cpp b/tools/clang/lib/Sema/SemaHLSL.cpp
index dcb6142858..656dfb401f 100644
--- a/tools/clang/lib/Sema/SemaHLSL.cpp
+++ b/tools/clang/lib/Sema/SemaHLSL.cpp
@@ -10829,18 +10829,24 @@ HLSLExternalSource::ApplyTypeSpecSignToParsedType(clang::QualType &type,
   }
 }
 
-bool DiagnoseIntersectionAttributes(Sema &S, SourceLocation Loc, QualType Ty) {
-  // Must be a UDT
+bool CheckIntersectionAttributeArg(Sema &S, Expr *E) {
+  SourceLocation Loc = E->getExprLoc();
+  QualType Ty = E->getType();
+
+  // Identify problematic fields first (high diagnostic accuracy, may miss some
+  // invalid cases)
+  const TypeDiagContext DiagContext = TypeDiagContext::Attributes;
+  if (DiagnoseTypeElements(S, Loc, Ty, DiagContext, DiagContext))
+    return true;
+
+  // Must be a UDT (low diagnostic accuracy, catches remaining invalid cases)
   if (Ty.isNull() || !hlsl::IsHLSLCopyableAnnotatableRecord(Ty)) {
     S.Diag(Loc, diag::err_payload_attrs_must_be_udt)
         << /*payload|attributes|callable*/ 1 << /*parameter %2|type*/ 1;
-    return false;
+    return true;
   }
 
-  const TypeDiagContext DiagContext = TypeDiagContext::Attributes;
-  if (DiagnoseTypeElements(S, Loc, Ty, DiagContext, DiagContext))
-    return false;
-  return true;
+  return false;
 }
 
 Sema::TemplateDeductionResult
@@ -10951,7 +10957,6 @@ HLSLExternalSource::DeduceTemplateArgumentsForHLSL(
     LPCSTR tableName = cursor.GetTableName();
     // Currently only intrinsic we allow for explicit template arguments are
     // for Load/Store for ByteAddressBuffer/RWByteAddressBuffer
-    // and HitObject::GetAttributes with user-defined intersection attributes.
 
     // Check Explicit template arguments
     UINT intrinsicOp = (*cursor)->Op;
@@ -10966,11 +10971,9 @@ HLSLExternalSource::DeduceTemplateArgumentsForHLSL(
       IsBABLoad = intrinsicOp == (UINT)IntrinsicOp::MOP_Load;
       IsBABStore = intrinsicOp == (UINT)IntrinsicOp::MOP_Store;
     }
-    bool IsHitObjectGetAttributes =
-        intrinsicOp == (UINT)IntrinsicOp::MOP_DxHitObject_GetAttributes;
     if (ExplicitTemplateArgs && ExplicitTemplateArgs->size() >= 1) {
       SourceLocation Loc = ExplicitTemplateArgs->getLAngleLoc();
-      if (!IsBABLoad && !IsBABStore && !IsHitObjectGetAttributes) {
+      if (!IsBABLoad && !IsBABStore) {
         getSema()->Diag(Loc, diag::err_hlsl_intrinsic_template_arg_unsupported)
             << intrinsicName;
         return Sema::TemplateDeductionResult::TDK_Invalid;
@@ -11000,10 +11003,6 @@ HLSLExternalSource::DeduceTemplateArgumentsForHLSL(
           return Sema::TemplateDeductionResult::TDK_Invalid;
         }
       }
-      if (IsHitObjectGetAttributes &&
-          !DiagnoseIntersectionAttributes(*getSema(), Loc,
-                                          functionTemplateTypeArg))
-        return Sema::TemplateDeductionResult::TDK_Invalid;
     } else if (IsBABStore) {
       // Prior to HLSL 2018, Store operation only stored scalar uint.
       if (!Is2018) {
@@ -12277,9 +12276,78 @@ static bool CheckVKBufferPointerCast(Sema &S, FunctionDecl *FD, CallExpr *CE,
 }
 #endif
 
+static bool isRelatedDeclMarkedNointerpolation(Expr *E) {
+  if (!E)
+    return false;
+  E = E->IgnoreCasts();
+  if (auto *DRE = dyn_cast<DeclRefExpr>(E))
+    return DRE->getDecl()->hasAttr<HLSLNoInterpolationAttr>();
+
+  if (auto *ME = dyn_cast<MemberExpr>(E))
+    return ME->getMemberDecl()->hasAttr<HLSLNoInterpolationAttr>() ||
+           isRelatedDeclMarkedNointerpolation(ME->getBase());
+
+  if (auto *HVE = dyn_cast<HLSLVectorElementExpr>(E))
+    return isRelatedDeclMarkedNointerpolation(HVE->getBase());
+
+  if (auto *ASE = dyn_cast<ArraySubscriptExpr>(E))
+    return isRelatedDeclMarkedNointerpolation(ASE->getBase());
+
+  return false;
+}
+
+static bool CheckIntrinsicGetAttributeAtVertex(Sema &S, FunctionDecl *FDecl,
+                                               CallExpr *TheCall) {
+  assert(TheCall->getNumArgs() > 0);
+  auto argument = TheCall->getArg(0)->IgnoreCasts();
+
+  if (!isRelatedDeclMarkedNointerpolation(argument)) {
+    S.Diag(argument->getExprLoc(), diag::err_hlsl_parameter_requires_attribute)
+        << 0 << FDecl->getName() << "nointerpolation";
+    return true;
+  }
+
+  return false;
+}
+
+static bool CheckNoInterpolationParams(Sema &S, FunctionDecl *FDecl,
+                                       CallExpr *TheCall) {
+  // See #hlsl-specs/issues/181. Feature is broken. For SPIR-V we want
+  // to limit the scope, and fail gracefully in some cases.
+  if (!S.getLangOpts().SPIRV)
+    return false;
+
+  bool error = false;
+  for (unsigned i = 0; i < FDecl->getNumParams(); i++) {
+    assert(i < TheCall->getNumArgs());
+
+    if (!FDecl->getParamDecl(i)->hasAttr<HLSLNoInterpolationAttr>())
+      continue;
+
+    if (!isRelatedDeclMarkedNointerpolation(TheCall->getArg(i))) {
+      S.Diag(TheCall->getArg(i)->getExprLoc(),
+             diag::err_hlsl_parameter_requires_attribute)
+          << i << FDecl->getName() << "nointerpolation";
+      error = true;
+    }
+  }
+
+  return error;
+}
+
+// Verify that user-defined intrinsic struct args contain no long vectors
+static bool CheckUDTIntrinsicArg(Sema &S, Expr *Arg) {
+  const TypeDiagContext DiagContext =
+      TypeDiagContext::UserDefinedStructParameter;
+  return DiagnoseTypeElements(S, Arg->getExprLoc(), Arg->getType(), DiagContext,
+                              DiagContext);
+}
+
 // Check HLSL call constraints, not fatal to creating the AST.
-void Sema::CheckHLSLFunctionCall(FunctionDecl *FDecl, CallExpr *TheCall,
-                                 const FunctionProtoType *Proto) {
+void Sema::CheckHLSLFunctionCall(FunctionDecl *FDecl, CallExpr *TheCall) {
+  if (CheckNoInterpolationParams(*this, FDecl, TheCall))
+    return;
+
   HLSLIntrinsicAttr *IntrinsicAttr = FDecl->getAttr<HLSLIntrinsicAttr>();
   if (!IntrinsicAttr)
     return;
@@ -12307,6 +12375,28 @@ void Sema::CheckHLSLFunctionCall(FunctionDecl *FDecl, CallExpr *TheCall,
   case hlsl::IntrinsicOp::IOP___builtin_OuterProductAccumulate:
     CheckOuterProductAccumulateCall(*this, FDecl, TheCall);
     break;
+  case hlsl::IntrinsicOp::IOP_GetAttributeAtVertex:
+    // See #hlsl-specs/issues/181. Feature is broken. For SPIR-V we want
+    // to limit the scope, and fail gracefully in some cases.
+    if (!getLangOpts().SPIRV)
+      return;
+    CheckIntrinsicGetAttributeAtVertex(*this, FDecl, TheCall);
+    break;
+  case hlsl::IntrinsicOp::IOP_DispatchMesh:
+    CheckUDTIntrinsicArg(*this, TheCall->getArg(3)->IgnoreCasts());
+    break;
+  case hlsl::IntrinsicOp::IOP_CallShader:
+    CheckUDTIntrinsicArg(*this, TheCall->getArg(1)->IgnoreCasts());
+    break;
+  case hlsl::IntrinsicOp::IOP_TraceRay:
+    CheckUDTIntrinsicArg(*this, TheCall->getArg(7)->IgnoreCasts());
+    break;
+  case hlsl::IntrinsicOp::IOP_ReportHit:
+    CheckIntersectionAttributeArg(*this, TheCall->getArg(2)->IgnoreCasts());
+    break;
+  case hlsl::IntrinsicOp::MOP_DxHitObject_GetAttributes:
+    CheckIntersectionAttributeArg(*this, TheCall->getArg(0)->IgnoreCasts());
+    break;
 #ifdef ENABLE_SPIRV_CODEGEN
   case hlsl::IntrinsicOp::IOP_Vkreinterpret_pointer_cast:
     CheckVKBufferPointerCast(*this, FDecl, TheCall, false);
@@ -16841,118 +16931,6 @@ QualType Sema::getHLSLDefaultSpecialization(TemplateDecl *Decl) {
   return QualType();
 }
 
-static bool isRelatedDeclMarkedNointerpolation(Expr *E) {
-  if (!E)
-    return false;
-  E = E->IgnoreCasts();
-  if (auto *DRE = dyn_cast<DeclRefExpr>(E))
-    return DRE->getDecl()->hasAttr<HLSLNoInterpolationAttr>();
-
-  if (auto *ME = dyn_cast<MemberExpr>(E))
-    return ME->getMemberDecl()->hasAttr<HLSLNoInterpolationAttr>() ||
-           isRelatedDeclMarkedNointerpolation(ME->getBase());
-
-  if (auto *HVE = dyn_cast<HLSLVectorElementExpr>(E))
-    return isRelatedDeclMarkedNointerpolation(HVE->getBase());
-
-  if (auto *ASE = dyn_cast<ArraySubscriptExpr>(E))
-    return isRelatedDeclMarkedNointerpolation(ASE->getBase());
-
-  return false;
-}
-
-// Verify that user-defined intrinsic struct args contain no long vectors
-static bool CheckUDTIntrinsicArg(Sema *S, Expr *Arg) {
-  const TypeDiagContext DiagContext =
-      TypeDiagContext::UserDefinedStructParameter;
-  return DiagnoseTypeElements(*S, Arg->getExprLoc(), Arg->getType(),
-                              DiagContext, DiagContext);
-}
-
-static bool CheckIntrinsicGetAttributeAtVertex(Sema *S, FunctionDecl *FDecl,
-                                               CallExpr *TheCall) {
-  assert(TheCall->getNumArgs() > 0);
-  auto argument = TheCall->getArg(0)->IgnoreCasts();
-
-  if (!isRelatedDeclMarkedNointerpolation(argument)) {
-    S->Diag(argument->getExprLoc(), diag::err_hlsl_parameter_requires_attribute)
-        << 0 << FDecl->getName() << "nointerpolation";
-    return true;
-  }
-
-  return false;
-}
-
-bool Sema::CheckHLSLIntrinsicCall(FunctionDecl *FDecl, CallExpr *TheCall) {
-  auto attr = FDecl->getAttr<HLSLIntrinsicAttr>();
-
-  if (!attr)
-    return false;
-
-  if (!IsBuiltinTable(attr->getGroup()))
-    return false;
-
-  switch (hlsl::IntrinsicOp(attr->getOpcode())) {
-  case hlsl::IntrinsicOp::IOP_GetAttributeAtVertex:
-    // See #hlsl-specs/issues/181. Feature is broken. For SPIR-V we want
-    // to limit the scope, and fail gracefully in some cases.
-    if (!getLangOpts().SPIRV)
-      return false;
-    // This should never happen for SPIR-V. But on the DXIL side, extension can
-    // be added by inserting new intrinsics, meaning opcodes can collide with
-    // existing ones. See the ExtensionTest.EvalAttributeCollision test.
-    assert(FDecl->getName() == "GetAttributeAtVertex");
-    return CheckIntrinsicGetAttributeAtVertex(this, FDecl, TheCall);
-  case hlsl::IntrinsicOp::IOP_DispatchMesh:
-    assert(TheCall->getNumArgs() > 3);
-    assert(FDecl->getName() == "DispatchMesh");
-    return CheckUDTIntrinsicArg(this, TheCall->getArg(3)->IgnoreCasts());
-  case hlsl::IntrinsicOp::IOP_CallShader:
-    assert(TheCall->getNumArgs() > 1);
-    assert(FDecl->getName() == "CallShader");
-    return CheckUDTIntrinsicArg(this, TheCall->getArg(1)->IgnoreCasts());
-  case hlsl::IntrinsicOp::IOP_TraceRay:
-    assert(TheCall->getNumArgs() > 7);
-    assert(FDecl->getName() == "TraceRay");
-    return CheckUDTIntrinsicArg(this, TheCall->getArg(7)->IgnoreCasts());
-  case hlsl::IntrinsicOp::IOP_ReportHit:
-    assert(TheCall->getNumArgs() > 2);
-    assert(FDecl->getName() == "ReportHit");
-    return CheckUDTIntrinsicArg(this, TheCall->getArg(2)->IgnoreCasts());
-  default:
-    break;
-  }
-
-  return false;
-}
-
-bool Sema::CheckHLSLFunctionCall(FunctionDecl *FDecl, CallExpr *TheCall) {
-  if (hlsl::IsIntrinsicOp(FDecl) && CheckHLSLIntrinsicCall(FDecl, TheCall))
-    return true;
-
-  // See #hlsl-specs/issues/181. Feature is broken. For SPIR-V we want
-  // to limit the scope, and fail gracefully in some cases.
-  if (!getLangOpts().SPIRV)
-    return false;
-
-  bool error = false;
-  for (unsigned i = 0; i < FDecl->getNumParams(); i++) {
-    assert(i < TheCall->getNumArgs());
-
-    if (!FDecl->getParamDecl(i)->hasAttr<HLSLNoInterpolationAttr>())
-      continue;
-
-    if (!isRelatedDeclMarkedNointerpolation(TheCall->getArg(i))) {
-      Diag(TheCall->getArg(i)->getExprLoc(),
-           diag::err_hlsl_parameter_requires_attribute)
-          << i << FDecl->getName() << "nointerpolation";
-      error = true;
-    }
-  }
-
-  return error;
-}
-
 namespace hlsl {
 
 static bool nodeInputIsCompatible(DXIL::NodeIOKind IOType,
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/objects/HitObject/hitobject_attributes.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/objects/HitObject/hitobject_attributes.hlsl
index 03cefe8e48..55ef023a2f 100644
--- a/tools/clang/test/CodeGenDXIL/hlsl/objects/HitObject/hitobject_attributes.hlsl
+++ b/tools/clang/test/CodeGenDXIL/hlsl/objects/HitObject/hitobject_attributes.hlsl
@@ -20,7 +20,8 @@ CustomAttrs {
 [shader("raygeneration")]
 void main() {
   dx::HitObject hit;
-  CustomAttrs attrs = hit.GetAttributes<CustomAttrs>();
+  CustomAttrs attrs;
+  hit.GetAttributes(attrs);
   float sum = attrs.v.x + attrs.v.y + attrs.v.z + attrs.v.w + attrs.y;
   outbuf.Store(0, sum);
 }
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/objects/HitObject/hitobject_attributes_builtin.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/objects/HitObject/hitobject_attributes_builtin.hlsl
index a096bb6f11..59140ab37e 100644
--- a/tools/clang/test/CodeGenDXIL/hlsl/objects/HitObject/hitobject_attributes_builtin.hlsl
+++ b/tools/clang/test/CodeGenDXIL/hlsl/objects/HitObject/hitobject_attributes_builtin.hlsl
@@ -5,7 +5,7 @@
 // as a template argument to GetAttributes.
 
 // For -fcgl, just check the form of the HL call.
-// FCGL: %{{[^ ]+}} = call %struct.BuiltInTriangleIntersectionAttributes* @"dx.hl.op..%struct.BuiltInTriangleIntersectionAttributes* (i32, %dx.types.HitObject*)"(i32 364, %dx.types.HitObject* %{{[^ ]+}})
+// FCGL: call void @"dx.hl.op..void (i32, %dx.types.HitObject*, %struct.BuiltInTriangleIntersectionAttributes*)"(i32 364, %dx.types.HitObject* %{{[^ ]+}}, %struct.BuiltInTriangleIntersectionAttributes* %{{[^ ]+}})
 
 // CHECK: %[[ATTR:[^ ]+]] = alloca %struct.BuiltInTriangleIntersectionAttributes
 // CHECK: call void @dx.op.hitObject_Attributes.struct.BuiltInTriangleIntersectionAttributes(i32 289, %dx.types.HitObject %{{[^ ]+}}, %struct.BuiltInTriangleIntersectionAttributes* nonnull %[[ATTR]])
@@ -34,7 +34,8 @@ void MyRaygenShader()
 
     dx::HitObject hit = dx::HitObject::TraceRay(Scene, RAY_FLAG_NONE, ~0, 0, 1, 0, ray, payload);
 
-    MyAttribs attr = hit.GetAttributes<MyAttribs>();
+    MyAttribs attr;
+    hit.GetAttributes(attr);
     payload.color += float4(attr,0,1);
 
     // Write the raytraced color to the output texture.
diff --git a/tools/clang/test/DXC/Passes/DxilGen/hitobject_attributes_dxilgen.ll b/tools/clang/test/DXC/Passes/DxilGen/hitobject_attributes_dxilgen.ll
index 4887be4d58..3488a3df03 100644
--- a/tools/clang/test/DXC/Passes/DxilGen/hitobject_attributes_dxilgen.ll
+++ b/tools/clang/test/DXC/Passes/DxilGen/hitobject_attributes_dxilgen.ll
@@ -30,30 +30,41 @@ target triple = "dxil-ms-dx"
 define void @"\01?main@@YAXXZ"() #0 {
 entry:
   %hit = alloca %dx.types.HitObject, align 4
-  %0 = bitcast %dx.types.HitObject* %hit to i8*, !dbg !21 ; line:22 col:3
-  call void @llvm.lifetime.start(i64 4, i8* %0) #0, !dbg !21 ; line:22 col:3
-  %1 = call %dx.types.HitObject* @"dx.hl.op..%dx.types.HitObject* (i32, %dx.types.HitObject*)"(i32 358, %dx.types.HitObject* %hit), !dbg !25 ; line:22 col:17
-  %2 = call %struct.CustomAttrs* @"dx.hl.op..%struct.CustomAttrs* (i32, %dx.types.HitObject*)"(i32 364, %dx.types.HitObject* %hit), !dbg !26 ; line:23 col:23
-  %3 = getelementptr inbounds %struct.CustomAttrs, %struct.CustomAttrs* %2, i32 0, i32 0, !dbg !26 ; line:23 col:23
-  %4 = load <4 x float>, <4 x float>* %3, !dbg !26 ; line:23 col:23
-  %5 = getelementptr inbounds %struct.CustomAttrs, %struct.CustomAttrs* %2, i32 0, i32 1, !dbg !26 ; line:23 col:23
-  %6 = load i32, i32* %5, !dbg !26 ; line:23 col:23
-  %7 = extractelement <4 x float> %4, i32 0, !dbg !27 ; line:24 col:15
-  %8 = extractelement <4 x float> %4, i32 1, !dbg !28 ; line:24 col:27
-  %add = fadd float %7, %8, !dbg !29 ; line:24 col:25
-  %9 = extractelement <4 x float> %4, i32 2, !dbg !30 ; line:24 col:39
-  %add4 = fadd float %add, %9, !dbg !31 ; line:24 col:37
-  %10 = extractelement <4 x float> %4, i32 3, !dbg !32 ; line:24 col:51
-  %add6 = fadd float %add4, %10, !dbg !33 ; line:24 col:49
-  %conv = sitofp i32 %6 to float, !dbg !34 ; line:24 col:63
-  %add7 = fadd float %add6, %conv, !dbg !35 ; line:24 col:61
-  %11 = load %struct.RWByteAddressBuffer, %struct.RWByteAddressBuffer* @"\01?outbuf@@3URWByteAddressBuffer@@A", !dbg !36 ; line:25 col:3
-  %12 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RWByteAddressBuffer)"(i32 0, %struct.RWByteAddressBuffer %11), !dbg !36 ; line:25 col:3
-  %13 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RWByteAddressBuffer)"(i32 14, %dx.types.Handle %12, %dx.types.ResourceProperties { i32 4107, i32 0 }, %struct.RWByteAddressBuffer zeroinitializer), !dbg !36 ; line:25 col:3
-  call void @"dx.hl.op..void (i32, %dx.types.Handle, i32, float)"(i32 277, %dx.types.Handle %13, i32 0, float %add7), !dbg !36 ; line:25 col:3
-  %14 = bitcast %dx.types.HitObject* %hit to i8*, !dbg !37 ; line:26 col:1
-  call void @llvm.lifetime.end(i64 4, i8* %14) #0, !dbg !37 ; line:26 col:1
-  ret void, !dbg !37 ; line:26 col:1
+  %attrs = alloca %struct.CustomAttrs, align 4
+  %0 = bitcast %dx.types.HitObject* %hit to i8*, !dbg !21 ; line:29 col:3
+  call void @llvm.lifetime.start(i64 4, i8* %0) #0, !dbg !21 ; line:29 col:3
+  %1 = call %dx.types.HitObject* @"dx.hl.op..%dx.types.HitObject* (i32, %dx.types.HitObject*)"(i32 358, %dx.types.HitObject* %hit), !dbg !25 ; line:29 col:17
+  %2 = bitcast %struct.CustomAttrs* %attrs to i8*, !dbg !26 ; line:30 col:3
+  call void @llvm.lifetime.start(i64 20, i8* %2) #0, !dbg !26 ; line:30 col:3
+  call void @"dx.hl.op..void (i32, %dx.types.HitObject*, %struct.CustomAttrs*)"(i32 364, %dx.types.HitObject* %hit, %struct.CustomAttrs* %attrs), !dbg !27 ; line:31 col:3
+  %v = getelementptr inbounds %struct.CustomAttrs, %struct.CustomAttrs* %attrs, i32 0, i32 0, !dbg !28 ; line:32 col:21
+  %3 = load <4 x float>, <4 x float>* %v, align 4, !dbg !29 ; line:32 col:15
+  %4 = extractelement <4 x float> %3, i32 0, !dbg !29 ; line:32 col:15
+  %v1 = getelementptr inbounds %struct.CustomAttrs, %struct.CustomAttrs* %attrs, i32 0, i32 0, !dbg !30 ; line:32 col:33
+  %5 = load <4 x float>, <4 x float>* %v1, align 4, !dbg !31 ; line:32 col:27
+  %6 = extractelement <4 x float> %5, i32 1, !dbg !31 ; line:32 col:27
+  %add = fadd float %4, %6, !dbg !32 ; line:32 col:25
+  %v2 = getelementptr inbounds %struct.CustomAttrs, %struct.CustomAttrs* %attrs, i32 0, i32 0, !dbg !33 ; line:32 col:45
+  %7 = load <4 x float>, <4 x float>* %v2, align 4, !dbg !34 ; line:32 col:39
+  %8 = extractelement <4 x float> %7, i32 2, !dbg !34 ; line:32 col:39
+  %add3 = fadd float %add, %8, !dbg !35 ; line:32 col:37
+  %v4 = getelementptr inbounds %struct.CustomAttrs, %struct.CustomAttrs* %attrs, i32 0, i32 0, !dbg !36 ; line:32 col:57
+  %9 = load <4 x float>, <4 x float>* %v4, align 4, !dbg !37 ; line:32 col:51
+  %10 = extractelement <4 x float> %9, i32 3, !dbg !37 ; line:32 col:51
+  %add5 = fadd float %add3, %10, !dbg !38 ; line:32 col:49
+  %y = getelementptr inbounds %struct.CustomAttrs, %struct.CustomAttrs* %attrs, i32 0, i32 1, !dbg !39 ; line:32 col:69
+  %11 = load i32, i32* %y, align 4, !dbg !39, !tbaa !40 ; line:32 col:69
+  %conv = sitofp i32 %11 to float, !dbg !44 ; line:32 col:63
+  %add6 = fadd float %add5, %conv, !dbg !45 ; line:32 col:61
+  %12 = load %struct.RWByteAddressBuffer, %struct.RWByteAddressBuffer* @"\01?outbuf@@3URWByteAddressBuffer@@A", !dbg !46 ; line:33 col:3
+  %13 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RWByteAddressBuffer)"(i32 0, %struct.RWByteAddressBuffer %12), !dbg !46 ; line:33 col:3
+  %14 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RWByteAddressBuffer)"(i32 14, %dx.types.Handle %13, %dx.types.ResourceProperties { i32 4107, i32 0 }, %struct.RWByteAddressBuffer zeroinitializer), !dbg !46 ; line:33 col:3
+  call void @"dx.hl.op..void (i32, %dx.types.Handle, i32, float)"(i32 277, %dx.types.Handle %14, i32 0, float %add6), !dbg !46 ; line:33 col:3
+  %15 = bitcast %struct.CustomAttrs* %attrs to i8*, !dbg !47 ; line:34 col:1
+  call void @llvm.lifetime.end(i64 20, i8* %15) #0, !dbg !47 ; line:34 col:1
+  %16 = bitcast %dx.types.HitObject* %hit to i8*, !dbg !47 ; line:34 col:1
+  call void @llvm.lifetime.end(i64 4, i8* %16) #0, !dbg !47 ; line:34 col:1
+  ret void, !dbg !47 ; line:34 col:1
 }
 
 ; Function Attrs: nounwind
@@ -66,7 +77,7 @@ declare void @llvm.lifetime.end(i64, i8* nocapture) #0
 declare %dx.types.HitObject* @"dx.hl.op..%dx.types.HitObject* (i32, %dx.types.HitObject*)"(i32, %dx.types.HitObject*) #0
 
 ; Function Attrs: nounwind
-declare %struct.CustomAttrs* @"dx.hl.op..%struct.CustomAttrs* (i32, %dx.types.HitObject*)"(i32, %dx.types.HitObject*) #0
+declare void @"dx.hl.op..void (i32, %dx.types.HitObject*, %struct.CustomAttrs*)"(i32, %dx.types.HitObject*, %struct.CustomAttrs*) #0
 
 ; Function Attrs: nounwind
 declare void @"dx.hl.op..void (i32, %dx.types.Handle, i32, float)"(i32, %dx.types.Handle, i32, float) #0
@@ -111,20 +122,30 @@ attributes #1 = { nounwind readnone }
 !18 = !{void ()* @"\01?main@@YAXXZ", i32 7}
 !19 = !{i32 -2147483584}
 !20 = !{i32 -1}
-!21 = !DILocation(line: 22, column: 3, scope: !22)
-!22 = !DISubprogram(name: "main", scope: !23, file: !23, line: 21, type: !24, isLocal: false, isDefinition: true, scopeLine: 21, flags: DIFlagPrototyped, isOptimized: false, function: void ()* @"\01?main@@YAXXZ")
-!23 = !DIFile(filename: "tools/clang/test/CodeGenDXIL/hlsl/objects/HitObject/hitobject_attributes.hlsl", directory: "")
+!21 = !DILocation(line: 29, column: 3, scope: !22)
+!22 = !DISubprogram(name: "main", scope: !23, file: !23, line: 28, type: !24, isLocal: false, isDefinition: true, scopeLine: 28, flags: DIFlagPrototyped, isOptimized: false, function: void ()* @"\01?main@@YAXXZ")
+!23 = !DIFile(filename: "tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_attributes.hlsl", directory: "")
 !24 = !DISubroutineType(types: !13)
-!25 = !DILocation(line: 22, column: 17, scope: !22)
-!26 = !DILocation(line: 23, column: 23, scope: !22)
-!27 = !DILocation(line: 24, column: 15, scope: !22)
-!28 = !DILocation(line: 24, column: 27, scope: !22)
-!29 = !DILocation(line: 24, column: 25, scope: !22)
-!30 = !DILocation(line: 24, column: 39, scope: !22)
-!31 = !DILocation(line: 24, column: 37, scope: !22)
-!32 = !DILocation(line: 24, column: 51, scope: !22)
-!33 = !DILocation(line: 24, column: 49, scope: !22)
-!34 = !DILocation(line: 24, column: 63, scope: !22)
-!35 = !DILocation(line: 24, column: 61, scope: !22)
-!36 = !DILocation(line: 25, column: 3, scope: !22)
-!37 = !DILocation(line: 26, column: 1, scope: !22)
+!25 = !DILocation(line: 29, column: 17, scope: !22)
+!26 = !DILocation(line: 30, column: 3, scope: !22)
+!27 = !DILocation(line: 31, column: 3, scope: !22)
+!28 = !DILocation(line: 32, column: 21, scope: !22)
+!29 = !DILocation(line: 32, column: 15, scope: !22)
+!30 = !DILocation(line: 32, column: 33, scope: !22)
+!31 = !DILocation(line: 32, column: 27, scope: !22)
+!32 = !DILocation(line: 32, column: 25, scope: !22)
+!33 = !DILocation(line: 32, column: 45, scope: !22)
+!34 = !DILocation(line: 32, column: 39, scope: !22)
+!35 = !DILocation(line: 32, column: 37, scope: !22)
+!36 = !DILocation(line: 32, column: 57, scope: !22)
+!37 = !DILocation(line: 32, column: 51, scope: !22)
+!38 = !DILocation(line: 32, column: 49, scope: !22)
+!39 = !DILocation(line: 32, column: 69, scope: !22)
+!40 = !{!41, !41, i64 0}
+!41 = !{!"int", !42, i64 0}
+!42 = !{!"omnipotent char", !43, i64 0}
+!43 = !{!"Simple C/C++ TBAA"}
+!44 = !DILocation(line: 32, column: 63, scope: !22)
+!45 = !DILocation(line: 32, column: 61, scope: !22)
+!46 = !DILocation(line: 33, column: 3, scope: !22)
+!47 = !DILocation(line: 34, column: 1, scope: !22)
diff --git a/tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_attributes.hlsl b/tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_attributes.hlsl
index 79db78cdaf..609d94f291 100644
--- a/tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_attributes.hlsl
+++ b/tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_attributes.hlsl
@@ -1,15 +1,20 @@
 // RUN: %dxc -T lib_6_9 -E main %s -ast-dump-implicit | FileCheck %s --check-prefix AST
 // RUN: %dxc -T lib_6_9 -E main %s -fcgl | FileCheck %s --check-prefix FCGL
 
+
 // AST: | | |-FunctionTemplateDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> GetAttributes
 // AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TResult
-// AST-NEXT: | | | |-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> implicit GetAttributes 'TResult () const'
-// AST-NEXT: | | | `-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> used GetAttributes 'CustomAttrs &()' extern
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TAttributes
+// AST-NEXT: | | | |-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> implicit GetAttributes 'TResult (TAttributes &) const'
+// AST-NEXT: | | | | `-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> Attributes 'TAttributes &'
+// AST-NEXT: | | | `-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> used GetAttributes 'void (CustomAttrs &)' extern
+// AST-NEXT: | | |   |-TemplateArgument type 'void'
 // AST-NEXT: | | |   |-TemplateArgument type 'CustomAttrs'
+// AST-NEXT: | | |   |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> GetAttributes 'CustomAttrs &&__restrict'
 // AST-NEXT: | | |   |-HLSLIntrinsicAttr {{[^ ]+}} <<invalid sloc>> Implicit "op" "" 364
 // AST-NEXT: | | |   `-AvailabilityAttr {{[^ ]+}} <<invalid sloc>> Implicit  6.9 0 0 ""
 
-// FCGL: %{{[^ ]+}} = call %struct.CustomAttrs* @"dx.hl.op..%struct.CustomAttrs* (i32, %dx.types.HitObject*)"(i32 364, %dx.types.HitObject* %{{[^ ]+}})
+// FCGL: call void @"dx.hl.op..void (i32, %dx.types.HitObject*, %struct.CustomAttrs*)"(i32 364, %dx.types.HitObject* %{{[^ ]+}}, %struct.CustomAttrs* %{{[^ ]+}})
 
 RWByteAddressBuffer outbuf;
 
@@ -22,7 +27,8 @@ CustomAttrs {
 [shader("raygeneration")]
 void main() {
   dx::HitObject hit;
-  CustomAttrs attrs = hit.GetAttributes<CustomAttrs>();
+  CustomAttrs attrs;
+  hit.GetAttributes(attrs);
   float sum = attrs.v.x + attrs.v.y + attrs.v.z + attrs.v.w + attrs.y;
   outbuf.Store(0, sum);
 }
diff --git a/tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_attributes_invalid_longvec.hlsl b/tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_attributes_invalid_longvec.hlsl
index 240ccfb9d4..97bb81a7cb 100644
--- a/tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_attributes_invalid_longvec.hlsl
+++ b/tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_attributes_invalid_longvec.hlsl
@@ -9,6 +9,7 @@ CustomAttrs {
 [shader("raygeneration")]
 void main() {
   dx::HitObject hit;
-  // expected-error@+1{{vectors of over 4 elements in attributes are not supported}}
-  CustomAttrs attrs = hit.GetAttributes<CustomAttrs>();
+  // expected-error@+2{{vectors of over 4 elements in attributes are not supported}}
+  CustomAttrs attrs;
+  hit.GetAttributes(attrs);
 }
diff --git a/tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_attributes_invalid_udt.hlsl b/tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_attributes_invalid_udt.hlsl
index 0f27f089e4..f8935676c5 100644
--- a/tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_attributes_invalid_udt.hlsl
+++ b/tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_attributes_invalid_udt.hlsl
@@ -9,6 +9,8 @@ CustomAttrs {
 [shader("raygeneration")]
 void main() {
   dx::HitObject hit;
-  // expected-error@+1{{attributes type must be a user-defined type composed of only numeric types}}
-  CustomAttrs attrs = hit.GetAttributes<CustomAttrs>();
+  CustomAttrs attrs;
+  hit.GetAttributes(attrs);
+  // expected-error@-1{{vectors of over 4 elements in attributes are not supported}}
+  // expected-error@-2{{attributes type must be a user-defined type composed of only numeric types}}
 }
diff --git a/tools/clang/test/SemaHLSL/hlsl/types/invalid-hitobject-decls-struct.hlsl b/tools/clang/test/SemaHLSL/hlsl/types/invalid-hitobject-decls-struct.hlsl
index b6b28700a9..c852d17a1a 100644
--- a/tools/clang/test/SemaHLSL/hlsl/types/invalid-hitobject-decls-struct.hlsl
+++ b/tools/clang/test/SemaHLSL/hlsl/types/invalid-hitobject-decls-struct.hlsl
@@ -264,7 +264,7 @@ void Intersection() {
   float hitT = RayTCurrent();
   RTTYPE attr = (RTTYPE)0;
   bool bReported = ReportHit(hitT, 0, attr);
-  // expected-error@-1{{object 'dx::HitObject' is not allowed in user-defined struct parameter}}
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in attributes}}
   // expected-note@16{{'dx::HitObject' field declared here}}
 }
 
diff --git a/tools/clang/test/SemaHLSL/hlsl/types/invalid-hitobject-decls-templated.hlsl b/tools/clang/test/SemaHLSL/hlsl/types/invalid-hitobject-decls-templated.hlsl
index 4ffd53878d..c2303a8608 100644
--- a/tools/clang/test/SemaHLSL/hlsl/types/invalid-hitobject-decls-templated.hlsl
+++ b/tools/clang/test/SemaHLSL/hlsl/types/invalid-hitobject-decls-templated.hlsl
@@ -275,7 +275,7 @@ void Intersection() {
   float hitT = RayTCurrent();
   RTTYPE attr = (RTTYPE)0;
   bool bReported = ReportHit(hitT, 0, attr);
-  // expected-error@-1{{object 'dx::HitObject' is not allowed in user-defined struct parameter}}
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in attributes}}
   // expected-note@40{{'dx::HitObject' field declared here}}
 }
 
diff --git a/tools/clang/test/SemaHLSL/hlsl/types/invalid-longvec-decls.hlsl b/tools/clang/test/SemaHLSL/hlsl/types/invalid-longvec-decls.hlsl
index 0604feeaec..96c5d4b5f4 100644
--- a/tools/clang/test/SemaHLSL/hlsl/types/invalid-longvec-decls.hlsl
+++ b/tools/clang/test/SemaHLSL/hlsl/types/invalid-longvec-decls.hlsl
@@ -146,7 +146,7 @@ void Miss(inout RTTYPE payload){ // expected-error{{vectors of over 4 elements i
 void Intersection() {
   float hitT = RayTCurrent();
   RTTYPE attr = (RTTYPE)0;
-  bool bReported = ReportHit(hitT, 0, attr); // expected-error{{vectors of over 4 elements in user-defined struct parameter are not supported}}
+  bool bReported = ReportHit(hitT, 0, attr); // expected-error{{vectors of over 4 elements in attributes are not supported}}
 }
 
 [shader("callable")]
diff --git a/utils/hct/gen_intrin_main.txt b/utils/hct/gen_intrin_main.txt
index ae8df55a0c..f2c0cc5e2e 100644
--- a/utils/hct/gen_intrin_main.txt
+++ b/utils/hct/gen_intrin_main.txt
@@ -1131,7 +1131,7 @@ namespace DxHitObjectMethods {
     uint [[rn,class_prefix,min_sm=6.9]] GetPrimitiveIndex();
     uint [[rn,class_prefix,min_sm=6.9]] GetHitKind();
     uint [[rn,class_prefix,min_sm=6.9]] GetShaderTableIndex();
-    $funcT [[class_prefix,min_sm=6.9]] GetAttributes();
+    void [[class_prefix,min_sm=6.9]] GetAttributes(out udt Attributes);
     void [[class_prefix,min_sm=6.9]] SetShaderTableIndex(in uint RecordIndex);
     uint [[ro,class_prefix,min_sm=6.9]] LoadLocalRootTableConstant(in uint RootConstantOffsetInBytes);
 } namespace

From d64d34ced89e2ac407d151b8f04516684fc41ed0 Mon Sep 17 00:00:00 2001
From: Alex Sepkowski <alexsepkowski@gmail.com>
Date: Wed, 23 Jul 2025 17:33:18 -0700
Subject: [PATCH 93/93] Merge Long Vector Trigonometric Op Exec Tests (#7665)

Resolves #7629

Merge the long vector trig op exec tests from staging-sm6.9.
Verified locally against WARP:
`F:\hlsl.bin\TAEF\x64\te.exe "F:\hlsl.bin\Debug\bin\ExecHLSLTests.dll"
/name:LongVector::OpTest::trig* /p:D3D12SDKVersion=1
/p:"ExperimentalShaders=*"`
---
 .../unittests/HLSLExec/LongVectorOpTable.xml  |  94 ++++++++++++++
 .../clang/unittests/HLSLExec/LongVectors.cpp  |  21 +++
 tools/clang/unittests/HLSLExec/LongVectors.h  |  49 +++++++
 .../clang/unittests/HLSLExec/LongVectors.tpp  | 121 ++++++++++++++++++
 4 files changed, 285 insertions(+)

diff --git a/tools/clang/unittests/HLSLExec/LongVectorOpTable.xml b/tools/clang/unittests/HLSLExec/LongVectorOpTable.xml
index df8fe250c9..f3b2e62dbc 100644
--- a/tools/clang/unittests/HLSLExec/LongVectorOpTable.xml
+++ b/tools/clang/unittests/HLSLExec/LongVectorOpTable.xml
@@ -596,4 +596,98 @@
         <Parameter Name="DataType">float64</Parameter>
       </Row>
     </Table>
+    <Table Id="TrigonometricOpTable">
+      <ParameterTypes>
+        <!-- InputValueSetName1 is optional. If no value is provided use the
+        default value set for the data type. This string is meant to be a key
+        value for the the array of std::pairs defined in LongVectorTestData.h
+        for the applicable DataType-->
+        <ParameterType Name="InputValueSetName1">String</ParameterType>
+        <!-- InputArgsName is optional and is also a key to the array of
+        std::pairs defined in LongVectorTestData.h for the applicable DataType.
+        Used for args like min and max in clamp-->
+        <ParameterType Name="DataType">String</ParameterType>
+        <ParameterType Name="OpTypeEnum">String</ParameterType>
+      </ParameterTypes>
+      <!-- LongVectorUnaryOpTable_Trigonometric DataType: float16 -->
+      <Row Name="Acos_float16">
+        <Parameter Name="OpTypeEnum">TrigonometricOpType_Acos</Parameter>
+        <Parameter Name="DataType">float16</Parameter>
+        <Parameter Name="InputValueSetName1">TrigonometricInputValueSet_RangeOne</Parameter>
+      </Row>
+      <Row Name="Asin_float16">
+        <Parameter Name="OpTypeEnum">TrigonometricOpType_Asin</Parameter>
+        <Parameter Name="DataType">float16</Parameter>
+        <Parameter Name="InputValueSetName1">TrigonometricInputValueSet_RangeHalfPi</Parameter>
+      </Row>
+      <Row Name="Atan_float16">
+        <Parameter Name="OpTypeEnum">TrigonometricOpType_Atan</Parameter>
+        <Parameter Name="DataType">float16</Parameter>
+        <Parameter Name="InputValueSetName1">TrigonometricInputValueSet_RangeHalfPi</Parameter>
+      </Row>
+      <Row Name="Cos_float16">
+        <Parameter Name="OpTypeEnum">TrigonometricOpType_Cos</Parameter>
+        <Parameter Name="DataType">float16</Parameter>
+      </Row>
+      <Row Name="Cosh_float16">
+        <Parameter Name="OpTypeEnum">TrigonometricOpType_Cosh</Parameter>
+        <Parameter Name="DataType">float16</Parameter>
+      </Row>
+      <Row Name="Sin_float16">
+        <Parameter Name="OpTypeEnum">TrigonometricOpType_Sin</Parameter>
+        <Parameter Name="DataType">float16</Parameter>
+      </Row>
+      <Row Name="Sinh_float16">
+        <Parameter Name="OpTypeEnum">TrigonometricOpType_Sinh</Parameter>
+        <Parameter Name="DataType">float16</Parameter>
+      </Row>
+      <Row Name="Tan_float16">
+        <Parameter Name="OpTypeEnum">TrigonometricOpType_Tan</Parameter>
+        <Parameter Name="DataType">float16</Parameter>
+      </Row>
+      <Row Name="Tanh_float16">
+        <Parameter Name="OpTypeEnum">TrigonometricOpType_Tanh</Parameter>
+        <Parameter Name="DataType">float16</Parameter>
+      </Row>
+      <!-- LongVectorUnaryOpTable_Trigonometric DataType: float32 -->
+      <Row Name="Acos_float32">
+        <Parameter Name="OpTypeEnum">TrigonometricOpType_Acos</Parameter>
+        <Parameter Name="DataType">float32</Parameter>
+        <Parameter Name="InputValueSetName1">TrigonometricInputValueSet_RangeOne</Parameter>
+      </Row>
+      <Row Name="Asin_float32">
+        <Parameter Name="OpTypeEnum">TrigonometricOpType_Asin</Parameter>
+        <Parameter Name="DataType">float32</Parameter>
+        <Parameter Name="InputValueSetName1">TrigonometricInputValueSet_RangeHalfPi</Parameter>
+      </Row>
+      <Row Name="Atan_float32">
+        <Parameter Name="OpTypeEnum">TrigonometricOpType_Atan</Parameter>
+        <Parameter Name="DataType">float32</Parameter>
+        <Parameter Name="InputValueSetName1">TrigonometricInputValueSet_RangeHalfPi</Parameter>
+      </Row>
+      <Row Name="Cos_float32">
+        <Parameter Name="OpTypeEnum">TrigonometricOpType_Cos</Parameter>
+        <Parameter Name="DataType">float32</Parameter>
+      </Row>
+      <Row Name="Cosh_float32">
+        <Parameter Name="OpTypeEnum">TrigonometricOpType_Cosh</Parameter>
+        <Parameter Name="DataType">float32</Parameter>
+      </Row>
+      <Row Name="Sin_float32">
+        <Parameter Name="OpTypeEnum">TrigonometricOpType_Sin</Parameter>
+        <Parameter Name="DataType">float32</Parameter>
+      </Row>
+      <Row Name="Sinh_float32">
+        <Parameter Name="OpTypeEnum">TrigonometricOpType_Sinh</Parameter>
+        <Parameter Name="DataType">float32</Parameter>
+      </Row>
+      <Row Name="Tan_float32">
+        <Parameter Name="OpTypeEnum">TrigonometricOpType_Tan</Parameter>
+        <Parameter Name="DataType">float32</Parameter>
+      </Row>
+      <Row Name="Tanh_float32">
+        <Parameter Name="OpTypeEnum">TrigonometricOpType_Tanh</Parameter>
+        <Parameter Name="DataType">float32</Parameter>
+      </Row>
+    </Table>
 </Data>
diff --git a/tools/clang/unittests/HLSLExec/LongVectors.cpp b/tools/clang/unittests/HLSLExec/LongVectors.cpp
index 9c2d3d229c..b9e79cfc5e 100644
--- a/tools/clang/unittests/HLSLExec/LongVectors.cpp
+++ b/tools/clang/unittests/HLSLExec/LongVectors.cpp
@@ -16,6 +16,13 @@ LongVector::getUnaryOpType(const std::wstring &OpTypeString) {
       std::size(unaryOpTypeStringToEnumMap));
 }
 
+LongVector::TrigonometricOpType
+LongVector::getTrigonometricOpType(const std::wstring &OpTypeString) {
+  return getLongVectorOpType<LongVector::TrigonometricOpType>(
+      trigonometricOpTypeStringToEnumMap, OpTypeString,
+      std::size(trigonometricOpTypeStringToEnumMap));
+}
+
 // These are helper arrays to be used with the TableParameterHandler that parses
 // the LongVectorOpTable.xml file for us.
 static TableParameter BinaryOpParameters[] = {
@@ -90,6 +97,20 @@ TEST_F(LongVector::OpTest, binaryOpTest) {
   dispatchTestByDataType(OpType, DataType, Handler);
 }
 
+TEST_F(LongVector::OpTest, trigonometricOpTest) {
+  WEX::TestExecution::SetVerifyOutput verifySettings(
+      WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
+
+  const int TableSize = sizeof(UnaryOpParameters) / sizeof(TableParameter);
+  TableParameterHandler Handler(UnaryOpParameters, TableSize);
+
+  std::wstring DataType(Handler.GetTableParamByName(L"DataType")->m_str);
+  std::wstring OpTypeString(Handler.GetTableParamByName(L"OpTypeEnum")->m_str);
+
+  auto OpType = LongVector::getTrigonometricOpType(OpTypeString);
+  dispatchTestByDataType(OpType, DataType, Handler);
+}
+
 TEST_F(LongVector::OpTest, unaryOpTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
diff --git a/tools/clang/unittests/HLSLExec/LongVectors.h b/tools/clang/unittests/HLSLExec/LongVectors.h
index 9157da679d..0e046d1966 100644
--- a/tools/clang/unittests/HLSLExec/LongVectors.h
+++ b/tools/clang/unittests/HLSLExec/LongVectors.h
@@ -35,6 +35,11 @@ class OpTest {
                        L"Table:LongVectorOpTable.xml#BinaryOpTable")
   END_TEST_METHOD()
 
+  BEGIN_TEST_METHOD(trigonometricOpTest)
+  TEST_METHOD_PROPERTY(L"DataSource",
+                       L"Table:LongVectorOpTable.xml#TrigonometricOpTable")
+  END_TEST_METHOD()
+
   BEGIN_TEST_METHOD(unaryOpTest)
   TEST_METHOD_PROPERTY(L"DataSource",
                        L"Table:LongVectorOpTable.xml#UnaryOpTable")
@@ -150,6 +155,39 @@ static_assert(_countof(unaryOpTypeStringToEnumMap) ==
 
 UnaryOpType getUnaryOpType(const std::wstring &OpTypeString);
 
+enum TrigonometricOpType {
+  TrigonometricOpType_Acos,
+  TrigonometricOpType_Asin,
+  TrigonometricOpType_Atan,
+  TrigonometricOpType_Cos,
+  TrigonometricOpType_Cosh,
+  TrigonometricOpType_Sin,
+  TrigonometricOpType_Sinh,
+  TrigonometricOpType_Tan,
+  TrigonometricOpType_Tanh,
+  TrigonometricOpType_EnumValueCount
+};
+
+static const LongVectorOpTypeStringToEnumValue
+    trigonometricOpTypeStringToEnumMap[] = {
+        {L"TrigonometricOpType_Acos", TrigonometricOpType_Acos},
+        {L"TrigonometricOpType_Asin", TrigonometricOpType_Asin},
+        {L"TrigonometricOpType_Atan", TrigonometricOpType_Atan},
+        {L"TrigonometricOpType_Cos", TrigonometricOpType_Cos},
+        {L"TrigonometricOpType_Cosh", TrigonometricOpType_Cosh},
+        {L"TrigonometricOpType_Sin", TrigonometricOpType_Sin},
+        {L"TrigonometricOpType_Sinh", TrigonometricOpType_Sinh},
+        {L"TrigonometricOpType_Tan", TrigonometricOpType_Tan},
+        {L"TrigonometricOpType_Tanh", TrigonometricOpType_Tanh},
+};
+
+static_assert(_countof(trigonometricOpTypeStringToEnumMap) ==
+                  TrigonometricOpType_EnumValueCount,
+              "trigonometricOpTypeStringToEnumMap size mismatch. Did you add "
+              "a new enum value?");
+
+TrigonometricOpType getTrigonometricOpType(const std::wstring &OpTypeString);
+
 template <typename DataTypeT>
 std::vector<DataTypeT> getInputValueSetByKey(const std::wstring &Key,
                                              bool LogKey = true) {
@@ -214,6 +252,7 @@ template <typename DataTypeT, typename LongVectorOpTypeT> class TestConfig {
 
   TestConfig(UnaryOpType OpType);
   TestConfig(BinaryOpType OpType);
+  TestConfig(TrigonometricOpType OpType);
 
   bool isBinaryOp() const {
     return BasicOpType == LongVector::BasicOpType_Binary ||
@@ -238,9 +277,15 @@ template <typename DataTypeT, typename LongVectorOpTypeT> class TestConfig {
   DataTypeT computeExpectedValue(const DataTypeT &A, const DataTypeT &B,
                                  BinaryOpType OpType) const;
   DataTypeT computeExpectedValue(const DataTypeT &A, const DataTypeT &B) const;
+  DataTypeT computeExpectedValue(const DataTypeT &A,
+                                 TrigonometricOpType OpType) const;
   DataTypeT computeExpectedValue(const DataTypeT &A, UnaryOpType OpType) const;
   DataTypeT computeExpectedValue(const DataTypeT &A) const;
 
+  void setInputArgsArrayName(const std::wstring &InputArgsArrayName) {
+    this->InputArgsArrayName = InputArgsArrayName;
+  }
+
   void setInputValueSet1(const std::wstring &InputValueSetName) {
     this->InputValueSetName1 = InputValueSetName;
   }
@@ -257,6 +302,8 @@ template <typename DataTypeT, typename LongVectorOpTypeT> class TestConfig {
     return getInputValueSet(2);
   }
 
+  std::vector<DataTypeT> getInputArgsArray() const;
+
   float getTolerance() const { return Tolerance; }
   LongVector::ValidationType getValidationType() const {
     return ValidationType;
@@ -278,6 +325,8 @@ template <typename DataTypeT, typename LongVectorOpTypeT> class TestConfig {
   LongVector::TestConfigTraits<LongVectorOpTypeT> OpTypeTraits;
   std::wstring InputValueSetName1 = L"DefaultInputValueSet1";
   std::wstring InputValueSetName2 = L"DefaultInputValueSet2";
+  // No default args array
+  std::wstring InputArgsArrayName = L"";
 }; // class LongVector::TestConfig
 
 }; // namespace LongVector
diff --git a/tools/clang/unittests/HLSLExec/LongVectors.tpp b/tools/clang/unittests/HLSLExec/LongVectors.tpp
index 331d4452eb..29affa4b2e 100644
--- a/tools/clang/unittests/HLSLExec/LongVectors.tpp
+++ b/tools/clang/unittests/HLSLExec/LongVectors.tpp
@@ -342,6 +342,59 @@ LongVector::TestConfig<DataTypeT, LongVectorOpTypeT>::TestConfig(LongVector::Bin
   }
 }
 
+template <typename DataTypeT, typename LongVectorOpTypeT>
+LongVector::TestConfig<DataTypeT, LongVectorOpTypeT>::TestConfig(LongVector::TrigonometricOpType OpType)
+    : OpTypeTraits(OpType) {
+  IntrinsicString = "";
+  BasicOpType = LongVector::BasicOpType_Unary;
+
+  // All trigonometric ops are floating point types.
+  // These trig functions are defined to have a max absolute error of 0.0008
+  // as per the D3D functional specs. An example with this spec for sin and
+  // cos is available here:
+  // https://microsoft.github.io/DirectX-Specs/d3d/archive/D3D11_3_FunctionalSpec.htm#22.10.20
+  ValidationType = LongVector::ValidationType_Epsilon;
+  if (std::is_same_v<DataTypeT, HLSLHalf_t>)
+    Tolerance = 0.0010f;
+  else if (std::is_same_v<DataTypeT, float>)
+    Tolerance = 0.0008f;
+  else
+    VERIFY_FAIL(
+        "Invalid type for trigonometric op. Expecting half or float.");
+
+  switch (OpType) {
+  case LongVector::TrigonometricOpType_Acos:
+    IntrinsicString = "acos";
+    break;
+  case LongVector::TrigonometricOpType_Asin:
+    IntrinsicString = "asin";
+    break;
+  case LongVector::TrigonometricOpType_Atan:
+    IntrinsicString = "atan";
+    break;
+  case LongVector::TrigonometricOpType_Cos:
+    IntrinsicString = "cos";
+    break;
+  case LongVector::TrigonometricOpType_Cosh:
+    IntrinsicString = "cosh";
+    break;
+  case LongVector::TrigonometricOpType_Sin:
+    IntrinsicString = "sin";
+    break;
+  case LongVector::TrigonometricOpType_Sinh:
+    IntrinsicString = "sinh";
+    break;
+  case LongVector::TrigonometricOpType_Tan:
+    IntrinsicString = "tan";
+    break;
+  case LongVector::TrigonometricOpType_Tanh:
+    IntrinsicString = "tanh";
+    break;
+  default:
+    VERIFY_FAIL("Invalid TrigonometricOpType");
+  }
+}
+
 template <typename DataTypeT, typename LongVectorOpTypeT>
 bool LongVector::TestConfig<DataTypeT, LongVectorOpTypeT>::hasFunctionDefinition() const {
   if constexpr (std::is_same_v<LongVectorOpTypeT, LongVector::UnaryOpType>) {
@@ -463,6 +516,13 @@ DataTypeT LongVector::TestConfig<DataTypeT, LongVectorOpTypeT>::computeExpectedV
 template <typename DataTypeT, typename LongVectorOpTypeT>
 DataTypeT LongVector::TestConfig<DataTypeT, LongVectorOpTypeT>::computeExpectedValue(const DataTypeT &A) const {
 
+  if constexpr (std::is_same_v<LongVectorOpTypeT, LongVector::TrigonometricOpType>) {
+    const auto OpType = static_cast<LongVector::TrigonometricOpType>(OpTypeTraits.OpType);
+    // HLSLHalf_t is a struct. We need to call the constructor to get the
+    // expected value.
+    return computeExpectedValue(A, OpType);
+  }
+
   if constexpr (std::is_same_v<LongVectorOpTypeT, LongVector::UnaryOpType>) {
     const auto OpType = static_cast<LongVector::UnaryOpType>(OpTypeTraits.OpType);
     // HLSLHalf_t is a struct. We need to call the constructor to get the
@@ -477,6 +537,67 @@ DataTypeT LongVector::TestConfig<DataTypeT, LongVectorOpTypeT>::computeExpectedV
   return DataTypeT();
 }
 
+template <typename DataTypeT, typename LongVectorOpTypeT>
+DataTypeT LongVector::TestConfig<DataTypeT, LongVectorOpTypeT>::computeExpectedValue(const DataTypeT &A,
+                              LongVector::TrigonometricOpType OpType) const {
+  // The trig functions are only valid on floating point types. The constexpr in
+  // this case is a relatively easy and clean way to prevent the compiler from
+  // erroring out trying to resolve these for the non floating point types. We
+  // won't use them in the first place.
+  if constexpr (isFloatingPointType<DataTypeT>()) {
+    switch (OpType) {
+    case LongVector::TrigonometricOpType_Acos:
+      return std::acos(A);
+    case LongVector::TrigonometricOpType_Asin:
+      return std::asin(A);
+    case LongVector::TrigonometricOpType_Atan:
+      return std::atan(A);
+    case LongVector::TrigonometricOpType_Cos:
+      return std::cos(A);
+    case LongVector::TrigonometricOpType_Cosh:
+      return std::cosh(A);
+    case LongVector::TrigonometricOpType_Sin:
+      return std::sin(A);
+    case LongVector::TrigonometricOpType_Sinh:
+      return std::sinh(A);
+    case LongVector::TrigonometricOpType_Tan:
+      return std::tan(A);
+    case LongVector::TrigonometricOpType_Tanh:
+      return std::tanh(A);
+    default:
+      LOG_ERROR_FMT_THROW(L"Unknown TrigonometricOpType: %d",
+                          OpTypeTraits.OpType);
+      return DataTypeT();
+    }
+  }
+
+  LOG_ERROR_FMT_THROW(L"ComputeExpectedValue(const DataTypeT &A, "
+                      L"LongVectorOpTypeT OpType) called on a "
+                      L"non-float type: %d",
+                      OpType);
+
+  return DataTypeT();
+}
+
+template <typename DataTypeT, typename LongVectorOpTypeT>
+std::vector<DataTypeT>  LongVector::TestConfig<DataTypeT, LongVectorOpTypeT>::getInputArgsArray() const {
+
+  std::vector<DataTypeT> InputArgs;
+
+  std::wstring InputArgsArrayName = this->InputArgsArrayName;
+
+  if (InputArgsArrayName.empty())
+    VERIFY_FAIL("No args array name set.");
+
+  if (std::is_same_v<DataTypeT, HLSLBool_t> && isClampOp())
+    VERIFY_FAIL("Clamp is not supported for bools.");
+  else
+    return getInputValueSetByKey<DataTypeT>(InputArgsArrayName, false);
+
+  VERIFY_FAIL("Invalid type for args array.");
+  return std::vector<DataTypeT>();
+}
+
 template <typename DataTypeT, typename LongVectorOpTypeT>
 std::string LongVector::TestConfig<DataTypeT, LongVectorOpTypeT>::getCompilerOptionsString(size_t VectorSize) const {
   std::stringstream CompilerOptions("");