diff --git a/.github/workflows/clang-format-checker.yml b/.github/workflows/clang-format-checker.yml
index d1887e4519..74b734a7c0 100644
--- a/.github/workflows/clang-format-checker.yml
+++ b/.github/workflows/clang-format-checker.yml
@@ -12,7 +12,7 @@ jobs:
     permissions:
       pull-requests: write
     steps:
-      - name: Fetch LLVM sources
+      - name: Fetch DirectXShaderCompiler sources
         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
         with:
           ref: ${{ github.event.pull_request.head.sha }}
@@ -31,6 +31,20 @@ jobs:
           separator: ","
           skip_initial_fetch: true
 
+      # We need to pull the script from the main branch, so that we ensure
+      # we get the latest version of this script.
+      - name: Fetch code formatting utils
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          repository: microsoft/DirectXShaderCompiler
+          ref: ${{ github.base_ref }}
+          sparse-checkout: |
+            utils/git/requirements_formatting.txt
+            utils/git/code-format-helper.py
+            utils/git/code-format-save-diff.py
+          sparse-checkout-cone-mode: false
+          path: code-format-tools
+
       - name: "Listed files"
         env:
           LISTED_FILES: ${{ steps.changed-files.outputs.all_changed_files }}
@@ -48,10 +62,10 @@ jobs:
         with:
           python-version: '3.11'
           cache: 'pip'
-          cache-dependency-path: 'utils/git/requirements_formatting.txt'
+          cache-dependency-path: 'code-format-tools/utils/git/requirements_formatting.txt'
 
       - name: Install python dependencies
-        run: pip install -r utils/git/requirements_formatting.txt
+        run: pip install -r code-format-tools/utils/git/requirements_formatting.txt
 
       - name: Run code formatter
         id: formatter
@@ -61,7 +75,7 @@ jobs:
           END_REV: ${{ github.event.pull_request.head.sha }}
           CHANGED_FILES: ${{ steps.changed-files.outputs.all_changed_files }}
         run: |
-           python utils/git/code-format-helper.py \
+           python code-format-tools/utils/git/code-format-helper.py \
             --token ${{ secrets.GITHUB_TOKEN }} \
             --issue-number $GITHUB_PR_NUMBER \
             --start-rev $START_REV \
@@ -92,28 +106,37 @@ jobs:
             } catch (err) {
               core.setFailed(`Request failed with error ${err}`)
             } 
-      - name: Fetch LLVM sources
-        uses: actions/checkout@v4
+      
+      # We need to pull the script from the main branch, so that we ensure
+      # we get the latest version of this script.
+      - name: Fetch code formatting utils
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
         with:
-          fetch-depth: 2
-          path: build/main_src
+          repository: microsoft/DirectXShaderCompiler
+          ref: ${{ github.base_ref }}
+          sparse-checkout: |
+            utils/git/requirements_formatting.txt
+            utils/git/code-format-helper.py
+            utils/git/code-format-save-diff.py
+          sparse-checkout-cone-mode: false
+          path: code-format-tools
 
       - name: Setup Python env
         uses: actions/setup-python@v4
         with:
           python-version: '3.11'
           cache: 'pip'
-          cache-dependency-path: 'build/main_src/utils/git/requirements_formatting.txt'
+          cache-dependency-path: 'code-format-tools/utils/git/requirements_formatting.txt'
 
       - name: Install python dependencies
-        run: pip install -r build/main_src/utils/git/requirements_formatting.txt
+        run: pip install -r code-format-tools/utils/git/requirements_formatting.txt
 
       - name: Apply code diff
         env:
           GITHUB_PR_NUMBER: ${{ github.event.issue.number }}
           COMMENT_ID: ${{ github.event.comment.id }}
         run: |
-          python build/main_src/utils/git/code-format-save-diff.py \
+          python code-format-tools/utils/git/code-format-save-diff.py \
             --token ${{ secrets.GITHUB_TOKEN }} \
             --issue-number $GITHUB_PR_NUMBER \
             --tmp-diff-file $TMP_DIFF_FILE \
diff --git a/autoconf/config.guess b/autoconf/config.guess
index cf0541d1f1..62df94c187 100755
--- a/autoconf/config.guess
+++ b/autoconf/config.guess
@@ -929,6 +929,9 @@ EOF
     ia64:Linux:*:*)
 	echo ${UNAME_MACHINE}-unknown-linux-gnu
 	exit ;;
+    loongarch64:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	exit ;;
     m32r*:Linux:*:*)
 	echo ${UNAME_MACHINE}-unknown-linux-gnu
 	exit ;;
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 33c5349f9e..dee579287c 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -52,17 +52,17 @@ stages:
 
     variables:
       macOS: macOS-latest
-      linux: Ubuntu-latest
+      linux: Ubuntu-22.04 # FIXME: #7364, DXC does not build correctly with GCC 13+
 
     strategy:
       matrix:
-        Linux_Clang_Release:
+        Linux_Clang_RelWithDebInfo:
           image: ${{ variables.linux }}
-          configuration: Release
+          configuration: RelWithDebInfo
           CC: clang-18
           CXX: clang++-18
-          CMAKE_OPTS: -DLLVM_ENABLE_WERROR=On -DLLVM_USE_SANITIZER='Address;Undefined' -DLLVM_ENABLE_LIBCXX=On -DLLVM_USE_LINKER=lld
-          CHECK_ALL_ENV: ASAN_OPTIONS=alloc_dealloc_mismatch=0
+          CMAKE_OPTS: -DLLVM_ENABLE_WERROR=On -DLLVM_USE_SANITIZER='Address;Undefined' -DLLVM_ENABLE_LIBCXX=On -DLLVM_USE_LINKER=lld-18
+          CHECK_ALL_ENV: ASAN_OPTIONS=alloc_dealloc_mismatch=0 LSAN_OPTIONS=suppressions=$BUILD_SOURCESDIRECTORY/utils/asan/x86_64-pc-linux-gnu.lsan.supp:print_suppressions=0 ASAN_SYMBOLIZER_PATH=/usr/bin/llvm-symbolizer-18 LSAN_SYMBOLIZER_PATH=/usr/bin/llvm-symbolizer-18
           OS: Linux
         Linux_Clang_Debug:
           image: ${{ variables.linux }}
@@ -107,6 +107,8 @@ stages:
         versionSpec: '3.x'
 
     - bash: |
+        sudo apt-get update
+        sudo apt-get upgrade libc6 libc6-dbg
         sudo apt-get install ninja-build
         wget https://apt.llvm.org/llvm.sh
         chmod u+x llvm.sh
diff --git a/cmake/config-ix.cmake b/cmake/config-ix.cmake
index 4541d08162..226881ad30 100644
--- a/cmake/config-ix.cmake
+++ b/cmake/config-ix.cmake
@@ -367,6 +367,8 @@ elseif (LLVM_NATIVE_ARCH MATCHES "wasm64")
   set(LLVM_NATIVE_ARCH WebAssembly)
 elseif (LLVM_NATIVE_ARCH MATCHES "riscv64")
   set(LLVM_NATIVE_ARCH RISCV)
+elseif (LLVM_NATIVE_ARCH MATCHES "loongarch64")
+  set(LLVM_NATIVE_ARCH LoongArch)
 elseif (LLVM_NATIVE_ARCH MATCHES "e2k")
   set(LLVM_NATIVE_ARCH E2K)
 else ()
diff --git a/cmake/modules/HandleLLVMOptions.cmake b/cmake/modules/HandleLLVMOptions.cmake
index acf76c2907..00bdaed363 100644
--- a/cmake/modules/HandleLLVMOptions.cmake
+++ b/cmake/modules/HandleLLVMOptions.cmake
@@ -301,7 +301,6 @@ if( MSVC )
 
   set(msvc_warning_flags
     # Disabled warnings.
-    -wd4146 # Suppress 'unary minus operator applied to unsigned type, result still unsigned'
     -wd4180 # Suppress 'qualifier applied to function type has no meaning; ignored'
     -wd4244 # Suppress ''argument' : conversion from 'type1' to 'type2', possible loss of data'
     -wd4258 # Suppress ''var' : definition from the for loop is ignored; the definition from the enclosing scope is used'
diff --git a/docs/DXIL.rst b/docs/DXIL.rst
index a1c5055085..1a2a691d27 100644
--- a/docs/DXIL.rst
+++ b/docs/DXIL.rst
@@ -2419,6 +2419,10 @@ ID  Name                                                  Description
 302 ReservedC9                                            reserved
 303 RawBufferVectorLoad                                   reads from a raw buffer and structured buffer
 304 RawBufferVectorStore                                  writes to a RWByteAddressBuffer or RWStructuredBuffer
+305 MatVecMul                                             Multiplies a MxK dimension matrix and a K sized input vector
+306 MatVecMulAdd                                          multiplies a MxK dimension matrix and a K sized input vector and adds an M-sized bias vector
+307 OuterProductAccumulate                                Computes the outer product between column vectors and an MxN matrix is accumulated component-wise atomically (with device scope) in memory
+308 VectorAccumulate                                      Accumulates the components of a vector component-wise atomically (with device scope) to the corresponding elements of an array in memory
 === ===================================================== =======================================================================================================================================================================================================================
 
 
@@ -3065,287 +3069,299 @@ The set of validation rules that are known to hold for a DXIL program is identif
 .. <py::lines('VALRULES-RST')>hctdb_instrhelp.get_valrules_rst()</py>
 .. VALRULES-RST:BEGIN
 
-===================================================== ========================================================================================================================================================================================================================================================================================================
-Rule Code                                             Description
-===================================================== ========================================================================================================================================================================================================================================================================================================
-BITCODE.VALID                                         Module must be bitcode-valid
-CONTAINER.CONTENTINVALID                              DXIL Container Content is well-formed
-CONTAINER.CONTENTMATCHES                              DXIL Container Content must match Module
-CONTAINER.PARTINVALID                                 DXIL Container must not contain unknown parts
-CONTAINER.PARTMATCHES                                 DXIL Container Parts must match Module
-CONTAINER.PARTMISSING                                 DXIL Container requires certain parts, corresponding to module
-CONTAINER.PARTREPEATED                                DXIL Container must have only one of each part type
-CONTAINER.ROOTSIGNATUREINCOMPATIBLE                   Root Signature in DXIL Container must be compatible with shader
-CONTAINER.UNUSEDITEMINTABLE                           Items in Table must be used
-DECL.ALLOCATERAYQUERY2FLAGSARECONST                   constRayFlags and RayQueryFlags for AllocateRayQuery2 must be constant
-DECL.ALLOCATERAYQUERYFLAGSARECONST                    RayFlags for AllocateRayQuery must be constant
-DECL.ALLOWOPACITYMICROMAPSEXPECTEDGIVENFORCEOMM2STATE When the ForceOMM2State ConstRayFlag is given as an argument to a RayQuery object, AllowOpacityMicromaps is expected as a RayQueryFlag argument
-DECL.ATTRSTRUCT                                       Attributes parameter must be struct type
-DECL.DXILFNEXTERN                                     External function must be a DXIL function
-DECL.DXILNSRESERVED                                   The DXIL reserved prefixes must only be used by built-in functions and types
-DECL.EXTRAARGS                                        Extra arguments not allowed for shader functions
-DECL.FNATTRIBUTE                                      Functions should only contain known function attributes
-DECL.FNFLATTENPARAM                                   Function parameters must not use struct types
-DECL.FNISCALLED                                       Functions can only be used by call instructions
-DECL.MULTIPLENODEINPUTS                               A node shader may not have more than one input record
-DECL.NODELAUNCHINPUTTYPE                              Invalid input record type for node launch type
-DECL.NOTUSEDEXTERNAL                                  External declaration should not be used
-DECL.PARAMSTRUCT                                      Callable function parameter must be struct type
-DECL.PAYLOADSTRUCT                                    Payload parameter must be struct type
-DECL.RAYQUERYINFNSIG                                  Rayquery objects not allowed in function signatures
-DECL.RESOURCEINFNSIG                                  Resources not allowed in function signatures
-DECL.SHADERMISSINGARG                                 payload/params/attributes parameter is required for certain shader types
-DECL.SHADERRETURNVOID                                 Shader functions must return void
-DECL.USEDEXTERNALFUNCTION                             External function must be used
-DECL.USEDINTERNAL                                     Internal declaration must be used
-FLOW.DEADLOOP                                         Loop must have break.
-FLOW.FUNCTIONCALL                                     Function with parameter is not permitted
-FLOW.NORECURSION                                      Recursion is not permitted.
-FLOW.REDUCIBLE                                        Execution flow must be reducible.
-INSTR.ALLOWED                                         Instructions must be of an allowed type.
-INSTR.ATOMICCONST                                     Constant destination to atomic.
-INSTR.ATOMICINTRINNONUAV                              Non-UAV destination to atomic intrinsic.
-INSTR.ATOMICOPNONGROUPSHAREDORRECORD                  Non-groupshared or node record destination to atomic operation.
-INSTR.ATTRIBUTEATVERTEXNOINTERPOLATION                Attribute %0 must have nointerpolation mode in order to use GetAttributeAtVertex function.
-INSTR.BARRIERFLAGINVALID                              Invalid %0 flags on DXIL operation '%1'
-INSTR.BARRIERMODEFORNONCS                             sync in a non-Compute/Amplification/Mesh/Node Shader must only sync UAV (sync_uglobal).
-INSTR.BARRIERMODENOMEMORY                             sync must include some form of memory barrier - _u (UAV) and/or _g (Thread Group Shared Memory).  Only _t (thread group sync) is optional.
-INSTR.BARRIERMODEUSELESSUGROUP                        sync can't specify both _ugroup and _uglobal. If both are needed, just specify _uglobal.
-INSTR.BARRIERNONCONSTANTFLAGARGUMENT                  Memory type, access, or sync flag is not constant
-INSTR.BARRIERREQUIRESNODE                             sync in a non-Node Shader must not sync node record memory.
-INSTR.BUFFERUPDATECOUNTERONRESHASCOUNTER              BufferUpdateCounter valid only when HasCounter is true.
-INSTR.BUFFERUPDATECOUNTERONUAV                        BufferUpdateCounter valid only on UAV.
-INSTR.CALLOLOAD                                       Call to DXIL intrinsic must match overload signature
-INSTR.CANNOTPULLPOSITION                              pull-model evaluation of position disallowed
-INSTR.CBUFFERCLASSFORCBUFFERHANDLE                    Expect Cbuffer for CBufferLoad handle.
-INSTR.CBUFFEROUTOFBOUND                               Cbuffer access out of bound.
-INSTR.CHECKACCESSFULLYMAPPED                          CheckAccessFullyMapped should only be used on resource status.
-INSTR.CONSTALIGNFORRAWBUF                             Raw Buffer alignment value must be a constant.
-INSTR.COORDINATECOUNTFORRAWTYPEDBUF                   raw/typed buffer offset must be undef.
-INSTR.COORDINATECOUNTFORSTRUCTBUF                     structured buffer requires defined index and offset coordinates.
-INSTR.CREATEHANDLEIMMRANGEID                          Local resource must map to global resource.
-INSTR.DXILSTRUCTUSER                                  Dxil struct types should only be used by ExtractValue.
-INSTR.DXILSTRUCTUSEROUTOFBOUND                        Index out of bound when extract value from dxil struct types.
-INSTR.EVALINTERPOLATIONMODE                           Interpolation mode on %0 used with eval_* instruction must be linear, linear_centroid, linear_noperspective, linear_noperspective_centroid, linear_sample or linear_noperspective_sample.
-INSTR.EXTRACTVALUE                                    ExtractValue should only be used on dxil struct types and cmpxchg.
-INSTR.FAILTORESLOVETGSMPOINTER                        TGSM pointers must originate from an unambiguous TGSM global variable.
-INSTR.HANDLENOTFROMCREATEHANDLE                       Resource handle should returned by createHandle.
-INSTR.ILLEGALDXILOPCODE                               DXILOpCode must be [0..%0].  %1 specified.
-INSTR.ILLEGALDXILOPFUNCTION                           '%0' is not a DXILOpFuncition for DXILOpcode '%1'.
-INSTR.IMMBIASFORSAMPLEB                               bias amount for sample_b must be in the range [%0,%1], but %2 was specified as an immediate.
-INSTR.INBOUNDSACCESS                                  Access to out-of-bounds memory is disallowed.
-INSTR.MAYREORDERTHREADUNDEFCOHERENCEHINTPARAM         Use of undef coherence hint or num coherence hint bits in MaybeReorderThread.
-INSTR.MINPRECISIONNOTPRECISE                          Instructions marked precise may not refer to minprecision values.
-INSTR.MINPRECISONBITCAST                              Bitcast on minprecison types is not allowed.
-INSTR.MIPLEVELFORGETDIMENSION                         Use mip level on buffer when GetDimensions.
-INSTR.MIPONUAVLOAD                                    uav load don't support mipLevel/sampleIndex.
-INSTR.MISSINGSETMESHOUTPUTCOUNTS                      Missing SetMeshOutputCounts call.
-INSTR.MULTIPLEGETMESHPAYLOAD                          GetMeshPayload cannot be called multiple times.
-INSTR.MULTIPLESETMESHOUTPUTCOUNTS                     SetMeshOUtputCounts cannot be called multiple times.
-INSTR.NODERECORDHANDLEUSEAFTERCOMPLETE                Invalid use of completed record handle.
-INSTR.NOGENERICPTRADDRSPACECAST                       Address space cast between pointer types must have one part to be generic address space.
-INSTR.NOIDIVBYZERO                                    No signed integer division by zero.
-INSTR.NOINDEFINITEACOS                                No indefinite arccosine.
-INSTR.NOINDEFINITEASIN                                No indefinite arcsine.
-INSTR.NOINDEFINITEDSXY                                No indefinite derivative calculation.
-INSTR.NOINDEFINITELOG                                 No indefinite logarithm.
-INSTR.NONDOMINATINGDISPATCHMESH                       Non-Dominating DispatchMesh call.
-INSTR.NONDOMINATINGSETMESHOUTPUTCOUNTS                Non-Dominating SetMeshOutputCounts call.
-INSTR.NOREADINGUNINITIALIZED                          Instructions should not read uninitialized value.
-INSTR.NOTONCEDISPATCHMESH                             DispatchMesh must be called exactly once in an Amplification shader.
-INSTR.NOUDIVBYZERO                                    No unsigned integer division by zero.
-INSTR.OFFSETONUAVLOAD                                 uav load don't support offset.
-INSTR.OLOAD                                           DXIL intrinsic overload must be valid.
-INSTR.ONLYONEALLOCCONSUME                             RWStructuredBuffers may increment or decrement their counters, but not both.
-INSTR.OPCODERESERVED                                  Instructions must not reference reserved opcodes.
-INSTR.OPCONST                                         DXIL intrinsic requires an immediate constant operand
-INSTR.OPCONSTRANGE                                    Constant values must be in-range for operation.
-INSTR.OPERANDRANGE                                    DXIL intrinsic operand must be within defined range
-INSTR.PTRBITCAST                                      Pointer type bitcast must be have same size.
-INSTR.RESOURCECLASSFORLOAD                            load can only run on UAV/SRV resource.
-INSTR.RESOURCECLASSFORSAMPLERGATHER                   sample, lod and gather should be on srv resource.
-INSTR.RESOURCECLASSFORUAVSTORE                        store should be on uav resource.
-INSTR.RESOURCECOORDINATEMISS                          coord uninitialized.
-INSTR.RESOURCECOORDINATETOOMANY                       out of bound coord must be undef.
-INSTR.RESOURCEKINDFORBUFFERLOADSTORE                  buffer load/store only works on Raw/Typed/StructuredBuffer.
-INSTR.RESOURCEKINDFORCALCLOD                          lod requires resource declared as texture1D/2D/3D/Cube/CubeArray/1DArray/2DArray.
-INSTR.RESOURCEKINDFORGATHER                           gather requires resource declared as texture/2D/Cube/2DArray/CubeArray.
-INSTR.RESOURCEKINDFORGETDIM                           Invalid resource kind on GetDimensions.
-INSTR.RESOURCEKINDFORSAMPLE                           sample/_l/_d requires resource declared as texture1D/2D/3D/Cube/1DArray/2DArray/CubeArray.
-INSTR.RESOURCEKINDFORSAMPLEC                          samplec requires resource declared as texture1D/2D/Cube/1DArray/2DArray/CubeArray.
-INSTR.RESOURCEKINDFORTEXTURELOAD                      texture load only works on Texture1D/1DArray/2D/2DArray/3D/MS2D/MS2DArray.
-INSTR.RESOURCEKINDFORTEXTURESTORE                     texture store only works on Texture1D/1DArray/2D/2DArray/3D.
-INSTR.RESOURCEKINDFORTRACERAY                         TraceRay should only use RTAccelerationStructure.
-INSTR.RESOURCEMAPTOSINGLEENTRY                        Fail to map resource to resource table.
-INSTR.RESOURCEOFFSETMISS                              offset uninitialized.
-INSTR.RESOURCEOFFSETTOOMANY                           out of bound offset must be undef.
-INSTR.RESOURCEUSER                                    Resource should only be used by Load/GEP/Call.
-INSTR.SAMPLECOMPTYPE                                  sample_* instructions require resource to be declared to return UNORM, SNORM or FLOAT.
-INSTR.SAMPLEINDEXFORLOAD2DMS                          load on Texture2DMS/2DMSArray require sampleIndex.
-INSTR.SAMPLERMODEFORLOD                               lod instruction requires sampler declared in default mode.
-INSTR.SAMPLERMODEFORSAMPLE                            sample/_l/_d/_cl_s/gather instruction requires sampler declared in default mode.
-INSTR.SAMPLERMODEFORSAMPLEC                           sample_c_*/gather_c instructions require sampler declared in comparison mode.
-INSTR.SIGNATUREOPERATIONNOTINENTRY                    Dxil operation for input output signature must be in entryPoints.
-INSTR.STATUS                                          Resource status should only be used by CheckAccessFullyMapped.
-INSTR.STRUCTBITCAST                                   Bitcast on struct types is not allowed.
-INSTR.SVCONFLICTINGLAUNCHMODE                         Input system values are compatible with node shader launch mode.
-INSTR.TEXTUREOFFSET                                   offset texture instructions must take offset which can resolve to integer literal in the range -8 to 7.
-INSTR.TGSMRACECOND                                    Race condition writing to shared memory detected, consider making this write conditional.
-INSTR.UNDEFHITOBJECT                                  HitObject is undef.
-INSTR.UNDEFINEDVALUEFORUAVSTORE                       Assignment of undefined values to UAV.
-INSTR.UNDEFRESULTFORGETDIMENSION                      GetDimensions used undef dimension %0 on %1.
-INSTR.WRITEMASKFORTYPEDUAVSTORE                       store on typed uav must write to all four components of the UAV.
-INSTR.WRITEMASKGAPFORUAV                              UAV write mask must be contiguous, starting at x: .x, .xy, .xyz, or .xyzw.
-INSTR.WRITEMASKMATCHVALUEFORUAVSTORE                  uav store write mask must match store value mask, write mask is %0 and store value mask is %1.
-META.BARYCENTRICSFLOAT3                               only 'float3' type is allowed for SV_Barycentrics.
-META.BARYCENTRICSINTERPOLATION                        SV_Barycentrics cannot be used with 'nointerpolation' type.
-META.BARYCENTRICSTWOPERSPECTIVES                      There can only be up to two input attributes of SV_Barycentrics with different perspective interpolation mode.
-META.BRANCHFLATTEN                                    Can't use branch and flatten attributes together.
-META.CLIPCULLMAXCOMPONENTS                            Combined elements of SV_ClipDistance and SV_CullDistance must fit in 8 components
-META.CLIPCULLMAXROWS                                  Combined elements of SV_ClipDistance and SV_CullDistance must fit in two rows.
-META.COMPUTEWITHNODE                                  Compute entry must not have node metadata
-META.CONTROLFLOWHINTNOTONCONTROLFLOW                  Control flow hint only works on control flow inst.
-META.DENSERESIDS                                      Resource identifiers must be zero-based and dense.
-META.DUPLICATESYSVALUE                                System value may only appear once in signature
-META.ENTRYFUNCTION                                    entrypoint not found.
-META.FLAGSUSAGE                                       Flags must match usage.
-META.FORCECASEONSWITCH                                Attribute forcecase only works for switch.
-META.GLCNOTONAPPENDCONSUME                            globallycoherent cannot be used with append/consume buffers: '%0'.
-META.INTEGERINTERPMODE                                Interpolation mode on integer must be Constant
-META.INTERPMODEINONEROW                               Interpolation mode must be identical for all elements packed into the same row.
-META.INTERPMODEVALID                                  Interpolation mode must be valid
-META.INVALIDCONTROLFLOWHINT                           Invalid control flow hint.
-META.KNOWN                                            Named metadata should be known
-META.MAXTESSFACTOR                                    Hull Shader MaxTessFactor must be [%0..%1].  %2 specified.
-META.NOENTRYPROPSFORENTRY                             Entry point %0 must have entry properties.
-META.NOSEMANTICOVERLAP                                Semantics must not overlap
-META.REQUIRED                                         Required metadata missing.
-META.SEMAKINDMATCHESNAME                              Semantic name must match system value, when defined.
-META.SEMAKINDVALID                                    Semantic kind must be valid
-META.SEMANTICCOMPTYPE                                 %0 must be %1.
-META.SEMANTICINDEXMAX                                 System value semantics have a maximum valid semantic index
-META.SEMANTICLEN                                      Semantic length must be at least 1 and at most 64.
-META.SEMANTICSHOULDBEALLOCATED                        Semantic should have a valid packing location
-META.SEMANTICSHOULDNOTBEALLOCATED                     Semantic should have a packing location of -1
-META.SIGNATURECOMPTYPE                                signature %0 specifies unrecognized or invalid component type.
-META.SIGNATUREDATAWIDTH                               Data width must be identical for all elements packed into the same row.
-META.SIGNATUREILLEGALCOMPONENTORDER                   Component ordering for packed elements must be: arbitrary < system value < system generated value
-META.SIGNATUREINDEXCONFLICT                           Only elements with compatible indexing rules may be packed together
-META.SIGNATUREOUTOFRANGE                              Signature elements must fit within maximum signature size
-META.SIGNATUREOVERLAP                                 Signature elements may not overlap in packing location.
-META.STRUCTBUFALIGNMENT                               StructuredBuffer stride not aligned
-META.STRUCTBUFALIGNMENTOUTOFBOUND                     StructuredBuffer stride out of bounds
-META.SYSTEMVALUEROWS                                  System value may only have 1 row
-META.TARGET                                           Target triple must be 'dxil-ms-dx'
-META.TESSELLATOROUTPUTPRIMITIVE                       Invalid Tessellator Output Primitive specified. Must be point, line, triangleCW or triangleCCW.
-META.TESSELLATORPARTITION                             Invalid Tessellator Partitioning specified. Must be integer, pow2, fractional_odd or fractional_even.
-META.TEXTURETYPE                                      elements of typed buffers and textures must fit in four 32-bit quantities.
-META.USED                                             All metadata must be used by dxil.
-META.VALIDSAMPLERMODE                                 Invalid sampler mode on sampler .
-META.VALUERANGE                                       Metadata value must be within range.
-META.VERSIONSUPPORTED                                 Version in metadata must be supported.
-META.WELLFORMED                                       Metadata must be well-formed in operand count and types.
-SM.64BITRAWBUFFERLOADSTORE                            i64/f64 rawBufferLoad/Store overloads are allowed after SM 6.3.
-SM.AMPLIFICATIONSHADERPAYLOADSIZE                     For amplification shader with entry '%0', payload size %1 is greater than maximum size of %2 bytes.
-SM.AMPLIFICATIONSHADERPAYLOADSIZEDECLARED             For amplification shader with entry '%0', payload size %1 is greater than declared size of %2 bytes.
-SM.APPENDANDCONSUMEONSAMEUAV                          BufferUpdateCounter inc and dec on a given UAV (%d) cannot both be in the same shader for shader model less than 5.1.
-SM.CBUFFERARRAYOFFSETALIGNMENT                        CBuffer array offset must be aligned to 16-bytes
-SM.CBUFFERELEMENTOVERFLOW                             CBuffer elements must not overflow
-SM.CBUFFEROFFSETOVERLAP                               CBuffer offsets must not overlap
-SM.CBUFFERSIZE                                        CBuffer size must not exceed 65536 bytes
-SM.CBUFFERTEMPLATETYPEMUSTBESTRUCT                    D3D12 constant/texture buffer template element can only be a struct.
-SM.COMPLETEPOSITION                                   Not all elements of SV_Position were written.
-SM.CONSTANTINTERPMODE                                 Interpolation mode must be constant for MS primitive output.
-SM.COUNTERONLYONSTRUCTBUF                             BufferUpdateCounter valid only on structured buffers.
-SM.CSNOSIGNATURES                                     Compute shaders must not have shader signatures.
-SM.DOMAINLOCATIONIDXOOB                               DomainLocation component index out of bounds for the domain.
-SM.DSINPUTCONTROLPOINTCOUNTRANGE                      DS input control point count must be [0..%0].  %1 specified.
-SM.DXILVERSION                                        Target shader model requires specific Dxil Version
-SM.GSINSTANCECOUNTRANGE                               GS instance count must be [1..%0].  %1 specified.
-SM.GSOUTPUTVERTEXCOUNTRANGE                           GS output vertex count must be [0..%0].  %1 specified.
-SM.GSTOTALOUTPUTVERTEXDATARANGE                       Declared output vertex count (%0) multiplied by the total number of declared scalar components of output data (%1) equals %2.  This value cannot be greater than %3.
-SM.GSVALIDINPUTPRIMITIVE                              GS input primitive unrecognized.
-SM.GSVALIDOUTPUTPRIMITIVETOPOLOGY                     GS output primitive topology unrecognized.
-SM.HSINPUTCONTROLPOINTCOUNTRANGE                      HS input control point count must be [0..%0].  %1 specified.
-SM.HULLPASSTHRUCONTROLPOINTCOUNTMATCH                 For pass thru hull shader, input control point count must match output control point count
-SM.INCOMPATIBLECALLINENTRY                            Features used in internal function calls must be compatible with entry
-SM.INCOMPATIBLEDERIVINCOMPUTESHADERMODEL              Derivatives in compute-model shaders require shader model 6.6 and above
-SM.INCOMPATIBLEDERIVLAUNCH                            Node shaders only support derivatives in broadcasting launch mode
-SM.INCOMPATIBLEOPERATION                              Operations used in entry function must be compatible with shader stage and other properties
-SM.INCOMPATIBLEREQUIRESGROUP                          Functions requiring groupshared memory must be called from shaders with a visible group
-SM.INCOMPATIBLESHADERMODEL                            Functions may only use features available in the current shader model
-SM.INCOMPATIBLESTAGE                                  Functions may only use features available in the entry function's stage
-SM.INCOMPATIBLETHREADGROUPDIM                         When derivatives are used in compute-model shaders, the thread group dimensions must be compatible
-SM.INSIDETESSFACTORSIZEMATCHDOMAIN                    InsideTessFactor rows, columns (%0, %1) invalid for domain %2.  Expected %3 rows and 1 column.
-SM.INVALIDRESOURCECOMPTYPE                            Invalid resource return type.
-SM.INVALIDRESOURCEKIND                                Invalid resources kind.
-SM.INVALIDSAMPLERFEEDBACKTYPE                         Invalid sampler feedback type.
-SM.INVALIDTEXTUREKINDONUAV                            TextureCube[Array] resources are not supported with UAVs.
-SM.ISOLINEOUTPUTPRIMITIVEMISMATCH                     Hull Shader declared with IsoLine Domain must specify output primitive point or line. Triangle_cw or triangle_ccw output are not compatible with the IsoLine Domain.
-SM.MAXMSSMSIZE                                        Total Thread Group Shared Memory storage is %0, exceeded %1.
-SM.MAXTGSMSIZE                                        Total Thread Group Shared Memory storage is %0, exceeded %1.
-SM.MAXTHEADGROUP                                      Declared Thread Group Count %0 (X*Y*Z) is beyond the valid maximum of %1.
-SM.MESHPSIGROWCOUNT                                   For shader '%0', primitive output signatures are taking up more than %1 rows.
-SM.MESHSHADERINOUTSIZE                                For shader '%0', payload plus output size is greater than %1.
-SM.MESHSHADERMAXPRIMITIVECOUNT                        MS max primitive output count must be [0..%0].  %1 specified.
-SM.MESHSHADERMAXVERTEXCOUNT                           MS max vertex output count must be [0..%0].  %1 specified.
-SM.MESHSHADEROUTPUTSIZE                               For shader '%0', vertex plus primitive output size is greater than %1.
-SM.MESHSHADERPAYLOADSIZE                              For mesh shader with entry '%0', payload size %1 is greater than maximum size of %2 bytes.
-SM.MESHSHADERPAYLOADSIZEDECLARED                      For mesh shader with entry '%0', payload size %1 is greater than declared size of %2 bytes.
-SM.MESHTOTALSIGROWCOUNT                               For shader '%0', vertex and primitive output signatures are taking up more than %1 rows.
-SM.MESHVSIGROWCOUNT                                   For shader '%0', vertex output signatures are taking up more than %1 rows.
-SM.MULTISTREAMMUSTBEPOINT                             When multiple GS output streams are used they must be pointlists
-SM.NAME                                               Target shader model name must be known
-SM.NOINTERPMODE                                       Interpolation mode must be undefined for VS input/PS output/patch constant.
-SM.NOPSOUTPUTIDX                                      Pixel shader output registers are not indexable.
-SM.OPCODE                                             Opcode must be defined in target shader model
-SM.OPCODEININVALIDFUNCTION                            Invalid DXIL opcode usage like StorePatchConstant in patch constant function
-SM.OPERAND                                            Operand must be defined in target shader model.
-SM.OUTPUTCONTROLPOINTCOUNTRANGE                       output control point count must be [%0..%1].  %2 specified.
-SM.OUTPUTCONTROLPOINTSTOTALSCALARS                    Total number of scalars across all HS output control points must not exceed .
-SM.PATCHCONSTANTONLYFORHSDS                           patch constant signature only valid in HS and DS.
-SM.PROGRAMVERSION                                     Program Version in Dxil Container does not match Dxil Module shader model version
-SM.PSCONSISTENTINTERP                                 Interpolation mode for PS input position must be linear_noperspective_centroid or linear_noperspective_sample when outputting oDepthGE or oDepthLE and not running at sample frequency (which is forced by inputting SV_SampleIndex or declaring an input linear_sample or linear_noperspective_sample).
-SM.PSCOVERAGEANDINNERCOVERAGE                         InnerCoverage and Coverage are mutually exclusive.
-SM.PSMULTIPLEDEPTHSEMANTIC                            Pixel Shader only allows one type of depth semantic to be declared.
-SM.PSOUTPUTSEMANTIC                                   Pixel Shader allows output semantics to be SV_Target, SV_Depth, SV_DepthGreaterEqual, SV_DepthLessEqual, SV_Coverage or SV_StencilRef, %0 found.
-SM.PSTARGETCOL0                                       SV_Target packed location must start at column 0.
-SM.PSTARGETINDEXMATCHESROW                            SV_Target semantic index must match packed row location.
-SM.RAYSHADERPAYLOADSIZE                               For shader '%0', %1 size is smaller than argument's allocation size.
-SM.RAYSHADERSIGNATURES                                Ray tracing shader '%0' should not have any shader signatures.
-SM.RESOURCERANGEOVERLAP                               Resource ranges must not overlap
-SM.ROVONLYINPS                                        RasterizerOrdered objects are only allowed in 5.0+ pixel shaders.
-SM.SAMPLECOUNTONLYON2DMS                              Only Texture2DMS/2DMSArray could has sample count.
-SM.SEMANTIC                                           Semantic must be defined in target shader model
-SM.STREAMINDEXRANGE                                   Stream index (%0) must between 0 and %1.
-SM.TESSFACTORFORDOMAIN                                Required TessFactor for domain not found declared anywhere in Patch Constant data.
-SM.TESSFACTORSIZEMATCHDOMAIN                          TessFactor rows, columns (%0, %1) invalid for domain %2.  Expected %3 rows and 1 column.
-SM.TGSMUNSUPPORTED                                    Thread Group Shared Memory not supported %0.
-SM.THREADGROUPCHANNELRANGE                            Declared Thread Group %0 size %1 outside valid range [%2..%3].
-SM.TRIOUTPUTPRIMITIVEMISMATCH                         Hull Shader declared with Tri Domain must specify output primitive point, triangle_cw or triangle_ccw. Line output is not compatible with the Tri domain.
-SM.UNDEFINEDOUTPUT                                    Not all elements of output %0 were written.
-SM.VALIDDOMAIN                                        Invalid Tessellator Domain specified. Must be isoline, tri or quad.
-SM.VIEWIDNEEDSSLOT                                    ViewID requires compatible space in pixel shader input signature
-SM.WAVESIZEALLZEROWHENUNDEFINED                       WaveSize Max and Preferred must be 0 when Min is 0
-SM.WAVESIZEEXPECTSONEPARAM                            WaveSize tag expects exactly 1 parameter.
-SM.WAVESIZEMAXANDPREFERREDZEROWHENNORANGE             WaveSize Max and Preferred must be 0 to encode min==max
-SM.WAVESIZEMAXGREATERTHANMIN                          WaveSize Max must greater than Min
-SM.WAVESIZENEEDSCONSTANTOPERANDS                      WaveSize metadata operands must be constant values.
-SM.WAVESIZENEEDSSM66OR67                              WaveSize is valid only for Shader Model 6.6 and 6.7.
-SM.WAVESIZEONCOMPUTEORNODE                            WaveSize only allowed on compute or node shaders
-SM.WAVESIZEPREFERREDINRANGE                           WaveSize Preferred must be within Min..Max range
-SM.WAVESIZERANGEEXPECTSTHREEPARAMS                    WaveSize Range tag expects exactly 3 parameters.
-SM.WAVESIZERANGENEEDSSM68PLUS                         WaveSize Range is valid only for Shader Model 6.8 and higher.
-SM.WAVESIZETAGDUPLICATE                               WaveSize or WaveSizeRange tag may only appear once per entry point.
-SM.WAVESIZEVALUE                                      WaveSize value must be a power of 2 in range [4..128]
-SM.ZEROHSINPUTCONTROLPOINTWITHINPUT                   When HS input control point count is 0, no input signature should exist.
-TYPES.DEFINED                                         Type must be defined based on DXIL primitives
-TYPES.I8                                              I8 can only be used as immediate value for intrinsic or as i8* via bitcast by lifetime intrinsics.
-TYPES.INTWIDTH                                        Int type must be of valid width
-TYPES.NOMULTIDIM                                      Only one dimension allowed for array type.
-TYPES.NOPTRTOPTR                                      Pointers to pointers, or pointers in structures are not allowed.
-TYPES.NOVECTOR                                        Vector types must not be present
-===================================================== ========================================================================================================================================================================================================================================================================================================
+============================================================= ========================================================================================================================================================================================================================================================================================================
+Rule Code                                                     Description
+============================================================= ========================================================================================================================================================================================================================================================================================================
+BITCODE.VALID                                                 Module must be bitcode-valid
+CONTAINER.CONTENTINVALID                                      DXIL Container Content is well-formed
+CONTAINER.CONTENTMATCHES                                      DXIL Container Content must match Module
+CONTAINER.PARTINVALID                                         DXIL Container must not contain unknown parts
+CONTAINER.PARTMATCHES                                         DXIL Container Parts must match Module
+CONTAINER.PARTMISSING                                         DXIL Container requires certain parts, corresponding to module
+CONTAINER.PARTREPEATED                                        DXIL Container must have only one of each part type
+CONTAINER.ROOTSIGNATUREINCOMPATIBLE                           Root Signature in DXIL Container must be compatible with shader
+CONTAINER.UNUSEDITEMINTABLE                                   Items in Table must be used
+DECL.ALLOCATERAYQUERY2FLAGSARECONST                           constRayFlags and RayQueryFlags for AllocateRayQuery2 must be constant
+DECL.ALLOCATERAYQUERYFLAGSARECONST                            RayFlags for AllocateRayQuery must be constant
+DECL.ALLOWOPACITYMICROMAPSEXPECTEDGIVENFORCEOMM2STATE         When the ForceOMM2State ConstRayFlag is given as an argument to a RayQuery object, AllowOpacityMicromaps is expected as a RayQueryFlag argument
+DECL.ATTRSTRUCT                                               Attributes parameter must be struct type
+DECL.DXILFNEXTERN                                             External function must be a DXIL function
+DECL.DXILNSRESERVED                                           The DXIL reserved prefixes must only be used by built-in functions and types
+DECL.EXTRAARGS                                                Extra arguments not allowed for shader functions
+DECL.FNATTRIBUTE                                              Functions should only contain known function attributes
+DECL.FNFLATTENPARAM                                           Function parameters must not use struct types
+DECL.FNISCALLED                                               Functions can only be used by call instructions
+DECL.MULTIPLENODEINPUTS                                       A node shader may not have more than one input record
+DECL.NODELAUNCHINPUTTYPE                                      Invalid input record type for node launch type
+DECL.NOTUSEDEXTERNAL                                          External declaration should not be used
+DECL.PARAMSTRUCT                                              Callable function parameter must be struct type
+DECL.PAYLOADSTRUCT                                            Payload parameter must be struct type
+DECL.RAYQUERYINFNSIG                                          Rayquery objects not allowed in function signatures
+DECL.RESOURCEINFNSIG                                          Resources not allowed in function signatures
+DECL.SHADERMISSINGARG                                         payload/params/attributes parameter is required for certain shader types
+DECL.SHADERRETURNVOID                                         Shader functions must return void
+DECL.USEDEXTERNALFUNCTION                                     External function must be used
+DECL.USEDINTERNAL                                             Internal declaration must be used
+FLOW.DEADLOOP                                                 Loop must have break.
+FLOW.FUNCTIONCALL                                             Function with parameter is not permitted
+FLOW.NORECURSION                                              Recursion is not permitted.
+FLOW.REDUCIBLE                                                Execution flow must be reducible.
+INSTR.ALLOWED                                                 Instructions must be of an allowed type.
+INSTR.ATOMICCONST                                             Constant destination to atomic.
+INSTR.ATOMICINTRINNONUAV                                      Non-UAV destination to atomic intrinsic.
+INSTR.ATOMICOPNONGROUPSHAREDORRECORD                          Non-groupshared or node record destination to atomic operation.
+INSTR.ATTRIBUTEATVERTEXNOINTERPOLATION                        Attribute %0 must have nointerpolation mode in order to use GetAttributeAtVertex function.
+INSTR.BARRIERFLAGINVALID                                      Invalid %0 flags on DXIL operation '%1'
+INSTR.BARRIERMODEFORNONCS                                     sync in a non-Compute/Amplification/Mesh/Node Shader must only sync UAV (sync_uglobal).
+INSTR.BARRIERMODENOMEMORY                                     sync must include some form of memory barrier - _u (UAV) and/or _g (Thread Group Shared Memory).  Only _t (thread group sync) is optional.
+INSTR.BARRIERMODEUSELESSUGROUP                                sync can't specify both _ugroup and _uglobal. If both are needed, just specify _uglobal.
+INSTR.BARRIERNONCONSTANTFLAGARGUMENT                          Memory type, access, or sync flag is not constant
+INSTR.BARRIERREQUIRESNODE                                     sync in a non-Node Shader must not sync node record memory.
+INSTR.BUFFERUPDATECOUNTERONRESHASCOUNTER                      BufferUpdateCounter valid only when HasCounter is true.
+INSTR.BUFFERUPDATECOUNTERONUAV                                BufferUpdateCounter valid only on UAV.
+INSTR.CALLOLOAD                                               Call to DXIL intrinsic must match overload signature
+INSTR.CANNOTPULLPOSITION                                      pull-model evaluation of position disallowed
+INSTR.CBUFFERCLASSFORCBUFFERHANDLE                            Expect Cbuffer for CBufferLoad handle.
+INSTR.CBUFFEROUTOFBOUND                                       Cbuffer access out of bound.
+INSTR.CHECKACCESSFULLYMAPPED                                  CheckAccessFullyMapped should only be used on resource status.
+INSTR.CONSTALIGNFORRAWBUF                                     Raw Buffer alignment value must be a constant.
+INSTR.COORDINATECOUNTFORRAWTYPEDBUF                           raw/typed buffer offset must be undef.
+INSTR.COORDINATECOUNTFORSTRUCTBUF                             structured buffer requires defined index and offset coordinates.
+INSTR.CREATEHANDLEIMMRANGEID                                  Local resource must map to global resource.
+INSTR.DXILSTRUCTUSER                                          Dxil struct types should only be used by ExtractValue.
+INSTR.DXILSTRUCTUSEROUTOFBOUND                                Index out of bound when extract value from dxil struct types.
+INSTR.EVALINTERPOLATIONMODE                                   Interpolation mode on %0 used with eval_* instruction must be linear, linear_centroid, linear_noperspective, linear_noperspective_centroid, linear_sample or linear_noperspective_sample.
+INSTR.EXTRACTVALUE                                            ExtractValue should only be used on dxil struct types and cmpxchg.
+INSTR.FAILTORESLOVETGSMPOINTER                                TGSM pointers must originate from an unambiguous TGSM global variable.
+INSTR.HANDLENOTFROMCREATEHANDLE                               Resource handle should returned by createHandle.
+INSTR.ILLEGALDXILOPCODE                                       DXILOpCode must be [0..%0].  %1 specified.
+INSTR.ILLEGALDXILOPFUNCTION                                   '%0' is not a DXILOpFuncition for DXILOpcode '%1'.
+INSTR.IMMBIASFORSAMPLEB                                       bias amount for sample_b must be in the range [%0,%1], but %2 was specified as an immediate.
+INSTR.INBOUNDSACCESS                                          Access to out-of-bounds memory is disallowed.
+INSTR.LINALGINTERPRETATIONPARAMARECONST                       In Linalg operations, Interpretation value is a constant.
+INSTR.LINALGINVALIDMATRIXLAYOUTVALUEFORMATVECOPS              Matrix Layout for Linalg Mul/MulAdd operation must be valid.
+INSTR.LINALGINVALIDMATRIXLAYOUTVALUEFOROUTERPRODUCTACCUMULATE Matrix Layout for Linalg Mul/MulAdd operation must be valid.
+INSTR.LINALGINVALIDMEMORYINTERPVALUE                          In Memory Interpolation value must be valid.
+INSTR.LINALGINVALIDREGISTERINTERPVALUE                        From Register Interpretation value must be valid.
+INSTR.LINALGMATRIXLAYOUTNOTTRANSPOSABLE                       Row Major and Column Major matrix layouts are not transposable.
+INSTR.LINALGMATRIXSHAPEPARAMSARECONST                         Matrix Layout, Dimensions and isTranspose are constants
+INSTR.LINALGMATRIXSTRIDEZEROFOROPTIMALLAYOUTS                 For optimal layouts, matrix stride must be zero.
+INSTR.LINALGNOTANUNSIGNEDTYPE                                 Unsigned flag set for a float signed type
+INSTR.MATVECOPISUNSIGNEDFLAGSARECONST                         In Linalg Mul/MulAdd functions, IsUnsigned flag is a constant.
+INSTR.MAYREORDERTHREADUNDEFCOHERENCEHINTPARAM                 Use of undef coherence hint or num coherence hint bits in MaybeReorderThread.
+INSTR.MINPRECISIONNOTPRECISE                                  Instructions marked precise may not refer to minprecision values.
+INSTR.MINPRECISONBITCAST                                      Bitcast on minprecison types is not allowed.
+INSTR.MIPLEVELFORGETDIMENSION                                 Use mip level on buffer when GetDimensions.
+INSTR.MIPONUAVLOAD                                            uav load don't support mipLevel/sampleIndex.
+INSTR.MISSINGSETMESHOUTPUTCOUNTS                              Missing SetMeshOutputCounts call.
+INSTR.MULTIPLEGETMESHPAYLOAD                                  GetMeshPayload cannot be called multiple times.
+INSTR.MULTIPLESETMESHOUTPUTCOUNTS                             SetMeshOUtputCounts cannot be called multiple times.
+INSTR.NODERECORDHANDLEUSEAFTERCOMPLETE                        Invalid use of completed record handle.
+INSTR.NOGENERICPTRADDRSPACECAST                               Address space cast between pointer types must have one part to be generic address space.
+INSTR.NOIDIVBYZERO                                            No signed integer division by zero.
+INSTR.NOINDEFINITEACOS                                        No indefinite arccosine.
+INSTR.NOINDEFINITEASIN                                        No indefinite arcsine.
+INSTR.NOINDEFINITEDSXY                                        No indefinite derivative calculation.
+INSTR.NOINDEFINITELOG                                         No indefinite logarithm.
+INSTR.NONDOMINATINGDISPATCHMESH                               Non-Dominating DispatchMesh call.
+INSTR.NONDOMINATINGSETMESHOUTPUTCOUNTS                        Non-Dominating SetMeshOutputCounts call.
+INSTR.NOREADINGUNINITIALIZED                                  Instructions should not read uninitialized value.
+INSTR.NOTONCEDISPATCHMESH                                     DispatchMesh must be called exactly once in an Amplification shader.
+INSTR.NOUDIVBYZERO                                            No unsigned integer division by zero.
+INSTR.OFFSETONUAVLOAD                                         uav load don't support offset.
+INSTR.OLOAD                                                   DXIL intrinsic overload must be valid.
+INSTR.ONLYONEALLOCCONSUME                                     RWStructuredBuffers may increment or decrement their counters, but not both.
+INSTR.OPCODERESERVED                                          Instructions must not reference reserved opcodes.
+INSTR.OPCONST                                                 DXIL intrinsic requires an immediate constant operand
+INSTR.OPCONSTRANGE                                            Constant values must be in-range for operation.
+INSTR.OPERANDRANGE                                            DXIL intrinsic operand must be within defined range
+INSTR.PARAMMULTIPLE                                           Parameter must be a valid multiple
+INSTR.PTRBITCAST                                              Pointer type bitcast must be have same size.
+INSTR.REORDERCOHERENTREQUIRESSM69                             reordercoherent requires SM 6.9 or later.
+INSTR.RESOURCECLASSFORLOAD                                    load can only run on UAV/SRV resource.
+INSTR.RESOURCECLASSFORSAMPLERGATHER                           sample, lod and gather should be on srv resource.
+INSTR.RESOURCECLASSFORUAVSTORE                                store should be on uav resource.
+INSTR.RESOURCECOORDINATEMISS                                  coord uninitialized.
+INSTR.RESOURCECOORDINATETOOMANY                               out of bound coord must be undef.
+INSTR.RESOURCEKINDFORBUFFERLOADSTORE                          buffer load/store only works on Raw/Typed/StructuredBuffer.
+INSTR.RESOURCEKINDFORCALCLOD                                  lod requires resource declared as texture1D/2D/3D/Cube/CubeArray/1DArray/2DArray.
+INSTR.RESOURCEKINDFORGATHER                                   gather requires resource declared as texture/2D/Cube/2DArray/CubeArray.
+INSTR.RESOURCEKINDFORGETDIM                                   Invalid resource kind on GetDimensions.
+INSTR.RESOURCEKINDFORSAMPLE                                   sample/_l/_d requires resource declared as texture1D/2D/3D/Cube/1DArray/2DArray/CubeArray.
+INSTR.RESOURCEKINDFORSAMPLEC                                  samplec requires resource declared as texture1D/2D/Cube/1DArray/2DArray/CubeArray.
+INSTR.RESOURCEKINDFORTEXTURELOAD                              texture load only works on Texture1D/1DArray/2D/2DArray/3D/MS2D/MS2DArray.
+INSTR.RESOURCEKINDFORTEXTURESTORE                             texture store only works on Texture1D/1DArray/2D/2DArray/3D.
+INSTR.RESOURCEKINDFORTRACERAY                                 TraceRay should only use RTAccelerationStructure.
+INSTR.RESOURCEMAPTOSINGLEENTRY                                Fail to map resource to resource table.
+INSTR.RESOURCEOFFSETMISS                                      offset uninitialized.
+INSTR.RESOURCEOFFSETTOOMANY                                   out of bound offset must be undef.
+INSTR.RESOURCEUSER                                            Resource should only be used by Load/GEP/Call.
+INSTR.SAMPLECOMPTYPE                                          sample_* instructions require resource to be declared to return UNORM, SNORM or FLOAT.
+INSTR.SAMPLEINDEXFORLOAD2DMS                                  load on Texture2DMS/2DMSArray require sampleIndex.
+INSTR.SAMPLERMODEFORLOD                                       lod instruction requires sampler declared in default mode.
+INSTR.SAMPLERMODEFORSAMPLE                                    sample/_l/_d/_cl_s/gather instruction requires sampler declared in default mode.
+INSTR.SAMPLERMODEFORSAMPLEC                                   sample_c_*/gather_c instructions require sampler declared in comparison mode.
+INSTR.SIGNATUREOPERATIONNOTINENTRY                            Dxil operation for input output signature must be in entryPoints.
+INSTR.STATUS                                                  Resource status should only be used by CheckAccessFullyMapped.
+INSTR.STRUCTBITCAST                                           Bitcast on struct types is not allowed.
+INSTR.SVCONFLICTINGLAUNCHMODE                                 Input system values are compatible with node shader launch mode.
+INSTR.TEXTUREOFFSET                                           offset texture instructions must take offset which can resolve to integer literal in the range -8 to 7.
+INSTR.TGSMRACECOND                                            Race condition writing to shared memory detected, consider making this write conditional.
+INSTR.UNDEFHITOBJECT                                          HitObject is undef.
+INSTR.UNDEFINEDVALUEFORUAVSTORE                               Assignment of undefined values to UAV.
+INSTR.UNDEFRESULTFORGETDIMENSION                              GetDimensions used undef dimension %0 on %1.
+INSTR.WRITEMASKFORTYPEDUAVSTORE                               store on typed uav must write to all four components of the UAV.
+INSTR.WRITEMASKGAPFORUAV                                      UAV write mask must be contiguous, starting at x: .x, .xy, .xyz, or .xyzw.
+INSTR.WRITEMASKMATCHVALUEFORUAVSTORE                          uav store write mask must match store value mask, write mask is %0 and store value mask is %1.
+META.BARYCENTRICSFLOAT3                                       only 'float3' type is allowed for SV_Barycentrics.
+META.BARYCENTRICSINTERPOLATION                                SV_Barycentrics cannot be used with 'nointerpolation' type.
+META.BARYCENTRICSTWOPERSPECTIVES                              There can only be up to two input attributes of SV_Barycentrics with different perspective interpolation mode.
+META.BRANCHFLATTEN                                            Can't use branch and flatten attributes together.
+META.CLIPCULLMAXCOMPONENTS                                    Combined elements of SV_ClipDistance and SV_CullDistance must fit in 8 components
+META.CLIPCULLMAXROWS                                          Combined elements of SV_ClipDistance and SV_CullDistance must fit in two rows.
+META.COHERENCENOTONAPPENDCONSUME                              globally/reorder coherent incompatible with append/consume/counter buffers
+META.COMPUTEWITHNODE                                          Compute entry must not have node metadata
+META.CONTROLFLOWHINTNOTONCONTROLFLOW                          Control flow hint only works on control flow inst.
+META.DENSERESIDS                                              Resource identifiers must be zero-based and dense.
+META.DUPLICATESYSVALUE                                        System value may only appear once in signature
+META.ENTRYFUNCTION                                            entrypoint not found.
+META.FLAGSUSAGE                                               Flags must match usage.
+META.FORCECASEONSWITCH                                        Attribute forcecase only works for switch.
+META.INTEGERINTERPMODE                                        Interpolation mode on integer must be Constant
+META.INTERPMODEINONEROW                                       Interpolation mode must be identical for all elements packed into the same row.
+META.INTERPMODEVALID                                          Interpolation mode must be valid
+META.INVALIDCONTROLFLOWHINT                                   Invalid control flow hint.
+META.KNOWN                                                    Named metadata should be known
+META.MAXTESSFACTOR                                            Hull Shader MaxTessFactor must be [%0..%1].  %2 specified.
+META.NOENTRYPROPSFORENTRY                                     Entry point %0 must have entry properties.
+META.NOSEMANTICOVERLAP                                        Semantics must not overlap
+META.REQUIRED                                                 Required metadata missing.
+META.SEMAKINDMATCHESNAME                                      Semantic name must match system value, when defined.
+META.SEMAKINDVALID                                            Semantic kind must be valid
+META.SEMANTICCOMPTYPE                                         %0 must be %1.
+META.SEMANTICINDEXMAX                                         System value semantics have a maximum valid semantic index
+META.SEMANTICLEN                                              Semantic length must be at least 1 and at most 64.
+META.SEMANTICSHOULDBEALLOCATED                                Semantic should have a valid packing location
+META.SEMANTICSHOULDNOTBEALLOCATED                             Semantic should have a packing location of -1
+META.SIGNATURECOMPTYPE                                        signature %0 specifies unrecognized or invalid component type.
+META.SIGNATUREDATAWIDTH                                       Data width must be identical for all elements packed into the same row.
+META.SIGNATUREILLEGALCOMPONENTORDER                           Component ordering for packed elements must be: arbitrary < system value < system generated value
+META.SIGNATUREINDEXCONFLICT                                   Only elements with compatible indexing rules may be packed together
+META.SIGNATUREOUTOFRANGE                                      Signature elements must fit within maximum signature size
+META.SIGNATUREOVERLAP                                         Signature elements may not overlap in packing location.
+META.STRUCTBUFALIGNMENT                                       StructuredBuffer stride not aligned
+META.STRUCTBUFALIGNMENTOUTOFBOUND                             StructuredBuffer stride out of bounds
+META.SYSTEMVALUEROWS                                          System value may only have 1 row
+META.TARGET                                                   Target triple must be 'dxil-ms-dx'
+META.TESSELLATOROUTPUTPRIMITIVE                               Invalid Tessellator Output Primitive specified. Must be point, line, triangleCW or triangleCCW.
+META.TESSELLATORPARTITION                                     Invalid Tessellator Partitioning specified. Must be integer, pow2, fractional_odd or fractional_even.
+META.TEXTURETYPE                                              elements of typed buffers and textures must fit in four 32-bit quantities.
+META.USED                                                     All metadata must be used by dxil.
+META.VALIDSAMPLERMODE                                         Invalid sampler mode on sampler .
+META.VALUERANGE                                               Metadata value must be within range.
+META.VERSIONSUPPORTED                                         Version in metadata must be supported.
+META.WELLFORMED                                               Metadata must be well-formed in operand count and types.
+SM.64BITRAWBUFFERLOADSTORE                                    i64/f64 rawBufferLoad/Store overloads are allowed after SM 6.3.
+SM.AMPLIFICATIONSHADERPAYLOADSIZE                             For amplification shader with entry '%0', payload size %1 is greater than maximum size of %2 bytes.
+SM.AMPLIFICATIONSHADERPAYLOADSIZEDECLARED                     For amplification shader with entry '%0', payload size %1 is greater than declared size of %2 bytes.
+SM.APPENDANDCONSUMEONSAMEUAV                                  BufferUpdateCounter inc and dec on a given UAV (%d) cannot both be in the same shader for shader model less than 5.1.
+SM.CBUFFERARRAYOFFSETALIGNMENT                                CBuffer array offset must be aligned to 16-bytes
+SM.CBUFFERELEMENTOVERFLOW                                     CBuffer elements must not overflow
+SM.CBUFFEROFFSETOVERLAP                                       CBuffer offsets must not overlap
+SM.CBUFFERSIZE                                                CBuffer size must not exceed 65536 bytes
+SM.CBUFFERTEMPLATETYPEMUSTBESTRUCT                            D3D12 constant/texture buffer template element can only be a struct.
+SM.COMPLETEPOSITION                                           Not all elements of SV_Position were written.
+SM.CONSTANTINTERPMODE                                         Interpolation mode must be constant for MS primitive output.
+SM.COUNTERONLYONSTRUCTBUF                                     BufferUpdateCounter valid only on structured buffers.
+SM.CSNOSIGNATURES                                             Compute shaders must not have shader signatures.
+SM.DOMAINLOCATIONIDXOOB                                       DomainLocation component index out of bounds for the domain.
+SM.DSINPUTCONTROLPOINTCOUNTRANGE                              DS input control point count must be [0..%0].  %1 specified.
+SM.DXILVERSION                                                Target shader model requires specific Dxil Version
+SM.GSINSTANCECOUNTRANGE                                       GS instance count must be [1..%0].  %1 specified.
+SM.GSOUTPUTVERTEXCOUNTRANGE                                   GS output vertex count must be [0..%0].  %1 specified.
+SM.GSTOTALOUTPUTVERTEXDATARANGE                               Declared output vertex count (%0) multiplied by the total number of declared scalar components of output data (%1) equals %2.  This value cannot be greater than %3.
+SM.GSVALIDINPUTPRIMITIVE                                      GS input primitive unrecognized.
+SM.GSVALIDOUTPUTPRIMITIVETOPOLOGY                             GS output primitive topology unrecognized.
+SM.HSINPUTCONTROLPOINTCOUNTRANGE                              HS input control point count must be [0..%0].  %1 specified.
+SM.HULLPASSTHRUCONTROLPOINTCOUNTMATCH                         For pass thru hull shader, input control point count must match output control point count
+SM.INCOMPATIBLECALLINENTRY                                    Features used in internal function calls must be compatible with entry
+SM.INCOMPATIBLEDERIVINCOMPUTESHADERMODEL                      Derivatives in compute-model shaders require shader model 6.6 and above
+SM.INCOMPATIBLEDERIVLAUNCH                                    Node shaders only support derivatives in broadcasting launch mode
+SM.INCOMPATIBLEOPERATION                                      Operations used in entry function must be compatible with shader stage and other properties
+SM.INCOMPATIBLEREQUIRESGROUP                                  Functions requiring groupshared memory must be called from shaders with a visible group
+SM.INCOMPATIBLESHADERMODEL                                    Functions may only use features available in the current shader model
+SM.INCOMPATIBLESTAGE                                          Functions may only use features available in the entry function's stage
+SM.INCOMPATIBLETHREADGROUPDIM                                 When derivatives are used in compute-model shaders, the thread group dimensions must be compatible
+SM.INSIDETESSFACTORSIZEMATCHDOMAIN                            InsideTessFactor rows, columns (%0, %1) invalid for domain %2.  Expected %3 rows and 1 column.
+SM.INVALIDRESOURCECOMPTYPE                                    Invalid resource return type.
+SM.INVALIDRESOURCEKIND                                        Invalid resources kind.
+SM.INVALIDSAMPLERFEEDBACKTYPE                                 Invalid sampler feedback type.
+SM.INVALIDTEXTUREKINDONUAV                                    TextureCube[Array] resources are not supported with UAVs.
+SM.ISOLINEOUTPUTPRIMITIVEMISMATCH                             Hull Shader declared with IsoLine Domain must specify output primitive point or line. Triangle_cw or triangle_ccw output are not compatible with the IsoLine Domain.
+SM.MAXMSSMSIZE                                                Total Thread Group Shared Memory storage is %0, exceeded %1.
+SM.MAXTGSMSIZE                                                Total Thread Group Shared Memory storage is %0, exceeded %1.
+SM.MAXTHEADGROUP                                              Declared Thread Group Count %0 (X*Y*Z) is beyond the valid maximum of %1.
+SM.MESHPSIGROWCOUNT                                           For shader '%0', primitive output signatures are taking up more than %1 rows.
+SM.MESHSHADERINOUTSIZE                                        For shader '%0', payload plus output size is greater than %1.
+SM.MESHSHADERMAXPRIMITIVECOUNT                                MS max primitive output count must be [0..%0].  %1 specified.
+SM.MESHSHADERMAXVERTEXCOUNT                                   MS max vertex output count must be [0..%0].  %1 specified.
+SM.MESHSHADEROUTPUTSIZE                                       For shader '%0', vertex plus primitive output size is greater than %1.
+SM.MESHSHADERPAYLOADSIZE                                      For mesh shader with entry '%0', payload size %1 is greater than maximum size of %2 bytes.
+SM.MESHSHADERPAYLOADSIZEDECLARED                              For mesh shader with entry '%0', payload size %1 is greater than declared size of %2 bytes.
+SM.MESHTOTALSIGROWCOUNT                                       For shader '%0', vertex and primitive output signatures are taking up more than %1 rows.
+SM.MESHVSIGROWCOUNT                                           For shader '%0', vertex output signatures are taking up more than %1 rows.
+SM.MULTISTREAMMUSTBEPOINT                                     When multiple GS output streams are used they must be pointlists
+SM.NAME                                                       Target shader model name must be known
+SM.NOINTERPMODE                                               Interpolation mode must be undefined for VS input/PS output/patch constant.
+SM.NOPSOUTPUTIDX                                              Pixel shader output registers are not indexable.
+SM.OPCODE                                                     Opcode must be defined in target shader model
+SM.OPCODEININVALIDFUNCTION                                    Invalid DXIL opcode usage like StorePatchConstant in patch constant function
+SM.OPERAND                                                    Operand must be defined in target shader model.
+SM.OUTPUTCONTROLPOINTCOUNTRANGE                               output control point count must be [%0..%1].  %2 specified.
+SM.OUTPUTCONTROLPOINTSTOTALSCALARS                            Total number of scalars across all HS output control points must not exceed .
+SM.PATCHCONSTANTONLYFORHSDS                                   patch constant signature only valid in HS and DS.
+SM.PROGRAMVERSION                                             Program Version in Dxil Container does not match Dxil Module shader model version
+SM.PSCONSISTENTINTERP                                         Interpolation mode for PS input position must be linear_noperspective_centroid or linear_noperspective_sample when outputting oDepthGE or oDepthLE and not running at sample frequency (which is forced by inputting SV_SampleIndex or declaring an input linear_sample or linear_noperspective_sample).
+SM.PSCOVERAGEANDINNERCOVERAGE                                 InnerCoverage and Coverage are mutually exclusive.
+SM.PSMULTIPLEDEPTHSEMANTIC                                    Pixel Shader only allows one type of depth semantic to be declared.
+SM.PSOUTPUTSEMANTIC                                           Pixel Shader allows output semantics to be SV_Target, SV_Depth, SV_DepthGreaterEqual, SV_DepthLessEqual, SV_Coverage or SV_StencilRef, %0 found.
+SM.PSTARGETCOL0                                               SV_Target packed location must start at column 0.
+SM.PSTARGETINDEXMATCHESROW                                    SV_Target semantic index must match packed row location.
+SM.RAYSHADERPAYLOADSIZE                                       For shader '%0', %1 size is smaller than argument's allocation size.
+SM.RAYSHADERSIGNATURES                                        Ray tracing shader '%0' should not have any shader signatures.
+SM.RESOURCERANGEOVERLAP                                       Resource ranges must not overlap
+SM.ROVONLYINPS                                                RasterizerOrdered objects are only allowed in 5.0+ pixel shaders.
+SM.SAMPLECOUNTONLYON2DMS                                      Only Texture2DMS/2DMSArray could has sample count.
+SM.SEMANTIC                                                   Semantic must be defined in target shader model
+SM.STREAMINDEXRANGE                                           Stream index (%0) must between 0 and %1.
+SM.TESSFACTORFORDOMAIN                                        Required TessFactor for domain not found declared anywhere in Patch Constant data.
+SM.TESSFACTORSIZEMATCHDOMAIN                                  TessFactor rows, columns (%0, %1) invalid for domain %2.  Expected %3 rows and 1 column.
+SM.TGSMUNSUPPORTED                                            Thread Group Shared Memory not supported %0.
+SM.THREADGROUPCHANNELRANGE                                    Declared Thread Group %0 size %1 outside valid range [%2..%3].
+SM.TRIOUTPUTPRIMITIVEMISMATCH                                 Hull Shader declared with Tri Domain must specify output primitive point, triangle_cw or triangle_ccw. Line output is not compatible with the Tri domain.
+SM.UNDEFINEDOUTPUT                                            Not all elements of output %0 were written.
+SM.VALIDDOMAIN                                                Invalid Tessellator Domain specified. Must be isoline, tri or quad.
+SM.VIEWIDNEEDSSLOT                                            ViewID requires compatible space in pixel shader input signature
+SM.WAVESIZEALLZEROWHENUNDEFINED                               WaveSize Max and Preferred must be 0 when Min is 0
+SM.WAVESIZEEXPECTSONEPARAM                                    WaveSize tag expects exactly 1 parameter.
+SM.WAVESIZEMAXANDPREFERREDZEROWHENNORANGE                     WaveSize Max and Preferred must be 0 to encode min==max
+SM.WAVESIZEMAXGREATERTHANMIN                                  WaveSize Max must greater than Min
+SM.WAVESIZENEEDSCONSTANTOPERANDS                              WaveSize metadata operands must be constant values.
+SM.WAVESIZENEEDSSM66OR67                                      WaveSize is valid only for Shader Model 6.6 and 6.7.
+SM.WAVESIZEONCOMPUTEORNODE                                    WaveSize only allowed on compute or node shaders
+SM.WAVESIZEPREFERREDINRANGE                                   WaveSize Preferred must be within Min..Max range
+SM.WAVESIZERANGEEXPECTSTHREEPARAMS                            WaveSize Range tag expects exactly 3 parameters.
+SM.WAVESIZERANGENEEDSSM68PLUS                                 WaveSize Range is valid only for Shader Model 6.8 and higher.
+SM.WAVESIZETAGDUPLICATE                                       WaveSize or WaveSizeRange tag may only appear once per entry point.
+SM.WAVESIZEVALUE                                              WaveSize value must be a power of 2 in range [4..128]
+SM.ZEROHSINPUTCONTROLPOINTWITHINPUT                           When HS input control point count is 0, no input signature should exist.
+TYPES.DEFINED                                                 Type must be defined based on DXIL primitives
+TYPES.I8                                                      I8 can only be used as immediate value for intrinsic or as i8* via bitcast by lifetime intrinsics.
+TYPES.INTWIDTH                                                Int type must be of valid width
+TYPES.NOMULTIDIM                                              Only one dimension allowed for array type.
+TYPES.NOPTRTOPTR                                              Pointers to pointers, or pointers in structures are not allowed.
+TYPES.NOVECTOR                                                Vector types must not be present
+============================================================= ========================================================================================================================================================================================================================================================================================================
 
 .. VALRULES-RST:END
 
diff --git a/docs/ReleaseNotes.md b/docs/ReleaseNotes.md
index 637bd8dae8..6850902a81 100644
--- a/docs/ReleaseNotes.md
+++ b/docs/ReleaseNotes.md
@@ -19,9 +19,48 @@ The included licenses apply to the following files:
 
 ### Upcoming Release
 
-Place release notes for the upcoming release below this line and remove this line upon naming this release.
+- Fix regression: [#7510](https://github.com/microsoft/DirectXShaderCompiler/issues/7510) crash when calling `sizeof` on templated type.
+- Fix regression: [#7508](https://github.com/microsoft/DirectXShaderCompiler/issues/7508) crash when calling `Load` with `status`.
+- Header file `dxcpix.h` was added to the release package.
+
+### Version 1.8.2505
+
+#### Potentially breaking changes
 
 - Typed buffers (including ROV buffers) no longer accept types other than vectors and scalars. Any other types will produce descriptive errors. This removes support for appropriately sized matrices and structs. Though it worked in some contexts, code generated from such types was unreliable.
+  - Load and Store operations have been refactored as a consequence. Behavior should be identical, please file issues if discrepancies are observed.
+- The compiler will now always use the internal validator instead of searching for an external DXIL.dll.  The (hidden) `-select-validator` option has been removed.
+
+#### Notable SPIR-V updates
+
+- Fix unnecessary Int64 requirement when loading Float64
+- Added vk::BufferPointer, see [proposal](https://github.com/microsoft/hlsl-specs/blob/main/proposals/0010-vk-buffer-ref.md) for more details.
+- Implement QuadAny and QuadAll (#7266)
+- Fix -fvk-invert-y (#7447)
+
+#### Shader Model 6.9 Preview
+
+You can now compile shaders to SM 6.9, but this is a preview, so shader hashes will be set to the PREVIEW_BYPASS pattern.
+SM 6.9 shaders will only work with AgilitySDK 1.717.0-preview, a supported preview driver, and use of experimental shader models in developer mode.
+Preview shaders will not be compatible with the SM 6.9 release, or likely even later versions of the SM 6.9 preview.
+
+SM 6.9 Preview Additions:
+
+- Long vectors are allowed in HLSL when targeting shader model 6.9. Vectors up to 1024 elements in length can be loaded from/stored to raw buffers and used in elementwise operations. See the [long vector proposal](https://github.com/microsoft/hlsl-specs/blob/main/proposals/0026-hlsl-long-vector-type.md) for more details.
+- HLSL Vectors are still limited to a maximum of 4 elements when used in certain contexts:
+  - entry function inputs/outputs
+  - parameter, payload, attribute, and node record types for mesh, raytracing, and node shaders
+  - constant buffers (cbuffer), texture buffers (tbuffer), textures and typed buffers
+  - Note: some HLSL elementwise intrinsics do not yet support long vectors in this preview
+- Native vectors of up to 1024 elements are now present in DXIL. This includes vector llvm instructions, load/store, and various elementwise DXIL operations. This may result in smaller DXIL and potentially other performance improvements. See the [dxil vectors proposal](https://github.com/microsoft/hlsl-specs/blob/main/proposals/0030-dxil-vectors.md) for more details.
+- Cooperative Vector operations, a subset of Linear Algebra (LinAlg). See the [cooperative vectors proposal](https://github.com/microsoft/hlsl-specs/blob/main/proposals/0029-cooperative-vector.md) and the [HLSL header based API proposal](https://github.com/microsoft/hlsl-specs/blob/main/proposals/0031-hlsl-vector-matrix-operations.md) for more details.
+  - New built-in operations are added for multiplying long vectors with a matrix in a ByteAddressBuffer, optionally with accumulation and bias data, as well as outer product and vector accumulate operations.
+  - An HLSL header shipped with this release provides a more convenient API for using these built-in operations.
+- Support for [Opacity Micromaps](https://github.com/microsoft/hlsl-specs/blob/main/proposals/0024-opacity-micromaps.md) in DXR shaders as well as for RayQuery.
+  - Unlocks DXR performance improvements using triangle sub-divisions for fast hit/miss detection to reduce the need for anyhit invocations.
+- Support for [Shader Execution Reordering](https://github.com/microsoft/hlsl-specs/blob/main/proposals/0027-shader-execution-reordering.md) in DXR.
+  - Introduces `MaybeReorderThread()` to explicitly specify where and how shader execution coherence can be improved. `MaybeReorderThread()` can be used in raygeneration shaders.
+  - `HitObject` decouples traversal, intersection testing and anyhit shading from closesthit and miss shading for more control and better reordering opportunities. `HitObject` can be used in raygeneration, closesthit and miss shaders.
 
 ### Version 1.8.2502
 
diff --git a/docs/SPIR-V.rst b/docs/SPIR-V.rst
index b5e9c05079..a695e5854d 100644
--- a/docs/SPIR-V.rst
+++ b/docs/SPIR-V.rst
@@ -896,6 +896,13 @@ are translated into SPIR-V ``OpTypeImage``, with parameters:
 The meanings of the headers in the above table is explained in ``OpTypeImage``
 of the SPIR-V spec.
 
+For storage images (e.g. ``RWTexture2D<T>``) and texel buffers (e.g. ``RWBuffer<T>``),
+the image format is typically inferred from the data type ``T``. However, the
+``-fspv-use-unknown-image-format`` command-line option can be used to change
+this behavior. When this option is active, the default format for these
+resources becomes ``Unknown`` if not otherwise specified by the
+``[[vk::image_format]]`` attribute.
+
 Vulkan specific Image Formats
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -1012,17 +1019,18 @@ right now:
 2. DirectX memory layout rules for uniform buffers and storage buffers:
    they allow packing data on the application side that can be shared with
    DirectX. They can be enabled by ``-fvk-use-dx-layout``.
+   
+   NOTE: This requires ``VK_EXT_scalar_block_layout`` to be enabled on the
+   application side.
 3. Strict OpenGL ``std140`` for uniform buffers and strict OpenGL ``std430``
    for storage buffers: they allow packing data on the application side that
    can be shared with OpenGL. They can be enabled by ``-fvk-use-gl-layout``.
 4. Scalar layout rules introduced via `VK_EXT_scalar_block_layout`, which
    basically aligns all aggregrate types according to their elements'
    natural alignment. They can be enabled by ``-fvk-use-scalar-layout``.
-
-To use scalar layout, the application side need to request
-``VK_EXT_scalar_block_layout``. This is also true for using DirectX memory
-layout since there is no dedicated DirectX layout extension for Vulkan
-(at least for now). So we must request something more permissive.
+   
+   NOTE: This requires ``VK_EXT_scalar_block_layout`` to be enabled on the
+   application side.
 
 In the above, "vector-relaxed OpenGL ``std140``/``std430``" rules mean OpenGL
 ``std140``/``std430`` rules with the following modification for vector type
@@ -1032,7 +1040,7 @@ alignment:
 2. If the above causes an `improper straddle <https://registry.khronos.org/vulkan/specs/latest/html/vkspec.html#interfaces-resources-layout>`_,
    the alignment will be set to 16 bytes.
 
-As an exmaple, for the following HLSL definition:
+As an example, for the following HLSL definition:
 
 .. code:: hlsl
 
@@ -3967,7 +3975,7 @@ RayQuery Mapping to SPIR-V
 +---------------------------------------------------+-------------------------------------------------------------------------+
 |``.WorldRayDirection``                             | ``OpRayQueryGetWorldRayDirectionKHR``                                   |
 +---------------------------------------------------+-------------------------------------------------------------------------+
-|``.WorldRayOrigin`                                 | ``OpRayQueryGetWorldRayOriginKHR``                                      |
+|``.WorldRayOrigin``                                | ``OpRayQueryGetWorldRayOriginKHR``                                      |
 +---------------------------------------------------+-------------------------------------------------------------------------+
 
 Shader Model 6.0+ Wave Intrinsics
@@ -4227,7 +4235,7 @@ codegen for Vulkan:
 - ``-fvk-use-dx-layout``: Uses DirectX layout rules for resources.
 - ``-fvk-invert-y``: Negates (additively inverts) SV_Position.y before writing
   to stage output. Used to accommodate the difference between Vulkan's
-  coordinate system and DirectX's. Only allowed in VS/DS/GS.
+  coordinate system and DirectX's. Only allowed in VS/DS/GS/MS/Lib.
 - ``-fvk-use-dx-position-w``: Reciprocates (multiplicatively inverts)
   SV_Position.w after reading from stage input. Used to accommodate the
   difference between Vulkan DirectX: the w component of SV_Position in PS is
diff --git a/external/SPIRV-Headers b/external/SPIRV-Headers
index 0e71067798..2a611a970f 160000
--- a/external/SPIRV-Headers
+++ b/external/SPIRV-Headers
@@ -1 +1 @@
-Subproject commit 0e710677989b4326ac974fd80c5308191ed80965
+Subproject commit 2a611a970fdbc41ac2e3e328802aed9985352dca
diff --git a/external/SPIRV-Tools b/external/SPIRV-Tools
index 4bd1536ed7..33e0256818 160000
--- a/external/SPIRV-Tools
+++ b/external/SPIRV-Tools
@@ -1 +1 @@
-Subproject commit 4bd1536ed79003a5194a4bd8c9aa2fa17a84c15b
+Subproject commit 33e02568181e3312f49a3cf33df470bf96ef293a
diff --git a/include/dxc/DXIL/DxilConstants.h b/include/dxc/DXIL/DxilConstants.h
index 8c73328fbd..84588a2ff7 100644
--- a/include/dxc/DXIL/DxilConstants.h
+++ b/include/dxc/DXIL/DxilConstants.h
@@ -154,6 +154,7 @@ const float kMaxMipLodBias = 15.99f;
 const float kMinMipLodBias = -16.0f;
 
 const unsigned kResRetStatusIndex = 4;
+const unsigned kVecResRetStatusIndex = 1;
 
 /* <py::lines('OLOAD_DIMS-TEXT')>hctdb_instrhelp.get_max_oload_dims()</py>*/
 // OLOAD_DIMS-TEXT:BEGIN
@@ -162,24 +163,32 @@ const unsigned kDxilMaxOloadDims = 2;
 
 enum class ComponentType : uint32_t {
   Invalid = 0,
-  I1,
-  I16,
-  U16,
-  I32,
-  U32,
-  I64,
-  U64,
-  F16,
-  F32,
-  F64,
-  SNormF16,
-  UNormF16,
-  SNormF32,
-  UNormF32,
-  SNormF64,
-  UNormF64,
-  PackedS8x32,
-  PackedU8x32,
+  I1 = 1,
+  I16 = 2,
+  U16 = 3,
+  I32 = 4,
+  U32 = 5,
+  I64 = 6,
+  U64 = 7,
+  F16 = 8,
+  F32 = 9,
+  F64 = 10,
+  SNormF16 = 11,
+  UNormF16 = 12,
+  SNormF32 = 13,
+  UNormF32 = 14,
+  SNormF64 = 15,
+  UNormF64 = 16,
+  PackedS8x32 = 17,
+  PackedU8x32 = 18,
+
+  // BEGIN NEW FOR SM 6.9
+  U8 = 19,
+  I8 = 20,
+  F8_E4M3 = 21,
+  F8_E5M2 = 22,
+  // END
+
   LastEntry
 };
 
@@ -743,6 +752,19 @@ enum class OpCode : unsigned {
   CreateHandleForLib =
       160, // create resource handle from resource struct for library
 
+  // Linear Algebra Operations
+  MatVecMul =
+      305, // Multiplies a MxK dimension matrix and a K sized input vector
+  MatVecMulAdd = 306, // multiplies a MxK dimension matrix and a K sized input
+                      // vector and adds an M-sized bias vector
+  OuterProductAccumulate =
+      307, // Computes the outer product between column vectors and an MxN
+           // matrix is accumulated component-wise atomically (with device
+           // scope) in memory
+  VectorAccumulate = 308, // Accumulates the components of a vector
+                          // component-wise atomically (with device scope) to
+                          // the corresponding elements of an array in memory
+
   // Mesh shader instructions
   EmitIndices = 169, // emit a primitive's vertex indices in a mesh shader
   GetMeshPayload =
@@ -1060,7 +1082,7 @@ enum class OpCode : unsigned {
   NumOpCodes_Dxil_1_7 = 226,
   NumOpCodes_Dxil_1_8 = 258,
 
-  NumOpCodes = 305 // exclusive last value of enumeration
+  NumOpCodes = 309 // exclusive last value of enumeration
 };
 // OPCODE-ENUM:END
 
@@ -1201,6 +1223,12 @@ enum class OpCodeClass : unsigned {
   // Library create handle from resource struct (like HL intrinsic)
   CreateHandleForLib,
 
+  // Linear Algebra Operations
+  MatVecMul,
+  MatVecMulAdd,
+  OuterProductAccumulate,
+  VectorAccumulate,
+
   // Mesh shader instructions
   EmitIndices,
   GetMeshPayload,
@@ -1385,7 +1413,7 @@ enum class OpCodeClass : unsigned {
   NumOpClasses_Dxil_1_7 = 153,
   NumOpClasses_Dxil_1_8 = 174,
 
-  NumOpClasses = 190 // exclusive last value of enumeration
+  NumOpClasses = 194 // exclusive last value of enumeration
 };
 // OPCODECLASS-ENUM:END
 
@@ -1556,6 +1584,38 @@ const unsigned kMSStoreOutputColOpIdx = 3;
 const unsigned kMSStoreOutputVIdxOpIdx = 4;
 const unsigned kMSStoreOutputValOpIdx = 5;
 
+// HitObject::MakeMiss
+const unsigned kHitObjectMakeMiss_RayDescOpIdx = 3;
+const unsigned kHitObjectMakeMiss_NumOp = 11;
+
+// HitObject::TraceRay
+const unsigned kHitObjectTraceRay_RayDescOpIdx = 7;
+const unsigned kHitObjectTraceRay_PayloadOpIdx = 15;
+const unsigned kHitObjectTraceRay_NumOp = 16;
+
+// MatVec Ops
+const unsigned kMatVecMulInputVectorIdx = 1;
+const unsigned kMatVecMulIsInputUnsignedIdx = 2;
+const unsigned kMatVecMulInputInterpretationIdx = 3;
+const unsigned kMatVecMulMatrixBufferIdx = 4;
+const unsigned kMatVecMulMatrixOffsetIdx = 5;
+const unsigned kMatVecMulMatrixInterpretationIdx = 6;
+const unsigned kMatVecMulMatrixMIdx = 7;
+const unsigned kMatVecMulMatrixKIdx = 8;
+const unsigned kMatVecMulMatrixLayoutIdx = 9;
+const unsigned kMatVecMulMatrixTransposeIdx = 10;
+const unsigned kMatVecMulMatrixStrideIdx = 11;
+const unsigned kMatVecMulIsOutputUnsignedIdx = 12;
+
+// MatVecAdd
+const unsigned kMatVecMulAddBiasInterpretation = 14;
+const unsigned kMatVecMulAddIsOutputUnsignedIdx = 15;
+
+// Outer Product Accumulate
+const unsigned kOuterProdAccMatrixInterpretation = 5;
+const unsigned kOuterProdAccMatrixLayout = 6;
+const unsigned kOuterProdAccMatrixStride = 7;
+
 // TODO: add operand index for all the OpCodeClass.
 } // namespace OperandIndex
 
@@ -2127,6 +2187,13 @@ extern const char *kHostLayoutTypePrefix;
 
 extern const char *kWaveOpsIncludeHelperLanesString;
 
+enum class LinalgMatrixLayout : uint32_t {
+  RowMajor = 0,
+  ColumnMajor = 1,
+  MulOptimal = 2,
+  OuterProductOptimal = 3,
+};
+
 } // namespace DXIL
 
 } // namespace hlsl
diff --git a/include/dxc/DXIL/DxilInstructions.h b/include/dxc/DXIL/DxilInstructions.h
index a99c5360d4..9a4030fd8e 100644
--- a/include/dxc/DXIL/DxilInstructions.h
+++ b/include/dxc/DXIL/DxilInstructions.h
@@ -9918,5 +9918,235 @@ struct DxilInst_RawBufferVectorStore {
                              llvm::APInt(32, (uint64_t)val)));
   }
 };
+
+/// This instruction Multiplies a MxK dimension matrix and a K sized input
+/// vector
+struct DxilInst_MatVecMul {
+  llvm::Instruction *Instr;
+  // Construction and identification
+  DxilInst_MatVecMul(llvm::Instruction *pInstr) : Instr(pInstr) {}
+  operator bool() const {
+    return hlsl::OP::IsDxilOpFuncCallInst(Instr, hlsl::OP::OpCode::MatVecMul);
+  }
+  // Validation support
+  bool isAllowed() const { return true; }
+  bool isArgumentListValid() const {
+    if (13 != llvm::dyn_cast<llvm::CallInst>(Instr)->getNumArgOperands())
+      return false;
+    return true;
+  }
+  // Metadata
+  bool requiresUniformInputs() const { return false; }
+  // Operand indexes
+  enum OperandIdx {
+    arg_inputVector = 1,
+    arg_isInputUnsigned = 2,
+    arg_inputInterpretation = 3,
+    arg_matrixBuffer = 4,
+    arg_matrixOffset = 5,
+    arg_matrixIntepretation = 6,
+    arg_matrixM = 7,
+    arg_matrixK = 8,
+    arg_matrixLayout = 9,
+    arg_matrixTranspose = 10,
+    arg_matrixStride = 11,
+    arg_isOutputUnsigned = 12,
+  };
+  // Accessors
+  llvm::Value *get_inputVector() const { return Instr->getOperand(1); }
+  void set_inputVector(llvm::Value *val) { Instr->setOperand(1, val); }
+  llvm::Value *get_isInputUnsigned() const { return Instr->getOperand(2); }
+  void set_isInputUnsigned(llvm::Value *val) { Instr->setOperand(2, val); }
+  llvm::Value *get_inputInterpretation() const { return Instr->getOperand(3); }
+  void set_inputInterpretation(llvm::Value *val) { Instr->setOperand(3, val); }
+  llvm::Value *get_matrixBuffer() const { return Instr->getOperand(4); }
+  void set_matrixBuffer(llvm::Value *val) { Instr->setOperand(4, val); }
+  llvm::Value *get_matrixOffset() const { return Instr->getOperand(5); }
+  void set_matrixOffset(llvm::Value *val) { Instr->setOperand(5, val); }
+  llvm::Value *get_matrixIntepretation() const { return Instr->getOperand(6); }
+  void set_matrixIntepretation(llvm::Value *val) { Instr->setOperand(6, val); }
+  llvm::Value *get_matrixM() const { return Instr->getOperand(7); }
+  void set_matrixM(llvm::Value *val) { Instr->setOperand(7, val); }
+  llvm::Value *get_matrixK() const { return Instr->getOperand(8); }
+  void set_matrixK(llvm::Value *val) { Instr->setOperand(8, val); }
+  llvm::Value *get_matrixLayout() const { return Instr->getOperand(9); }
+  void set_matrixLayout(llvm::Value *val) { Instr->setOperand(9, val); }
+  llvm::Value *get_matrixTranspose() const { return Instr->getOperand(10); }
+  void set_matrixTranspose(llvm::Value *val) { Instr->setOperand(10, val); }
+  llvm::Value *get_matrixStride() const { return Instr->getOperand(11); }
+  void set_matrixStride(llvm::Value *val) { Instr->setOperand(11, val); }
+  llvm::Value *get_isOutputUnsigned() const { return Instr->getOperand(12); }
+  void set_isOutputUnsigned(llvm::Value *val) { Instr->setOperand(12, val); }
+};
+
+/// This instruction multiplies a MxK dimension matrix and a K sized input
+/// vector and adds an M-sized bias vector
+struct DxilInst_MatVecMulAdd {
+  llvm::Instruction *Instr;
+  // Construction and identification
+  DxilInst_MatVecMulAdd(llvm::Instruction *pInstr) : Instr(pInstr) {}
+  operator bool() const {
+    return hlsl::OP::IsDxilOpFuncCallInst(Instr,
+                                          hlsl::OP::OpCode::MatVecMulAdd);
+  }
+  // Validation support
+  bool isAllowed() const { return true; }
+  bool isArgumentListValid() const {
+    if (16 != llvm::dyn_cast<llvm::CallInst>(Instr)->getNumArgOperands())
+      return false;
+    return true;
+  }
+  // Metadata
+  bool requiresUniformInputs() const { return false; }
+  // Operand indexes
+  enum OperandIdx {
+    arg_inputVector = 1,
+    arg_isInputUnsigned = 2,
+    arg_inputInterpretation = 3,
+    arg_matrixBuffer = 4,
+    arg_matrixOffset = 5,
+    arg_matrixIntepretation = 6,
+    arg_matrixM = 7,
+    arg_matrixK = 8,
+    arg_matrixLayout = 9,
+    arg_matrixTranspose = 10,
+    arg_matrixStride = 11,
+    arg_biasBuffer = 12,
+    arg_biasOffset = 13,
+    arg_biasIntepretation = 14,
+    arg_isOutputUnsigned = 15,
+  };
+  // Accessors
+  llvm::Value *get_inputVector() const { return Instr->getOperand(1); }
+  void set_inputVector(llvm::Value *val) { Instr->setOperand(1, val); }
+  llvm::Value *get_isInputUnsigned() const { return Instr->getOperand(2); }
+  void set_isInputUnsigned(llvm::Value *val) { Instr->setOperand(2, val); }
+  llvm::Value *get_inputInterpretation() const { return Instr->getOperand(3); }
+  void set_inputInterpretation(llvm::Value *val) { Instr->setOperand(3, val); }
+  llvm::Value *get_matrixBuffer() const { return Instr->getOperand(4); }
+  void set_matrixBuffer(llvm::Value *val) { Instr->setOperand(4, val); }
+  llvm::Value *get_matrixOffset() const { return Instr->getOperand(5); }
+  void set_matrixOffset(llvm::Value *val) { Instr->setOperand(5, val); }
+  llvm::Value *get_matrixIntepretation() const { return Instr->getOperand(6); }
+  void set_matrixIntepretation(llvm::Value *val) { Instr->setOperand(6, val); }
+  llvm::Value *get_matrixM() const { return Instr->getOperand(7); }
+  void set_matrixM(llvm::Value *val) { Instr->setOperand(7, val); }
+  llvm::Value *get_matrixK() const { return Instr->getOperand(8); }
+  void set_matrixK(llvm::Value *val) { Instr->setOperand(8, val); }
+  llvm::Value *get_matrixLayout() const { return Instr->getOperand(9); }
+  void set_matrixLayout(llvm::Value *val) { Instr->setOperand(9, val); }
+  llvm::Value *get_matrixTranspose() const { return Instr->getOperand(10); }
+  void set_matrixTranspose(llvm::Value *val) { Instr->setOperand(10, val); }
+  llvm::Value *get_matrixStride() const { return Instr->getOperand(11); }
+  void set_matrixStride(llvm::Value *val) { Instr->setOperand(11, val); }
+  llvm::Value *get_biasBuffer() const { return Instr->getOperand(12); }
+  void set_biasBuffer(llvm::Value *val) { Instr->setOperand(12, val); }
+  llvm::Value *get_biasOffset() const { return Instr->getOperand(13); }
+  void set_biasOffset(llvm::Value *val) { Instr->setOperand(13, val); }
+  llvm::Value *get_biasIntepretation() const { return Instr->getOperand(14); }
+  void set_biasIntepretation(llvm::Value *val) { Instr->setOperand(14, val); }
+  llvm::Value *get_isOutputUnsigned() const { return Instr->getOperand(15); }
+  void set_isOutputUnsigned(llvm::Value *val) { Instr->setOperand(15, val); }
+};
+
+/// This instruction Computes the outer product between column vectors and an
+/// MxN matrix is accumulated component-wise atomically (with device scope) in
+/// memory
+struct DxilInst_OuterProductAccumulate {
+  llvm::Instruction *Instr;
+  // Construction and identification
+  DxilInst_OuterProductAccumulate(llvm::Instruction *pInstr) : Instr(pInstr) {}
+  operator bool() const {
+    return hlsl::OP::IsDxilOpFuncCallInst(
+        Instr, hlsl::OP::OpCode::OuterProductAccumulate);
+  }
+  // Validation support
+  bool isAllowed() const { return true; }
+  bool isArgumentListValid() const {
+    if (8 != llvm::dyn_cast<llvm::CallInst>(Instr)->getNumArgOperands())
+      return false;
+    return true;
+  }
+  // Metadata
+  bool requiresUniformInputs() const { return false; }
+  // Operand indexes
+  enum OperandIdx {
+    arg_inputVector1 = 1,
+    arg_inputVector2 = 2,
+    arg_matrixBuffer = 3,
+    arg_matrixOffset = 4,
+    arg_matrixIntepretation = 5,
+    arg_matrixLayout = 6,
+    arg_matrixStride = 7,
+  };
+  // Accessors
+  llvm::Value *get_inputVector1() const { return Instr->getOperand(1); }
+  void set_inputVector1(llvm::Value *val) { Instr->setOperand(1, val); }
+  llvm::Value *get_inputVector2() const { return Instr->getOperand(2); }
+  void set_inputVector2(llvm::Value *val) { Instr->setOperand(2, val); }
+  llvm::Value *get_matrixBuffer() const { return Instr->getOperand(3); }
+  void set_matrixBuffer(llvm::Value *val) { Instr->setOperand(3, val); }
+  llvm::Value *get_matrixOffset() const { return Instr->getOperand(4); }
+  void set_matrixOffset(llvm::Value *val) { Instr->setOperand(4, val); }
+  llvm::Value *get_matrixIntepretation() const { return Instr->getOperand(5); }
+  void set_matrixIntepretation(llvm::Value *val) { Instr->setOperand(5, val); }
+  int32_t get_matrixIntepretation_val() const {
+    return (int32_t)(llvm::dyn_cast<llvm::ConstantInt>(Instr->getOperand(5))
+                         ->getZExtValue());
+  }
+  void set_matrixIntepretation_val(int32_t val) {
+    Instr->setOperand(5, llvm::Constant::getIntegerValue(
+                             llvm::IntegerType::get(Instr->getContext(), 32),
+                             llvm::APInt(32, (uint64_t)val)));
+  }
+  llvm::Value *get_matrixLayout() const { return Instr->getOperand(6); }
+  void set_matrixLayout(llvm::Value *val) { Instr->setOperand(6, val); }
+  int32_t get_matrixLayout_val() const {
+    return (int32_t)(llvm::dyn_cast<llvm::ConstantInt>(Instr->getOperand(6))
+                         ->getZExtValue());
+  }
+  void set_matrixLayout_val(int32_t val) {
+    Instr->setOperand(6, llvm::Constant::getIntegerValue(
+                             llvm::IntegerType::get(Instr->getContext(), 32),
+                             llvm::APInt(32, (uint64_t)val)));
+  }
+  llvm::Value *get_matrixStride() const { return Instr->getOperand(7); }
+  void set_matrixStride(llvm::Value *val) { Instr->setOperand(7, val); }
+};
+
+/// This instruction Accumulates the components of a vector component-wise
+/// atomically (with device scope) to the corresponding elements of an array in
+/// memory
+struct DxilInst_VectorAccumulate {
+  llvm::Instruction *Instr;
+  // Construction and identification
+  DxilInst_VectorAccumulate(llvm::Instruction *pInstr) : Instr(pInstr) {}
+  operator bool() const {
+    return hlsl::OP::IsDxilOpFuncCallInst(Instr,
+                                          hlsl::OP::OpCode::VectorAccumulate);
+  }
+  // Validation support
+  bool isAllowed() const { return true; }
+  bool isArgumentListValid() const {
+    if (4 != llvm::dyn_cast<llvm::CallInst>(Instr)->getNumArgOperands())
+      return false;
+    return true;
+  }
+  // Metadata
+  bool requiresUniformInputs() const { return false; }
+  // Operand indexes
+  enum OperandIdx {
+    arg_inputVector = 1,
+    arg_arrayBuffer = 2,
+    arg_arrayOffset = 3,
+  };
+  // Accessors
+  llvm::Value *get_inputVector() const { return Instr->getOperand(1); }
+  void set_inputVector(llvm::Value *val) { Instr->setOperand(1, val); }
+  llvm::Value *get_arrayBuffer() const { return Instr->getOperand(2); }
+  void set_arrayBuffer(llvm::Value *val) { Instr->setOperand(2, val); }
+  llvm::Value *get_arrayOffset() const { return Instr->getOperand(3); }
+  void set_arrayOffset(llvm::Value *val) { Instr->setOperand(3, val); }
+};
 // INSTR-HELPER:END
 } // namespace hlsl
diff --git a/include/dxc/DxilContainer/DxcContainerBuilder.h b/include/dxc/DxilContainer/DxcContainerBuilder.h
index 9a3241525c..e79fec18c8 100644
--- a/include/dxc/DxilContainer/DxcContainerBuilder.h
+++ b/include/dxc/DxilContainer/DxcContainerBuilder.h
@@ -45,8 +45,7 @@ class DxcContainerBuilder : public IDxcContainerBuilder {
     return DoBasicQueryInterface<IDxcContainerBuilder>(this, riid, ppvObject);
   }
 
-  void Init(const char *warning = nullptr) {
-    m_warning = warning;
+  void Init() {
     m_RequireValidation = false;
     m_HasPrivateData = false;
     m_HashFunction = nullptr;
@@ -67,7 +66,6 @@ class DxcContainerBuilder : public IDxcContainerBuilder {
 
   PartList m_parts;
   CComPtr<IDxcBlob> m_pContainer;
-  const char *m_warning;
   bool m_RequireValidation;
   bool m_HasPrivateData;
   // Function to compute hash when valid dxil container is built
diff --git a/include/dxc/DxilContainer/RDAT_LibraryTypes.inl b/include/dxc/DxilContainer/RDAT_LibraryTypes.inl
index 4b58b406c2..902f2e9652 100644
--- a/include/dxc/DxilContainer/RDAT_LibraryTypes.inl
+++ b/include/dxc/DxilContainer/RDAT_LibraryTypes.inl
@@ -565,9 +565,13 @@ RDAT_DXIL_ENUM_START(hlsl::DXIL::ComponentType, uint32_t)
   RDAT_ENUM_VALUE_NODEF(UNormF64)
   RDAT_ENUM_VALUE_NODEF(PackedS8x32)
   RDAT_ENUM_VALUE_NODEF(PackedU8x32)
+  RDAT_ENUM_VALUE_NODEF(U8)
+  RDAT_ENUM_VALUE_NODEF(I8)
+  RDAT_ENUM_VALUE_NODEF(F8_E4M3)
+  RDAT_ENUM_VALUE_NODEF(F8_E5M2)
   RDAT_ENUM_VALUE_NODEF(LastEntry)
 #if DEF_RDAT_ENUMS == DEF_RDAT_DUMP_IMPL
-  static_assert((unsigned)hlsl::DXIL::ComponentType::LastEntry == 19,
+  static_assert((unsigned)hlsl::DXIL::ComponentType::LastEntry == 23,
                 "otherwise, RDAT_DXIL_ENUM definition needs updating");
 #endif
 RDAT_ENUM_END()
diff --git a/include/dxc/HLSL/HLOperations.h b/include/dxc/HLSL/HLOperations.h
index a7db8612a6..79cbadc42c 100644
--- a/include/dxc/HLSL/HLOperations.h
+++ b/include/dxc/HLSL/HLOperations.h
@@ -396,7 +396,12 @@ const unsigned kAnnotateHandleResourceTypeOpIdx = 3;
 
 // TraceRay.
 const unsigned kTraceRayRayDescOpIdx = 7;
-const unsigned kTraceRayPayLoadOpIdx = 8;
+// kTraceRayPayloadPreOpIdx is before flattening the RayDesc
+const unsigned kTraceRayPayloadPreOpIdx = 8;
+// kTraceRayPayloadOpIdx is after flattening the RayDesc
+const unsigned kTraceRayPayloadOpIdx = 11;
+const unsigned kTraceRay_PreNumOp = 9;
+const unsigned kTraceRay_NumOp = 12;
 
 // AllocateRayQuery
 const unsigned kAllocateRayQueryRayFlagsIdx = 1;
@@ -407,6 +412,10 @@ const unsigned kCallShaderPayloadOpIdx = 2;
 
 // TraceRayInline.
 const unsigned kTraceRayInlineRayDescOpIdx = 5;
+// kTraceRayInlinePayloadPreOpIdx is before flattening the RayDesc
+const unsigned kTraceRayInlinePayloadPreOpIdx = 6;
+// kTraceRayInlinePayloadOpIdx is after flattening the RayDesc
+const unsigned kTraceRayInlinePayloadOpIdx = 9;
 
 // ReportIntersection.
 const unsigned kReportIntersectionAttributeOpIdx = 3;
@@ -435,8 +444,75 @@ const unsigned kAnnotateNodeRecordHandleNodeRecordPropIdx = 2;
 
 // HitObject::MakeMiss
 const unsigned kHitObjectMakeMiss_NumOp = 8;
-const unsigned kHitObjectMakeMissRayDescOpIdx = 4;
-
+const unsigned kHitObjectMakeMiss_RayDescOpIdx = 4;
+
+// HitObject::TraceRay
+const unsigned kHitObjectTraceRay_RayDescOpIdx = 8;
+// kHitObjectTraceRay_PayloadPreOpIdx is before flattening the RayDesc
+const unsigned kHitObjectTraceRay_PayloadPreOpIdx = 9;
+// kHitObjectTraceRay_PayloadOpIdx is after flattening the RayDesc
+const unsigned kHitObjectTraceRay_PayloadOpIdx = 12;
+const unsigned kHitObjectTraceRay_PreNumOp = 10;
+const unsigned kHitObjectTraceRay_NumOp = 13;
+
+// HitObject::Invoke
+const unsigned kHitObjectInvoke_PayloadOpIdx = 2;
+
+// HitObject::FromRayQuery
+const unsigned kHitObjectFromRayQuery_WithAttrs_AttributeOpIdx = 4;
+const unsigned kHitObjectFromRayQuery_WithAttrs_NumOp = 5;
+
+// HitObject::GetAttributes
+const unsigned kHitObjectGetAttributes_AttributeOpIdx = 2;
+
+// Linear Algebra Operations
+
+// MatVecMul
+const unsigned kMatVecMulOutputVectorIdx = 1;
+const unsigned kMatVecMulIsOutputUnsignedIdx = 2;
+const unsigned kMatVecMulInputVectorIdx = 3;
+const unsigned kMatVecMulIsInputUnsignedIdx = 4;
+const unsigned kMatVecMulInputInterpretationIdx = 5;
+const unsigned kMatVecMulMatrixBufferIdx = 6;
+const unsigned kMatVecMulMatrixOffsetIdx = 7;
+const unsigned kMatVecMulMatrixInterpretationIdx = 8;
+const unsigned kMatVecMulMatrixMIdx = 9;
+const unsigned kMatVecMulMatrixKIdx = 10;
+const unsigned kMatVecMulMatrixLayoutIdx = 11;
+const unsigned kMatVecMulMatrixTransposeIdx = 12;
+const unsigned kMatVecMulMatrixStrideIdx = 13;
+
+// MatVecMulAdd
+const unsigned kMatVecMulAddOutputVectorIdx = 1;
+const unsigned kMatVecMulAddIsOutputUnsignedIdx = 2;
+const unsigned kMatVecMulAddInputVectorIdx = 3;
+const unsigned kMatVecMulAddIsInputUnsignedIdx = 4;
+const unsigned kMatVecMulAddInputInterpretationIdx = 5;
+const unsigned kMatVecMulAddMatrixBufferIdx = 6;
+const unsigned kMatVecMulAddMatrixOffsetIdx = 7;
+const unsigned kMatVecMulAddMatrixInterpretationIdx = 8;
+const unsigned kMatVecMulAddMatrixMIdx = 9;
+const unsigned kMatVecMulAddMatrixKIdx = 10;
+const unsigned kMatVecMulAddMatrixLayoutIdx = 11;
+const unsigned kMatVecMulAddMatrixTransposeIdx = 12;
+const unsigned kMatVecMulAddMatrixStrideIdx = 13;
+const unsigned kMatVecMulAddBiasBufferIdx = 14;
+const unsigned kMatVecMulAddBiasOffsetIdx = 15;
+const unsigned kMatVecMulAddBiasInterpretationIdx = 16;
+
+// OuterProductAccumulate
+const unsigned kOuterProdAccInputVec1Idx = 1;
+const unsigned kOuterProdAccInputVec2Idx = 2;
+const unsigned kOuterProdAccMatrixIdx = 3;
+const unsigned kOuterProdAccMatrixOffsetIdx = 4;
+const unsigned kOuterProdAccMatrixInterpretationIdx = 5;
+const unsigned kOuterProdAccMatrixLayoutIdx = 6;
+const unsigned kOuterProdAccMatrixStrideIdx = 7;
+
+// Vector Accumulate
+const unsigned kVectorAccInputVecIdx = 1;
+const unsigned kVectorAccMatrixIdx = 2;
+const unsigned kVectorAccMatrixOffsetIdx = 3;
 } // namespace HLOperandIndex
 
 llvm::Function *GetOrCreateHLFunction(llvm::Module &M,
diff --git a/include/dxc/HlslIntrinsicOp.h b/include/dxc/HlslIntrinsicOp.h
index d37c27a38e..197bd3e1f5 100644
--- a/include/dxc/HlslIntrinsicOp.h
+++ b/include/dxc/HlslIntrinsicOp.h
@@ -107,6 +107,10 @@ enum class IntrinsicOp {
   IOP_WorldToObject = 99,
   IOP_WorldToObject3x4 = 100,
   IOP_WorldToObject4x3 = 101,
+  IOP___builtin_MatVecMul = 390,
+  IOP___builtin_MatVecMulAdd = 391,
+  IOP___builtin_OuterProductAccumulate = 392,
+  IOP___builtin_VectorAccumulate = 393,
   IOP_abort = 102,
   IOP_abs = 103,
   IOP_acos = 104,
@@ -396,7 +400,7 @@ enum class IntrinsicOp {
   IOP_usign = 355,
   MOP_InterlockedUMax = 356,
   MOP_InterlockedUMin = 357,
-  Num_Intrinsics = 390,
+  Num_Intrinsics = 394,
 };
 inline bool HasUnsignedIntrinsicOpcode(IntrinsicOp opcode) {
   switch (opcode) {
diff --git a/include/dxc/Support/HLSLOptions.h b/include/dxc/Support/HLSLOptions.h
index 56e95a1659..31ca3d1c14 100644
--- a/include/dxc/Support/HLSLOptions.h
+++ b/include/dxc/Support/HLSLOptions.h
@@ -114,13 +114,6 @@ struct RewriterOpts {
   bool DeclGlobalCB = false;          // OPT_rw_decl_global_cb
 };
 
-enum class ValidatorSelection : int {
-  Auto,        // Try DXIL.dll; fallback to internal validator
-  Internal,    // Force internal validator (even if DXIL.dll is present)
-  External,    // Use DXIL.dll, failing compilation if not available
-  Invalid = -1 // Invalid
-};
-
 /// Use this class to capture all options.
 class DxcOpts {
 public:
@@ -225,8 +218,6 @@ class DxcOpts {
   bool ResMayAlias = false;                  // OPT_res_may_alias
   unsigned long ValVerMajor = UINT_MAX,
                 ValVerMinor = UINT_MAX; // OPT_validator_version
-  ValidatorSelection SelectValidator =
-      ValidatorSelection::Auto;         // OPT_select_validator
   unsigned ScanLimit = 0;               // OPT_memdep_block_scan_limit
   bool ForceZeroStoreLifetimes = false; // OPT_force_zero_store_lifetimes
   bool EnableLifetimeMarkers = false;   // OPT_enable_lifetime_markers
diff --git a/include/dxc/Support/HLSLOptions.td b/include/dxc/Support/HLSLOptions.td
index ea000f4877..4a38e275c3 100644
--- a/include/dxc/Support/HLSLOptions.td
+++ b/include/dxc/Support/HLSLOptions.td
@@ -317,8 +317,6 @@ def print_before_all : Flag<["-", "/"], "print-before-all">, Group<hlslcomp_Grou
   HelpText<"Print LLVM IR before each pass.">;
 def print_before : Separate<["-", "/"], "print-before">, Group<hlslcomp_Group>, Flags<[CoreOption, HelpHidden]>,
   HelpText<"Print LLVM IR before a specific pass. May be specificied multiple times.">;
-def select_validator : Separate<["-", "/"], "select-validator">, Group<hlslcomp_Group>, Flags<[CoreOption, HelpHidden]>,
-  HelpText<"Select validator: auto: (default) use DXIL.dll if found, otherwise use internal;  internal: internal non-signing validator;  external: use DXIL.dll if found, otherwise fail compilation.">;
 def print_after_all : Flag<["-", "/"], "print-after-all">, Group<hlslcomp_Group>, Flags<[CoreOption, HelpHidden]>,
   HelpText<"Print LLVM IR after each pass.">;
 def print_after : Separate<["-", "/"], "print-after">, Group<hlslcomp_Group>, Flags<[CoreOption, HelpHidden]>,
@@ -370,7 +368,7 @@ def fvk_bind_register : MultiArg<["-"], "fvk-bind-register", 4>, MetaVarName<"<t
   HelpText<"Specify Vulkan descriptor set and binding for a specific register">;
 def vkbr : MultiArg<["-"], "vkbr", 4>, Flags<[CoreOption, DriverOption]>, Alias<fvk_bind_register>;
 def fvk_invert_y: Flag<["-"], "fvk-invert-y">, Group<spirv_Group>, Flags<[CoreOption, DriverOption]>,
-  HelpText<"Negate SV_Position.y before writing to stage output in VS/DS/GS to accommodate Vulkan's coordinate system">;
+  HelpText<"Negate SV_Position.y before writing to stage output in VS/DS/GS/MS/Lib to accommodate Vulkan's coordinate system">;
 def fvk_use_dx_position_w: Flag<["-"], "fvk-use-dx-position-w">, Group<spirv_Group>, Flags<[CoreOption, DriverOption]>,
   HelpText<"Reciprocate SV_Position.w after reading from stage input in PS to accommodate the difference between Vulkan and DirectX">;
 def fvk_support_nonzero_base_instance: Flag<["-"], "fvk-support-nonzero-base-instance">, Group<spirv_Group>, Flags<[CoreOption, DriverOption]>,
@@ -405,6 +403,12 @@ def fspv_enable_maximal_reconvergence: Flag<["-"], "fspv-enable-maximal-reconver
   HelpText<"Enables the MaximallyReconvergesKHR execution mode for this module.">;
 def fspv_use_vulkan_memory_model: Flag<["-"], "fspv-use-vulkan-memory-model">, Group<spirv_Group>, Flags<[CoreOption, DriverOption]>,
   HelpText<"Generates SPIR-V modules that use the Vulkan memory model instead of GLSL450.">;
+def fspv_use_unknown_image_format
+    : Flag<["-"], "fspv-use-unknown-image-format">,
+      Group<spirv_Group>,
+      Flags<[CoreOption, DriverOption]>,
+      HelpText<"For storage images and texel buffers, sets the default format to 'Unknown' when not specified via the `vk::image_format` attribute. If this option is not used, the format is inferred from the resource's data type.">;
+
 def fvk_auto_shift_bindings: Flag<["-"], "fvk-auto-shift-bindings">, Group<spirv_Group>, Flags<[CoreOption, DriverOption]>,
   HelpText<"Apply fvk-*-shift to resources without an explicit register assignment.">;
 def Wno_vk_ignored_features : Joined<["-"], "Wno-vk-ignored-features">, Group<spirv_Group>, Flags<[CoreOption, DriverOption, HelpHidden]>,
diff --git a/include/dxc/Support/SPIRVOptions.h b/include/dxc/Support/SPIRVOptions.h
index 1b88ef4def..352cf6c2ec 100644
--- a/include/dxc/Support/SPIRVOptions.h
+++ b/include/dxc/Support/SPIRVOptions.h
@@ -71,6 +71,7 @@ struct SpirvCodeGenOptions {
   bool fixFuncCallArguments;
   bool enableMaximalReconvergence;
   bool useVulkanMemoryModel;
+  bool useUnknownImageFormat;
   bool IEEEStrict;
   /// Maximum length in words for the OpString literal containing the shader
   /// source for DebugSource and DebugSourceContinued. If the source code length
diff --git a/include/dxc/Test/HlslTestUtils.h b/include/dxc/Test/HlslTestUtils.h
index 0e37ccdcff..dd89fda676 100644
--- a/include/dxc/Test/HlslTestUtils.h
+++ b/include/dxc/Test/HlslTestUtils.h
@@ -10,6 +10,8 @@
 ///////////////////////////////////////////////////////////////////////////////
 
 // *** THIS FILE CANNOT TAKE ANY LLVM DEPENDENCIES  *** //
+#ifndef HLSLTESTUTILS_H
+#define HLSLTESTUTILS_H
 
 #include <algorithm>
 #include <atomic>
@@ -258,6 +260,29 @@ inline void LogErrorFmt(const wchar_t *fmt, ...) {
   WEX::Logging::Log::Error(buf.data());
 }
 
+inline void LogErrorFmtThrow(const char *fileName, int line, const wchar_t *fmt,
+                             ...) {
+  va_list args;
+  va_start(args, fmt);
+  std::wstring buf(vFormatToWString(fmt, args));
+  va_end(args);
+
+  std::wstringstream wss;
+  wss << L"Error in file: " << fileName << L" at line: " << line << L"\n"
+      << buf.data() << L"\n"
+      << buf;
+
+  WEX::Logging::Log::Error(wss.str().c_str());
+
+  // Throws an exception to abort the test.
+  VERIFY_FAIL(L"Test error");
+}
+
+// Macro to pass the file name and line number. Otherwise TAEF prints this file
+// and line number.
+#define LOG_ERROR_FMT_THROW(fmt, ...)                                          \
+  hlsl_test::LogErrorFmtThrow(__FILE__, __LINE__, fmt, __VA_ARGS__)
+
 inline std::wstring
 GetPathToHlslDataFile(const wchar_t *relative,
                       LPCWSTR paramName = HLSLDATAFILEPARAM,
@@ -459,15 +484,17 @@ inline bool GetTestParamUseWARP(bool defaultVal) {
 
 #ifdef FP_SUBNORMAL
 
-inline bool isdenorm(float f) { return FP_SUBNORMAL == std::fpclassify(f); }
+template <typename T> inline bool isdenorm(T f) {
+  return FP_SUBNORMAL == std::fpclassify(f);
+}
 
 #else
 
-inline bool isdenorm(float f) {
-  return (std::numeric_limits<float>::denorm_min() <= f &&
-          f < std::numeric_limits<float>::min()) ||
-         (-std::numeric_limits<float>::min() < f &&
-          f <= -std::numeric_limits<float>::denorm_min());
+template <typename T> inline bool isdenorm(T f) {
+  return (std::numeric_limits<T>::denorm_min() <= f &&
+          f < std::numeric_limits<T>::min()) ||
+         (-std::numeric_limits<T>::min() < f &&
+          f <= -std::numeric_limits<T>::denorm_min());
 }
 
 #endif // FP_SUBNORMAL
@@ -515,6 +542,44 @@ inline bool isnanFloat16(uint16_t val) {
 uint16_t ConvertFloat32ToFloat16(float val) throw();
 float ConvertFloat16ToFloat32(uint16_t val) throw();
 
+inline bool CompareDoubleULP(
+    const double &Src, const double &Ref, int64_t ULPTolerance,
+    hlsl::DXIL::Float32DenormMode Mode = hlsl::DXIL::Float32DenormMode::Any) {
+  if (Src == Ref) {
+    return true;
+  }
+  if (std::isnan(Src)) {
+    return std::isnan(Ref);
+  }
+
+  if (Mode == hlsl::DXIL::Float32DenormMode::Any) {
+    // If denorm expected, output can be sign preserved zero. Otherwise output
+    // should pass the regular ulp testing.
+    if (isdenorm(Ref) && Src == 0 && std::signbit(Src) == std::signbit(Ref))
+      return true;
+  }
+
+  // For FTZ or Preserve mode, we should get the expected number within
+  // ULPTolerance for any operations.
+  int64_t Diff = *((const uint64_t *)&Src) - *((const uint64_t *)&Ref);
+
+  uint64_t AbsoluteDiff = Diff < 0 ? -Diff : Diff;
+  return AbsoluteDiff <= (uint64_t)ULPTolerance;
+}
+
+inline bool CompareDoubleEpsilon(const double &Src, const double &Ref,
+                                 float Epsilon) {
+  if (Src == Ref) {
+    return true;
+  }
+  if (std::isnan(Src)) {
+    return std::isnan(Ref);
+  }
+  // For FTZ or Preserve mode, we should get the expected number within
+  // epsilon for any operations.
+  return fabs(Src - Ref) < Epsilon;
+}
+
 inline bool CompareFloatULP(
     const float &fsrc, const float &fref, int ULPTolerance,
     hlsl::DXIL::Float32DenormMode mode = hlsl::DXIL::Float32DenormMode::Any) {
@@ -566,12 +631,26 @@ inline bool CompareFloatRelativeEpsilon(
 
 inline bool CompareHalfULP(const uint16_t &fsrc, const uint16_t &fref,
                            float ULPTolerance) {
+  // Treat +0 and -0 as equal
+  if ((fsrc & ~FLOAT16_BIT_SIGN) == 0 && (fref & ~FLOAT16_BIT_SIGN) == 0)
+    return true;
   if (fsrc == fref)
     return true;
-  if (isnanFloat16(fsrc))
-    return isnanFloat16(fref);
+
+  const bool nanRef = isnanFloat16(fref);
+  const bool nanSrc = isnanFloat16(fsrc);
+  if (nanRef || nanSrc)
+    return nanRef && nanSrc;
+
+  // Map to monotonic ordering for correct ULP diff
+  auto toOrdered = [](uint16_t h) -> int {
+    return (h & FLOAT16_BIT_SIGN) ? (~h & 0xFFFF) : (h | 0x8000);
+  };
+
   // 16-bit floating point numbers must preserve denorms
-  int diff = fsrc - fref;
+  int i_fsrc = toOrdered(fsrc);
+  int i_fref = toOrdered(fref);
+  int diff = i_fsrc - i_fref;
   unsigned int uDiff = diff < 0 ? -diff : diff;
   return uDiff <= (unsigned int)ULPTolerance;
 }
@@ -735,3 +814,5 @@ inline UINT GetByteSizeForFormat(DXGI_FORMAT value) {
   }
 }
 #endif
+
+#endif // HLSLTESTUTILS_H
diff --git a/include/dxc/dxcapi.internal.h b/include/dxc/dxcapi.internal.h
index 28bd3e7066..46a485206e 100644
--- a/include/dxc/dxcapi.internal.h
+++ b/include/dxc/dxcapi.internal.h
@@ -7,9 +7,6 @@
 //                                                                           //
 // Provides non-public declarations for the DirectX Compiler component.      //
 //                                                                           //
-// Modifications Copyright(C) 2025 Advanced Micro Devices, Inc.              //
-// All rights reserved.                                                      //
-//                                                                           //
 ///////////////////////////////////////////////////////////////////////////////
 
 #ifndef __DXC_API_INTERNAL__
@@ -133,11 +130,15 @@ enum LEGAL_INTRINSIC_COMPTYPES {
   LICOMPTYPE_HIT_OBJECT = 51,
   LICOMPTYPE_RAY_QUERY = 52,
 
+  LICOMPTYPE_LINALG = 53, // f32, partial-precision-f32, f16,
+                          // i32, i16, u32, u16,
+                          // int8_4packed, uint8_4packed
+
 #ifdef ENABLE_SPIRV_CODEGEN
-  LICOMPTYPE_VK_BUFFER_POINTER = 53,
-  LICOMPTYPE_COUNT = 54
+  LICOMPTYPE_VK_BUFFER_POINTER = 54,
+  LICOMPTYPE_COUNT = 55
 #else
-  LICOMPTYPE_COUNT = 53
+  LICOMPTYPE_COUNT = 54
 #endif
 };
 
diff --git a/include/llvm/ADT/IntervalMap.h b/include/llvm/ADT/IntervalMap.h
index 2a00667227..5bb948727e 100644
--- a/include/llvm/ADT/IntervalMap.h
+++ b/include/llvm/ADT/IntervalMap.h
@@ -320,7 +320,11 @@ class NodeBase {
       return Count;
     } else {
       // We want to shrink, copy to sib.
-      unsigned Count = std::min(std::min(unsigned(-Add), Size), N - SSize);
+      // Count <= INT_MAX: Since Add is an int, unsigned(-Add) <= 2^31, so
+      // std::min result <= INT_MAX. Meaning its safe to store the result in an
+      // int to avoid the compiler warning for '-Count' if we were to use an
+      // unsigned value instead.
+      int Count = std::min(std::min(unsigned(-Add), Size), N - SSize);
       transferToLeftSib(Size, Sib, SSize, Count);
       return -Count;
     }
diff --git a/include/llvm/ADT/StringExtras.h b/include/llvm/ADT/StringExtras.h
index 270989b349..684ee0f9dc 100644
--- a/include/llvm/ADT/StringExtras.h
+++ b/include/llvm/ADT/StringExtras.h
@@ -36,12 +36,12 @@ static inline StringRef toStringRef(bool B) {
 /// Interpret the given character \p C as a hexadecimal digit and return its
 /// value.
 ///
-/// If \p C is not a valid hex digit, -1U is returned.
+/// If \p C is not a valid hex digit, ~0U is returned.
 static inline unsigned hexDigitValue(char C) {
   if (C >= '0' && C <= '9') return C-'0';
   if (C >= 'a' && C <= 'f') return C-'a'+10U;
   if (C >= 'A' && C <= 'F') return C-'A'+10U;
-  return -1U;
+  return ~0U;
 }
 
 /// utohex_buffer - Emit the specified number into the buffer specified by
diff --git a/include/llvm/CodeGen/SelectionDAGNodes.h b/include/llvm/CodeGen/SelectionDAGNodes.h
index d4a6371216..ba63d80e94 100644
--- a/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -191,12 +191,12 @@ class SDValue {
 template<> struct DenseMapInfo<SDValue> {
   static inline SDValue getEmptyKey() {
     SDValue V;
-    V.ResNo = -1U;
+    V.ResNo = ~0U;
     return V;
   }
   static inline SDValue getTombstoneKey() {
     SDValue V;
-    V.ResNo = -2U;
+    V.ResNo = ~1U;
     return V;
   }
   static unsigned getHashValue(const SDValue &Val) {
@@ -879,7 +879,7 @@ inline SDValue::SDValue(SDNode *node, unsigned resno)
     : Node(node), ResNo(resno) {
   assert((!Node || ResNo < Node->getNumValues()) &&
          "Invalid result number for the given node!");
-  assert(ResNo < -2U && "Cannot use result numbers reserved for DenseMaps.");
+  assert(ResNo < ~1U && "Cannot use result numbers reserved for DenseMaps.");
 }
 
 inline unsigned SDValue::getOpcode() const {
diff --git a/include/llvm/DebugInfo/DWARF/DWARFDebugAranges.h b/include/llvm/DebugInfo/DWARF/DWARFDebugAranges.h
index 791f010a88..c34cfab284 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFDebugAranges.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFDebugAranges.h
@@ -32,12 +32,13 @@ class DWARFDebugAranges {
   void construct();
 
   struct Range {
-    explicit Range(uint64_t LowPC = -1ULL, uint64_t HighPC = -1ULL,
-                   uint32_t CUOffset = -1U)
-      : LowPC(LowPC), Length(HighPC - LowPC), CUOffset(CUOffset) {}
+    explicit Range(uint64_t LowPC = std::numeric_limits<uint64_t>::max(),
+                   uint64_t HighPC = std::numeric_limits<uint64_t>::max(),
+                   uint32_t CUOffset = std::numeric_limits<uint32_t>::max())
+        : LowPC(LowPC), Length(HighPC - LowPC), CUOffset(CUOffset) {}
 
     void setHighPC(uint64_t HighPC) {
-      if (HighPC == -1ULL || HighPC <= LowPC)
+      if (HighPC == std::numeric_limits<uint64_t>::max() || HighPC <= LowPC)
         Length = 0;
       else
         Length = HighPC - LowPC;
@@ -45,7 +46,7 @@ class DWARFDebugAranges {
     uint64_t HighPC() const {
       if (Length)
         return LowPC + Length;
-      return -1ULL;
+      return std::numeric_limits<uint64_t>::max();
     }
 
     bool containsAddress(uint64_t Address) const {
diff --git a/include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h b/include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h
index c930bd603d..8eea252b60 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h
@@ -49,9 +49,9 @@ class DWARFDebugRangeList {
     bool isBaseAddressSelectionEntry(uint8_t AddressSize) const {
       assert(AddressSize == 4 || AddressSize == 8);
       if (AddressSize == 4)
-        return StartAddress == -1U;
+        return StartAddress == ~0U;
       else
-        return StartAddress == -1ULL;
+        return StartAddress == ~0ULL;
     }
   };
 
diff --git a/include/llvm/Support/BlockFrequency.h b/include/llvm/Support/BlockFrequency.h
index 4304a253b2..d7d6d741f4 100644
--- a/include/llvm/Support/BlockFrequency.h
+++ b/include/llvm/Support/BlockFrequency.h
@@ -15,6 +15,7 @@
 #define LLVM_SUPPORT_BLOCKFREQUENCY_H
 
 #include "llvm/Support/DataTypes.h"
+#include <limits>
 
 namespace llvm {
 
@@ -29,7 +30,9 @@ class BlockFrequency {
   BlockFrequency(uint64_t Freq = 0) : Frequency(Freq) { }
 
   /// \brief Returns the maximum possible frequency, the saturation value.
-  static uint64_t getMaxFrequency() { return -1ULL; }
+  static uint64_t getMaxFrequency() {
+    return std::numeric_limits<uint64_t>::max();
+  }
 
   /// \brief Returns the frequency as a fixpoint number scaled by the entry
   /// frequency.
diff --git a/include/llvm/Support/LEB128.h b/include/llvm/Support/LEB128.h
index 1324cb82ca..f8a2843412 100644
--- a/include/llvm/Support/LEB128.h
+++ b/include/llvm/Support/LEB128.h
@@ -103,7 +103,7 @@ inline int64_t decodeSLEB128(const uint8_t *p, unsigned *n = nullptr) {
   } while (Byte >= 128);
   // Sign extend negative numbers.
   if (Byte & 0x40)
-    Value |= (-1ULL) << Shift;
+    Value |= (~0ULL) << Shift;
   if (n)
     *n = (unsigned)(p - orig_p);
   return Value;
diff --git a/lib/Analysis/BasicAliasAnalysis.cpp b/lib/Analysis/BasicAliasAnalysis.cpp
index aa0f9ed873..956c334374 100644
--- a/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/lib/Analysis/BasicAliasAnalysis.cpp
@@ -1117,7 +1117,11 @@ AliasResult BasicAliasAnalysis::aliasGEP(
       // stripped a gep with negative index ('gep <ptr>, -1, ...).
       if (V1Size != MemoryLocation::UnknownSize &&
           V2Size != MemoryLocation::UnknownSize) {
-        if (-(uint64_t)GEP1BaseOffset < V1Size)
+        // GEP1BaseOffset is negative in this else block and because we're
+        // assigning to an unsigned variable, we can make use of
+        // -I == (~I + 1) to compute the absolute value of GEP1BaseOffset.
+        const uint64_t GEP1BaseOffsetAbs = (~GEP1BaseOffset + 1ULL);
+        if (GEP1BaseOffsetAbs < V1Size)
           return PartialAlias;
         return NoAlias;
       }
diff --git a/lib/Analysis/ConstantFolding.cpp b/lib/Analysis/ConstantFolding.cpp
index 69c9b10b60..0167bdf0a1 100644
--- a/lib/Analysis/ConstantFolding.cpp
+++ b/lib/Analysis/ConstantFolding.cpp
@@ -187,7 +187,7 @@ static Constant *FoldBitCast(Constant *C, Type *DestTy, const DataLayout &DL) {
         // Shift it to the right place, depending on endianness.
         Src = ConstantExpr::getShl(Src,
                                    ConstantInt::get(Src->getType(), ShiftAmt));
-        ShiftAmt += isLittleEndian ? SrcBitSize : -SrcBitSize;
+        ShiftAmt += isLittleEndian ? SrcBitSize : (~SrcBitSize + 1U);
 
         // Mix it in.
         Elt = ConstantExpr::getOr(Elt, Src);
@@ -213,7 +213,7 @@ static Constant *FoldBitCast(Constant *C, Type *DestTy, const DataLayout &DL) {
       // endianness.
       Constant *Elt = ConstantExpr::getLShr(Src,
                                   ConstantInt::get(Src->getType(), ShiftAmt));
-      ShiftAmt += isLittleEndian ? DstBitSize : -DstBitSize;
+      ShiftAmt += isLittleEndian ? DstBitSize : (~DstBitSize + 1U);
 
       // Truncate the element to an integer with the same pointer size and
       // convert the element back to a pointer using a inttoptr.
diff --git a/lib/Analysis/InstructionSimplify.cpp b/lib/Analysis/InstructionSimplify.cpp
index 89c7cc7a3e..96c0b3302d 100644
--- a/lib/Analysis/InstructionSimplify.cpp
+++ b/lib/Analysis/InstructionSimplify.cpp
@@ -4109,7 +4109,7 @@ Constant *FoldBitCast(Constant *C, Type *DestTy, const DataLayout &DL) {
         // Shift it to the right place, depending on endianness.
         Src = ConstantExpr::getShl(Src,
                                    ConstantInt::get(Src->getType(), ShiftAmt));
-        ShiftAmt += isLittleEndian ? SrcBitSize : -SrcBitSize;
+        ShiftAmt += isLittleEndian ? SrcBitSize : (~SrcBitSize + 1U);
 
         // Mix it in.
         Elt = ConstantExpr::getOr(Elt, Src);
@@ -4144,9 +4144,9 @@ Constant *FoldBitCast(Constant *C, Type *DestTy, const DataLayout &DL) {
     for (unsigned j = 0; j != Ratio; ++j) {
       // Shift the piece of the value into the right place, depending on
       // endianness.
-      Constant *Elt = ConstantExpr::getLShr(Src,
-                                  ConstantInt::get(Src->getType(), ShiftAmt));
-      ShiftAmt += isLittleEndian ? DstBitSize : -DstBitSize;
+      Constant *Elt = ConstantExpr::getLShr(
+          Src, ConstantInt::get(Src->getType(), ShiftAmt));
+      ShiftAmt += isLittleEndian ? DstBitSize : (~DstBitSize + 1U);
 
       // Truncate the element to an integer with the same pointer size and
       // convert the element back to a pointer using a inttoptr.
diff --git a/lib/Analysis/LoopAccessAnalysis.cpp b/lib/Analysis/LoopAccessAnalysis.cpp
index d6316dc75b..d855df32dc 100644
--- a/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/lib/Analysis/LoopAccessAnalysis.cpp
@@ -1179,7 +1179,7 @@ bool MemoryDepChecker::areDepsSafe(DepCandidates &AccessSets,
                                    MemAccessInfoSet &CheckDeps,
                                    const ValueToValueMap &Strides) {
 
-  MaxSafeDepDistBytes = -1U;
+  MaxSafeDepDistBytes = std::numeric_limits<unsigned>::max();
   while (!CheckDeps.empty()) {
     MemAccessInfo CurAccess = *CheckDeps.begin();
 
@@ -1677,8 +1677,8 @@ LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE,
                                const ValueToValueMap &Strides)
     : PtrRtChecking(SE), DepChecker(SE, L), TheLoop(L), SE(SE), DL(DL),
       TLI(TLI), AA(AA), DT(DT), LI(LI), NumLoads(0), NumStores(0),
-      MaxSafeDepDistBytes(-1U), CanVecMem(false),
-      StoreToLoopInvariantAddress(false) {
+      MaxSafeDepDistBytes(std::numeric_limits<unsigned>::max()),
+      CanVecMem(false), StoreToLoopInvariantAddress(false) {
   if (canAnalyzeLoop())
     analyzeLoop(Strides);
 }
diff --git a/lib/Bitcode/Reader/BitcodeReader.cpp b/lib/Bitcode/Reader/BitcodeReader.cpp
index 185c291d66..a87128ca26 100644
--- a/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -2401,7 +2401,7 @@ uint64_t BitcodeReader::decodeSignRotatedValue(uint64_t V) {
   if ((V & 1) == 0)
     return V >> 1;
   if (V != 1)
-    return -(V >> 1);
+    return ~(V >> 1) + 1;
   // There is no such thing as -0 with integers.  "-0" really means MININT.
   return 1ULL << 63;
 }
diff --git a/lib/Bitcode/Writer/BitcodeWriter.cpp b/lib/Bitcode/Writer/BitcodeWriter.cpp
index 0718c81451..f02344ae64 100644
--- a/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -1360,7 +1360,7 @@ static void emitSignedInt64(SmallVectorImpl<uint64_t> &Vals, uint64_t V) {
   if ((int64_t)V >= 0)
     Vals.push_back(V << 1);
   else
-    Vals.push_back((-V << 1) | 1);
+    Vals.push_back(((~V + 1) << 1) | 1);
 }
 
 static void WriteConstants(unsigned FirstVal, unsigned LastVal,
@@ -1437,7 +1437,7 @@ static void WriteConstants(unsigned FirstVal, unsigned LastVal,
       continue;
     }
     const Constant *C = cast<Constant>(V);
-    unsigned Code = -1U;
+    unsigned Code = ~0U;
     unsigned AbbrevToUse = 0;
     if (C->isNullValue()) {
       Code = bitc::CST_CODE_NULL;
diff --git a/lib/DXIL/DxilOperations.cpp b/lib/DXIL/DxilOperations.cpp
index f614ba9d14..253121346a 100644
--- a/lib/DXIL/DxilOperations.cpp
+++ b/lib/DXIL/DxilOperations.cpp
@@ -765,32 +765,32 @@ const OP::OpCodeProperty OP::m_OpCodeProps[(unsigned)OP::OpCode::NumOpCodes] = {
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x403}},
-     {{0x3}}}, // Overloads: hf<hf
+     {{0x3}},
+     {{0x0}}}, // Overloads: hf
     {OC::DerivCoarseY,
      "DerivCoarseY",
      OCC::Unary,
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x403}},
-     {{0x3}}}, // Overloads: hf<hf
+     {{0x3}},
+     {{0x0}}}, // Overloads: hf
     {OC::DerivFineX,
      "DerivFineX",
      OCC::Unary,
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x403}},
-     {{0x3}}}, // Overloads: hf<hf
+     {{0x3}},
+     {{0x0}}}, // Overloads: hf
     {OC::DerivFineY,
      "DerivFineY",
      OCC::Unary,
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x403}},
-     {{0x3}}}, // Overloads: hf<hf
+     {{0x3}},
+     {{0x0}}}, // Overloads: hf
 
     // Pixel shader
     {OC::EvalSnapped,
@@ -2652,6 +2652,40 @@ const OP::OpCodeProperty OP::m_OpCodeProps[(unsigned)OP::OpCode::NumOpCodes] = {
      1,
      {{0x4e7}},
      {{0xe7}}}, // Overloads: hfwidl<hfwidl
+
+    // Linear Algebra Operations
+    {OC::MatVecMul,
+     "MatVecMul",
+     OCC::MatVecMul,
+     "matVecMul",
+     Attribute::ReadOnly,
+     2,
+     {{0x400}, {0x400}},
+     {{0x63}, {0x63}}}, // Overloads: <hfwi,<hfwi
+    {OC::MatVecMulAdd,
+     "MatVecMulAdd",
+     OCC::MatVecMulAdd,
+     "matVecMulAdd",
+     Attribute::ReadOnly,
+     2,
+     {{0x400}, {0x400}},
+     {{0x63}, {0x63}}}, // Overloads: <hfwi,<hfwi
+    {OC::OuterProductAccumulate,
+     "OuterProductAccumulate",
+     OCC::OuterProductAccumulate,
+     "outerProductAccumulate",
+     Attribute::None,
+     2,
+     {{0x400}, {0x400}},
+     {{0x63}, {0x63}}}, // Overloads: <hfwi,<hfwi
+    {OC::VectorAccumulate,
+     "VectorAccumulate",
+     OCC::VectorAccumulate,
+     "vectorAccumulate",
+     Attribute::None,
+     1,
+     {{0x400}},
+     {{0x63}}}, // Overloads: <hfwi
 };
 // OPCODE-OLOADS:END
 
@@ -3440,8 +3474,9 @@ void OP::GetMinShaderModelAndMask(OpCode C, bool bWithTranslation,
     return;
   }
   // Instructions: AllocateRayQuery2=258, RawBufferVectorLoad=303,
-  // RawBufferVectorStore=304
-  if (op == 258 || (303 <= op && op <= 304)) {
+  // RawBufferVectorStore=304, MatVecMul=305, MatVecMulAdd=306,
+  // OuterProductAccumulate=307, VectorAccumulate=308
+  if (op == 258 || (303 <= op && op <= 308)) {
     major = 6;
     minor = 9;
     return;
@@ -5890,6 +5925,61 @@ Function *OP::GetOpFunc(OpCode opCode, Type *pOverloadType) {
     A(pETy);
     A(pI32);
     break;
+
+    // Linear Algebra Operations
+  case OpCode::MatVecMul:
+    EXT(0);
+    A(pI32);
+    EXT(1);
+    A(pI1);
+    A(pI32);
+    A(pRes);
+    A(pI32);
+    A(pI32);
+    A(pI32);
+    A(pI32);
+    A(pI32);
+    A(pI1);
+    A(pI32);
+    A(pI1);
+    break;
+  case OpCode::MatVecMulAdd:
+    EXT(0);
+    A(pI32);
+    EXT(1);
+    A(pI1);
+    A(pI32);
+    A(pRes);
+    A(pI32);
+    A(pI32);
+    A(pI32);
+    A(pI32);
+    A(pI32);
+    A(pI1);
+    A(pI32);
+    A(pRes);
+    A(pI32);
+    A(pI32);
+    A(pI1);
+    break;
+  case OpCode::OuterProductAccumulate:
+    A(pV);
+    A(pI32);
+    EXT(0);
+    EXT(1);
+    A(pRes);
+    A(pI32);
+    A(pI32);
+    A(pI32);
+    A(pI32);
+    break;
+  case OpCode::VectorAccumulate:
+    A(pV);
+    A(pI32);
+    A(pETy);
+    A(pRes);
+    A(pI32);
+    break;
   // OPCODE-OLOAD-FUNCS:END
   default:
     DXASSERT(false, "otherwise unhandled case");
@@ -6061,6 +6151,7 @@ llvm::Type *OP::GetOverloadType(OpCode opCode, llvm::Function *F) {
   case OpCode::WaveActiveAllEqual:
   case OpCode::CreateHandleForLib:
   case OpCode::WaveMatch:
+  case OpCode::VectorAccumulate:
     if (FT->getNumParams() <= 1)
       return nullptr;
     return FT->getParamType(1);
@@ -6291,6 +6382,19 @@ llvm::Type *OP::GetOverloadType(OpCode opCode, llvm::Function *F) {
     StructType *ST = cast<StructType>(Ty);
     return ST->getElementType(0);
   }
+  case OpCode::MatVecMul:
+  case OpCode::MatVecMulAdd:
+    if (FT->getNumParams() < 2)
+      return nullptr;
+    return llvm::StructType::get(Ctx,
+                                 {FT->getReturnType(), FT->getParamType(1)});
+
+  case OpCode::OuterProductAccumulate:
+    if (FT->getNumParams() < 3)
+      return nullptr;
+    return llvm::StructType::get(Ctx,
+                                 {FT->getParamType(1), FT->getParamType(2)});
+
   // OPCODE-OLOAD-TYPES:END
   default:
     return Ty;
@@ -6334,7 +6438,7 @@ Type *OP::GetFourI32Type() const { return m_pFourI32Type; }
 Type *OP::GetFourI16Type() const { return m_pFourI16Type; }
 
 bool OP::IsResRetType(llvm::Type *Ty) {
-  if (!Ty->isStructTy())
+  if (!Ty || !Ty->isStructTy())
     return false;
   for (Type *ResTy : m_pResRetType) {
     if (Ty == ResTy)
diff --git a/lib/DXIL/DxilUtil.cpp b/lib/DXIL/DxilUtil.cpp
index 966c2e189c..cc0b509772 100644
--- a/lib/DXIL/DxilUtil.cpp
+++ b/lib/DXIL/DxilUtil.cpp
@@ -181,11 +181,11 @@ void PrintUnescapedString(StringRef Name, raw_ostream &Out) {
     if (C == '\\') {
       C = Name[++i];
       unsigned value = hexDigitValue(C);
-      if (value != -1U) {
+      if (value != ~0U) {
         C = (unsigned char)value;
         unsigned value2 = hexDigitValue(Name[i + 1]);
-        assert(value2 != -1U && "otherwise, not a two digit hex escape");
-        if (value2 != -1U) {
+        assert(value2 != ~0U && "otherwise, not a two digit hex escape");
+        if (value2 != ~0U) {
           C = (C << 4) + (unsigned char)value2;
           ++i;
         }
diff --git a/lib/DxcSupport/HLSLOptions.cpp b/lib/DxcSupport/HLSLOptions.cpp
index 1ce7d0dfc0..b3eb422eb9 100644
--- a/lib/DxcSupport/HLSLOptions.cpp
+++ b/lib/DxcSupport/HLSLOptions.cpp
@@ -1033,20 +1033,6 @@ int ReadDxcOpts(const OptTable *optionTable, unsigned flagsToInclude,
     opts.ValVerMinor = (unsigned long)minor64;
   }
 
-  llvm::StringRef valSelectStr = Args.getLastArgValue(OPT_select_validator);
-  if (!valSelectStr.empty()) {
-    opts.SelectValidator = llvm::StringSwitch<ValidatorSelection>(valSelectStr)
-                               .Case("auto", ValidatorSelection::Auto)
-                               .Case("internal", ValidatorSelection::Internal)
-                               .Case("external", ValidatorSelection::External)
-                               .Default(ValidatorSelection::Invalid);
-    if (opts.SelectValidator == ValidatorSelection::Invalid) {
-      errors << "Unsupported value '" << valSelectStr
-             << "for -select-validator option.";
-      return 1;
-    }
-  }
-
   if (opts.IsLibraryProfile() && Minor == 0xF) {
     if (opts.ValVerMajor != UINT_MAX && opts.ValVerMajor != 0) {
       errors << "Offline library profile cannot be used with non-zero "
@@ -1134,6 +1120,8 @@ int ReadDxcOpts(const OptTable *optionTable, unsigned flagsToInclude,
       Args.hasFlag(OPT_fspv_enable_maximal_reconvergence, OPT_INVALID, false);
   opts.SpirvOptions.useVulkanMemoryModel =
       Args.hasFlag(OPT_fspv_use_vulkan_memory_model, OPT_INVALID, false);
+  opts.SpirvOptions.useUnknownImageFormat =
+      Args.hasFlag(OPT_fspv_use_unknown_image_format, OPT_INVALID, false);
 
   if (!handleVkShiftArgs(Args, OPT_fvk_b_shift, "b", &opts.SpirvOptions.bShift,
                          errors) ||
diff --git a/lib/DxilContainer/DxcContainerBuilder.cpp b/lib/DxilContainer/DxcContainerBuilder.cpp
index 770aa910a4..be182328dd 100644
--- a/lib/DxilContainer/DxcContainerBuilder.cpp
+++ b/lib/DxilContainer/DxcContainerBuilder.cpp
@@ -146,18 +146,14 @@ DxcContainerBuilder::SerializeContainer(IDxcOperationResult **ppResult) {
     // Combine existing warnings and errors from validation
     CComPtr<IDxcBlobEncoding> pErrorBlob;
     CDxcMallocHeapPtr<char> errorHeap(m_pMalloc);
-    SIZE_T warningLength = m_warning ? strlen(m_warning) : 0;
-    SIZE_T valErrorLength =
+    SIZE_T totalErrorLength =
         pValErrorUtf8 ? pValErrorUtf8->GetStringLength() : 0;
-    SIZE_T totalErrorLength = warningLength + valErrorLength;
     if (totalErrorLength) {
       SIZE_T errorSizeInBytes = totalErrorLength + 1;
       errorHeap.AllocateBytes(errorSizeInBytes);
-      if (warningLength)
-        memcpy(errorHeap.m_pData, m_warning, warningLength);
-      if (valErrorLength)
-        memcpy(errorHeap.m_pData + warningLength,
-               pValErrorUtf8->GetStringPointer(), valErrorLength);
+
+      memcpy(errorHeap.m_pData, pValErrorUtf8->GetStringPointer(),
+             totalErrorLength);
       errorHeap.m_pData[totalErrorLength] = L'\0';
       IFT(hlsl::DxcCreateBlobWithEncodingOnMalloc(errorHeap.m_pData, m_pMalloc,
                                                   errorSizeInBytes, DXC_CP_UTF8,
diff --git a/lib/DxilDia/DxcPixDxilStorage.cpp b/lib/DxilDia/DxcPixDxilStorage.cpp
index 79d21303dc..4b06f472e8 100644
--- a/lib/DxilDia/DxcPixDxilStorage.cpp
+++ b/lib/DxilDia/DxcPixDxilStorage.cpp
@@ -185,7 +185,11 @@ dxil_debug_info::DxcPixDxilScalarStorage::Index(DWORD Index,
 STDMETHODIMP dxil_debug_info::DxcPixDxilScalarStorage::GetRegisterNumber(
     DWORD *pRegisterNumber) {
   const auto &ValueLocationMap = m_pVarInfo->m_ValueLocationMap;
-  auto RegIt = ValueLocationMap.find(m_OffsetFromStorageStartInBits);
+  // Bitfields will have been packed into their containing integer type:
+  DWORD size;
+  m_pOriginalType->GetSizeInBits(&size);
+  auto RegIt =
+      ValueLocationMap.find(m_OffsetFromStorageStartInBits & ~(size - 1));
 
   if (RegIt == ValueLocationMap.end()) {
     return E_FAIL;
diff --git a/lib/DxilPIXPasses/DxilAnnotateWithVirtualRegister.cpp b/lib/DxilPIXPasses/DxilAnnotateWithVirtualRegister.cpp
index babf5b7953..88f696b7fa 100644
--- a/lib/DxilPIXPasses/DxilAnnotateWithVirtualRegister.cpp
+++ b/lib/DxilPIXPasses/DxilAnnotateWithVirtualRegister.cpp
@@ -76,19 +76,29 @@ class DxilAnnotateWithVirtualRegister : public llvm::ModulePass {
 
 private:
   void AnnotateValues(llvm::Instruction *pI);
-  void AnnotateStore(llvm::Instruction *pI);
-  void SplitVectorStores(hlsl::OP *HlslOP, llvm::Instruction *pI);
+  void AnnotateStore(hlsl::OP *HlslOP, llvm::Instruction *pI);
+  void SplitVectorStores(llvm::Instruction *pI);
   bool IsAllocaRegisterWrite(llvm::Value *V, llvm::AllocaInst **pAI,
                              llvm::Value **pIdx);
   void AnnotateAlloca(llvm::AllocaInst *pAlloca);
   void AnnotateGeneric(llvm::Instruction *pI);
   void AssignNewDxilRegister(llvm::Instruction *pI);
   void AssignNewAllocaRegister(llvm::AllocaInst *pAlloca, std::uint32_t C);
-
+  llvm::Value *AddConstIntValues(llvm::Value *l, llvm::Value *r);
+  llvm::Value *MultiplyConstIntValue(llvm::Value *l, uint32_t r);
+  llvm::Value *GetStructOffset(llvm::GetElementPtrInst *pGEP,
+                               uint32_t &GEPOperandIndex,
+                               llvm::Type *pElementType);
   hlsl::DxilModule *m_DM;
   std::uint32_t m_uVReg;
   std::unique_ptr<llvm::ModuleSlotTracker> m_MST;
   int m_StartInstruction = 0;
+  struct RememberedAllocaStores {
+    llvm::StoreInst *StoreInst;
+    llvm::Value *Index;
+    llvm::MDNode *AllocaReg;
+  };
+  std::vector<RememberedAllocaStores> m_RememberedAllocaStores;
 
   void Init(llvm::Module &M) {
     m_DM = &M.GetOrCreateDxilModule();
@@ -129,8 +139,6 @@ bool DxilAnnotateWithVirtualRegister::runOnModule(llvm::Module &M) {
     m_DM->SetValidatorVersion(1, 4);
   }
 
-  std::uint32_t InstNum = m_StartInstruction;
-
   auto instrumentableFunctions =
       PIXPassHelpers::GetAllInstrumentableFunctions(*m_DM);
 
@@ -138,7 +146,7 @@ bool DxilAnnotateWithVirtualRegister::runOnModule(llvm::Module &M) {
     for (auto &block : F->getBasicBlockList()) {
       for (auto it = block.begin(); it != block.end();) {
         llvm::Instruction *I = &*(it++);
-        SplitVectorStores(m_DM->GetOP(), I);
+        SplitVectorStores(I);
       }
     }
   }
@@ -151,17 +159,32 @@ bool DxilAnnotateWithVirtualRegister::runOnModule(llvm::Module &M) {
     }
   }
 
+  // Process all allocas referenced by dbg.declare intrinsics
   for (auto *F : instrumentableFunctions) {
     for (auto &block : F->getBasicBlockList()) {
-      for (llvm::Instruction &I : block.getInstList()) {
-        AnnotateStore(&I);
+      for (auto &I : block) {
+        if (auto *DbgDeclare = llvm::dyn_cast<llvm::DbgDeclareInst>(&I)) {
+          // The first operand of DbgDeclare is the address (typically an
+          // AllocaInst)
+          if (auto *AddrVal =
+                  llvm::dyn_cast<llvm::Instruction>(DbgDeclare->getAddress())) {
+            AnnotateValues(AddrVal);
+          }
+        }
       }
     }
   }
 
+  for (auto *F : instrumentableFunctions)
+    for (auto &block : F->getBasicBlockList()) {
+      for (llvm::Instruction &I : block.getInstList()) {
+        AnnotateStore(m_DM->GetOP(), &I);
+      }
+    }
+
   for (auto *F : instrumentableFunctions) {
-    int InstructionRangeStart = InstNum;
-    int InstructionRangeEnd = InstNum;
+    int InstructionRangeStart = m_StartInstruction;
+    int InstructionRangeEnd = m_StartInstruction;
     for (auto &block : F->getBasicBlockList()) {
       for (llvm::Instruction &I : block.getInstList()) {
         // If the instruction is part of the debug value instrumentation added
@@ -171,8 +194,9 @@ bool DxilAnnotateWithVirtualRegister::runOnModule(llvm::Module &M) {
           if (PixAllocaReg::FromInst(Alloca, &unused1, &unused2))
             continue;
         if (!llvm::isa<llvm::DbgDeclareInst>(&I)) {
-          pix_dxil::PixDxilInstNum::AddMD(M.getContext(), &I, InstNum++);
-          InstructionRangeEnd = InstNum;
+          pix_dxil::PixDxilInstNum::AddMD(M.getContext(), &I,
+                                          m_StartInstruction++);
+          InstructionRangeEnd = m_StartInstruction;
         }
       }
     }
@@ -188,12 +212,17 @@ bool DxilAnnotateWithVirtualRegister::runOnModule(llvm::Module &M) {
     }
   }
 
+  for (auto const &as : m_RememberedAllocaStores) {
+    PixAllocaRegWrite::AddMD(m_DM->GetCtx(), as.StoreInst, as.AllocaReg,
+                             as.Index);
+  }
+
   if (OSOverride != nullptr) {
     // Print a set of strings of the exemplary form "InstructionCount: <n>
     // <fnName>"
     if (m_DM->GetShaderModel()->GetKind() == hlsl::ShaderModel::Kind::Library)
       *OSOverride << "\nIsLibrary\n";
-    *OSOverride << "\nInstructionCount:" << InstNum << "\n";
+    *OSOverride << "\nInstructionCount:" << m_StartInstruction << "\n";
   }
 
   m_DM = nullptr;
@@ -210,7 +239,8 @@ void DxilAnnotateWithVirtualRegister::AnnotateValues(llvm::Instruction *pI) {
   }
 }
 
-void DxilAnnotateWithVirtualRegister::AnnotateStore(llvm::Instruction *pI) {
+void DxilAnnotateWithVirtualRegister::AnnotateStore(hlsl::OP *HlslOP,
+                                                    llvm::Instruction *pI) {
   auto *pSt = llvm::dyn_cast<llvm::StoreInst>(pI);
   if (pSt == nullptr) {
     return;
@@ -226,15 +256,47 @@ void DxilAnnotateWithVirtualRegister::AnnotateStore(llvm::Instruction *pI) {
   if (AllocaReg == nullptr) {
     return;
   }
+  m_RememberedAllocaStores.push_back({pSt, Index, AllocaReg});
+}
+
+llvm::Value *
+DxilAnnotateWithVirtualRegister::MultiplyConstIntValue(llvm::Value *l,
+                                                       uint32_t r) {
+  if (r == 1)
+    return l;
+  if (auto *lci = llvm::dyn_cast<llvm::ConstantInt>(l))
+    return m_DM->GetOP()->GetU32Const(lci->getLimitedValue() * r);
+  // Should never get here, but if we do, return the left as a reasonable
+  // default:
+  return l;
+}
 
-  PixAllocaRegWrite::AddMD(m_DM->GetCtx(), pSt, AllocaReg, Index);
+llvm::Value *
+DxilAnnotateWithVirtualRegister::AddConstIntValues(llvm::Value *l,
+                                                   llvm::Value *r) {
+  auto *rci = llvm::dyn_cast<llvm::ConstantInt>(r);
+  if (rci && rci->getLimitedValue() == 0)
+    return l;
+  auto *lci = llvm::dyn_cast<llvm::ConstantInt>(l);
+  if (lci && lci->getLimitedValue() == 0)
+    return r;
+  // Both an assert and a check, in case of unexpected circumstances.
+  DXASSERT(lci != nullptr && rci != nullptr,
+           "Both sides of add should be constant ints");
+  if (lci != nullptr && rci != nullptr)
+    return m_DM->GetOP()->GetU32Const(lci->getLimitedValue() +
+                                      rci->getLimitedValue());
+  // In an emergency, return the left argument. It'll be closest to
+  // the desired value.
+  return l;
 }
 
-static uint32_t GetStructOffset(llvm::GetElementPtrInst *pGEP,
-                                uint32_t &GEPOperandIndex,
-                                llvm::Type *pElementType) {
+llvm::Value *
+DxilAnnotateWithVirtualRegister::GetStructOffset(llvm::GetElementPtrInst *pGEP,
+                                                 uint32_t &GEPOperandIndex,
+                                                 llvm::Type *pElementType) {
   if (IsInstrumentableFundamentalType(pElementType)) {
-    return 0;
+    return m_DM->GetOP()->GetU32Const(0);
   } else if (auto *pArray = llvm::dyn_cast<llvm::ArrayType>(pElementType)) {
     // 1D-array example:
     //
@@ -248,18 +310,13 @@ static uint32_t GetStructOffset(llvm::GetElementPtrInst *pGEP,
     //  -The zeroth element in the struct (which is the array)
     //  -The zeroth element in that array
 
-    auto *pArrayIndex =
-        llvm::dyn_cast<llvm::ConstantInt>(pGEP->getOperand(GEPOperandIndex++));
-
-    if (pArrayIndex == nullptr) {
-      return 0;
-    }
+    auto *pArrayIndex = pGEP->getOperand(GEPOperandIndex++);
 
-    uint32_t ArrayIndex = pArrayIndex->getLimitedValue();
     auto pArrayElementType = pArray->getArrayElementType();
-    uint32_t MemberIndex = ArrayIndex * CountStructMembers(pArrayElementType);
-    return MemberIndex +
-           GetStructOffset(pGEP, GEPOperandIndex, pArrayElementType);
+    auto *MemberIndex = MultiplyConstIntValue(
+        pArrayIndex, CountStructMembers(pArrayElementType));
+    return AddConstIntValues(
+        MemberIndex, GetStructOffset(pGEP, GEPOperandIndex, pArrayElementType));
   } else if (auto *pStruct = llvm::dyn_cast<llvm::StructType>(pElementType)) {
     DXASSERT(GEPOperandIndex < pGEP->getNumOperands(),
              "Unexpectedly read too many GetElementPtrInst operands");
@@ -268,7 +325,7 @@ static uint32_t GetStructOffset(llvm::GetElementPtrInst *pGEP,
         llvm::dyn_cast<llvm::ConstantInt>(pGEP->getOperand(GEPOperandIndex++));
 
     if (pMemberIndex == nullptr) {
-      return 0;
+      return m_DM->GetOP()->GetU32Const(0);
     }
 
     uint32_t MemberIndex = pMemberIndex->getLimitedValue();
@@ -278,16 +335,17 @@ static uint32_t GetStructOffset(llvm::GetElementPtrInst *pGEP,
       MemberOffset += CountStructMembers(pStruct->getElementType(i));
     }
 
-    return MemberOffset + GetStructOffset(pGEP, GEPOperandIndex,
-                                          pStruct->getElementType(MemberIndex));
+    return AddConstIntValues(
+        m_DM->GetOP()->GetU32Const(MemberOffset),
+        GetStructOffset(pGEP, GEPOperandIndex,
+                        pStruct->getElementType(MemberIndex)));
   } else {
-    return 0;
+    return m_DM->GetOP()->GetU32Const(0);
   }
 }
 
 bool DxilAnnotateWithVirtualRegister::IsAllocaRegisterWrite(
     llvm::Value *V, llvm::AllocaInst **pAI, llvm::Value **pIdx) {
-  llvm::IRBuilder<> B(m_DM->GetCtx());
 
   *pAI = nullptr;
   *pIdx = nullptr;
@@ -366,7 +424,8 @@ bool DxilAnnotateWithVirtualRegister::IsAllocaRegisterWrite(
 
     auto offset = GetStructOffset(pGEP, GEPOperandIndex, pStructType);
 
-    llvm::Value *IndexValue = B.getInt32(offset + precedingMemberCount);
+    llvm::Value *IndexValue = AddConstIntValues(
+        offset, m_DM->GetOP()->GetU32Const(precedingMemberCount));
 
     if (IndexValue != nullptr) {
       *pAI = Alloca;
@@ -383,7 +442,7 @@ bool DxilAnnotateWithVirtualRegister::IsAllocaRegisterWrite(
     }
 
     *pAI = pAlloca;
-    *pIdx = B.getInt32(0);
+    *pIdx = m_DM->GetOP()->GetU32Const(0);
     return true;
   }
 
@@ -463,12 +522,13 @@ void DxilAnnotateWithVirtualRegister::AssignNewDxilRegister(
 
 void DxilAnnotateWithVirtualRegister::AssignNewAllocaRegister(
     llvm::AllocaInst *pAlloca, std::uint32_t C) {
-  PixAllocaReg::AddMD(m_DM->GetCtx(), pAlloca, m_uVReg, C);
-  m_uVReg += C;
+  if (!PixAllocaReg::FromInst(pAlloca, nullptr, nullptr)) {
+    PixAllocaReg::AddMD(m_DM->GetCtx(), pAlloca, m_uVReg, C);
+    m_uVReg += C;
+  }
 }
 
-void DxilAnnotateWithVirtualRegister::SplitVectorStores(hlsl::OP *HlslOP,
-                                                        llvm::Instruction *pI) {
+void DxilAnnotateWithVirtualRegister::SplitVectorStores(llvm::Instruction *pI) {
   auto *pSt = llvm::dyn_cast<llvm::StoreInst>(pI);
   if (pSt == nullptr) {
     return;
diff --git a/lib/DxilPIXPasses/DxilDbgValueToDbgDeclare.cpp b/lib/DxilPIXPasses/DxilDbgValueToDbgDeclare.cpp
index bf25d9f85f..9ddbe876b5 100644
--- a/lib/DxilPIXPasses/DxilDbgValueToDbgDeclare.cpp
+++ b/lib/DxilPIXPasses/DxilDbgValueToDbgDeclare.cpp
@@ -36,7 +36,7 @@ using namespace PIXPassHelpers;
 
 using namespace llvm;
 
-//#define VALUE_TO_DECLARE_LOGGING
+// #define VALUE_TO_DECLARE_LOGGING
 
 #ifdef VALUE_TO_DECLARE_LOGGING
 #ifndef PIX_DEBUG_DUMP_HELPER
@@ -859,8 +859,8 @@ void DxilDbgValueToDbgDeclare::handleDbgValue(llvm::Module &M,
     VALUE_TO_DECLARE_LOG("... variable was null too");
   }
 
-  llvm::Value *V = DbgValue->getValue();
-  if (V == nullptr) {
+  llvm::Value *ValueFromDbgInst = DbgValue->getValue();
+  if (ValueFromDbgInst == nullptr) {
     // The metadata contained a null Value, so we ignore it. This
     // seems to be a dxcompiler bug.
     VALUE_TO_DECLARE_LOG("...Null value!");
@@ -873,20 +873,20 @@ void DxilDbgValueToDbgDeclare::handleDbgValue(llvm::Module &M,
     return;
   }
 
-  if (llvm::isa<llvm::PointerType>(V->getType())) {
+  if (llvm::isa<llvm::PointerType>(ValueFromDbgInst->getType())) {
     // Safeguard: If the type is not a pointer type, then this is
     // dbg.value directly pointing to a memory location instead of
     // a value.
     if (!IsDITypePointer(Ty, EmptyMap)) {
       // We only know how to handle AllocaInsts for now
-      if (!isa<AllocaInst>(V)) {
+      if (!isa<AllocaInst>(ValueFromDbgInst)) {
         VALUE_TO_DECLARE_LOG(
             "... variable had pointer type, but is not an alloca.");
         return;
       }
 
       IRBuilder<> B(DbgValue->getNextNode());
-      V = B.CreateLoad(V);
+      ValueFromDbgInst = B.CreateLoad(ValueFromDbgInst);
     }
   }
 
@@ -931,7 +931,7 @@ void DxilDbgValueToDbgDeclare::handleDbgValue(llvm::Module &M,
   }
 
   const OffsetInBits InitialOffset = PackedOffsetFromVar;
-  auto *insertPt = llvm::dyn_cast<llvm::Instruction>(V);
+  auto *insertPt = llvm::dyn_cast<llvm::Instruction>(ValueFromDbgInst);
   if (insertPt != nullptr && !llvm::isa<TerminatorInst>(insertPt)) {
     insertPt = insertPt->getNextNode();
     // Drivers may crash if phi nodes aren't always at the top of a block,
@@ -950,7 +950,8 @@ void DxilDbgValueToDbgDeclare::handleDbgValue(llvm::Module &M,
       // Offset}. InitialOffset is the offset from DbgValue's expression
       // (i.e., the offset from the Variable's start), and Offset is the
       // Scalar Value's packed offset from DbgValue's value.
-      for (const ValueAndOffset &VO : SplitValue(V, InitialOffset, B)) {
+      for (const ValueAndOffset &VO :
+           SplitValue(ValueFromDbgInst, InitialOffset, B)) {
 
         OffsetInBits AlignedOffset;
         if (!Offsets.GetAlignedOffsetFromPackedOffset(VO.m_PackedOffset,
diff --git a/lib/DxilPIXPasses/DxilDebugInstrumentation.cpp b/lib/DxilPIXPasses/DxilDebugInstrumentation.cpp
index a7d7e72cb4..4dd43b07cc 100644
--- a/lib/DxilPIXPasses/DxilDebugInstrumentation.cpp
+++ b/lib/DxilPIXPasses/DxilDebugInstrumentation.cpp
@@ -1356,7 +1356,19 @@ DxilDebugInstrumentation::FindInstrumentableInstructionsInBlock(
           IndexingToken = "s"; // static indexing, no debug output required
         } else {
           IndexingToken = "d"; // dynamic indexing
-          RegisterOrStaticIndex = std::to_string(IandT->AllocaBase);
+          int MaxArraySize = 1;
+          if (auto *Store = dyn_cast<StoreInst>(&Inst)) {
+            if (auto *GEP =
+                    dyn_cast<GetElementPtrInst>(Store->getPointerOperand())) {
+              if (auto *Alloca =
+                      dyn_cast<AllocaInst>(GEP->getPointerOperand())) {
+                MaxArraySize =
+                    Alloca->getAllocatedType()->getArrayNumElements();
+              }
+            }
+          }
+          RegisterOrStaticIndex = std::to_string(IandT->AllocaBase) + "-" +
+                                  std::to_string(MaxArraySize);
           DebugOutputForThisInstruction.ValueToWriteToDebugMemory =
               IandT->AllocaWriteIndex;
         }
@@ -1374,7 +1386,8 @@ DxilDebugInstrumentation::FindInstrumentableInstructionsInBlock(
         *OSOverride << "," << *RegisterOrStaticIndex;
       }
       if (IandT->ConstantAllocaStoreValue) {
-        *OSOverride << "," << std::to_string(*IandT->ConstantAllocaStoreValue);
+        uint64_t value = IandT->ConstantAllocaStoreValue.value();
+        *OSOverride << "," << std::to_string(value);
       }
       *OSOverride << ";";
       if (DebugOutputForThisInstruction.ValueToWriteToDebugMemory)
diff --git a/lib/DxilPIXPasses/DxilPIXVirtualRegisters.cpp b/lib/DxilPIXPasses/DxilPIXVirtualRegisters.cpp
index f68e2082bc..a60f6a77a7 100644
--- a/lib/DxilPIXPasses/DxilPIXVirtualRegisters.cpp
+++ b/lib/DxilPIXPasses/DxilPIXVirtualRegisters.cpp
@@ -124,8 +124,10 @@ static bool ParsePixAllocaReg(llvm::MDNode *MD, std::uint32_t *RegNum,
     return false;
   }
 
-  *RegNum = mdRegNum->getLimitedValue();
-  *Count = mdCount->getLimitedValue();
+  if (RegNum != nullptr)
+    *RegNum = mdRegNum->getLimitedValue();
+  if (Count != nullptr)
+    *Count = mdCount->getLimitedValue();
   return true;
 }
 
@@ -144,8 +146,10 @@ void pix_dxil::PixAllocaReg::AddMD(llvm::LLVMContext &Ctx,
 bool pix_dxil::PixAllocaReg::FromInst(llvm::AllocaInst const *pAlloca,
                                       std::uint32_t *pRegBase,
                                       std::uint32_t *pRegSize) {
-  *pRegBase = 0;
-  *pRegSize = 0;
+  if (pRegBase != nullptr)
+    *pRegBase = 0;
+  if (pRegSize != nullptr)
+    *pRegSize = 0;
 
   auto *mdNodes = pAlloca->getMetadata(MDName);
   if (mdNodes == nullptr) {
diff --git a/lib/DxilValidation/DxilValidation.cpp b/lib/DxilValidation/DxilValidation.cpp
index 00a6b9ae14..9587897e22 100644
--- a/lib/DxilValidation/DxilValidation.cpp
+++ b/lib/DxilValidation/DxilValidation.cpp
@@ -165,7 +165,8 @@ ValidateSignatureAccess(Instruction *I, DxilSignature &Sig, Value *SigId,
 
 static DxilResourceProperties GetResourceFromHandle(Value *Handle,
                                                     ValidationContext &ValCtx) {
-  if (!isa<CallInst>(Handle)) {
+  CallInst *HandleCall = dyn_cast<CallInst>(Handle);
+  if (!HandleCall) {
     if (Instruction *I = dyn_cast<Instruction>(Handle))
       ValCtx.EmitInstrError(I, ValidationRule::InstrHandleNotFromCreateHandle);
     else
@@ -175,10 +176,13 @@ static DxilResourceProperties GetResourceFromHandle(Value *Handle,
   }
 
   DxilResourceProperties RP = ValCtx.GetResourceFromVal(Handle);
-  if (RP.getResourceClass() == DXIL::ResourceClass::Invalid) {
+  if (RP.getResourceClass() == DXIL::ResourceClass::Invalid)
     ValCtx.EmitInstrError(cast<CallInst>(Handle),
                           ValidationRule::InstrHandleNotFromCreateHandle);
-  }
+  if (RP.Basic.IsReorderCoherent &&
+      !ValCtx.DxilMod.GetShaderModel()->IsSM69Plus())
+    ValCtx.EmitInstrError(HandleCall,
+                          ValidationRule::InstrReorderCoherentRequiresSM69);
 
   return RP;
 }
@@ -970,6 +974,293 @@ static void ValidateImmOperandForMathDxilOp(CallInst *CI, DXIL::OpCode Opcode,
   }
 }
 
+static bool CheckLinalgInterpretation(uint32_t Input, bool InRegister) {
+  using CT = DXIL::ComponentType;
+  switch (static_cast<CT>(Input)) {
+  case CT::I16:
+  case CT::U16:
+  case CT::I32:
+  case CT::U32:
+  case CT::F16:
+  case CT::F32:
+  case CT::U8:
+  case CT::I8:
+  case CT::F8_E4M3:
+  case CT::F8_E5M2:
+    return true;
+  case CT::PackedS8x32:
+  case CT::PackedU8x32:
+    return InRegister;
+  default:
+    return false;
+  }
+}
+
+static bool CheckMatrixLayoutForMatVecMulOps(unsigned Layout) {
+  return Layout <=
+         static_cast<unsigned>(DXIL::LinalgMatrixLayout::OuterProductOptimal);
+}
+
+std::string GetMatrixLayoutStr(unsigned Layout) {
+  switch (static_cast<DXIL::LinalgMatrixLayout>(Layout)) {
+  case DXIL::LinalgMatrixLayout::RowMajor:
+    return "RowMajor";
+  case DXIL::LinalgMatrixLayout::ColumnMajor:
+    return "ColumnMajor";
+  case DXIL::LinalgMatrixLayout::MulOptimal:
+    return "MulOptimal";
+  case DXIL::LinalgMatrixLayout::OuterProductOptimal:
+    return "OuterProductOptimal";
+  default:
+    DXASSERT_NOMSG(false);
+    return "Invalid";
+  }
+}
+
+static bool CheckTransposeForMatrixLayout(unsigned Layout, bool Transposed) {
+  switch (static_cast<DXIL::LinalgMatrixLayout>(Layout)) {
+  case DXIL::LinalgMatrixLayout::RowMajor:
+  case DXIL::LinalgMatrixLayout::ColumnMajor:
+    return !Transposed;
+
+  default:
+    return true;
+  }
+}
+
+static bool CheckUnsignedFlag(Type *VecTy, bool IsUnsigned) {
+  Type *ElemTy = VecTy->getScalarType();
+  if (ElemTy->isFloatingPointTy())
+    return !IsUnsigned;
+
+  return true;
+}
+
+static Value *GetMatVecOpIsOutputUnsigned(CallInst *CI, DXIL::OpCode OpCode) {
+  switch (OpCode) {
+  case DXIL::OpCode::MatVecMul:
+    return CI->getOperand(DXIL::OperandIndex::kMatVecMulIsOutputUnsignedIdx);
+  case DXIL::OpCode::MatVecMulAdd:
+    return CI->getOperand(DXIL::OperandIndex::kMatVecMulAddIsOutputUnsignedIdx);
+
+  default:
+    DXASSERT_NOMSG(false);
+    return nullptr;
+  }
+}
+
+static void ValidateImmOperandsForMatVecOps(CallInst *CI, DXIL::OpCode OpCode,
+                                            ValidationContext &ValCtx) {
+
+  llvm::Value *IsInputUnsigned =
+      CI->getOperand(DXIL::OperandIndex::kMatVecMulIsInputUnsignedIdx);
+  ConstantInt *IsInputUnsignedConst =
+      dyn_cast<llvm::ConstantInt>(IsInputUnsigned);
+  if (!IsInputUnsignedConst) {
+    ValCtx.EmitInstrFormatError(
+        CI, ValidationRule::InstrMatVecOpIsUnsignedFlagsAreConst,
+        {"IsInputUnsigned"});
+    return;
+  }
+
+  llvm::Value *IsOutputUnsigned = GetMatVecOpIsOutputUnsigned(CI, OpCode);
+  ConstantInt *IsOutputUnsignedConst =
+      dyn_cast<llvm::ConstantInt>(IsOutputUnsigned);
+  if (!IsOutputUnsignedConst) {
+    ValCtx.EmitInstrFormatError(
+        CI, ValidationRule::InstrMatVecOpIsUnsignedFlagsAreConst,
+        {"IsOutputUnsigned"});
+    return;
+  }
+
+  llvm::Value *InputInterpretation =
+      CI->getOperand(DXIL::OperandIndex::kMatVecMulInputInterpretationIdx);
+  ConstantInt *II = dyn_cast<ConstantInt>(InputInterpretation);
+  if (!II) {
+    ValCtx.EmitInstrFormatError(
+        CI, ValidationRule::InstrLinalgInterpretationParamAreConst,
+        {"InputInterpretation"});
+    return;
+  }
+  uint64_t IIValue = II->getLimitedValue();
+  if (!CheckLinalgInterpretation(IIValue, true)) {
+    ValCtx.EmitInstrFormatError(
+        CI, ValidationRule::InstrLinalgInvalidRegisterInterpValue,
+        {std::to_string(IIValue), "Input"});
+    return;
+  }
+
+  llvm::Value *MatrixInterpretation =
+      CI->getOperand(DXIL::OperandIndex::kMatVecMulMatrixInterpretationIdx);
+  ConstantInt *MI = dyn_cast<ConstantInt>(MatrixInterpretation);
+  if (!MI) {
+    ValCtx.EmitInstrFormatError(
+        CI, ValidationRule::InstrLinalgInterpretationParamAreConst,
+        {"MatrixInterpretation"});
+    return;
+  }
+  uint64_t MIValue = MI->getLimitedValue();
+  if (!CheckLinalgInterpretation(MIValue, false)) {
+    ValCtx.EmitInstrFormatError(
+        CI, ValidationRule::InstrLinalgInvalidMemoryInterpValue,
+        {std::to_string(MIValue), "Matrix"});
+    return;
+  }
+
+  llvm::Value *MatrixM =
+      CI->getOperand(DXIL::OperandIndex::kMatVecMulMatrixMIdx);
+  if (!llvm::isa<llvm::Constant>(MatrixM)) {
+    ValCtx.EmitInstrFormatError(
+        CI, ValidationRule::InstrLinalgMatrixShapeParamsAreConst,
+        {"Matrix M dimension"});
+    return;
+  }
+
+  llvm::Value *MatrixK =
+      CI->getOperand(DXIL::OperandIndex::kMatVecMulMatrixKIdx);
+  if (!llvm::isa<llvm::Constant>(MatrixK)) {
+    ValCtx.EmitInstrFormatError(
+        CI, ValidationRule::InstrLinalgMatrixShapeParamsAreConst,
+        {"Matrix K dimension"});
+    return;
+  }
+
+  llvm::Value *MatrixLayout =
+      CI->getOperand(DXIL::OperandIndex::kMatVecMulMatrixLayoutIdx);
+
+  ConstantInt *MatrixLayoutConst = dyn_cast<ConstantInt>(MatrixLayout);
+  if (!MatrixLayoutConst) {
+    ValCtx.EmitInstrFormatError(
+        CI, ValidationRule::InstrLinalgMatrixShapeParamsAreConst,
+        {"Matrix Layout"});
+    return;
+  }
+  uint64_t MLValue = MatrixLayoutConst->getLimitedValue();
+  if (!CheckMatrixLayoutForMatVecMulOps(MLValue)) {
+    ValCtx.EmitInstrFormatError(
+        CI, ValidationRule::InstrLinalgInvalidMatrixLayoutValueForMatVecOps,
+        {std::to_string(MLValue),
+         std::to_string(
+             static_cast<unsigned>(DXIL::LinalgMatrixLayout::RowMajor)),
+         std::to_string(static_cast<unsigned>(
+             DXIL::LinalgMatrixLayout::OuterProductOptimal))});
+    return;
+  }
+
+  llvm::Value *MatrixTranspose =
+      CI->getOperand(DXIL::OperandIndex::kMatVecMulMatrixTransposeIdx);
+  ConstantInt *MatrixTransposeConst = dyn_cast<ConstantInt>(MatrixTranspose);
+  if (!MatrixTransposeConst) {
+    ValCtx.EmitInstrFormatError(
+        CI, ValidationRule::InstrLinalgMatrixShapeParamsAreConst,
+        {"MatrixTranspose"});
+    return;
+  }
+
+  if (!CheckTransposeForMatrixLayout(MLValue,
+                                     MatrixTransposeConst->getLimitedValue())) {
+    ValCtx.EmitInstrFormatError(
+        CI, ValidationRule::InstrLinalgMatrixLayoutNotTransposable,
+        {GetMatrixLayoutStr(MLValue)});
+    return;
+  }
+
+  llvm::Value *InputVector =
+      CI->getOperand(DXIL::OperandIndex::kMatVecMulInputVectorIdx);
+  if (!CheckUnsignedFlag(InputVector->getType(),
+                         IsInputUnsignedConst->getLimitedValue())) {
+    ValCtx.EmitInstrFormatError(
+        CI, ValidationRule::InstrLinalgNotAnUnsignedType, {"Input"});
+    return;
+  }
+
+  if (!CheckUnsignedFlag(CI->getType(),
+                         IsOutputUnsignedConst->getLimitedValue())) {
+    ValCtx.EmitInstrFormatError(
+        CI, ValidationRule::InstrLinalgNotAnUnsignedType, {"Output"});
+    return;
+  }
+
+  switch (OpCode) {
+  case DXIL::OpCode::MatVecMulAdd: {
+    llvm::Value *BiasInterpretation =
+        CI->getOperand(DXIL::OperandIndex::kMatVecMulAddBiasInterpretation);
+    ConstantInt *BI = cast<ConstantInt>(BiasInterpretation);
+    if (!BI) {
+      ValCtx.EmitInstrFormatError(
+          CI, ValidationRule::InstrLinalgInterpretationParamAreConst,
+          {"BiasInterpretation"});
+      return;
+    }
+    uint64_t BIValue = BI->getLimitedValue();
+    if (!CheckLinalgInterpretation(BIValue, false)) {
+      ValCtx.EmitInstrFormatError(
+          CI, ValidationRule::InstrLinalgInvalidMemoryInterpValue,
+          {std::to_string(BIValue), "Bias vector"});
+      return;
+    }
+  } break;
+  default:
+    break;
+  }
+}
+
+static void ValidateImmOperandsForOuterProdAcc(CallInst *CI,
+                                               ValidationContext &ValCtx) {
+
+  llvm::Value *MatrixInterpretation =
+      CI->getOperand(DXIL::OperandIndex::kOuterProdAccMatrixInterpretation);
+  ConstantInt *MI = cast<ConstantInt>(MatrixInterpretation);
+  if (!MI) {
+    ValCtx.EmitInstrFormatError(
+        CI, ValidationRule::InstrLinalgInterpretationParamAreConst,
+        {"MatrixInterpretation"});
+    return;
+  }
+  uint64_t MIValue = MI->getLimitedValue();
+  if (!CheckLinalgInterpretation(MIValue, false)) {
+    ValCtx.EmitInstrFormatError(
+        CI, ValidationRule::InstrLinalgInvalidMemoryInterpValue,
+        {std::to_string(MIValue), "Matrix"});
+    return;
+  }
+
+  llvm::Value *MatrixLayout =
+      CI->getOperand(DXIL::OperandIndex::kOuterProdAccMatrixLayout);
+  if (!llvm::isa<llvm::Constant>(MatrixLayout)) {
+    ValCtx.EmitInstrFormatError(
+        CI, ValidationRule::InstrLinalgMatrixShapeParamsAreConst,
+        {"MatrixLayout"});
+    return;
+  }
+  ConstantInt *ML = cast<ConstantInt>(MatrixLayout);
+  uint64_t MLValue = ML->getLimitedValue();
+  if (MLValue !=
+      static_cast<unsigned>(DXIL::LinalgMatrixLayout::OuterProductOptimal))
+    ValCtx.EmitInstrFormatError(
+        CI,
+        ValidationRule::
+            InstrLinalgInvalidMatrixLayoutValueForOuterProductAccumulate,
+        {GetMatrixLayoutStr(MLValue),
+         GetMatrixLayoutStr(static_cast<unsigned>(
+             DXIL::LinalgMatrixLayout::OuterProductOptimal))});
+
+  llvm::Value *MatrixStride =
+      CI->getOperand(DXIL::OperandIndex::kOuterProdAccMatrixStride);
+  if (!llvm::isa<llvm::Constant>(MatrixStride)) {
+    ValCtx.EmitInstrError(
+        CI, ValidationRule::InstrLinalgMatrixStrideZeroForOptimalLayouts);
+    return;
+  }
+  ConstantInt *MS = cast<ConstantInt>(MatrixStride);
+  uint64_t MSValue = MS->getLimitedValue();
+  if (MSValue != 0) {
+    ValCtx.EmitInstrError(
+        CI, ValidationRule::InstrLinalgMatrixStrideZeroForOptimalLayouts);
+    return;
+  }
+}
+
 // Validate the type-defined mask compared to the store value mask which
 // indicates which parts were defined returns true if caller should continue
 // validation
@@ -1282,9 +1573,15 @@ static void ValidateResourceDxilOp(CallInst *CI, DXIL::OpCode Opcode,
       ValCtx.EmitInstrError(CI, ValidationRule::InstrCheckAccessFullyMapped);
     } else {
       Value *V = EVI->getOperand(0);
+      StructType *StrTy = dyn_cast<StructType>(V->getType());
+      unsigned ExtractIndex = EVI->getIndices()[0];
+      // Ensure parameter is a single value that is extracted from the correct
+      // ResRet struct location.
       bool IsLegal = EVI->getNumIndices() == 1 &&
-                     EVI->getIndices()[0] == DXIL::kResRetStatusIndex &&
-                     ValCtx.DxilMod.GetOP()->IsResRetType(V->getType());
+                     (ExtractIndex == DXIL::kResRetStatusIndex ||
+                      ExtractIndex == DXIL::kVecResRetStatusIndex) &&
+                     ValCtx.DxilMod.GetOP()->IsResRetType(StrTy) &&
+                     ExtractIndex == StrTy->getNumElements() - 1;
       if (!IsLegal) {
         ValCtx.EmitInstrError(CI, ValidationRule::InstrCheckAccessFullyMapped);
       }
@@ -1644,6 +1941,46 @@ static unsigned getSemanticFlagValidMask(const ShaderModel *pSM) {
   return static_cast<unsigned>(hlsl::DXIL::BarrierSemanticFlag::ValidMask);
 }
 
+StringRef GetOpCodeName(DXIL::OpCode OpCode) {
+  switch (OpCode) {
+  default:
+    DXASSERT(false, "Unexpected op code");
+    return "";
+  case DXIL::OpCode::HitObject_ObjectRayOrigin:
+    return "HitObject_ObjectRayOrigin";
+  case DXIL::OpCode::HitObject_WorldRayDirection:
+    return "HitObject_WorldRayDirection";
+  case DXIL::OpCode::HitObject_WorldRayOrigin:
+    return "HitObject_WorldRayOrigin";
+  case DXIL::OpCode::HitObject_ObjectRayDirection:
+    return "HitObject_ObjectRayDirection";
+  case DXIL::OpCode::HitObject_WorldToObject3x4:
+    return "HitObject_WorldToObject3x4";
+  case DXIL::OpCode::HitObject_ObjectToWorld3x4:
+    return "HitObject_ObjectToWorld3x4";
+  }
+}
+
+static void ValidateConstantRangeUnsigned(Value *Val, StringRef Name,
+                                          uint64_t LowerBound,
+                                          uint64_t UpperBound, CallInst *CI,
+                                          DXIL::OpCode OpCode,
+                                          ValidationContext &ValCtx) {
+  ConstantInt *C = dyn_cast<ConstantInt>(Val);
+  if (!C) {
+    ValCtx.EmitInstrFormatError(CI, ValidationRule::InstrOpConst,
+                                {Name, GetOpCodeName(OpCode)});
+    return;
+  }
+  if (C->uge(UpperBound + 1U) || !C->uge(LowerBound)) {
+    std::string Range =
+        std::to_string(LowerBound) + "~" + std::to_string(UpperBound);
+    ValCtx.EmitInstrFormatError(
+        CI, ValidationRule::InstrOperandRange,
+        {Name, Range, C->getValue().toString(10, false)});
+  }
+}
+
 static void ValidateDxilOperationCallInProfile(CallInst *CI,
                                                DXIL::OpCode Opcode,
                                                const ShaderModel *pSM,
@@ -1909,7 +2246,109 @@ static void ValidateDxilOperationCallInProfile(CallInst *CI,
       ValCtx.EmitInstrError(
           CI, ValidationRule::InstrMayReorderThreadUndefCoherenceHintParam);
   } break;
+  case DXIL::OpCode::HitObject_MakeMiss: {
+    DxilInst_HitObject_MakeMiss MakeMiss(CI);
+    if (isa<UndefValue>(MakeMiss.get_RayFlags()) ||
+        isa<UndefValue>(MakeMiss.get_MissShaderIndex()))
+      ValCtx.EmitInstrError(CI, ValidationRule::InstrNoReadingUninitialized);
+  } break;
 
+  case DXIL::OpCode::HitObject_LoadLocalRootTableConstant: {
+    Value *HitObject = CI->getArgOperand(1);
+    if (isa<UndefValue>(HitObject))
+      ValCtx.EmitInstrError(CI, ValidationRule::InstrUndefHitObject);
+    Value *Offset = CI->getArgOperand(2);
+    if (isa<UndefValue>(Offset))
+      ValCtx.EmitInstrError(CI, ValidationRule::InstrNoReadingUninitialized);
+    if (ConstantInt *COffset = dyn_cast<ConstantInt>(Offset)) {
+      if (COffset->getLimitedValue() % 4 != 0)
+        ValCtx.EmitInstrFormatError(
+            CI, ValidationRule::InstrParamMultiple,
+            {"offset", "4", COffset->getValue().toString(10, false)});
+    }
+    break;
+  }
+  case DXIL::OpCode::HitObject_SetShaderTableIndex: {
+    Value *HitObject = CI->getArgOperand(1);
+    if (isa<UndefValue>(HitObject))
+      ValCtx.EmitInstrError(CI, ValidationRule::InstrUndefHitObject);
+    Value *RecordIndex = CI->getArgOperand(2);
+    if (isa<UndefValue>(RecordIndex))
+      ValCtx.EmitInstrError(CI, ValidationRule::InstrNoReadingUninitialized);
+    break;
+  }
+
+  // Shader Execution Reordering - scalar getters
+  case DXIL::OpCode::HitObject_GeometryIndex:
+  case DXIL::OpCode::HitObject_HitKind:
+  case DXIL::OpCode::HitObject_InstanceID:
+  case DXIL::OpCode::HitObject_InstanceIndex:
+  case DXIL::OpCode::HitObject_IsHit:
+  case DXIL::OpCode::HitObject_IsMiss:
+  case DXIL::OpCode::HitObject_IsNop:
+  case DXIL::OpCode::HitObject_PrimitiveIndex:
+  case DXIL::OpCode::HitObject_RayFlags:
+  case DXIL::OpCode::HitObject_RayTCurrent:
+  case DXIL::OpCode::HitObject_RayTMin:
+  case DXIL::OpCode::HitObject_ShaderTableIndex: {
+    Value *HitObject = CI->getArgOperand(1);
+    if (isa<UndefValue>(HitObject))
+      ValCtx.EmitInstrError(CI, ValidationRule::InstrUndefHitObject);
+    break;
+  }
+
+  // Shader Execution Reordering - vector getters
+  case DXIL::OpCode::HitObject_ObjectRayDirection:
+  case DXIL::OpCode::HitObject_ObjectRayOrigin:
+  case DXIL::OpCode::HitObject_WorldRayDirection:
+  case DXIL::OpCode::HitObject_WorldRayOrigin: {
+    Value *HitObject = CI->getArgOperand(1);
+    if (isa<UndefValue>(HitObject))
+      ValCtx.EmitInstrError(CI, ValidationRule::InstrUndefHitObject);
+    Value *Col = CI->getArgOperand(2);
+    ValidateConstantRangeUnsigned(Col, "component", 0, 2, CI, Opcode, ValCtx);
+    break;
+  }
+
+  // Shader Execution Reordering - matrix getters
+  case DXIL::OpCode::HitObject_WorldToObject3x4:
+  case DXIL::OpCode::HitObject_ObjectToWorld3x4: {
+    Value *HitObject = CI->getArgOperand(1);
+    if (isa<UndefValue>(HitObject))
+      ValCtx.EmitInstrError(CI, ValidationRule::InstrUndefHitObject);
+    Value *Row = CI->getArgOperand(2);
+    ValidateConstantRangeUnsigned(Row, "row", 0, 2, CI, Opcode, ValCtx);
+    Value *Col = CI->getArgOperand(3);
+    ValidateConstantRangeUnsigned(Col, "column", 0, 3, CI, Opcode, ValCtx);
+    break;
+  }
+
+  // Shader Execution Reordering - from ray query
+  case DXIL::OpCode::HitObject_FromRayQuery:
+  case DXIL::OpCode::HitObject_FromRayQueryWithAttrs: {
+    for (unsigned i = 1; i < CI->getNumOperands(); ++i) {
+      Value *Arg = CI->getArgOperand(i);
+      if (isa<UndefValue>(Arg))
+        ValCtx.EmitInstrError(CI, ValidationRule::InstrNoReadingUninitialized);
+    }
+    break;
+  }
+
+  case DXIL::OpCode::HitObject_Invoke: {
+    if (isa<UndefValue>(CI->getArgOperand(1)))
+      ValCtx.EmitInstrError(CI, ValidationRule::InstrUndefHitObject);
+    if (isa<UndefValue>(CI->getArgOperand(2)))
+      ValCtx.EmitInstrError(CI, ValidationRule::InstrNoReadingUninitialized);
+  } break;
+  case DXIL::OpCode::HitObject_TraceRay: {
+    Value *Hdl = CI->getArgOperand(
+        DxilInst_HitObject_TraceRay::arg_accelerationStructure);
+    ValidateASHandle(CI, Hdl, ValCtx);
+    for (unsigned ArgIdx = 2; ArgIdx < CI->getNumArgOperands(); ++ArgIdx)
+      if (isa<UndefValue>(CI->getArgOperand(ArgIdx)))
+        ValCtx.EmitInstrError(CI, ValidationRule::InstrNoReadingUninitialized);
+    DxilInst_HitObject_TraceRay HOTraceRay(CI);
+  } break;
   case DXIL::OpCode::AtomicBinOp:
   case DXIL::OpCode::AtomicCompareExchange: {
     Type *pOverloadType = OP::GetOverloadType(Opcode, CI->getCalledFunction());
@@ -1994,6 +2433,16 @@ static void ValidateDxilOperationCallInProfile(CallInst *CI,
                                  GetLaunchTypeStr(NodeLaunchType)});
 
     break;
+  case DXIL::OpCode::MatVecMul:
+  case DXIL::OpCode::MatVecMulAdd:
+    ValidateImmOperandsForMatVecOps(CI, Opcode, ValCtx);
+    break;
+  case DXIL::OpCode::OuterProductAccumulate:
+    ValidateImmOperandsForOuterProdAcc(CI, ValCtx);
+    break;
+  case DXIL::OpCode::VectorAccumulate:
+
+    break;
 
   default:
     // TODO: make sure every Opcode is checked.
@@ -2212,6 +2661,9 @@ static bool ValidateType(Type *Ty, ValidationContext &ValCtx,
       if (ValCtx.HandleTy == Ty)
         return true;
       hlsl::OP *HlslOP = ValCtx.DxilMod.GetOP();
+      // Allow HitObject type.
+      if (ST == HlslOP->GetHitObjectType())
+        return true;
       if (IsDxilBuiltinStructType(ST, HlslOP)) {
         ValCtx.EmitTypeError(Ty, ValidationRule::InstrDxilStructUser);
         Result = false;
@@ -3766,6 +4218,9 @@ static void ValidateResourceOverlap(
 
 static void ValidateResource(hlsl::DxilResource &Res,
                              ValidationContext &ValCtx) {
+  if (Res.IsReorderCoherent() && !ValCtx.DxilMod.GetShaderModel()->IsSM69Plus())
+    ValCtx.EmitResourceError(&Res,
+                             ValidationRule::InstrReorderCoherentRequiresSM69);
   switch (Res.GetKind()) {
   case DXIL::ResourceKind::RawBuffer:
   case DXIL::ResourceKind::TypedBuffer:
@@ -3997,10 +4452,13 @@ static void ValidateResources(ValidationContext &ValCtx) {
       ValCtx.EmitResourceError(Uav.get(),
                                ValidationRule::SmCounterOnlyOnStructBuf);
     }
-    if (Uav->HasCounter() && Uav->IsGloballyCoherent())
-      ValCtx.EmitResourceFormatError(Uav.get(),
-                                     ValidationRule::MetaGlcNotOnAppendConsume,
-                                     {ValCtx.GetResourceName(Uav.get())});
+    const bool UavIsCoherent =
+        Uav->IsGloballyCoherent() || Uav->IsReorderCoherent();
+    if (Uav->HasCounter() && UavIsCoherent) {
+      StringRef Prefix = Uav->IsGloballyCoherent() ? "globally" : "reorder";
+      ValCtx.EmitResourceFormatError(
+          Uav.get(), ValidationRule::MetaCoherenceNotOnAppendConsume, {Prefix});
+    }
 
     ValidateResource(*Uav, ValCtx);
     ValidateResourceOverlap(*Uav, UavAllocator, ValCtx);
diff --git a/lib/HLSL/DxilCondenseResources.cpp b/lib/HLSL/DxilCondenseResources.cpp
index 529c203bdc..09dd9cea64 100644
--- a/lib/HLSL/DxilCondenseResources.cpp
+++ b/lib/HLSL/DxilCondenseResources.cpp
@@ -655,7 +655,7 @@ class ResourceUseErrors {
 public:
   ResourceUseErrors() : m_bErrorsReported(false) {}
 
-  enum ErrorCode {
+  enum ErrorCode : unsigned int {
     // Collision between use of one resource GV and another.
     // All uses must be guaranteed to resolve to only one GV.
     // Additionally, when writing resource to alloca, all uses
diff --git a/lib/HLSL/HLOperationLower.cpp b/lib/HLSL/HLOperationLower.cpp
index be45021e41..3c062475af 100644
--- a/lib/HLSL/HLOperationLower.cpp
+++ b/lib/HLSL/HLOperationLower.cpp
@@ -7,9 +7,6 @@
 //                                                                           //
 // Lower functions to lower HL operations to DXIL operations.                //
 //                                                                           //
-// Modifications Copyright(C) 2025 Advanced Micro Devices, Inc.              //
-// All rights reserved.                                                      //
-//                                                                           //
 ///////////////////////////////////////////////////////////////////////////////
 
 #include "dxc/DXIL/DxilConstants.h"
@@ -19,6 +16,8 @@
 #include <functional>
 #include <unordered_set>
 
+#include "dxc/DXIL/DxilConstants.h"
+#include "dxc/DXIL/DxilInstructions.h"
 #include "dxc/DXIL/DxilModule.h"
 #include "dxc/DXIL/DxilOperations.h"
 #include "dxc/DXIL/DxilResourceProperties.h"
@@ -3064,10 +3063,10 @@ static Value *ScalarizeResRet(Type *RetTy, Value *ResRet,
 }
 
 void UpdateStatus(Value *ResRet, Value *status, IRBuilder<> &Builder,
-                  hlsl::OP *hlslOp) {
+                  hlsl::OP *hlslOp,
+                  unsigned StatusIndex = DXIL::kResRetStatusIndex) {
   if (status && !isa<UndefValue>(status)) {
-    Value *statusVal =
-        Builder.CreateExtractValue(ResRet, DXIL::kResRetStatusIndex);
+    Value *statusVal = Builder.CreateExtractValue(ResRet, StatusIndex);
     Value *checkAccessOp = hlslOp->GetI32Const(
         static_cast<unsigned>(DXIL::OpCode::CheckAccessFullyMapped));
     Function *checkAccessFn = hlslOp->GetOpFunc(
@@ -4029,9 +4028,9 @@ struct ResLoadHelper {
   // Used for some subscript operators that feed the generic HL call inst
   // into a load op and by the matrixload call instruction.
   ResLoadHelper(Instruction *Inst, DxilResource::Kind RK, Value *h, Value *idx,
-                Value *Offset, Value *mip = nullptr)
+                Value *Offset, Value *status = nullptr, Value *mip = nullptr)
       : intrinsicOpCode(IntrinsicOp::Num_Intrinsics), handle(h), retVal(Inst),
-        addr(idx), offset(Offset), status(nullptr), mipLevel(mip) {
+        addr(idx), offset(Offset), status(status), mipLevel(mip) {
     opcode = LoadOpFromResKind(RK);
     Type *Ty = Inst->getType();
     if (opcode == OP::OpCode::RawBufferLoad && Ty->isVectorTy() &&
@@ -4305,18 +4304,22 @@ Value *TranslateBufLoad(ResLoadHelper &helper, HLResource::Kind RK,
 
     Function *F = OP->GetOpFunc(opcode, EltTy);
     Value *Ld = Builder.CreateCall(F, Args, OP::GetOpCodeName(opcode));
+    unsigned StatusIndex;
 
     // Extract elements from returned ResRet.
     // Native vector loads just have one vector element in the ResRet.
     // Others have up to four scalars that need to be individually extracted.
-    if (opcode == OP::OpCode::RawBufferVectorLoad)
+    if (opcode == OP::OpCode::RawBufferVectorLoad) {
       Elts[i++] = Builder.CreateExtractValue(Ld, 0);
-    else
+      StatusIndex = DXIL::kVecResRetStatusIndex;
+    } else {
       for (unsigned j = 0; j < chunkSize; j++, i++)
         Elts[i] = Builder.CreateExtractValue(Ld, j);
+      StatusIndex = DXIL::kResRetStatusIndex;
+    }
 
     // Update status.
-    UpdateStatus(Ld, helper.status, Builder, OP);
+    UpdateStatus(Ld, helper.status, Builder, OP, StatusIndex);
 
     if (!FirstLd)
       FirstLd = Ld;
@@ -5718,58 +5721,51 @@ Value *TranslateCallShader(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
   return Builder.CreateCall(F, {opArg, ShaderIndex, Parameter});
 }
 
-Value *TranslateTraceRay(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
-                         HLOperationLowerHelper &helper,
+static void TransferRayDescArgs(Value **Args, hlsl::OP *OP,
+                                IRBuilder<> &Builder, CallInst *CI,
+                                unsigned &Index, unsigned &HLIndex) {
+  // Extract elements from flattened ray desc arguments in HL op.
+  // float3 Origin;
+  Value *origin = CI->getArgOperand(HLIndex++);
+  Args[Index++] = Builder.CreateExtractElement(origin, (uint64_t)0);
+  Args[Index++] = Builder.CreateExtractElement(origin, 1);
+  Args[Index++] = Builder.CreateExtractElement(origin, 2);
+  // float  TMin;
+  Args[Index++] = CI->getArgOperand(HLIndex++);
+  // float3 Direction;
+  Value *direction = CI->getArgOperand(HLIndex++);
+  Args[Index++] = Builder.CreateExtractElement(direction, (uint64_t)0);
+  Args[Index++] = Builder.CreateExtractElement(direction, 1);
+  Args[Index++] = Builder.CreateExtractElement(direction, 2);
+  // float  TMax;
+  Args[Index++] = CI->getArgOperand(HLIndex++);
+}
+
+Value *TranslateTraceRay(CallInst *CI, IntrinsicOp IOP, OP::OpCode OpCode,
+                         HLOperationLowerHelper &Helper,
                          HLObjectOperationLowerHelper *pObjHelper,
                          bool &Translated) {
-  hlsl::OP *hlslOP = &helper.hlslOP;
-
-  Value *rayDesc = CI->getArgOperand(HLOperandIndex::kTraceRayRayDescOpIdx);
-  Value *payLoad = CI->getArgOperand(HLOperandIndex::kTraceRayPayLoadOpIdx);
-
-  Value *opArg = hlslOP->GetU32Const(static_cast<unsigned>(opcode));
+  hlsl::OP *OP = &Helper.hlslOP;
 
   Value *Args[DXIL::OperandIndex::kTraceRayNumOp];
-  Args[0] = opArg;
-  for (unsigned i = 1; i < HLOperandIndex::kTraceRayRayDescOpIdx; i++) {
-    Args[i] = CI->getArgOperand(i);
-  }
-  IRBuilder<> Builder(CI);
-  // struct RayDesc
-  //{
-  //    float3 Origin;
-  //    float  TMin;
-  //    float3 Direction;
-  //    float  TMax;
-  //};
-  Value *zeroIdx = hlslOP->GetU32Const(0);
-  Value *origin = Builder.CreateGEP(rayDesc, {zeroIdx, zeroIdx});
-  origin = Builder.CreateLoad(origin);
-  unsigned index = DXIL::OperandIndex::kTraceRayRayDescOpIdx;
-  Args[index++] = Builder.CreateExtractElement(origin, (uint64_t)0);
-  Args[index++] = Builder.CreateExtractElement(origin, 1);
-  Args[index++] = Builder.CreateExtractElement(origin, 2);
+  Args[0] = OP->GetU32Const(static_cast<unsigned>(OpCode));
+  unsigned Index = 1, HLIndex = 1;
+  while (HLIndex < HLOperandIndex::kTraceRayRayDescOpIdx)
+    Args[Index++] = CI->getArgOperand(HLIndex++);
 
-  Value *tmin = Builder.CreateGEP(rayDesc, {zeroIdx, hlslOP->GetU32Const(1)});
-  tmin = Builder.CreateLoad(tmin);
-  Args[index++] = tmin;
-
-  Value *direction =
-      Builder.CreateGEP(rayDesc, {zeroIdx, hlslOP->GetU32Const(2)});
-  direction = Builder.CreateLoad(direction);
+  IRBuilder<> Builder(CI);
+  TransferRayDescArgs(Args, OP, Builder, CI, Index, HLIndex);
+  DXASSERT_NOMSG(HLIndex == CI->getNumArgOperands() - 1);
+  DXASSERT_NOMSG(Index == DXIL::OperandIndex::kTraceRayPayloadOpIdx);
 
-  Args[index++] = Builder.CreateExtractElement(direction, (uint64_t)0);
-  Args[index++] = Builder.CreateExtractElement(direction, 1);
-  Args[index++] = Builder.CreateExtractElement(direction, 2);
+  Value *Payload = CI->getArgOperand(HLIndex++);
+  Args[Index++] = Payload;
 
-  Value *tmax = Builder.CreateGEP(rayDesc, {zeroIdx, hlslOP->GetU32Const(3)});
-  tmax = Builder.CreateLoad(tmax);
-  Args[index++] = tmax;
+  DXASSERT_NOMSG(HLIndex == CI->getNumArgOperands());
+  DXASSERT_NOMSG(Index == DXIL::OperandIndex::kTraceRayNumOp);
 
-  Args[DXIL::OperandIndex::kTraceRayPayloadOpIdx] = payLoad;
-
-  Type *Ty = payLoad->getType();
-  Function *F = hlslOP->GetOpFunc(opcode, Ty);
+  Type *Ty = Payload->getType();
+  Function *F = OP->GetOpFunc(OpCode, Ty);
 
   return Builder.CreateCall(F, Args);
 }
@@ -5812,33 +5808,16 @@ Value *TranslateTraceRayInline(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
 
   Value *Args[DXIL::OperandIndex::kTraceRayInlineNumOp];
   Args[0] = opArg;
-  for (unsigned i = 1; i < HLOperandIndex::kTraceRayInlineRayDescOpIdx; i++) {
-    Args[i] = CI->getArgOperand(i);
-  }
+  unsigned Index = 1, HLIndex = 1;
+  while (HLIndex < HLOperandIndex::kTraceRayInlineRayDescOpIdx)
+    Args[Index++] = CI->getArgOperand(HLIndex++);
 
   IRBuilder<> Builder(CI);
-  unsigned hlIndex = HLOperandIndex::kTraceRayInlineRayDescOpIdx;
-  unsigned index = DXIL::OperandIndex::kTraceRayInlineRayDescOpIdx;
-
-  // struct RayDesc
-  //{
-  //    float3 Origin;
-  Value *origin = CI->getArgOperand(hlIndex++);
-  Args[index++] = Builder.CreateExtractElement(origin, (uint64_t)0);
-  Args[index++] = Builder.CreateExtractElement(origin, 1);
-  Args[index++] = Builder.CreateExtractElement(origin, 2);
-  //    float  TMin;
-  Args[index++] = CI->getArgOperand(hlIndex++);
-  //    float3 Direction;
-  Value *direction = CI->getArgOperand(hlIndex++);
-  Args[index++] = Builder.CreateExtractElement(direction, (uint64_t)0);
-  Args[index++] = Builder.CreateExtractElement(direction, 1);
-  Args[index++] = Builder.CreateExtractElement(direction, 2);
-  //    float  TMax;
-  Args[index++] = CI->getArgOperand(hlIndex++);
-  //};
-
-  DXASSERT_NOMSG(index == DXIL::OperandIndex::kTraceRayInlineNumOp);
+  DXASSERT_NOMSG(HLIndex == HLOperandIndex::kTraceRayInlineRayDescOpIdx);
+  DXASSERT_NOMSG(Index == DXIL::OperandIndex::kTraceRayInlineRayDescOpIdx);
+  TransferRayDescArgs(Args, hlslOP, Builder, CI, Index, HLIndex);
+  DXASSERT_NOMSG(HLIndex == CI->getNumArgOperands());
+  DXASSERT_NOMSG(Index == DXIL::OperandIndex::kTraceRayInlineNumOp);
 
   Function *F = hlslOP->GetOpFunc(opcode, Builder.getVoidTy());
 
@@ -5953,19 +5932,31 @@ Value *TranslateNoArgVectorOperation(CallInst *CI, IntrinsicOp IOP,
   return retVal;
 }
 
+template <typename ColElemTy>
+static void GetMatrixIndices(Constant *&Rows, Constant *&Cols, bool Is3x4,
+                             LLVMContext &Ctx) {
+  if (Is3x4) {
+    uint32_t RVals[] = {0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2};
+    Rows = ConstantDataVector::get(Ctx, RVals);
+    ColElemTy CVals[] = {0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3};
+    Cols = ConstantDataVector::get(Ctx, CVals);
+    return;
+  }
+  uint32_t RVals[] = {0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2};
+  Rows = ConstantDataVector::get(Ctx, RVals);
+  ColElemTy CVals[] = {0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3};
+  Cols = ConstantDataVector::get(Ctx, CVals);
+}
+
 Value *TranslateNoArgMatrix3x4Operation(
     CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
     HLOperationLowerHelper &helper, HLObjectOperationLowerHelper *pObjHelper,
     bool &Translated) {
   hlsl::OP *hlslOP = &helper.hlslOP;
   VectorType *Ty = cast<VectorType>(CI->getType());
-  uint32_t rVals[] = {0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2};
-  Constant *rows = ConstantDataVector::get(CI->getContext(), rVals);
-  uint8_t cVals[] = {0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3};
-  Constant *cols = ConstantDataVector::get(CI->getContext(), cVals);
-  Value *retVal =
-      TrivialDxilOperation(opcode, {nullptr, rows, cols}, Ty, CI, hlslOP);
-  return retVal;
+  Constant *Rows, *Cols;
+  GetMatrixIndices<uint8_t>(Rows, Cols, true, CI->getContext());
+  return TrivialDxilOperation(opcode, {nullptr, Rows, Cols}, Ty, CI, hlslOP);
 }
 
 Value *TranslateNoArgTransposedMatrix3x4Operation(
@@ -5974,13 +5965,9 @@ Value *TranslateNoArgTransposedMatrix3x4Operation(
     bool &Translated) {
   hlsl::OP *hlslOP = &helper.hlslOP;
   VectorType *Ty = cast<VectorType>(CI->getType());
-  uint32_t rVals[] = {0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2};
-  Constant *rows = ConstantDataVector::get(CI->getContext(), rVals);
-  uint8_t cVals[] = {0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3};
-  Constant *cols = ConstantDataVector::get(CI->getContext(), cVals);
-  Value *retVal =
-      TrivialDxilOperation(opcode, {nullptr, rows, cols}, Ty, CI, hlslOP);
-  return retVal;
+  Constant *Rows, *Cols;
+  GetMatrixIndices<uint8_t>(Rows, Cols, false, CI->getContext());
+  return TrivialDxilOperation(opcode, {nullptr, Rows, Cols}, Ty, CI, hlslOP);
 }
 
 /*
@@ -6184,55 +6171,49 @@ Value *TranslateUnpack(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
 
 // Shader Execution Reordering.
 namespace {
-Value *TranslateHitObjectMake(CallInst *CI, IntrinsicOp IOP, OP::OpCode Opcode,
-                              HLOperationLowerHelper &Helper,
-                              HLObjectOperationLowerHelper *ObjHelper,
-                              bool &Translated) {
+Value *TranslateHitObjectMakeNop(CallInst *CI, IntrinsicOp IOP,
+                                 OP::OpCode Opcode,
+                                 HLOperationLowerHelper &Helper,
+                                 HLObjectOperationLowerHelper *ObjHelper,
+                                 bool &Translated) {
   hlsl::OP *HlslOP = &Helper.hlslOP;
   IRBuilder<> Builder(CI);
-  unsigned SrcIdx = 1;
-  Value *HitObjectPtr = CI->getArgOperand(SrcIdx++);
-  if (Opcode == OP::OpCode::HitObject_MakeNop) {
-    Value *HitObject = TrivialDxilOperation(
-        Opcode, {nullptr}, Type::getVoidTy(CI->getContext()), CI, HlslOP);
-    Builder.CreateStore(HitObject, HitObjectPtr);
-    DXASSERT(
-        CI->use_empty(),
-        "Default ctor return type is a Clang artifact. Value must not be used");
-    return nullptr;
-  }
+  Value *HitObjectPtr = CI->getArgOperand(1);
+  Value *HitObject = TrivialDxilOperation(
+      Opcode, {nullptr}, Type::getVoidTy(CI->getContext()), CI, HlslOP);
+  Builder.CreateStore(HitObject, HitObjectPtr);
+  DXASSERT(
+      CI->use_empty(),
+      "Default ctor return type is a Clang artifact. Value must not be used");
+  return nullptr;
+}
 
+Value *TranslateHitObjectMakeMiss(CallInst *CI, IntrinsicOp IOP,
+                                  OP::OpCode Opcode,
+                                  HLOperationLowerHelper &Helper,
+                                  HLObjectOperationLowerHelper *ObjHelper,
+                                  bool &Translated) {
   DXASSERT_NOMSG(CI->getNumArgOperands() ==
                  HLOperandIndex::kHitObjectMakeMiss_NumOp);
-  Value *RayFlags = CI->getArgOperand(SrcIdx++);
-  Value *MissShaderIdx = CI->getArgOperand(SrcIdx++);
-  DXASSERT_NOMSG(SrcIdx == HLOperandIndex::kHitObjectMakeMissRayDescOpIdx);
-  Value *RayDescOrigin = CI->getArgOperand(SrcIdx++);
-  Value *RayDescOriginX =
-      Builder.CreateExtractElement(RayDescOrigin, (uint64_t)0);
-  Value *RayDescOriginY =
-      Builder.CreateExtractElement(RayDescOrigin, (uint64_t)1);
-  Value *RayDescOriginZ =
-      Builder.CreateExtractElement(RayDescOrigin, (uint64_t)2);
-
-  Value *RayDescTMin = CI->getArgOperand(SrcIdx++);
-  Value *RayDescDirection = CI->getArgOperand(SrcIdx++);
-  Value *RayDescDirectionX =
-      Builder.CreateExtractElement(RayDescDirection, (uint64_t)0);
-  Value *RayDescDirectionY =
-      Builder.CreateExtractElement(RayDescDirection, (uint64_t)1);
-  Value *RayDescDirectionZ =
-      Builder.CreateExtractElement(RayDescDirection, (uint64_t)2);
-
-  Value *RayDescTMax = CI->getArgOperand(SrcIdx++);
+  hlsl::OP *OP = &Helper.hlslOP;
+  IRBuilder<> Builder(CI);
+  Value *Args[DXIL::OperandIndex::kHitObjectMakeMiss_NumOp];
+  Args[0] = nullptr; // Filled in by TrivialDxilOperation
+
+  unsigned DestIdx = 1, SrcIdx = 1;
+  Value *HitObjectPtr = CI->getArgOperand(SrcIdx++);
+  Args[DestIdx++] = CI->getArgOperand(SrcIdx++); // RayFlags
+  Args[DestIdx++] = CI->getArgOperand(SrcIdx++); // MissShaderIdx
+
+  DXASSERT_NOMSG(SrcIdx == HLOperandIndex::kHitObjectMakeMiss_RayDescOpIdx);
+  DXASSERT_NOMSG(DestIdx ==
+                 DXIL::OperandIndex::kHitObjectMakeMiss_RayDescOpIdx);
+  TransferRayDescArgs(Args, OP, Builder, CI, DestIdx, SrcIdx);
   DXASSERT_NOMSG(SrcIdx == CI->getNumArgOperands());
+  DXASSERT_NOMSG(DestIdx == DXIL::OperandIndex::kHitObjectMakeMiss_NumOp);
 
-  Value *OutHitObject = TrivialDxilOperation(
-      Opcode,
-      {nullptr, RayFlags, MissShaderIdx, RayDescOriginX, RayDescOriginY,
-       RayDescOriginZ, RayDescTMin, RayDescDirectionX, RayDescDirectionY,
-       RayDescDirectionZ, RayDescTMax},
-      Helper.voidTy, CI, HlslOP);
+  Value *OutHitObject =
+      TrivialDxilOperation(Opcode, Args, Helper.voidTy, CI, OP);
   Builder.CreateStore(OutHitObject, HitObjectPtr);
   return nullptr;
 }
@@ -6299,7 +6280,32 @@ Value *TranslateHitObjectFromRayQuery(CallInst *CI, IntrinsicOp IOP,
                                       HLOperationLowerHelper &Helper,
                                       HLObjectOperationLowerHelper *pObjHelper,
                                       bool &Translated) {
-  return UndefValue::get(CI->getType()); // TODO: Merge SER DXIL patches
+  hlsl::OP *OP = &Helper.hlslOP;
+  IRBuilder<> Builder(CI);
+
+  unsigned SrcIdx = 1;
+  Value *HitObjectPtr = CI->getArgOperand(SrcIdx++);
+  Value *RayQuery = CI->getArgOperand(SrcIdx++);
+
+  if (CI->getNumArgOperands() ==
+      HLOperandIndex::kHitObjectFromRayQuery_WithAttrs_NumOp) {
+    Value *HitKind = CI->getArgOperand(SrcIdx++);
+    Value *AttribSrc = CI->getArgOperand(SrcIdx++);
+    DXASSERT_NOMSG(SrcIdx == CI->getNumArgOperands());
+    OpCode = DXIL::OpCode::HitObject_FromRayQueryWithAttrs;
+    Type *AttrTy = AttribSrc->getType();
+    Value *OutHitObject = TrivialDxilOperation(
+        OpCode, {nullptr, RayQuery, HitKind, AttribSrc}, AttrTy, CI, OP);
+    Builder.CreateStore(OutHitObject, HitObjectPtr);
+    return nullptr;
+  }
+
+  DXASSERT_NOMSG(SrcIdx == CI->getNumArgOperands());
+  OpCode = DXIL::OpCode::HitObject_FromRayQuery;
+  Value *OutHitObject =
+      TrivialDxilOperation(OpCode, {nullptr, RayQuery}, Helper.voidTy, CI, OP);
+  Builder.CreateStore(OutHitObject, HitObjectPtr);
+  return nullptr;
 }
 
 Value *TranslateHitObjectTraceRay(CallInst *CI, IntrinsicOp IOP,
@@ -6307,7 +6313,42 @@ Value *TranslateHitObjectTraceRay(CallInst *CI, IntrinsicOp IOP,
                                   HLOperationLowerHelper &Helper,
                                   HLObjectOperationLowerHelper *pObjHelper,
                                   bool &Translated) {
-  return UndefValue::get(CI->getType()); // TODO: Merge SER DXIL patches
+  hlsl::OP *OP = &Helper.hlslOP;
+  IRBuilder<> Builder(CI);
+
+  DXASSERT_NOMSG(CI->getNumArgOperands() ==
+                 HLOperandIndex::kHitObjectTraceRay_NumOp);
+  Value *Args[DXIL::OperandIndex::kHitObjectTraceRay_NumOp];
+  Value *OpArg = OP->GetU32Const(static_cast<unsigned>(OpCode));
+  Args[0] = OpArg;
+
+  unsigned DestIdx = 1, SrcIdx = 1;
+  Value *HitObjectPtr = CI->getArgOperand(SrcIdx++);
+  Args[DestIdx++] = CI->getArgOperand(SrcIdx++);
+  for (; SrcIdx < HLOperandIndex::kHitObjectTraceRay_RayDescOpIdx;
+       ++SrcIdx, ++DestIdx) {
+    Args[DestIdx] = CI->getArgOperand(SrcIdx);
+  }
+
+  DXASSERT_NOMSG(SrcIdx == HLOperandIndex::kHitObjectTraceRay_RayDescOpIdx);
+  DXASSERT_NOMSG(DestIdx ==
+                 DXIL::OperandIndex::kHitObjectTraceRay_RayDescOpIdx);
+  TransferRayDescArgs(Args, OP, Builder, CI, DestIdx, SrcIdx);
+  DXASSERT_NOMSG(SrcIdx == CI->getNumArgOperands() - 1);
+  DXASSERT_NOMSG(DestIdx ==
+                 DXIL::OperandIndex::kHitObjectTraceRay_PayloadOpIdx);
+
+  Value *Payload = CI->getArgOperand(SrcIdx++);
+  Args[DestIdx++] = Payload;
+
+  DXASSERT_NOMSG(SrcIdx == CI->getNumArgOperands());
+  DXASSERT_NOMSG(DestIdx == DXIL::OperandIndex::kHitObjectTraceRay_NumOp);
+
+  Function *F = OP->GetOpFunc(OpCode, Payload->getType());
+
+  Value *OutHitObject = Builder.CreateCall(F, Args);
+  Builder.CreateStore(OutHitObject, HitObjectPtr);
+  return nullptr;
 }
 
 Value *TranslateHitObjectInvoke(CallInst *CI, IntrinsicOp IOP,
@@ -6315,7 +6356,16 @@ Value *TranslateHitObjectInvoke(CallInst *CI, IntrinsicOp IOP,
                                 HLOperationLowerHelper &Helper,
                                 HLObjectOperationLowerHelper *pObjHelper,
                                 bool &Translated) {
-  return nullptr; // TODO: Merge SER DXIL patches
+  unsigned SrcIdx = 1;
+  Value *HitObjectPtr = CI->getArgOperand(SrcIdx++);
+  Value *Payload = CI->getArgOperand(SrcIdx++);
+  DXASSERT_NOMSG(SrcIdx == CI->getNumArgOperands());
+
+  IRBuilder<> Builder(CI);
+  Value *HitObject = Builder.CreateLoad(HitObjectPtr);
+  TrivialDxilOperation(OpCode, {nullptr, HitObject, Payload},
+                       Payload->getType(), CI, &Helper.hlslOP);
+  return nullptr;
 }
 
 Value *TranslateHitObjectGetAttributes(CallInst *CI, IntrinsicOp IOP,
@@ -6323,7 +6373,16 @@ Value *TranslateHitObjectGetAttributes(CallInst *CI, IntrinsicOp IOP,
                                        HLOperationLowerHelper &Helper,
                                        HLObjectOperationLowerHelper *pObjHelper,
                                        bool &Translated) {
-  return UndefValue::get(CI->getType()); // TODO: Merge SER DXIL patches
+  hlsl::OP *OP = &Helper.hlslOP;
+  IRBuilder<> Builder(CI);
+
+  Value *HitObjectPtr = CI->getArgOperand(1);
+  Value *HitObject = Builder.CreateLoad(HitObjectPtr);
+  Value *AttrOutPtr =
+      CI->getArgOperand(HLOperandIndex::kHitObjectGetAttributes_AttributeOpIdx);
+  TrivialDxilOperation(OpCode, {nullptr, HitObject, AttrOutPtr},
+                       AttrOutPtr->getType(), CI, OP);
+  return nullptr;
 }
 
 Value *TranslateHitObjectScalarGetter(CallInst *CI, IntrinsicOp IOP,
@@ -6331,7 +6390,12 @@ Value *TranslateHitObjectScalarGetter(CallInst *CI, IntrinsicOp IOP,
                                       HLOperationLowerHelper &Helper,
                                       HLObjectOperationLowerHelper *pObjHelper,
                                       bool &Translated) {
-  return UndefValue::get(CI->getType()); // TODO: Merge SER DXIL patches
+  hlsl::OP *OP = &Helper.hlslOP;
+  Value *HitObjectPtr = CI->getArgOperand(1);
+  IRBuilder<> Builder(CI);
+  Value *HitObject = Builder.CreateLoad(HitObjectPtr);
+  return TrivialDxilOperation(OpCode, {nullptr, HitObject}, CI->getType(), CI,
+                              OP);
 }
 
 Value *TranslateHitObjectVectorGetter(CallInst *CI, IntrinsicOp IOP,
@@ -6339,7 +6403,24 @@ Value *TranslateHitObjectVectorGetter(CallInst *CI, IntrinsicOp IOP,
                                       HLOperationLowerHelper &Helper,
                                       HLObjectOperationLowerHelper *pObjHelper,
                                       bool &Translated) {
-  return UndefValue::get(CI->getType()); // TODO: Merge SER DXIL patches
+  hlsl::OP *OP = &Helper.hlslOP;
+  Value *HitObjectPtr = CI->getArgOperand(1);
+  IRBuilder<> Builder(CI);
+  Value *HitObject = Builder.CreateLoad(HitObjectPtr);
+  VectorType *Ty = cast<VectorType>(CI->getType());
+  uint32_t Vals[] = {0, 1, 2, 3};
+  Constant *Src = ConstantDataVector::get(CI->getContext(), Vals);
+  return TrivialDxilOperation(OpCode, {nullptr, HitObject, Src}, Ty, CI, OP);
+}
+
+static bool IsHitObject3x4Getter(IntrinsicOp IOP) {
+  switch (IOP) {
+  default:
+    return false;
+  case IntrinsicOp::MOP_DxHitObject_GetObjectToWorld3x4:
+  case IntrinsicOp::MOP_DxHitObject_GetWorldToObject3x4:
+    return true;
+  }
 }
 
 Value *TranslateHitObjectMatrixGetter(CallInst *CI, IntrinsicOp IOP,
@@ -6347,21 +6428,51 @@ Value *TranslateHitObjectMatrixGetter(CallInst *CI, IntrinsicOp IOP,
                                       HLOperationLowerHelper &Helper,
                                       HLObjectOperationLowerHelper *pObjHelper,
                                       bool &Translated) {
-  return UndefValue::get(CI->getType()); // TODO: Merge SER DXIL patches
+  hlsl::OP *OP = &Helper.hlslOP;
+  Value *HitObjectPtr = CI->getArgOperand(1);
+  IRBuilder<> Builder(CI);
+  Value *HitObject = Builder.CreateLoad(HitObjectPtr);
+
+  // Create 3x4 matrix indices
+  bool Is3x4 = IsHitObject3x4Getter(IOP);
+  Constant *Rows, *Cols;
+  GetMatrixIndices<uint32_t>(Rows, Cols, Is3x4, CI->getContext());
+
+  VectorType *Ty = cast<VectorType>(CI->getType());
+  return TrivialDxilOperation(OpCode, {nullptr, HitObject, Rows, Cols}, Ty, CI,
+                              OP);
 }
 
 Value *TranslateHitObjectLoadLocalRootTableConstant(
     CallInst *CI, IntrinsicOp IOP, OP::OpCode OpCode,
     HLOperationLowerHelper &Helper, HLObjectOperationLowerHelper *pObjHelper,
     bool &Translated) {
-  return UndefValue::get(CI->getType()); // TODO: Merge SER DXIL patches
+  hlsl::OP *OP = &Helper.hlslOP;
+  IRBuilder<> Builder(CI);
+
+  Value *HitObjectPtr = CI->getArgOperand(1);
+  Value *Offset = CI->getArgOperand(2);
+
+  Value *HitObject = Builder.CreateLoad(HitObjectPtr);
+  return TrivialDxilOperation(OpCode, {nullptr, HitObject, Offset},
+                              Helper.voidTy, CI, OP);
 }
 
 Value *TranslateHitObjectSetShaderTableIndex(
     CallInst *CI, IntrinsicOp IOP, OP::OpCode OpCode,
     HLOperationLowerHelper &Helper, HLObjectOperationLowerHelper *pObjHelper,
     bool &Translated) {
-  return UndefValue::get(CI->getType()); // TODO: Merge SER DXIL patches
+  hlsl::OP *OP = &Helper.hlslOP;
+  IRBuilder<> Builder(CI);
+
+  Value *HitObjectPtr = CI->getArgOperand(1);
+  Value *ShaderTableIndex = CI->getArgOperand(2);
+
+  Value *InHitObject = Builder.CreateLoad(HitObjectPtr);
+  Value *OutHitObject = TrivialDxilOperation(
+      OpCode, {nullptr, InHitObject, ShaderTableIndex}, Helper.voidTy, CI, OP);
+  Builder.CreateStore(OutHitObject, HitObjectPtr);
+  return nullptr;
 }
 
 } // namespace
@@ -6417,6 +6528,200 @@ Value *TranslateSelect(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
 
   return Builder.CreateSelect(cond, t, f);
 }
+
+Value *TranslateMatVecMul(CallInst *CI, IntrinsicOp IOP, OP::OpCode OpCode,
+                          HLOperationLowerHelper &Helper,
+                          HLObjectOperationLowerHelper *ObjHelper,
+                          bool &Translated) {
+
+  hlsl::OP *HlslOp = &Helper.hlslOP;
+  IRBuilder<> Builder(CI);
+
+  Constant *OpArg = HlslOp->GetU32Const(static_cast<unsigned>(OpCode));
+
+  // Input parameters
+  Value *InputVector =
+      CI->getArgOperand(HLOperandIndex::kMatVecMulInputVectorIdx);
+  Value *InputIsUnsigned =
+      CI->getArgOperand(HLOperandIndex::kMatVecMulIsInputUnsignedIdx);
+  Value *InputInterpretation =
+      CI->getArgOperand(HLOperandIndex::kMatVecMulInputInterpretationIdx);
+
+  // Matrix parameters
+  Value *MatrixBuffer =
+      CI->getArgOperand(HLOperandIndex::kMatVecMulMatrixBufferIdx);
+  Value *MatrixOffset =
+      CI->getArgOperand(HLOperandIndex::kMatVecMulMatrixOffsetIdx);
+  Value *MatrixInterpretation =
+      CI->getArgOperand(HLOperandIndex::kMatVecMulMatrixInterpretationIdx);
+  Value *MatrixM = CI->getArgOperand(HLOperandIndex::kMatVecMulMatrixMIdx);
+  Value *MatrixK = CI->getArgOperand(HLOperandIndex::kMatVecMulMatrixKIdx);
+  Value *MatrixLayout =
+      CI->getArgOperand(HLOperandIndex::kMatVecMulMatrixLayoutIdx);
+  Value *MatrixTranspose =
+      CI->getArgOperand(HLOperandIndex::kMatVecMulMatrixTransposeIdx);
+  Value *MatrixStride =
+      CI->getArgOperand(HLOperandIndex::kMatVecMulMatrixStrideIdx);
+
+  // Output parameters
+  Value *OutputIsUnsigned =
+      CI->getArgOperand(HLOperandIndex::kMatVecMulIsOutputUnsignedIdx);
+
+  // Get the DXIL function for the operation
+  Function *DxilFunc = HlslOp->GetOpFunc(
+      OpCode, {CI->getArgOperand(HLOperandIndex::kMatVecMulOutputVectorIdx)
+                   ->getType()
+                   ->getPointerElementType(),
+               InputVector->getType()});
+
+  // Create a call to the DXIL function
+  Value *NewCI = Builder.CreateCall(
+      DxilFunc,
+      {OpArg, InputVector, InputIsUnsigned, InputInterpretation, MatrixBuffer,
+       MatrixOffset, MatrixInterpretation, MatrixM, MatrixK, MatrixLayout,
+       MatrixTranspose, MatrixStride, OutputIsUnsigned});
+
+  // Get the output parameter and store the result
+  Value *OutParam =
+      CI->getArgOperand(HLOperandIndex::kMatVecMulOutputVectorIdx);
+
+  Builder.CreateStore(NewCI, OutParam);
+
+  return nullptr;
+}
+
+Value *TranslateMatVecMulAdd(CallInst *CI, IntrinsicOp IOP, OP::OpCode OpCode,
+                             HLOperationLowerHelper &Helper,
+                             HLObjectOperationLowerHelper *ObjHelper,
+                             bool &Translated) {
+
+  hlsl::OP *HlslOp = &Helper.hlslOP;
+  IRBuilder<> Builder(CI);
+
+  Constant *OpArg = HlslOp->GetU32Const(static_cast<unsigned>(OpCode));
+
+  // Input vector parameters
+  Value *InputVector =
+      CI->getArgOperand(HLOperandIndex::kMatVecMulAddInputVectorIdx);
+  Value *InputIsUnsigned =
+      CI->getArgOperand(HLOperandIndex::kMatVecMulAddIsInputUnsignedIdx);
+  Value *InputInterpretation =
+      CI->getArgOperand(HLOperandIndex::kMatVecMulAddInputInterpretationIdx);
+
+  // Matrix parameters
+  Value *MatrixBuffer =
+      CI->getArgOperand(HLOperandIndex::kMatVecMulAddMatrixBufferIdx);
+  Value *MatrixOffset =
+      CI->getArgOperand(HLOperandIndex::kMatVecMulAddMatrixOffsetIdx);
+  Value *MatrixInterpretation =
+      CI->getArgOperand(HLOperandIndex::kMatVecMulAddMatrixInterpretationIdx);
+  Value *MatrixM = CI->getArgOperand(HLOperandIndex::kMatVecMulAddMatrixMIdx);
+  Value *MatrixK = CI->getArgOperand(HLOperandIndex::kMatVecMulAddMatrixKIdx);
+  Value *MatrixLayout =
+      CI->getArgOperand(HLOperandIndex::kMatVecMulAddMatrixLayoutIdx);
+  Value *MatrixTranspose =
+      CI->getArgOperand(HLOperandIndex::kMatVecMulAddMatrixTransposeIdx);
+  Value *MatrixStride =
+      CI->getArgOperand(HLOperandIndex::kMatVecMulAddMatrixStrideIdx);
+
+  // Bias parameters
+  Value *BiasBuffer =
+      CI->getArgOperand(HLOperandIndex::kMatVecMulAddBiasBufferIdx);
+  Value *BiasOffset =
+      CI->getArgOperand(HLOperandIndex::kMatVecMulAddBiasOffsetIdx);
+  Value *BiasInterpretation =
+      CI->getArgOperand(HLOperandIndex::kMatVecMulAddBiasInterpretationIdx);
+
+  // Output parameters
+  Value *OutputIsUnsigned =
+      CI->getArgOperand(HLOperandIndex::kMatVecMulAddIsOutputUnsignedIdx);
+
+  // Get the DXIL function for the operation
+  Function *DxilFunc = HlslOp->GetOpFunc(
+      OpCode, {CI->getArgOperand(HLOperandIndex::kMatVecMulAddOutputVectorIdx)
+                   ->getType()
+                   ->getPointerElementType(),
+               InputVector->getType()});
+
+  // Create a call to the DXIL function
+  Value *NewCI = Builder.CreateCall(
+      DxilFunc, {OpArg, InputVector, InputIsUnsigned, InputInterpretation,
+                 MatrixBuffer, MatrixOffset, MatrixInterpretation, MatrixM,
+                 MatrixK, MatrixLayout, MatrixTranspose, MatrixStride,
+                 BiasBuffer, BiasOffset, BiasInterpretation, OutputIsUnsigned});
+
+  // Store the result in the output parameter
+  Value *OutParam =
+      CI->getArgOperand(HLOperandIndex::kMatVecMulAddOutputVectorIdx);
+  Builder.CreateStore(NewCI, OutParam);
+
+  return nullptr;
+}
+
+Value *TranslateOuterProductAccumulate(CallInst *CI, IntrinsicOp IOP,
+                                       OP::OpCode OpCode,
+                                       HLOperationLowerHelper &Helper,
+                                       HLObjectOperationLowerHelper *ObjHelper,
+                                       bool &Translated) {
+
+  hlsl::OP *HlslOp = &Helper.hlslOP;
+  IRBuilder<> Builder(CI);
+
+  Constant *OpArg = HlslOp->GetU32Const(static_cast<unsigned>(OpCode));
+
+  // Input vector parameters
+  Value *InputVector1 =
+      CI->getArgOperand(HLOperandIndex::kOuterProdAccInputVec1Idx);
+  Value *InputVector2 =
+      CI->getArgOperand(HLOperandIndex::kOuterProdAccInputVec2Idx);
+
+  // Matrix parameters
+  Value *MatrixBuffer =
+      CI->getArgOperand(HLOperandIndex::kOuterProdAccMatrixIdx);
+  Value *MatrixOffset =
+      CI->getArgOperand(HLOperandIndex::kOuterProdAccMatrixOffsetIdx);
+  Value *MatrixInterpretation =
+      CI->getArgOperand(HLOperandIndex::kOuterProdAccMatrixInterpretationIdx);
+  Value *MatrixLayout =
+      CI->getArgOperand(HLOperandIndex::kOuterProdAccMatrixLayoutIdx);
+  Value *MatrixStride =
+      CI->getArgOperand(HLOperandIndex::kOuterProdAccMatrixStrideIdx);
+
+  // Get the DXIL function for the operation
+  Function *DxilFunc = HlslOp->GetOpFunc(
+      OpCode, {InputVector1->getType(), InputVector2->getType()});
+
+  return Builder.CreateCall(
+      DxilFunc, {OpArg, InputVector1, InputVector2, MatrixBuffer, MatrixOffset,
+                 MatrixInterpretation, MatrixLayout, MatrixStride});
+}
+
+Value *TranslateVectorAccumulate(CallInst *CI, IntrinsicOp IOP,
+                                 OP::OpCode OpCode,
+                                 HLOperationLowerHelper &Helper,
+                                 HLObjectOperationLowerHelper *ObjHelper,
+                                 bool &Translated) {
+
+  hlsl::OP *HlslOp = &Helper.hlslOP;
+  IRBuilder<> Builder(CI);
+
+  Constant *OpArg = HlslOp->GetU32Const(static_cast<unsigned>(OpCode));
+
+  // Input vector parameter
+  Value *InputVector = CI->getArgOperand(HLOperandIndex::kVectorAccInputVecIdx);
+
+  // Matrix parameters
+  Value *MatrixBuffer = CI->getArgOperand(HLOperandIndex::kVectorAccMatrixIdx);
+  Value *MatrixOffset =
+      CI->getArgOperand(HLOperandIndex::kVectorAccMatrixOffsetIdx);
+
+  // Get the DXIL function for the operation
+  Function *DxilFunc = HlslOp->GetOpFunc(OpCode, InputVector->getType());
+
+  return Builder.CreateCall(DxilFunc,
+                            {OpArg, InputVector, MatrixBuffer, MatrixOffset});
+}
+
 } // namespace
 
 // Lower table.
@@ -7063,7 +7368,7 @@ IntrinsicLower gLowerTable[] = {
      DXIL::OpCode::NumOpCodes},
     {IntrinsicOp::MOP_InterlockedUMin, TranslateMopAtomicBinaryOperation,
      DXIL::OpCode::NumOpCodes},
-    {IntrinsicOp::MOP_DxHitObject_MakeNop, TranslateHitObjectMake,
+    {IntrinsicOp::MOP_DxHitObject_MakeNop, TranslateHitObjectMakeNop,
      DXIL::OpCode::HitObject_MakeNop},
     {IntrinsicOp::IOP_DxMaybeReorderThread, TranslateMaybeReorderThread,
      DXIL::OpCode::MaybeReorderThread},
@@ -7123,13 +7428,22 @@ IntrinsicLower gLowerTable[] = {
     {IntrinsicOp::MOP_DxHitObject_LoadLocalRootTableConstant,
      TranslateHitObjectLoadLocalRootTableConstant,
      DXIL::OpCode::HitObject_LoadLocalRootTableConstant},
-    {IntrinsicOp::MOP_DxHitObject_MakeMiss, TranslateHitObjectMake,
+    {IntrinsicOp::MOP_DxHitObject_MakeMiss, TranslateHitObjectMakeMiss,
      DXIL::OpCode::HitObject_MakeMiss},
     {IntrinsicOp::MOP_DxHitObject_SetShaderTableIndex,
      TranslateHitObjectSetShaderTableIndex,
      DXIL::OpCode::HitObject_SetShaderTableIndex},
     {IntrinsicOp::MOP_DxHitObject_TraceRay, TranslateHitObjectTraceRay,
      DXIL::OpCode::HitObject_TraceRay},
+
+    {IntrinsicOp::IOP___builtin_MatVecMul, TranslateMatVecMul,
+     DXIL::OpCode::MatVecMul},
+    {IntrinsicOp::IOP___builtin_MatVecMulAdd, TranslateMatVecMulAdd,
+     DXIL::OpCode::MatVecMulAdd},
+    {IntrinsicOp::IOP___builtin_OuterProductAccumulate,
+     TranslateOuterProductAccumulate, DXIL::OpCode::OuterProductAccumulate},
+    {IntrinsicOp::IOP___builtin_VectorAccumulate, TranslateVectorAccumulate,
+     DXIL::OpCode::VectorAccumulate},
 };
 } // namespace
 static_assert(
@@ -8220,7 +8534,7 @@ Value *TranslateStructBufMatLd(CallInst *CI, IRBuilder<> &Builder,
                                Value *status, Value *bufIdx, Value *baseOffset,
                                const DataLayout &DL) {
 
-  ResLoadHelper helper(CI, RK, handle, bufIdx, baseOffset);
+  ResLoadHelper helper(CI, RK, handle, bufIdx, baseOffset, status);
 #ifndef NDEBUG
   Value *ptr = CI->getArgOperand(HLOperandIndex::kMatLoadPtrOpIdx);
   Type *matType = ptr->getType()->getPointerElementType();
@@ -8547,7 +8861,7 @@ void TranslateStructBufSubscriptUser(Instruction *user, Value *handle,
     }
   } else if (LoadInst *LdInst = dyn_cast<LoadInst>(user)) {
     // Load of scalar/vector within a struct or structured raw load.
-    ResLoadHelper helper(LdInst, ResKind, handle, bufIdx, baseOffset);
+    ResLoadHelper helper(LdInst, ResKind, handle, bufIdx, baseOffset, status);
     TranslateBufLoad(helper, ResKind, Builder, OP, DL);
 
     LdInst->eraseFromParent();
@@ -8922,7 +9236,8 @@ void TranslateHLSubscript(CallInst *CI, HLSubscriptOpcode opcode,
     IRBuilder<> Builder(CI);
     if (LoadInst *ldInst = dyn_cast<LoadInst>(*U)) {
       Value *Offset = UndefValue::get(Builder.getInt32Ty());
-      ResLoadHelper ldHelper(ldInst, RK, handle, coord, Offset, mipLevel);
+      ResLoadHelper ldHelper(ldInst, RK, handle, coord, Offset,
+                             /*status*/ nullptr, mipLevel);
       TranslateBufLoad(ldHelper, RK, Builder, hlslOP, helper.dataLayout);
       ldInst->eraseFromParent();
     } else {
diff --git a/lib/Support/APFloat.cpp b/lib/Support/APFloat.cpp
index 3c76c72271..40c22459e2 100644
--- a/lib/Support/APFloat.cpp
+++ b/lib/Support/APFloat.cpp
@@ -331,7 +331,7 @@ trailingHexadecimalFraction(StringRef::iterator p, StringRef::iterator end,
 
   /* If we ran off the end it is exactly zero or one-half, otherwise
      a little more.  */
-  if (hexDigit == -1U)
+  if (hexDigit == ~0U)
     return digitValue == 0 ? lfExactlyZero: lfExactlyHalf;
   else
     return digitValue == 0 ? lfLessThanHalf: lfMoreThanHalf;
@@ -446,7 +446,7 @@ ulpsFromBoundary(const integerPart *parts, unsigned int bits, bool isNearest)
       if (~parts[count])
         return ~(integerPart) 0; /* A lot.  */
 
-    return -parts[0];
+    return (~parts[0] + 1);
   }
 
   return ~(integerPart) 0; /* A lot.  */
@@ -2368,7 +2368,7 @@ APFloat::convertFromHexadecimalString(StringRef s, roundingMode rounding_mode)
     }
 
     hex_value = hexDigitValue(*p);
-    if (hex_value == -1U)
+    if (hex_value == ~0U)
       break;
 
     p++;
diff --git a/lib/Support/APInt.cpp b/lib/Support/APInt.cpp
index 23f89bb66f..d01238a552 100644
--- a/lib/Support/APInt.cpp
+++ b/lib/Support/APInt.cpp
@@ -70,7 +70,7 @@ inline static unsigned getDigit(char cdigit, uint8_t radix) {
   if (r < radix)
     return r;
 
-  return -1U;
+  return std::numeric_limits<unsigned>::max();
 }
 
 
@@ -79,7 +79,7 @@ void APInt::initSlowCase(unsigned numBits, uint64_t val, bool isSigned) {
   pVal[0] = val;
   if (isSigned && int64_t(val) < 0)
     for (unsigned i = 1; i < getNumWords(); ++i)
-      pVal[i] = -1ULL;
+      pVal[i] = std::numeric_limits<uint64_t>::max();
 }
 
 void APInt::initSlowCase(const APInt& that) {
@@ -735,7 +735,7 @@ unsigned APInt::countLeadingOnes() const {
   unsigned Count = llvm::countLeadingOnes(pVal[i] << shift);
   if (Count == highWordBits) {
     for (i--; i >= 0; --i) {
-      if (pVal[i] == -1ULL)
+      if (pVal[i] == std::numeric_limits<uint64_t>::max())
         Count += APINT_BITS_PER_WORD;
       else {
         Count += llvm::countLeadingOnes(pVal[i]);
@@ -761,7 +761,8 @@ unsigned APInt::countTrailingZeros() const {
 unsigned APInt::countTrailingOnesSlowCase() const {
   unsigned Count = 0;
   unsigned i = 0;
-  for (; i < getNumWords() && pVal[i] == -1ULL; ++i)
+  for (; i < getNumWords() && pVal[i] == std::numeric_limits<uint64_t>::max();
+       ++i)
     Count += APINT_BITS_PER_WORD;
   if (i < getNumWords())
     Count += llvm::countTrailingOnes(pVal[i]);
@@ -1070,7 +1071,7 @@ APInt APInt::ashr(unsigned shiftAmt) const {
   // issues in the algorithm below.
   if (shiftAmt == BitWidth) {
     if (isNegative())
-      return APInt(BitWidth, -1ULL, true);
+      return APInt(BitWidth, std::numeric_limits<uint64_t>::max(), true);
     else
       return APInt(BitWidth, 0);
   }
@@ -1123,7 +1124,8 @@ APInt APInt::ashr(unsigned shiftAmt) const {
   }
 
   // Remaining words are 0 or -1, just assign them.
-  uint64_t fillValue = (isNegative() ? -1ULL : 0);
+  uint64_t fillValue =
+      (isNegative() ? std::numeric_limits<uint64_t>::max() : 0);
   for (unsigned i = breakWord+1; i < getNumWords(); ++i)
     val[i] = fillValue;
   APInt Result(val, BitWidth);
@@ -2192,7 +2194,18 @@ void APInt::toString(SmallVectorImpl<char> &Str, unsigned Radix,
         N = I;
       } else {
         Str.push_back('-');
-        N = -(uint64_t)I;
+        // In this else block, all values of I must be less than 0.
+        //
+        // Because values are stored in 2's complement and I is a signed
+        // integer, the expression -I is equivalent to (~I + 1) for all values
+        // of I, except INT64_MIN, where -I is undefined behavior in C++ due to
+        // overflow.
+        //
+        // However, (~I + 1) is still well-defined even when I == INT64_MIN, and
+        // it evaluates to the same bit pattern as INT64_MIN. Because N is
+        // unsigned, assigning N = ~I + 1 preserves the exact bit pattern
+        // and correctly represents the 2's complement value of -I.
+        N = (~I + 1);
       }
     }
 
@@ -2408,7 +2421,7 @@ APInt::tcLSB(const integerPart *parts, unsigned int n)
       }
   }
 
-  return -1U;
+  return std::numeric_limits<unsigned int>::max();
 }
 
 /* Returns the bit number of the most significant set bit of a number.
@@ -2428,7 +2441,7 @@ APInt::tcMSB(const integerPart *parts, unsigned int n)
     }
   } while (n);
 
-  return -1U;
+  return std::numeric_limits<unsigned int>::max();
 }
 
 /* Copy the bit vector of width srcBITS from SRC, starting at bit
diff --git a/lib/Support/DataExtractor.cpp b/lib/Support/DataExtractor.cpp
index 5d6d60a87f..625fb3595a 100644
--- a/lib/Support/DataExtractor.cpp
+++ b/lib/Support/DataExtractor.cpp
@@ -168,7 +168,7 @@ int64_t DataExtractor::getSLEB128(uint32_t *offset_ptr) const {
 
   // Sign bit of byte is 2nd high order bit (0x40)
   if (shift < 64 && (byte & 0x40))
-    result |= -(1ULL << shift);
+    result |= (~(1ULL << shift) + 1);
 
   *offset_ptr = offset;
   return result;
diff --git a/lib/Support/StringRef.cpp b/lib/Support/StringRef.cpp
index ddece087a9..52b949d826 100644
--- a/lib/Support/StringRef.cpp
+++ b/lib/Support/StringRef.cpp
@@ -12,6 +12,7 @@
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/edit_distance.h"
 #include <bitset>
+#include <limits>
 
 using namespace llvm;
 
@@ -393,13 +394,16 @@ bool llvm::getAsSignedInteger(StringRef Str, unsigned Radix,
 
   // Get the positive part of the value.
   if (getAsUnsignedInteger(Str.substr(1), Radix, ULLVal) ||
-      // Reject values so large they'd overflow as negative signed, but allow
-      // "-0".  This negates the unsigned so that the negative isn't undefined
-      // on signed overflow.
-      (long long)-ULLVal > 0)
+      // Reject values larger than what can be represented as negative signed.
+      // The most negative long long is LLONG_MIN, which has magnitude
+      // (LLONG_MAX + 1). Values larger than this magnitude cannot be negated
+      // without overflow.
+      ULLVal > static_cast<unsigned long long>(
+                   std::numeric_limits<long long>::max()) +
+                   1)
     return true;
 
-  Result = -ULLVal;
+  Result = (~ULLVal + 1);
   return false;
 }
 
diff --git a/lib/Support/TimeValue.cpp b/lib/Support/TimeValue.cpp
index 136b93ecee..06de27bbda 100644
--- a/lib/Support/TimeValue.cpp
+++ b/lib/Support/TimeValue.cpp
@@ -19,8 +19,7 @@ using namespace sys;
 
 const TimeValue::SecondsType
   TimeValue::PosixZeroTimeSeconds = -946684800;
-const TimeValue::SecondsType
-  TimeValue::Win32ZeroTimeSeconds = -12591158400ULL;
+const TimeValue::SecondsType TimeValue::Win32ZeroTimeSeconds = -12591158400LL;
 
 void
 TimeValue::normalize( void ) {
diff --git a/lib/Support/raw_ostream.cpp b/lib/Support/raw_ostream.cpp
index b11ffb15d5..595468a6dc 100644
--- a/lib/Support/raw_ostream.cpp
+++ b/lib/Support/raw_ostream.cpp
@@ -134,13 +134,18 @@ raw_ostream &raw_ostream::operator<<(unsigned long N) {
 }
 
 raw_ostream &raw_ostream::operator<<(long N) {
+  // A positive signed long has the same value when casted to its unsigned
+  // counterpart. If its negative, then we'll handle it in the below if block.
+  unsigned long UN = static_cast<unsigned long>(N);
+
   if (N < 0 && writeBase == 10) {
     *this << '-';
-    // Avoid undefined behavior on LONG_MIN with a cast.
-    N = -(unsigned long)N;
+    // Since N is negative and we're storing the result in an unsigned Long,
+    // we can use the equivalence of -N == ~N + 1 to get the positive value.
+    UN = ~N + 1UL;
   }
 
-  return this->operator<<(static_cast<unsigned long>(N));
+  return this->operator<<(UN);
 }
 
 raw_ostream &raw_ostream::operator<<(unsigned long long N) {
@@ -169,13 +174,18 @@ raw_ostream &raw_ostream::operator<<(unsigned long long N) {
 }
 
 raw_ostream &raw_ostream::operator<<(long long N) {
+  // A positive signed long has the same value when casted to its unsigned
+  // counterpart. If its negative, then we'll handle it in the below if block.
+  unsigned long long UN = static_cast<unsigned long long>(N);
+
   if (N < 0 && writeBase == 10) {
     *this << '-';
-    // Avoid undefined behavior on INT64_MIN with a cast.
-    N = -(unsigned long long)N;
+    // Since N is negative and we're storing the result in an unsigned Long,
+    // we can use the equivalence of -N == ~N + 1 to get the positive value.
+    UN = ~N + 1ULL;
   }
 
-  return this->operator<<(static_cast<unsigned long long>(N));
+  return this->operator<<(UN);
 }
 
 // HLSL Change Starts - Generalize non-base10 printing.
@@ -470,7 +480,10 @@ raw_ostream &raw_ostream::operator<<(const FormattedNumber &FN) {
     char *EndPtr = NumberBuffer+sizeof(NumberBuffer);
     char *CurPtr = EndPtr;
     bool Neg = (FN.DecValue < 0);
-    uint64_t N = Neg ? -static_cast<uint64_t>(FN.DecValue) : FN.DecValue;
+    // If the value is negative, and because we are storing the result of the ~
+    // operation in an unsigned value, we can use the equivalence of
+    // -N == ~N + 1 to get the positive value of the negative number
+    uint64_t N = Neg ? (~FN.DecValue + 1UL) : FN.DecValue;
     while (N) {
       *--CurPtr = '0' + char(N % 10);
       N /= 10;
diff --git a/lib/Transforms/IPO/DeadArgumentElimination.cpp b/lib/Transforms/IPO/DeadArgumentElimination.cpp
index d044764025..0cf9f7797a 100644
--- a/lib/Transforms/IPO/DeadArgumentElimination.cpp
+++ b/lib/Transforms/IPO/DeadArgumentElimination.cpp
@@ -146,7 +146,7 @@ namespace {
   private:
     Liveness MarkIfNotLive(RetOrArg Use, UseVector &MaybeLiveUses);
     Liveness SurveyUse(const Use *U, UseVector &MaybeLiveUses,
-                       unsigned RetValNum = -1U);
+                       unsigned RetValNum = ~0U);
     Liveness SurveyUses(const Value *V, UseVector &MaybeLiveUses);
 
     void SurveyFunction(const Function &F);
@@ -442,7 +442,7 @@ DAE::Liveness DAE::SurveyUse(const Use *U,
       // that U is really a use of an insertvalue instruction that uses the
       // original Use.
       const Function *F = RI->getParent()->getParent();
-      if (RetValNum != -1U) {
+      if (RetValNum != ~0U) {
         RetOrArg Use = CreateRet(F, RetValNum);
         // We might be live, depending on the liveness of Use.
         return MarkIfNotLive(Use, MaybeLiveUses);
diff --git a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index 2d28b14213..66e01198bd 100644
--- a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -998,7 +998,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
     for (unsigned i = 0; i < VWidth; i++) {
       if (DemandedElts[i]) {
         unsigned MaskVal = Shuffle->getMaskValue(i);
-        if (MaskVal != -1u) {
+        if (MaskVal != ~0u) {
           assert(MaskVal < LHSVWidth * 2 &&
                  "shufflevector mask index out of range!");
           if (MaskVal < LHSVWidth)
@@ -1022,7 +1022,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
     bool NewUndefElts = false;
     for (unsigned i = 0; i < VWidth; i++) {
       unsigned MaskVal = Shuffle->getMaskValue(i);
-      if (MaskVal == -1u) {
+      if (MaskVal == ~0u) {
         UndefElts.setBit(i);
       } else if (!DemandedElts[i]) {
         NewUndefElts = true;
diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp
index 6bc322fa92..c93232b67f 100644
--- a/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1937,7 +1937,8 @@ Instruction *InstCombiner::visitAllocSite(Instruction &MI) {
       } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
         if (II->getIntrinsicID() == Intrinsic::objectsize) {
           ConstantInt *CI = cast<ConstantInt>(II->getArgOperand(1));
-          uint64_t DontKnow = CI->isZero() ? -1ULL : 0;
+          uint64_t DontKnow =
+              CI->isZero() ? std::numeric_limits<uint64_t>::max() : 0;
           ReplaceInstUsesWith(*I, ConstantInt::get(I->getType(), DontKnow));
         }
       }
diff --git a/lib/Transforms/Scalar/LoadCombine.cpp b/lib/Transforms/Scalar/LoadCombine.cpp
index 6d358744ef..fb48513c18 100644
--- a/lib/Transforms/Scalar/LoadCombine.cpp
+++ b/lib/Transforms/Scalar/LoadCombine.cpp
@@ -131,10 +131,10 @@ bool LoadCombine::aggregateLoads(SmallVectorImpl<LoadPOPPair> &Loads) {
   LoadInst *BaseLoad = nullptr;
   SmallVector<LoadPOPPair, 8> AggregateLoads;
   bool Combined = false;
-  uint64_t PrevOffset = -1ull;
+  uint64_t PrevOffset = std::numeric_limits<uint64_t>::max();
   uint64_t PrevSize = 0;
   for (auto &L : Loads) {
-    if (PrevOffset == -1ull) {
+    if (PrevOffset == std::numeric_limits<uint64_t>::max()) {
       BaseLoad = L.Load;
       PrevOffset = L.POP.Offset;
       PrevSize = L.Load->getModule()->getDataLayout().getTypeStoreSize(
@@ -186,7 +186,7 @@ bool LoadCombine::combineLoads(SmallVectorImpl<LoadPOPPair> &Loads) {
 
   // Find first load. This is where we put the new load.
   LoadPOPPair FirstLP;
-  FirstLP.InsertOrder = -1u;
+  FirstLP.InsertOrder = std::numeric_limits<unsigned>::max();
   for (const auto &L : Loads)
     if (L.InsertOrder < FirstLP.InsertOrder)
       FirstLP = L;
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 3ab9367a6b..60962ec69a 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -1395,8 +1395,11 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
       // ICmpZero -1*ScaleReg + BaseOffset => ICmp ScaleReg, BaseOffset
       // Offs is the ICmp immediate.
       if (Scale == 0)
-        // The cast does the right thing with INT64_MIN.
-        BaseOffset = -(uint64_t)BaseOffset;
+        // Negate BaseOffset using two's complement (~x + 1) to avoid undefined
+        // behavior. Simple negation (-BaseOffset) would be undefined for
+        // INT64_MIN since -INT64_MIN cannot fit in int64_t. Two's complement
+        // gives the expected wraparound behavior: -INT64_MIN becomes INT64_MIN.
+        BaseOffset = ~BaseOffset + 1ULL;
       return TTI.isLegalICmpImmediate(BaseOffset);
     }
 
@@ -3000,7 +3003,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {
         // of -1) are now also interesting.
         for (size_t i = 0, e = Factors.size(); i != e; ++i)
           if (Factors[i] != -1)
-            Factors.insert(-(uint64_t)Factors[i]);
+            Factors.insert(~Factors[i] + 1ULL);
         Factors.insert(-1);
       }
 
@@ -3739,7 +3742,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
     const SCEV *OrigReg = WI.OrigReg;
 
     Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType());
-    const SCEV *NegImmS = SE.getSCEV(ConstantInt::get(IntTy, -(uint64_t)Imm));
+    const SCEV *NegImmS = SE.getSCEV(ConstantInt::get(IntTy, ~Imm + 1ULL));
     unsigned BitWidth = SE.getTypeSizeInBits(IntTy);
 
     // TODO: Use a more targeted data structure.
@@ -3754,8 +3757,8 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
       if (F.ScaledReg == OrigReg) {
         int64_t Offset = (uint64_t)F.BaseOffset + Imm * (uint64_t)F.Scale;
         // Don't create 50 + reg(-50).
-        if (F.referencesReg(SE.getSCEV(
-                   ConstantInt::get(IntTy, -(uint64_t)Offset))))
+        if (F.referencesReg(
+                SE.getSCEV(ConstantInt::get(IntTy, ~Offset + 1ULL))))
           continue;
         Formula NewF = F;
         NewF.BaseOffset = Offset;
@@ -4556,7 +4559,7 @@ Value *LSRInstance::Expand(const LSRFixup &LF, const Formula &F,
       // The other interesting way of "folding" with an ICmpZero is to use a
       // negated immediate.
       if (!ICmpScaledV)
-        ICmpScaledV = ConstantInt::get(IntTy, -(uint64_t)Offset);
+        ICmpScaledV = ConstantInt::get(IntTy, ~Offset + 1ULL);
       else {
         Ops.push_back(SE.getUnknown(ICmpScaledV));
         ICmpScaledV = ConstantInt::get(IntTy, Offset);
@@ -4608,8 +4611,8 @@ Value *LSRInstance::Expand(const LSRFixup &LF, const Formula &F,
       assert((F.Scale == 0 || F.Scale == 1) &&
              "ICmp does not support folding a global value and "
              "a scale at the same time!");
-      Constant *C = ConstantInt::getSigned(SE.getEffectiveSCEVType(OpTy),
-                                           -(uint64_t)Offset);
+      Constant *C =
+          ConstantInt::getSigned(SE.getEffectiveSCEVType(OpTy), ~Offset + 1ULL);
       if (C->getType() != OpTy)
         C = ConstantExpr::getCast(CastInst::getCastOpcode(C, false,
                                                           OpTy, false),
diff --git a/lib/Transforms/Scalar/ScalarReplAggregatesHLSL.cpp b/lib/Transforms/Scalar/ScalarReplAggregatesHLSL.cpp
index e487079b94..54250ad36d 100644
--- a/lib/Transforms/Scalar/ScalarReplAggregatesHLSL.cpp
+++ b/lib/Transforms/Scalar/ScalarReplAggregatesHLSL.cpp
@@ -129,7 +129,6 @@ class SROA_Helper {
   void RewriteMemIntrin(MemIntrinsic *MI, Value *OldV);
   void RewriteCall(CallInst *CI);
   void RewriteBitCast(BitCastInst *BCI);
-  void RewriteCallArg(CallInst *CI, unsigned ArgIdx, bool bIn, bool bOut);
 };
 
 } // namespace
@@ -1478,6 +1477,57 @@ void isSafePHISelectUseForScalarRepl(Instruction *I, uint64_t Offset,
   }
 }
 
+// Returns whether the `OpIdx` argument of HL intrinsic call `CI` is expected to
+// be a user-defined-type.
+static bool isUDTIntrinsicArg(CallInst *CI, unsigned OpIdx) {
+  if (HLOpcodeGroup::HLIntrinsic != GetHLOpcodeGroup(CI->getCalledFunction()))
+    return false;
+  const unsigned NumOps = CI->getNumArgOperands();
+  switch (static_cast<IntrinsicOp>(GetHLOpcode(CI))) {
+  case IntrinsicOp::IOP_TraceRay:
+    if (NumOps == HLOperandIndex::kTraceRay_PreNumOp &&
+        OpIdx == HLOperandIndex::kTraceRayPayloadPreOpIdx)
+      return true;
+    else if (NumOps == HLOperandIndex::kTraceRay_NumOp &&
+             OpIdx == HLOperandIndex::kTraceRayPayloadOpIdx)
+      return true;
+    break;
+  case IntrinsicOp::IOP_ReportHit:
+    if (OpIdx == HLOperandIndex::kReportIntersectionAttributeOpIdx)
+      return true;
+    break;
+  case IntrinsicOp::IOP_CallShader:
+    if (OpIdx == HLOperandIndex::kCallShaderPayloadOpIdx)
+      return true;
+    break;
+  case IntrinsicOp::MOP_DxHitObject_FromRayQuery:
+    if (NumOps == HLOperandIndex::kHitObjectFromRayQuery_WithAttrs_NumOp &&
+        OpIdx ==
+            HLOperandIndex::kHitObjectFromRayQuery_WithAttrs_AttributeOpIdx)
+      return true;
+    break;
+  case IntrinsicOp::MOP_DxHitObject_TraceRay:
+    if (NumOps == HLOperandIndex::kHitObjectTraceRay_PreNumOp &&
+        OpIdx == HLOperandIndex::kHitObjectTraceRay_PayloadPreOpIdx)
+      return true;
+    else if (NumOps == HLOperandIndex::kHitObjectTraceRay_NumOp &&
+             OpIdx == HLOperandIndex::kHitObjectTraceRay_PayloadOpIdx)
+      return true;
+    break;
+  case IntrinsicOp::MOP_DxHitObject_Invoke:
+    if (OpIdx == HLOperandIndex::kHitObjectInvoke_PayloadOpIdx)
+      return true;
+    break;
+  case IntrinsicOp::MOP_DxHitObject_GetAttributes:
+    if (OpIdx == HLOperandIndex::kHitObjectGetAttributes_AttributeOpIdx)
+      return true;
+    break;
+  default:
+    break;
+  }
+  return false;
+}
+
 /// isSafeForScalarRepl - Check if instruction I is a safe use with regard to
 /// performing scalar replacement of alloca AI.  The results are flagged in
 /// the Info parameter.  Offset indicates the position within AI that is
@@ -1535,16 +1585,9 @@ void isSafeForScalarRepl(Instruction *I, uint64_t Offset, AllocaInfo &Info) {
       // Most HL functions are safe for scalar repl.
       if (HLOpcodeGroup::NotHL == group)
         return MarkUnsafe(Info, User);
-      else if (HLOpcodeGroup::HLIntrinsic == group) {
-        // TODO: should we check HL parameter type for UDT overload instead of
-        // basing on IOP?
-        IntrinsicOp opcode = static_cast<IntrinsicOp>(GetHLOpcode(CI));
-        if (IntrinsicOp::IOP_TraceRay == opcode ||
-            IntrinsicOp::IOP_ReportHit == opcode ||
-            IntrinsicOp::IOP_CallShader == opcode) {
-          return MarkUnsafe(Info, User);
-        }
-      }
+      else if (HLOpcodeGroup::HLIntrinsic == group &&
+               isUDTIntrinsicArg(CI, U.getOperandNo()))
+        return MarkUnsafe(Info, User);
     } else {
       return MarkUnsafe(Info, User);
     }
@@ -2660,12 +2703,11 @@ void SROA_Helper::RewriteBitCast(BitCastInst *BCI) {
   RewriteForGEP(cast<GEPOperator>(GEP), GEPBuilder);
 }
 
-/// RewriteCallArg - For Functions which don't flat,
-///                  replace OldVal with alloca and
-///                  copy in copy out data between alloca and flattened NewElts
-///                  in CallInst.
-void SROA_Helper::RewriteCallArg(CallInst *CI, unsigned ArgIdx, bool bIn,
-                                 bool bOut) {
+/// memcpyAggCallArg - For an aggregate call argument, this replaces the
+/// argument with an alloca and inserts a memcpy for input (if CopyIn) and
+/// output (if CopyOut).
+static void memcpyAggCallArg(CallInst *CI, unsigned ArgIdx, bool CopyIn,
+                             bool CopyOut) {
   Function *F = CI->getParent()->getParent();
   IRBuilder<> AllocaBuilder(dxilutil::FindAllocaInsertionPt(F));
   const DataLayout &DL = F->getParent()->getDataLayout();
@@ -2675,17 +2717,79 @@ void SROA_Helper::RewriteCallArg(CallInst *CI, unsigned ArgIdx, bool bIn,
   Type *userTyElt = userTy->getElementType();
   Value *Alloca = AllocaBuilder.CreateAlloca(userTyElt);
   IRBuilder<> Builder(CI);
-  if (bIn) {
-    MemCpyInst *cpy = cast<MemCpyInst>(Builder.CreateMemCpy(
-        Alloca, userTyV, DL.getTypeAllocSize(userTyElt), false));
-    RewriteMemIntrin(cpy, cpy->getRawSource());
-  }
+  if (CopyIn)
+    Builder.CreateMemCpy(Alloca, userTyV, DL.getTypeAllocSize(userTyElt),
+                         false);
   CI->setArgOperand(ArgIdx, Alloca);
-  if (bOut) {
+  if (CopyOut) {
     Builder.SetInsertPoint(CI->getNextNode());
-    MemCpyInst *cpy = cast<MemCpyInst>(Builder.CreateMemCpy(
-        userTyV, Alloca, DL.getTypeAllocSize(userTyElt), false));
-    RewriteMemIntrin(cpy, cpy->getRawSource());
+    Builder.CreateMemCpy(userTyV, Alloca, DL.getTypeAllocSize(userTyElt),
+                         false);
+  }
+}
+
+static void copyIntrinsicAggArgs(HLModule &HLM) {
+  // Iterate HLIntrinsic function users
+  // For specific intrinsics, use memcpyAggCallArg on aggregate args
+  // This ensures that the call does not directly use the pointer supplied,
+  // allowing certain arguments to be flattened, and UDT args to be correctly
+  // lowered.
+  for (Function &F : HLM.GetModule()->functions()) {
+    if (F.isIntrinsic() || !F.isDeclaration())
+      continue;
+    if (GetHLOpcodeGroup(&F) != HLOpcodeGroup::HLIntrinsic)
+      continue;
+    // Iterate users
+    for (User *U : F.users()) {
+      if (CallInst *CI = dyn_cast<CallInst>(U)) {
+        switch (static_cast<IntrinsicOp>(GetHLOpcode(CI))) {
+        case IntrinsicOp::IOP_TraceRay:
+          memcpyAggCallArg(CI, HLOperandIndex::kTraceRayRayDescOpIdx,
+                           /*CopyIn*/ true, /*CopyOut*/ false);
+          memcpyAggCallArg(CI, HLOperandIndex::kTraceRayPayloadPreOpIdx,
+                           /*CopyIn*/ true, /*CopyOut*/ true);
+          break;
+        case IntrinsicOp::IOP_ReportHit:
+          memcpyAggCallArg(CI,
+                           HLOperandIndex::kReportIntersectionAttributeOpIdx,
+                           /*CopyIn*/ true, /*CopyOut*/ false);
+          break;
+        case IntrinsicOp::IOP_CallShader:
+          memcpyAggCallArg(CI, HLOperandIndex::kCallShaderPayloadOpIdx,
+                           /*CopyIn*/ true, /*CopyOut*/ true);
+          break;
+        case IntrinsicOp::MOP_TraceRayInline:
+          memcpyAggCallArg(CI, HLOperandIndex::kTraceRayInlineRayDescOpIdx,
+                           /*CopyIn*/ true, /*CopyOut*/ false);
+          break;
+        case IntrinsicOp::MOP_DxHitObject_FromRayQuery:
+          if (CI->getNumArgOperands() ==
+              HLOperandIndex::kHitObjectFromRayQuery_WithAttrs_NumOp)
+            memcpyAggCallArg(
+                CI,
+                HLOperandIndex::kHitObjectFromRayQuery_WithAttrs_AttributeOpIdx,
+                /*CopyIn*/ true, /*CopyOut*/ false);
+          break;
+        case IntrinsicOp::MOP_DxHitObject_MakeMiss:
+          memcpyAggCallArg(CI, HLOperandIndex::kHitObjectMakeMiss_RayDescOpIdx,
+                           /*CopyIn*/ true, /*CopyOut*/ false);
+          break;
+        case IntrinsicOp::MOP_DxHitObject_TraceRay:
+          memcpyAggCallArg(CI, HLOperandIndex::kHitObjectTraceRay_RayDescOpIdx,
+                           /*CopyIn*/ true, /*CopyOut*/ false);
+          memcpyAggCallArg(CI,
+                           HLOperandIndex::kHitObjectTraceRay_PayloadPreOpIdx,
+                           /*CopyIn*/ true, /*CopyOut*/ true);
+          break;
+        case IntrinsicOp::MOP_DxHitObject_Invoke:
+          memcpyAggCallArg(CI, HLOperandIndex::kHitObjectInvoke_PayloadOpIdx,
+                           /*CopyIn*/ true, /*CopyOut*/ true);
+          break;
+        default:
+          break;
+        }
+      }
+    }
   }
 }
 
@@ -2739,13 +2843,26 @@ static CallInst *RewriteWithFlattenedHLIntrinsicCall(CallInst *CI,
 
 /// RewriteCall - Replace OldVal with flattened NewElts in CallInst.
 void SROA_Helper::RewriteCall(CallInst *CI) {
-  HLOpcodeGroup group = GetHLOpcodeGroupByName(CI->getCalledFunction());
-  if (group != HLOpcodeGroup::NotHL) {
+  HLOpcodeGroup Group = GetHLOpcodeGroupByName(CI->getCalledFunction());
+  if (Group != HLOpcodeGroup::NotHL) {
     unsigned opcode = GetHLOpcode(CI);
-    if (group == HLOpcodeGroup::HLIntrinsic) {
+    if (Group == HLOpcodeGroup::HLIntrinsic) {
+      // RayQuery this pointer replacement.
+      if (OldVal->getType()->isPointerTy() &&
+          dxilutil::IsHLSLRayQueryType(
+              OldVal->getType()->getPointerElementType())) {
+        // For RayQuery methods, we want to replace the RayQuery this pointer
+        // with a load and use of the underlying handle value.
+        // This will allow elimination of RayQuery types earlier.
+        RewriteWithFlattenedHLIntrinsicCall(CI, OldVal, NewElts,
+                                            /*loadElts*/ true);
+        DeadInsts.push_back(CI);
+        return;
+      }
+
       IntrinsicOp IOP = static_cast<IntrinsicOp>(opcode);
       switch (IOP) {
-      case IntrinsicOp::MOP_Append: {
+      case IntrinsicOp::MOP_Append:
         // Buffer Append already expand in code gen.
         // Must be OutputStream Append here.
         // Every Elt has a pointer type.
@@ -2753,63 +2870,47 @@ void SROA_Helper::RewriteCall(CallInst *CI) {
         RewriteWithFlattenedHLIntrinsicCall(CI, OldVal, NewElts,
                                             /*loadElts*/ false);
         DeadInsts.push_back(CI);
-      } break;
-      case IntrinsicOp::IOP_TraceRay: {
+        return;
+      case IntrinsicOp::IOP_TraceRay:
         if (OldVal ==
             CI->getArgOperand(HLOperandIndex::kTraceRayRayDescOpIdx)) {
-          RewriteCallArg(CI, HLOperandIndex::kTraceRayRayDescOpIdx,
-                         /*bIn*/ true, /*bOut*/ false);
-        } else {
-          DXASSERT(OldVal ==
-                       CI->getArgOperand(HLOperandIndex::kTraceRayPayLoadOpIdx),
-                   "else invalid TraceRay");
-          RewriteCallArg(CI, HLOperandIndex::kTraceRayPayLoadOpIdx,
-                         /*bIn*/ true, /*bOut*/ true);
+          RewriteWithFlattenedHLIntrinsicCall(CI, OldVal, NewElts,
+                                              /*loadElts*/ true);
+          DeadInsts.push_back(CI);
+          return;
         }
-      } break;
-      case IntrinsicOp::IOP_ReportHit: {
-        RewriteCallArg(CI, HLOperandIndex::kReportIntersectionAttributeOpIdx,
-                       /*bIn*/ true, /*bOut*/ false);
-      } break;
-      case IntrinsicOp::IOP_CallShader: {
-        RewriteCallArg(CI, HLOperandIndex::kCallShaderPayloadOpIdx,
-                       /*bIn*/ true, /*bOut*/ true);
-      } break;
-      case IntrinsicOp::MOP_DxHitObject_MakeMiss: {
-        if (OldVal ==
-            CI->getArgOperand(HLOperandIndex::kHitObjectMakeMissRayDescOpIdx)) {
+        break;
+      case IntrinsicOp::MOP_DxHitObject_TraceRay:
+        if (OldVal == CI->getArgOperand(
+                          HLOperandIndex::kHitObjectTraceRay_RayDescOpIdx)) {
           RewriteWithFlattenedHLIntrinsicCall(CI, OldVal, NewElts,
                                               /*loadElts*/ true);
           DeadInsts.push_back(CI);
+          return;
         }
-      } break;
-      case IntrinsicOp::MOP_TraceRayInline: {
-        if (OldVal ==
-            CI->getArgOperand(HLOperandIndex::kTraceRayInlineRayDescOpIdx)) {
+        break;
+      case IntrinsicOp::MOP_DxHitObject_MakeMiss:
+        if (OldVal == CI->getArgOperand(
+                          HLOperandIndex::kHitObjectMakeMiss_RayDescOpIdx)) {
           RewriteWithFlattenedHLIntrinsicCall(CI, OldVal, NewElts,
                                               /*loadElts*/ true);
           DeadInsts.push_back(CI);
-          break;
+          return;
         }
-      }
-        LLVM_FALLTHROUGH;
-      default:
-        // RayQuery this pointer replacement.
-        if (OldVal->getType()->isPointerTy() &&
-            CI->getNumArgOperands() >= HLOperandIndex::kHandleOpIdx &&
-            OldVal == CI->getArgOperand(HLOperandIndex::kHandleOpIdx) &&
-            dxilutil::IsHLSLRayQueryType(
-                OldVal->getType()->getPointerElementType())) {
-          // For RayQuery methods, we want to replace the RayQuery this pointer
-          // with a load and use of the underlying handle value.
-          // This will allow elimination of RayQuery types earlier.
+        break;
+      case IntrinsicOp::MOP_TraceRayInline:
+        if (OldVal ==
+            CI->getArgOperand(HLOperandIndex::kTraceRayInlineRayDescOpIdx)) {
           RewriteWithFlattenedHLIntrinsicCall(CI, OldVal, NewElts,
                                               /*loadElts*/ true);
           DeadInsts.push_back(CI);
-          break;
+          return;
         }
-        DXASSERT(0, "cannot flatten hlsl intrinsic.");
+        break;
+      default:
+        break;
       }
+      DXASSERT(0, "cannot flatten hlsl intrinsic.");
     }
     // TODO: check other high level dx operations if need to.
   } else {
@@ -4390,6 +4491,9 @@ class SROA_Parameter_HLSL : public ModulePass {
       F->eraseFromParent();
     }
 
+    // Expand flattened copy-in/copy-out for intrinsic UDT args:
+    copyIntrinsicAggArgs(*m_pHLModule);
+
     // SROA globals and allocas.
     SROAGlobalAndAllocas(*m_pHLModule, m_HasDbgInfo);
 
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 69ca2688c8..d8e8fa11bd 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4472,8 +4472,8 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
 
   unsigned WidestType = getWidestType();
   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
-  unsigned MaxSafeDepDist = -1U;
-  if (Legal->getMaxSafeDepDistBytes() != -1U)
+  unsigned MaxSafeDepDist = std::numeric_limits<unsigned>::max();
+  if (Legal->getMaxSafeDepDistBytes() != std::numeric_limits<unsigned>::max())
     MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
   WidestRegister = ((WidestRegister < MaxSafeDepDist) ?
                     WidestRegister : MaxSafeDepDist);
@@ -4638,7 +4638,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
     return 1;
 
   // We used the distance for the interleave count.
-  if (Legal->getMaxSafeDepDistBytes() != -1U)
+  if (Legal->getMaxSafeDepDistBytes() != std::numeric_limits<unsigned>::max())
     return 1;
 
   // Do not interleave loops with a relatively small trip count.
diff --git a/tools/clang/include/clang/AST/DeclCXX.h b/tools/clang/include/clang/AST/DeclCXX.h
index 36e0f99c82..3b07576545 100644
--- a/tools/clang/include/clang/AST/DeclCXX.h
+++ b/tools/clang/include/clang/AST/DeclCXX.h
@@ -465,10 +465,6 @@ class CXXRecordDecl : public RecordDecl {
     /// \brief Whether we are currently parsing base specifiers.
     bool IsParsingBaseSpecifiers : 1;
 
-    /// \brief Whether this class contains at least one member or base
-    ///  class containing an HLSL vector longer than 4 elements.
-    bool HasHLSLLongVector : 1;
-
     /// \brief The number of base class specifiers in Bases.
     unsigned NumBases;
 
@@ -1022,13 +1018,6 @@ class CXXRecordDecl : public RecordDecl {
     return data().NeedOverloadResolutionForDestructor;
   }
 
-  // HLSL Change add HLSL Long vector bit.
-  /// \brief Determine whether this class contains an HLSL long vector
-  /// of over 4 elements.
-  bool hasHLSLLongVector() { return data().HasHLSLLongVector; }
-  /// \brief Set that this class contains an HLSL long vector of over 4 elements
-  bool setHasHLSLLongVector() { return data().HasHLSLLongVector = true; }
-
   /// \brief Determine whether this class describes a lambda function object.
   bool isLambda() const {
     // An update record can't turn a non-lambda into a lambda.
diff --git a/tools/clang/include/clang/AST/Expr.h b/tools/clang/include/clang/AST/Expr.h
index 26eff309f7..55fd184a79 100644
--- a/tools/clang/include/clang/AST/Expr.h
+++ b/tools/clang/include/clang/AST/Expr.h
@@ -4510,7 +4510,9 @@ class GenericSelectionExpr : public Expr {
   Expr *getControllingExpr() { return cast<Expr>(SubExprs[CONTROLLING]); }
 
   /// Whether this generic selection is result-dependent.
-  bool isResultDependent() const { return ResultIndex == -1U; }
+  bool isResultDependent() const {
+    return ResultIndex == std::numeric_limits<unsigned>::max();
+  }
 
   /// The zero-based index of the result expression's generic association in
   /// the generic selection's association list.  Defined only if the
diff --git a/tools/clang/include/clang/AST/HlslTypes.h b/tools/clang/include/clang/AST/HlslTypes.h
index 3a02824b3a..43c1effdb8 100644
--- a/tools/clang/include/clang/AST/HlslTypes.h
+++ b/tools/clang/include/clang/AST/HlslTypes.h
@@ -6,9 +6,6 @@
 // This file is distributed under the University of Illinois Open Source     //
 // License. See LICENSE.TXT for details.                                     //
 //                                                                           //
-// Modifications Copyright(C) 2025 Advanced Micro Devices, Inc.              //
-// All rights reserved.                                                      //
-//                                                                           //
 ///
 /// \file                                                                    //
 /// \brief Defines the HLSL type system interface.                           //
@@ -488,17 +485,21 @@ bool IsHLSLObjectWithImplicitMemberAccess(clang::QualType type);
 bool IsHLSLObjectWithImplicitROMemberAccess(clang::QualType type);
 bool IsHLSLRWNodeInputRecordType(clang::QualType type);
 bool IsHLSLRONodeInputRecordType(clang::QualType type);
+bool IsHLSLDispatchNodeInputRecordType(clang::QualType type);
+bool IsHLSLNodeRecordArrayType(clang::QualType type);
 bool IsHLSLNodeOutputType(clang::QualType type);
+bool IsHLSLEmptyNodeRecordType(clang::QualType type);
 
 DXIL::NodeIOKind GetNodeIOType(clang::QualType type);
 
 bool IsHLSLStructuredBufferType(clang::QualType type);
 bool IsHLSLNumericOrAggregateOfNumericType(clang::QualType type);
-bool IsHLSLNumericUserDefinedType(clang::QualType type);
 bool IsHLSLCopyableAnnotatableRecord(clang::QualType QT);
 bool IsHLSLBuiltinRayAttributeStruct(clang::QualType QT);
 bool IsHLSLAggregateType(clang::QualType type);
 clang::QualType GetHLSLResourceResultType(clang::QualType type);
+clang::QualType GetHLSLNodeIOResultType(clang::ASTContext &astContext,
+                                        clang::QualType type);
 unsigned GetHLSLResourceTemplateUInt(clang::QualType type);
 bool IsIncompleteHLSLResourceArrayType(clang::ASTContext &context,
                                        clang::QualType type);
diff --git a/tools/clang/include/clang/AST/OperationKinds.h b/tools/clang/include/clang/AST/OperationKinds.h
index 3909c8b5e8..d19082d699 100644
--- a/tools/clang/include/clang/AST/OperationKinds.h
+++ b/tools/clang/include/clang/AST/OperationKinds.h
@@ -5,9 +5,6 @@
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
-// Modifications Copyright(C) 2025 Advanced Micro Devices, Inc.
-// All rights reserved.
-//
 //===----------------------------------------------------------------------===//
 //
 // This file enumerates the different kinds of operations that can be
diff --git a/tools/clang/include/clang/Basic/Attr.td b/tools/clang/include/clang/Basic/Attr.td
index 2518423565..1797597d17 100644
--- a/tools/clang/include/clang/Basic/Attr.td
+++ b/tools/clang/include/clang/Basic/Attr.td
@@ -5,9 +5,6 @@
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
-// Modifications Copyright(C) 2025 Advanced Micro Devices, Inc.
-// All rights reserved.
-//
 //===----------------------------------------------------------------------===//
 
 class DocumentationCategory<string name> {
@@ -1418,7 +1415,8 @@ def VKDecorateExt : InheritableAttr {
 
 def VKDecorateIdExt : InheritableAttr {
   let Spellings = [CXX11<"vk", "ext_decorate_id">];
-  let Subjects = SubjectList<[Function, Var, ParmVar, TypedefName], ErrorDiag>;
+  let Subjects =
+      SubjectList<[Function, Var, ParmVar, Field, TypedefName], ErrorDiag>;
   let Args = [UnsignedArgument<"decorate">, VariadicExprArgument<"arguments">];
   let LangOpts = [SPIRV];
   let Documentation = [Undocumented];
@@ -1426,7 +1424,8 @@ def VKDecorateIdExt : InheritableAttr {
 
 def VKDecorateStringExt : InheritableAttr {
   let Spellings = [CXX11<"vk", "ext_decorate_string">];
-  let Subjects = SubjectList<[Function, Var, ParmVar, TypedefName], ErrorDiag>;
+  let Subjects =
+      SubjectList<[Function, Var, ParmVar, Field, TypedefName], ErrorDiag>;
   let Args = [UnsignedArgument<"decorate">, VariadicStringArgument<"arguments">];
   let LangOpts = [SPIRV];
   let Documentation = [Undocumented];
diff --git a/tools/clang/include/clang/Basic/DiagnosticSemaKinds.td b/tools/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 6254e5fc71..cbd9412566 100644
--- a/tools/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/tools/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -5,9 +5,6 @@
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
-// Modifications Copyright(C) 2025 Advanced Micro Devices, Inc.
-// All rights reserved.
-//
 //===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
@@ -7558,8 +7555,6 @@ def err_hlsl_missing_type_specifier : Error< // Patterened after err_missing_typ
   "HLSL requires a type specifier for all declarations">;
 def err_hlsl_multiple_concrete_bases : Error<
   "multiple concrete base types specified">;
-def err_hlsl_objectintemplateargument : Error<
-  "%0 is an object and cannot be used as a type parameter">;
 def err_hlsl_packoffset_requires_cbuffer : Error<
   "packoffset is only allowed in a constant buffer">;
 def warn_hlsl_packoffset_mix : Warning<
@@ -7646,7 +7641,7 @@ def err_payload_requires_inout : Error<
 def err_attributes_requiers_in : Error<
   "intersection attributes parameter %0 must be 'in'">;
 def err_payload_attrs_must_be_udt : Error<
-  "%select{payload|attributes|callable}0 parameter %1 must be a user-defined type composed of only numeric types">;
+  "%select{payload|attributes|callable}0 %select{parameter %2|type}1 must be a user-defined type composed of only numeric types">;
 def err_shader_must_return_void : Error<
   "return type for '%0' shaders must be void">;
 def err_raytracing_entry_param_count : Error<
@@ -7885,7 +7880,16 @@ def err_hlsl_unsupported_long_vector
     "cbuffers or tbuffers|user-defined struct parameter|"
     "entry function parameters|entry function return type|"
     "patch constant function parameters|patch constant function return type|"
-    "payload parameters}0 are not supported">;
+    "payload parameters|attributes}0 are not supported">;
+// First %select options must match err_hlsl_unsupported_long_vector (same index used)
+def err_hlsl_unsupported_object_context
+    : Error<"object %0 is not allowed in "
+    "%select{ConstantBuffers or TextureBuffers|"
+    "tessellation patches|geometry streams|node records|"
+    "cbuffers or tbuffers|user-defined struct parameter|"
+    "entry function parameters|entry function return type|"
+    "patch constant function parameters|patch constant function return type|"
+    "payload parameters|attributes|builtin template parameters|structured buffers|global variables|groupshared variables}1">;
 def err_hlsl_logical_binop_scalar : Error<
    "operands for short-circuiting logical binary operator must be scalar, for non-scalar types use '%select{and|or}0'">;
 def err_hlsl_ternary_scalar : Error<
@@ -7970,8 +7974,6 @@ def err_hlsl_too_many_node_inputs : Error<
    "Node shader '%0' may not have more than one input record">;
 def err_hlsl_node_record_type : Error<
    "%0 is not valid as a node record type - struct/class required">;
-def err_hlsl_node_record_object : Error<
-   "object %0 may not appear in a node record">;
 def err_hlsl_array_disallowed : Error<
    "%select{entry parameter|declaration}1 of type %0 may not be an array">;
 def err_hlsl_inputpatch_size: Error<
@@ -8013,6 +8015,43 @@ def err_hlsl_reorder_unsupported_stage : Error<
    "dx::MaybeReorderThread is unavailable in shader stage '%0' (requires 'raygeneration')">;
 def err_hlsl_hitobject_unsupported_stage : Error<
    "dx::HitObject is unavailable in shader stage '%0' (requires 'raygeneration', 'closesthit' or 'miss')">;
+
+// Linear Algebra Operations
+def err_hlsl_linalg_isunsigned_incorrect_for_given_type : Error<
+  "%0 must be %select{false|true}1 for vector of "
+  "%select{floating point|signed integer|unsigned integer}2 type">;
+def err_hlsl_linalg_interpretation_value_incorrect : Error<
+  "%0 is an invalid %select{memory|register}1 interpretation value">;
+def err_hlsl_linalg_matrix_layout_is_not_transposable : Error<
+  "RowMajor and ColumnMajor matrices are not transposable">;
+def err_hlsl_linalg_optimal_matrix_layout_matrix_stride_must_be_zero : Error<
+  "for optimal matrix layout, matrix stride must be 0">;
+def err_hlsl_linalg_matrix_dim_must_be_greater_than_zero: Error<
+  "matrix dimension must be greater than 0">;
+def err_hlsl_linalg_matrix_layout_invalid : Error<
+  "matrix layout %0 is not valid, must be in the range [%1, %2]">;
+
+def err_hlsl_linalg_mul_muladd_output_vector_size_not_equal_to_matrix_M : Error<
+  "output vector length must be equal to Matrix M dimension in a linalg Mul/MulAdd operation">;
+def err_hlsl_linalg_mul_muladd_unpacked_input_vector_size_not_equal_to_matrix_K : Error<
+  "unpacked input vector length must be equal to Matrix K dimension in a linalg Mul/MulAdd operation">;
+def err_hlsl_linalg_mul_muladd_packed_input_vector_size_incorrect : Error<
+  "packed input vector length must be the smallest number that can hold matrix dim K values of the "
+  "packed(smaller) type in linalg mul/muladd operations">;
+def err_hlsl_linalg_mul_muladd_isUnsigned_for_packed_input_must_be_true : Error<
+  "IsInputUnsigned must be true for packed input interpretations in linalg mul/muladd operations">;
+def err_hlsl_linalg_mul_muladd_packed_input_vector_must_be_uint : Error<
+  "packed input vector type must be a 32-bit unsigned int type in linalg mul/muladd operations">;
+def err_hlsl_linalg_mul_muladd_invalid_dim: Error<
+  "matrix dimension %select{M|K when using unpacked input vectors|K "
+  "when using packed input vectors}0 must be less than %1, in a linalg "
+  "Mul/MulAdd operation">;
+
+def err_hlsl_linalg_outer_prod_acc_vector_type_mismatch : Error<
+  "input vectors of outerproductaccumulate must have the same element type">;
+def err_hlsl_linalg_outer_prod_acc_matrix_layout_must_be_outer_prod_acc_optimal : Error<
+  "matrix layout for outerproductaccumulate must be %0">;
+
 // HLSL Change Ends
 
 // SPIRV Change Starts
@@ -8021,6 +8060,8 @@ def err_hlsl_vk_pointer_cast_alignment: Error<
   "Vulkan buffer pointer cannot be cast to greater alignment">;
 def err_hlsl_vk_static_pointer_cast_type: Error<
   "vk::static_pointer_cast() content type must be base class of argument's content type">;
+def warn_spirv_node_shaders_experimental : Warning<
+  "SPIR-V implementation of node shaders is experimental and subject to change">;
 // SPIRV Change Ends
 
 let CategoryName = "OpenMP Issue" in {
diff --git a/tools/clang/include/clang/SPIRV/AstTypeProbe.h b/tools/clang/include/clang/SPIRV/AstTypeProbe.h
index 6302d43a88..9abea972c6 100644
--- a/tools/clang/include/clang/SPIRV/AstTypeProbe.h
+++ b/tools/clang/include/clang/SPIRV/AstTypeProbe.h
@@ -337,6 +337,10 @@ bool isOrContainsNonFpColMajorMatrix(const ASTContext &,
                                      const SpirvCodeGenOptions &, QualType type,
                                      const Decl *decl);
 
+/// brief Returns true if the type is a boolean type or an aggragate type that
+/// contains a boolean type.
+bool isOrContainsBoolType(QualType type);
+
 /// \brief Returns true if the given type is `vk::ext_result_id<T>`.
 bool isExtResultIdType(QualType type);
 
diff --git a/tools/clang/include/clang/SPIRV/FeatureManager.h b/tools/clang/include/clang/SPIRV/FeatureManager.h
index 3c1871df37..94dc5bf1ab 100644
--- a/tools/clang/include/clang/SPIRV/FeatureManager.h
+++ b/tools/clang/include/clang/SPIRV/FeatureManager.h
@@ -57,6 +57,7 @@ enum class Extension {
   KHR_ray_query,
   EXT_shader_image_int64,
   KHR_physical_storage_buffer,
+  AMD_shader_enqueue,
   KHR_vulkan_memory_model,
   NV_compute_shader_derivatives,
   KHR_compute_shader_derivatives,
diff --git a/tools/clang/include/clang/SPIRV/SpirvBuilder.h b/tools/clang/include/clang/SPIRV/SpirvBuilder.h
index 5e03d1ef96..465f7313f1 100644
--- a/tools/clang/include/clang/SPIRV/SpirvBuilder.h
+++ b/tools/clang/include/clang/SPIRV/SpirvBuilder.h
@@ -5,9 +5,6 @@
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
-// Modifications Copyright(C) 2025 Advanced Micro Devices, Inc.
-// All rights reserved.
-//
 //===----------------------------------------------------------------------===//
 #ifndef LLVM_CLANG_SPIRV_SPIRVBUILDER_H
 #define LLVM_CLANG_SPIRV_SPIRVBUILDER_H
@@ -437,6 +434,25 @@ class SpirvBuilder {
       QualType resultType, NonSemanticDebugPrintfInstructions instId,
       llvm::ArrayRef<SpirvInstruction *> operands, SourceLocation);
 
+  SpirvInstruction *createIsNodePayloadValid(SpirvInstruction *payloadArray,
+                                             SpirvInstruction *nodeIndex,
+                                             SourceLocation);
+
+  SpirvInstruction *createNodePayloadArrayLength(SpirvInstruction *payloadArray,
+                                                 SourceLocation);
+
+  SpirvInstruction *createAllocateNodePayloads(QualType resultType,
+                                               spv::Scope allocationScope,
+                                               SpirvInstruction *shaderIndex,
+                                               SpirvInstruction *recordCount,
+                                               SourceLocation);
+
+  void createEnqueueOutputNodePayloads(SpirvInstruction *payload,
+                                       SourceLocation);
+
+  SpirvInstruction *createFinishWritingNodePayload(SpirvInstruction *payload,
+                                                   SourceLocation);
+
   /// \brief Creates an OpMemoryBarrier or OpControlBarrier instruction with the
   /// given flags. If execution scope (exec) is provided, an OpControlBarrier
   /// is created; otherwise an OpMemoryBarrier is created.
@@ -615,8 +631,15 @@ class SpirvBuilder {
   inline SpirvInstruction *addExecutionMode(SpirvFunction *entryPoint,
                                             spv::ExecutionMode em,
                                             llvm::ArrayRef<uint32_t> params,
-                                            SourceLocation,
-                                            bool useIdParams = false);
+                                            SourceLocation);
+
+  /// \brief Adds an execution mode to the module under construction if it does
+  /// not already exist. Return the newly added instruction or the existing
+  /// instruction, if one already exists.
+  inline SpirvInstruction *
+  addExecutionModeId(SpirvFunction *entryPoint, spv::ExecutionMode em,
+                     llvm::ArrayRef<SpirvInstruction *> params,
+                     SourceLocation loc);
 
   /// \brief Adds an OpModuleProcessed instruction to the module under
   /// construction.
@@ -759,6 +782,7 @@ class SpirvBuilder {
                        llvm::ArrayRef<SpirvConstant *> constituents,
                        bool specConst = false);
   SpirvConstant *getConstantNull(QualType);
+  SpirvConstant *getConstantString(llvm::StringRef str, bool specConst = false);
   SpirvUndef *getUndef(QualType);
 
   SpirvString *createString(llvm::StringRef str);
@@ -963,17 +987,44 @@ SpirvBuilder::setDebugSource(uint32_t major, uint32_t minor,
 SpirvInstruction *
 SpirvBuilder::addExecutionMode(SpirvFunction *entryPoint, spv::ExecutionMode em,
                                llvm::ArrayRef<uint32_t> params,
-                               SourceLocation loc, bool useIdParams) {
+                               SourceLocation loc) {
   SpirvExecutionMode *mode = nullptr;
-  SpirvExecutionMode *existingInstruction =
+  SpirvExecutionModeBase *existingInstruction =
       mod->findExecutionMode(entryPoint, em);
 
   if (!existingInstruction) {
-    mode = new (context)
-        SpirvExecutionMode(loc, entryPoint, em, params, useIdParams);
+    mode = new (context) SpirvExecutionMode(loc, entryPoint, em, params);
+    mod->addExecutionMode(mode);
+  } else {
+    // No execution mode can be used with both OpExecutionMode and
+    // OpExecutionModeId. If this assert is triggered, then either this
+    // `addExecutionModeId` should have been called with `em` or the existing
+    // instruction is wrong.
+    assert(existingInstruction->getKind() ==
+           SpirvInstruction::IK_ExecutionMode);
+    mode = cast<SpirvExecutionMode>(existingInstruction);
+  }
+
+  return mode;
+}
+
+SpirvInstruction *SpirvBuilder::addExecutionModeId(
+    SpirvFunction *entryPoint, spv::ExecutionMode em,
+    llvm::ArrayRef<SpirvInstruction *> params, SourceLocation loc) {
+  SpirvExecutionModeId *mode = nullptr;
+  SpirvExecutionModeBase *existingInstruction =
+      mod->findExecutionMode(entryPoint, em);
+  if (!existingInstruction) {
+    mode = new (context) SpirvExecutionModeId(loc, entryPoint, em, params);
     mod->addExecutionMode(mode);
   } else {
-    mode = existingInstruction;
+    // No execution mode can be used with both OpExecutionMode and
+    // OpExecutionModeId. If this assert is triggered, then either this
+    // `addExecutionMode` should have been called with `em` or the existing
+    // instruction is wrong.
+    assert(existingInstruction->getKind() ==
+           SpirvInstruction::IK_ExecutionModeId);
+    mode = cast<SpirvExecutionModeId>(existingInstruction);
   }
 
   return mode;
diff --git a/tools/clang/include/clang/SPIRV/SpirvContext.h b/tools/clang/include/clang/SPIRV/SpirvContext.h
index c18c139642..8e0458e731 100644
--- a/tools/clang/include/clang/SPIRV/SpirvContext.h
+++ b/tools/clang/include/clang/SPIRV/SpirvContext.h
@@ -5,9 +5,6 @@
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
-// Modifications Copyright(C) 2025 Advanced Micro Devices, Inc.
-// All rights reserved.
-//
 //===----------------------------------------------------------------------===//
 #ifndef LLVM_CLANG_SPIRV_SPIRVCONTEXT_H
 #define LLVM_CLANG_SPIRV_SPIRVCONTEXT_H
@@ -101,6 +98,21 @@ struct RuntimeArrayTypeMapInfo {
   }
 };
 
+// Provides DenseMapInfo for NodePayloadArrayType so we can create a DenseSet of
+// node payload array types.
+struct NodePayloadArrayTypeMapInfo {
+  static inline NodePayloadArrayType *getEmptyKey() { return nullptr; }
+  static inline NodePayloadArrayType *getTombstoneKey() { return nullptr; }
+  static unsigned getHashValue(const NodePayloadArrayType *Val) {
+    return llvm::hash_combine(Val->getElementType(), Val->getNodeDecl());
+  }
+  static bool isEqual(const NodePayloadArrayType *LHS,
+                      const NodePayloadArrayType *RHS) {
+    // Either both are null, or both should have the same underlying type.
+    return (LHS == RHS) || (LHS && RHS && *LHS == *RHS);
+  }
+};
+
 // Provides DenseMapInfo for ImageType so we can create a DenseSet of
 // image types.
 struct ImageTypeMapInfo {
@@ -273,6 +285,9 @@ class SpirvContext {
   const RuntimeArrayType *
   getRuntimeArrayType(const SpirvType *elemType,
                       llvm::Optional<uint32_t> arrayStride);
+  const NodePayloadArrayType *
+  getNodePayloadArrayType(const SpirvType *elemType,
+                          const ParmVarDecl *nodeDecl);
 
   const StructType *getStructType(
       llvm::ArrayRef<StructType::FieldInfo> fields, llvm::StringRef name,
@@ -349,6 +364,7 @@ class SpirvContext {
   bool isDS() const { return curShaderModelKind == ShaderModelKind::Domain; }
   bool isCS() const { return curShaderModelKind == ShaderModelKind::Compute; }
   bool isLib() const { return curShaderModelKind == ShaderModelKind::Library; }
+  bool isNode() const { return curShaderModelKind == ShaderModelKind::Node; }
   bool isRay() const {
     return curShaderModelKind >= ShaderModelKind::RayGeneration &&
            curShaderModelKind <= ShaderModelKind::Callable;
@@ -440,6 +456,31 @@ class SpirvContext {
            instructionsWithLoweredType.end();
   }
 
+  void registerDispatchGridIndex(const RecordDecl *decl, unsigned index) {
+    auto iter = dispatchGridIndices.find(decl);
+    if (iter == dispatchGridIndices.end()) {
+      dispatchGridIndices[decl] = index;
+    }
+  }
+
+  llvm::Optional<unsigned> getDispatchGridIndex(const RecordDecl *decl) {
+    auto iter = dispatchGridIndices.find(decl);
+    if (iter != dispatchGridIndices.end()) {
+      return iter->second;
+    }
+    return llvm::None;
+  }
+
+  void registerNodeDeclPayloadType(const NodePayloadArrayType *type,
+                                   const ParmVarDecl *decl) {
+    nodeDecls[decl] = type;
+  }
+
+  const NodePayloadArrayType *getNodeDeclPayloadType(const ParmVarDecl *decl) {
+    auto iter = nodeDecls.find(decl);
+    return iter == nodeDecls.end() ? nullptr : iter->second;
+  }
+
 private:
   /// \brief The allocator used to create SPIR-V entity objects.
   ///
@@ -484,6 +525,8 @@ class SpirvContext {
   llvm::DenseSet<const ArrayType *, ArrayTypeMapInfo> arrayTypes;
   llvm::DenseSet<const RuntimeArrayType *, RuntimeArrayTypeMapInfo>
       runtimeArrayTypes;
+  llvm::DenseSet<const NodePayloadArrayType *, NodePayloadArrayTypeMapInfo>
+      nodePayloadArrayTypes;
   llvm::SmallVector<const StructType *, 8> structTypes;
   llvm::SmallVector<const HybridStructType *, 8> hybridStructTypes;
   llvm::DenseMap<const SpirvType *, SCToPtrTyMap> pointerTypes;
@@ -510,6 +553,9 @@ class SpirvContext {
   llvm::StringMap<RichDebugInfo> debugInfo;
   SpirvDebugInstruction *currentLexicalScope;
 
+  // Mapping from graphics node input record types to member decoration maps.
+  llvm::MapVector<const RecordDecl *, unsigned> dispatchGridIndices;
+
   // Mapping from SPIR-V type to debug type instruction.
   // The purpose is not to generate several DebugType* instructions for the same
   // type if the type is used for several variables.
@@ -541,6 +587,10 @@ class SpirvContext {
 
   // Set of instructions that already have lowered SPIR-V types.
   llvm::DenseSet<const SpirvInstruction *> instructionsWithLoweredType;
+
+  // Mapping from shader entry function parameter declaration to node payload
+  // array type.
+  llvm::MapVector<const ParmVarDecl *, const NodePayloadArrayType *> nodeDecls;
 };
 
 } // end namespace spirv
diff --git a/tools/clang/include/clang/SPIRV/SpirvInstruction.h b/tools/clang/include/clang/SPIRV/SpirvInstruction.h
index f49a295610..52f4128a6c 100644
--- a/tools/clang/include/clang/SPIRV/SpirvInstruction.h
+++ b/tools/clang/include/clang/SPIRV/SpirvInstruction.h
@@ -5,9 +5,6 @@
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
-// Modifications Copyright(C) 2025 Advanced Micro Devices, Inc.
-// All rights reserved.
-//
 //===----------------------------------------------------------------------===//
 #ifndef LLVM_CLANG_SPIRV_SPIRVINSTRUCTION_H
 #define LLVM_CLANG_SPIRV_SPIRVINSTRUCTION_H
@@ -57,6 +54,7 @@ class SpirvInstruction {
     IK_MemoryModel,     // OpMemoryModel
     IK_EntryPoint,      // OpEntryPoint
     IK_ExecutionMode,   // OpExecutionMode
+    IK_ExecutionModeId, // OpExecutionModeId
     IK_String,          // OpString (debug)
     IK_Source,          // OpSource (debug)
     IK_ModuleProcessed, // OpModuleProcessed (debug)
@@ -69,6 +67,7 @@ class SpirvInstruction {
     IK_ConstantInteger,
     IK_ConstantFloat,
     IK_ConstantComposite,
+    IK_ConstantString,
     IK_ConstantNull,
 
     // Pointer <-> uint conversions.
@@ -167,6 +166,13 @@ class SpirvInstruction {
     IK_DebugTypeMember,
     IK_DebugTypeTemplate,
     IK_DebugTypeTemplateParameter,
+
+    // For workgraph instructions
+    IK_IsNodePayloadValid,
+    IK_NodePayloadArrayLength,
+    IK_AllocateNodePayloads,
+    IK_EnqueueNodePayloads,
+    IK_FinishWritingNodePayload,
   };
 
   // All instruction classes should include a releaseMemory method.
@@ -404,12 +410,34 @@ class SpirvEntryPoint : public SpirvInstruction {
   llvm::SmallVector<SpirvVariable *, 8> interfaceVec;
 };
 
+class SpirvExecutionModeBase : public SpirvInstruction {
+public:
+  SpirvExecutionModeBase(Kind kind, spv::Op opcode, SourceLocation loc,
+                         SpirvFunction *entryPointFunction,
+                         spv::ExecutionMode executionMode)
+      : SpirvInstruction(kind, opcode, QualType(), loc),
+        entryPoint(entryPointFunction), execMode(executionMode) {}
+
+  DEFINE_RELEASE_MEMORY_FOR_CLASS(SpirvExecutionModeBase)
+
+  // For LLVM-style RTTI
+  static bool classof(const SpirvInstruction *inst) { return false; }
+
+  bool invokeVisitor(Visitor *v) override;
+
+  SpirvFunction *getEntryPoint() const { return entryPoint; }
+  spv::ExecutionMode getExecutionMode() const { return execMode; }
+
+private:
+  SpirvFunction *entryPoint;
+  spv::ExecutionMode execMode;
+};
+
 /// \brief OpExecutionMode and OpExecutionModeId instructions
-class SpirvExecutionMode : public SpirvInstruction {
+class SpirvExecutionMode : public SpirvExecutionModeBase {
 public:
   SpirvExecutionMode(SourceLocation loc, SpirvFunction *entryPointFunction,
-                     spv::ExecutionMode, llvm::ArrayRef<uint32_t> params,
-                     bool usesIdParams);
+                     spv::ExecutionMode, llvm::ArrayRef<uint32_t> params);
 
   DEFINE_RELEASE_MEMORY_FOR_CLASS(SpirvExecutionMode)
 
@@ -430,6 +458,28 @@ class SpirvExecutionMode : public SpirvInstruction {
   llvm::SmallVector<uint32_t, 4> params;
 };
 
+/// \brief OpExecutionModeId
+class SpirvExecutionModeId : public SpirvExecutionModeBase {
+public:
+  SpirvExecutionModeId(SourceLocation loc, SpirvFunction *entryPointFunction,
+                       spv::ExecutionMode em,
+                       llvm::ArrayRef<SpirvInstruction *> params);
+
+  DEFINE_RELEASE_MEMORY_FOR_CLASS(SpirvExecutionModeId)
+
+  // For LLVM-style RTTI
+  static bool classof(const SpirvInstruction *inst) {
+    return inst->getKind() == IK_ExecutionModeId;
+  }
+
+  bool invokeVisitor(Visitor *v) override;
+
+  llvm::ArrayRef<SpirvInstruction *> getParams() const { return params; }
+
+private:
+  llvm::SmallVector<SpirvInstruction *, 4> params;
+};
+
 /// \brief OpString instruction
 class SpirvString : public SpirvInstruction {
 public:
@@ -1018,6 +1068,119 @@ class SpirvBarrier : public SpirvInstruction {
   llvm::Optional<spv::Scope> executionScope;
 };
 
+/// \brief OpIsNodePayloadValidAMDX instruction
+class SpirvIsNodePayloadValid : public SpirvInstruction {
+public:
+  SpirvIsNodePayloadValid(QualType resultType, SourceLocation loc,
+                          SpirvInstruction *payloadArray,
+                          SpirvInstruction *nodeIndex);
+
+  DEFINE_RELEASE_MEMORY_FOR_CLASS(SpirvIsNodePayloadValid)
+
+  // For LLVM-style RTTI
+  static bool classof(const SpirvInstruction *inst) {
+    return inst->getKind() == IK_IsNodePayloadValid;
+  }
+
+  bool invokeVisitor(Visitor *v) override;
+
+  SpirvInstruction *getPayloadArray() { return payloadArray; }
+  SpirvInstruction *getNodeIndex() { return nodeIndex; }
+
+private:
+  SpirvInstruction *payloadArray;
+  SpirvInstruction *nodeIndex;
+};
+
+/// \brief OpNodePayloadArrayLengthAMDX instruction
+class SpirvNodePayloadArrayLength : public SpirvInstruction {
+public:
+  SpirvNodePayloadArrayLength(QualType resultType, SourceLocation loc,
+                              SpirvInstruction *payloadArray);
+
+  DEFINE_RELEASE_MEMORY_FOR_CLASS(SpirvNodePayloadArrayLength)
+
+  // For LLVM-style RTTI
+  static bool classof(const SpirvInstruction *inst) {
+    return inst->getKind() == IK_NodePayloadArrayLength;
+  }
+
+  bool invokeVisitor(Visitor *v) override;
+
+  SpirvInstruction *getPayloadArray() { return payloadArray; }
+
+private:
+  SpirvInstruction *payloadArray;
+};
+
+/// \brief OpAllocateNodePayloadsAMDX instruction
+class SpirvAllocateNodePayloads : public SpirvInstruction {
+public:
+  SpirvAllocateNodePayloads(QualType resultType, SourceLocation loc,
+                            spv::Scope allocationScope,
+                            SpirvInstruction *shaderIndex,
+                            SpirvInstruction *recordCount);
+
+  DEFINE_RELEASE_MEMORY_FOR_CLASS(SpirvAllocateNodePayloads)
+
+  // For LLVM-style RTTI
+  static bool classof(const SpirvInstruction *inst) {
+    return inst->getKind() == IK_AllocateNodePayloads;
+  }
+
+  bool invokeVisitor(Visitor *v) override;
+
+  spv::Scope getAllocationScope() { return allocationScope; }
+  SpirvInstruction *getShaderIndex() { return shaderIndex; }
+  SpirvInstruction *getRecordCount() { return recordCount; }
+
+private:
+  spv::Scope allocationScope;
+  SpirvInstruction *shaderIndex;
+  SpirvInstruction *recordCount;
+};
+
+/// \brief OpReleaseOutputNodePayloadAMDX instruction
+class SpirvEnqueueNodePayloads : public SpirvInstruction {
+public:
+  SpirvEnqueueNodePayloads(SourceLocation loc, SpirvInstruction *payload);
+
+  DEFINE_RELEASE_MEMORY_FOR_CLASS(SpirvEnqueueNodePayloads)
+
+  // For LLVM-style RTTI
+  static bool classof(const SpirvInstruction *inst) {
+    return inst->getKind() == IK_EnqueueNodePayloads;
+  }
+
+  bool invokeVisitor(Visitor *v) override;
+
+  SpirvInstruction *getPayload() { return payload; }
+
+private:
+  SpirvInstruction *payload;
+};
+
+/// \brief OpFinishWritingNodePayloadAMDX instruction
+class SpirvFinishWritingNodePayload : public SpirvInstruction {
+public:
+  SpirvFinishWritingNodePayload(QualType resultType, SourceLocation loc,
+                                SpirvInstruction *payload);
+
+  DEFINE_RELEASE_MEMORY_FOR_CLASS(SpirvFinishWritingNodePayload)
+
+  // For LLVM-style RTTI
+  static bool classof(const SpirvInstruction *inst) {
+    return inst->getKind() == IK_FinishWritingNodePayload;
+  }
+
+  bool invokeVisitor(Visitor *v) override;
+
+  SpirvInstruction *getPayload() { return payload; }
+
+private:
+  SpirvInstruction *payload;
+};
+
 /// \brief Represents SPIR-V binary operation instructions.
 ///
 /// This class includes:
@@ -1314,6 +1477,27 @@ class SpirvConstantNull : public SpirvConstant {
   bool operator==(const SpirvConstantNull &that) const;
 };
 
+class SpirvConstantString : public SpirvConstant {
+public:
+  SpirvConstantString(llvm::StringRef stringLiteral, bool isSpecConst = false);
+
+  DEFINE_RELEASE_MEMORY_FOR_CLASS(SpirvConstantString)
+
+  // For LLVM-style RTTI
+  static bool classof(const SpirvInstruction *inst) {
+    return inst->getKind() == IK_ConstantString;
+  }
+
+  bool invokeVisitor(Visitor *v) override;
+
+  bool operator==(const SpirvConstantString &that) const;
+
+  llvm::StringRef getString() const { return str; }
+
+private:
+  std::string str;
+};
+
 class SpirvConvertPtrToU : public SpirvInstruction {
 public:
   SpirvConvertPtrToU(SpirvInstruction *ptr, QualType type,
diff --git a/tools/clang/include/clang/SPIRV/SpirvModule.h b/tools/clang/include/clang/SPIRV/SpirvModule.h
index 298c06d65e..9ab0c296b8 100644
--- a/tools/clang/include/clang/SPIRV/SpirvModule.h
+++ b/tools/clang/include/clang/SPIRV/SpirvModule.h
@@ -119,11 +119,11 @@ class SpirvModule {
 
   // Returns an existing execution mode instruction that is the same as em if it
   // exists. Return nullptr otherwise.
-  SpirvExecutionMode *findExecutionMode(SpirvFunction *entryPoint,
-                                        spv::ExecutionMode em);
+  SpirvExecutionModeBase *findExecutionMode(SpirvFunction *entryPoint,
+                                            spv::ExecutionMode em);
 
   // Adds an execution mode to the module.
-  void addExecutionMode(SpirvExecutionMode *);
+  void addExecutionMode(SpirvExecutionModeBase *em);
 
   // Adds an extension to the module. Returns true if the extension was added.
   // Returns false otherwise (e.g. if the extension already existed).
@@ -194,7 +194,7 @@ class SpirvModule {
   llvm::SmallVector<SpirvExtInstImport *, 1> extInstSets;
   SpirvMemoryModel *memoryModel;
   llvm::SmallVector<SpirvEntryPoint *, 1> entryPoints;
-  llvm::SmallVector<SpirvExecutionMode *, 4> executionModes;
+  llvm::SmallVector<SpirvExecutionModeBase *, 4> executionModes;
   llvm::SmallVector<SpirvString *, 4> constStrings;
   std::vector<SpirvSource *> sources;
   std::vector<SpirvModuleProcessed *> moduleProcesses;
diff --git a/tools/clang/include/clang/SPIRV/SpirvType.h b/tools/clang/include/clang/SPIRV/SpirvType.h
index 00a00ef238..7966e3e0de 100644
--- a/tools/clang/include/clang/SPIRV/SpirvType.h
+++ b/tools/clang/include/clang/SPIRV/SpirvType.h
@@ -5,9 +5,6 @@
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
-// Modifications Copyright(C) 2025 Advanced Micro Devices, Inc.
-// All rights reserved.
-//
 //===----------------------------------------------------------------------===//
 #ifndef LLVM_CLANG_SPIRV_SPIRVTYPE_H
 #define LLVM_CLANG_SPIRV_SPIRVTYPE_H
@@ -54,6 +51,7 @@ class SpirvType {
     TK_SampledImage,
     TK_Array,
     TK_RuntimeArray,
+    TK_NodePayloadArrayAMD,
     TK_Struct,
     TK_Pointer,
     TK_ForwardPointer,
@@ -294,6 +292,26 @@ class RuntimeArrayType : public SpirvType {
   llvm::Optional<uint32_t> stride;
 };
 
+class NodePayloadArrayType : public SpirvType {
+public:
+  NodePayloadArrayType(const SpirvType *elemType, const ParmVarDecl *decl)
+      : SpirvType(TK_NodePayloadArrayAMD), elementType(elemType),
+        nodeDecl(decl) {}
+
+  static bool classof(const SpirvType *t) {
+    return t->getKind() == TK_NodePayloadArrayAMD;
+  }
+
+  bool operator==(const NodePayloadArrayType &that) const;
+
+  const SpirvType *getElementType() const { return elementType; }
+  const ParmVarDecl *getNodeDecl() const { return nodeDecl; }
+
+private:
+  const SpirvType *elementType;
+  const ParmVarDecl *nodeDecl;
+};
+
 // The StructType is the lowered type that best represents what a structure type
 // is in SPIR-V. Contains all necessary information for properly emitting a
 // SPIR-V structure type.
@@ -630,6 +648,8 @@ bool SpirvType::isOrContainsType(const SpirvType *type) {
     return isOrContainsType<T, Bitwidth>(pointerType->getPointeeType());
   if (const auto *raType = dyn_cast<RuntimeArrayType>(type))
     return isOrContainsType<T, Bitwidth>(raType->getElementType());
+  if (const auto *npaType = dyn_cast<NodePayloadArrayType>(type))
+    return isOrContainsType<T, Bitwidth>(npaType->getElementType());
   if (const auto *imgType = dyn_cast<ImageType>(type))
     return isOrContainsType<T, Bitwidth>(imgType->getSampledType());
   if (const auto *sampledImageType = dyn_cast<SampledImageType>(type))
diff --git a/tools/clang/include/clang/SPIRV/SpirvVisitor.h b/tools/clang/include/clang/SPIRV/SpirvVisitor.h
index 93682518a1..a6de26c807 100644
--- a/tools/clang/include/clang/SPIRV/SpirvVisitor.h
+++ b/tools/clang/include/clang/SPIRV/SpirvVisitor.h
@@ -5,9 +5,6 @@
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
-// Modifications Copyright(C) 2025 Advanced Micro Devices, Inc.
-// All rights reserved.
-//
 //===----------------------------------------------------------------------===//
 #ifndef LLVM_CLANG_SPIRV_SPIRVVISITOR_H
 #define LLVM_CLANG_SPIRV_SPIRVVISITOR_H
@@ -64,7 +61,7 @@ class Visitor {
   DEFINE_VISIT_METHOD(SpirvExtInstImport)
   DEFINE_VISIT_METHOD(SpirvMemoryModel)
   DEFINE_VISIT_METHOD(SpirvEntryPoint)
-  DEFINE_VISIT_METHOD(SpirvExecutionMode)
+  DEFINE_VISIT_METHOD(SpirvExecutionModeBase)
   DEFINE_VISIT_METHOD(SpirvString)
   DEFINE_VISIT_METHOD(SpirvSource)
   DEFINE_VISIT_METHOD(SpirvModuleProcessed)
@@ -85,6 +82,11 @@ class Visitor {
   DEFINE_VISIT_METHOD(SpirvAccessChain)
   DEFINE_VISIT_METHOD(SpirvAtomic)
   DEFINE_VISIT_METHOD(SpirvBarrier)
+  DEFINE_VISIT_METHOD(SpirvIsNodePayloadValid)
+  DEFINE_VISIT_METHOD(SpirvNodePayloadArrayLength)
+  DEFINE_VISIT_METHOD(SpirvAllocateNodePayloads)
+  DEFINE_VISIT_METHOD(SpirvEnqueueNodePayloads)
+  DEFINE_VISIT_METHOD(SpirvFinishWritingNodePayload)
   DEFINE_VISIT_METHOD(SpirvBinaryOp)
   DEFINE_VISIT_METHOD(SpirvBitFieldExtract)
   DEFINE_VISIT_METHOD(SpirvBitFieldInsert)
@@ -92,6 +94,7 @@ class Visitor {
   DEFINE_VISIT_METHOD(SpirvConstantInteger)
   DEFINE_VISIT_METHOD(SpirvConstantFloat)
   DEFINE_VISIT_METHOD(SpirvConstantComposite)
+  DEFINE_VISIT_METHOD(SpirvConstantString)
   DEFINE_VISIT_METHOD(SpirvConstantNull)
   DEFINE_VISIT_METHOD(SpirvConvertPtrToU)
   DEFINE_VISIT_METHOD(SpirvConvertUToPtr)
diff --git a/tools/clang/include/clang/Sema/ExternalSemaSource.h b/tools/clang/include/clang/Sema/ExternalSemaSource.h
index 91578e2440..b10d649cc6 100644
--- a/tools/clang/include/clang/Sema/ExternalSemaSource.h
+++ b/tools/clang/include/clang/Sema/ExternalSemaSource.h
@@ -211,10 +211,9 @@ class ExternalSemaSource : public ExternalASTSource {
   // add call candidates to the given expression. It returns 'true'
   // if standard overload search should be suppressed; false otherwise.
   virtual bool AddOverloadedCallCandidates(UnresolvedLookupExpr *ULE,
-    ArrayRef<Expr *> Args,
-    OverloadCandidateSet &CandidateSet,
-    bool PartialOverloading)
-  {
+                                           ArrayRef<Expr *> Args,
+                                           OverloadCandidateSet &CandidateSet,
+                                           Scope *S, bool PartialOverloading) {
     return false;
   }
 
diff --git a/tools/clang/include/clang/Sema/Overload.h b/tools/clang/include/clang/Sema/Overload.h
index 89de4ce984..473af49cab 100644
--- a/tools/clang/include/clang/Sema/Overload.h
+++ b/tools/clang/include/clang/Sema/Overload.h
@@ -57,7 +57,7 @@ namespace clang {
   /// convert an argument to a parameter's type. The enumerator values
   /// match with Table 9 of (C++ 13.3.3.1.1) and are listed such that
   /// better conversion kinds have smaller values.
-  enum ImplicitConversionKind {
+  enum ImplicitConversionKind : unsigned int {
     ICK_Identity = 0,          ///< Identity conversion (no conversion)
     ICK_Lvalue_To_Rvalue,      ///< Lvalue-to-rvalue conversion (C++ 4.1)
     ICK_Array_To_Pointer,      ///< Array-to-pointer conversion (C++ 4.2)
@@ -79,27 +79,28 @@ namespace clang {
     ICK_Vector_Conversion,     ///< Vector conversions
     ICK_Vector_Splat,          ///< A vector splat from an arithmetic type
     ICK_Complex_Real,          ///< Complex-real conversions (C99 6.3.1.7)
-    ICK_Block_Pointer_Conversion,    ///< Block Pointer conversions 
+    ICK_Block_Pointer_Conversion,   ///< Block Pointer conversions
     ICK_TransparentUnionConversion, ///< Transparent Union Conversions
-    ICK_Writeback_Conversion,  ///< Objective-C ARC writeback conversion
+    ICK_Writeback_Conversion,       ///< Objective-C ARC writeback conversion
     ICK_Zero_Event_Conversion, ///< Zero constant to event (OpenCL1.2 6.12.10)
 
     // HLSL Change Starts
-    // The following conversion types also imply a potential followup 
+    // The following conversion types also imply a potential followup
     // ComponentConversion.
     // List is roughly ordered to preserve the property:
     //   "better conversion kinds have smaller values"
-    // Unfortunately, this property isn't really possible to preserve due 
+    // Unfortunately, this property isn't really possible to preserve due
     // to potential additional component conversion.
     ICK_HLSLVector_Scalar,     ///< HLSLVector/Matrix to scalar
     ICK_HLSLVector_Conversion, ///< HLSLVector/Matrix conversion
-    ICK_Flat_Conversion,       ///< Flat assignment conversion for HLSL (inline conversion, straddled)
+    ICK_Flat_Conversion,       ///< Flat assignment conversion for HLSL (inline
+                               ///< conversion, straddled)
     ICK_HLSLVector_Splat,      ///< HLSLVector/Matrix splat
     ICK_HLSLVector_Truncation, ///< HLSLVector/Matrix truncation
     ICK_HLSL_Derived_To_Base,  ///< HLSL Derived-to-base
     // HLSL Change Ends
 
-    ICK_Num_Conversion_Kinds   ///< The number of conversion kinds
+    ICK_Num_Conversion_Kinds ///< The number of conversion kinds
   };
 
   /// ImplicitConversionRank - The rank of an implicit conversion
diff --git a/tools/clang/include/clang/Sema/Sema.h b/tools/clang/include/clang/Sema/Sema.h
index 755c7e0755..6eb0aba801 100644
--- a/tools/clang/include/clang/Sema/Sema.h
+++ b/tools/clang/include/clang/Sema/Sema.h
@@ -2495,9 +2495,14 @@ class Sema {
                                             DeclAccessPair FoundDecl,
                                             FunctionDecl *Fn);
 
+  // HLSL Change Begin
+  void CollectNamespaceContexts(Scope *,
+                                SmallVectorImpl<const DeclContext *> &);
+  // HLSL Change End
   void AddOverloadedCallCandidates(UnresolvedLookupExpr *ULE,
                                    ArrayRef<Expr *> Args,
                                    OverloadCandidateSet &CandidateSet,
+                                   Scope *S, // HLSL Change
                                    bool PartialOverloading = false);
 
   // An enum used to represent the different possible results of building a
@@ -3806,8 +3811,7 @@ class Sema {
   void DiagnoseHLSLDeclAttr(const Decl *D, const Attr *A);
   void DiagnoseCoherenceMismatch(const Expr *SrcExpr, QualType TargetType,
                                  SourceLocation Loc);
-  void CheckHLSLFunctionCall(FunctionDecl *FDecl, CallExpr *TheCall,
-                             const FunctionProtoType *Proto);
+  void CheckHLSLFunctionCall(FunctionDecl *FDecl, CallExpr *TheCall);
   void DiagnoseReachableHLSLCall(CallExpr *CE, const hlsl::ShaderModel *SM,
                                  hlsl::DXIL::ShaderKind EntrySK,
                                  hlsl::DXIL::NodeLaunchType NodeLaunchTy,
@@ -8826,8 +8830,6 @@ class Sema {
                         bool AllowOnePastEnd=true, bool IndexNegated=false);
   // HLSL Change Starts - checking array subscript access to vector or matrix member
   void CheckHLSLArrayAccess(const Expr *expr);
-  bool CheckHLSLIntrinsicCall(FunctionDecl *FDecl, CallExpr *TheCall);
-  bool CheckHLSLFunctionCall(FunctionDecl *FDecl, CallExpr *TheCall);
   // HLSL Change ends
   void CheckArrayAccess(const Expr *E);
   // Used to grab the relevant information from a FormatAttr and a
diff --git a/tools/clang/include/clang/Sema/SemaHLSL.h b/tools/clang/include/clang/Sema/SemaHLSL.h
index 59d99ab4c5..80ce8ddd7d 100644
--- a/tools/clang/include/clang/Sema/SemaHLSL.h
+++ b/tools/clang/include/clang/Sema/SemaHLSL.h
@@ -59,6 +59,38 @@ bool DiagnoseNodeStructArgument(clang::Sema *self,
                                 clang::QualType ArgTy, bool &Empty,
                                 const clang::FieldDecl *FD = nullptr);
 
+// Keep this in sync with err_hlsl_unsupported_object in DiagnosticSemaKinds.td
+enum class TypeDiagContext {
+  // Indices that the type context is valid and no diagnostics should be emitted
+  // for this type category.
+  Valid = -1,
+  // Supported indices for both `err_hlsl_unsupported_object_context` and
+  // `err_hlsl_unsupported_long_vector`
+  ConstantBuffersOrTextureBuffers = 0,
+  TessellationPatches = 1,
+  GeometryStreams = 2,
+  NodeRecords = 3,
+  CBuffersOrTBuffers = 4,
+  UserDefinedStructParameter = 5,
+  EntryFunctionParameters = 6,
+  EntryFunctionReturnType = 7,
+  PatchConstantFunctionParameters = 8,
+  PatchConstantFunctionReturnType = 9,
+  PayloadParameters = 10,
+  Attributes = 11,
+  TypeParameter = 12,
+  LongVecDiagMaxSelectIndex = TypeParameter,
+  // Below only supported for `err_hlsl_diag_unsupported_object_context`
+  StructuredBuffers = 13,
+  GlobalVariables = 14,
+  GroupShared = 15,
+  DiagMaxSelectIndex = 15,
+};
+bool DiagnoseTypeElements(clang::Sema &S, clang::SourceLocation Loc,
+                          clang::QualType Ty, TypeDiagContext ObjDiagContext,
+                          TypeDiagContext LongVecDiagContext,
+                          const clang::FieldDecl *FD = nullptr);
+
 void DiagnoseControlFlowConditionForHLSL(clang::Sema *self,
                                          clang::Expr *condExpr,
                                          llvm::StringRef StmtName);
diff --git a/tools/clang/lib/AST/ASTContextHLSL.cpp b/tools/clang/lib/AST/ASTContextHLSL.cpp
index 0a688c03fa..913b28ced8 100644
--- a/tools/clang/lib/AST/ASTContextHLSL.cpp
+++ b/tools/clang/lib/AST/ASTContextHLSL.cpp
@@ -6,9 +6,6 @@
 // This file is distributed under the University of Illinois Open Source     //
 // License. See LICENSE.TXT for details.                                     //
 //                                                                           //
-// Modifications Copyright(C) 2025 Advanced Micro Devices, Inc.              //
-// All rights reserved.                                                      //
-//                                                                           //
 //  This file implements the ASTContext interface for HLSL.                  //
 //                                                                           //
 ///////////////////////////////////////////////////////////////////////////////
diff --git a/tools/clang/lib/AST/DeclCXX.cpp b/tools/clang/lib/AST/DeclCXX.cpp
index baed44667f..8023a0a588 100644
--- a/tools/clang/lib/AST/DeclCXX.cpp
+++ b/tools/clang/lib/AST/DeclCXX.cpp
@@ -72,8 +72,8 @@ CXXRecordDecl::DefinitionData::DefinitionData(CXXRecordDecl *D)
       ImplicitCopyAssignmentHasConstParam(true),
       HasDeclaredCopyConstructorWithConstParam(false),
       HasDeclaredCopyAssignmentWithConstParam(false), IsLambda(false),
-      IsParsingBaseSpecifiers(false), HasHLSLLongVector(false), NumBases(0),
-      NumVBases(0), Bases(), VBases(), Definition(D), FirstFriend() {}
+      IsParsingBaseSpecifiers(false), NumBases(0), NumVBases(0), Bases(),
+      VBases(), Definition(D), FirstFriend() {}
 // HLSL Change End - Add HasLongVector and clang-format
 
 CXXBaseSpecifier *CXXRecordDecl::DefinitionData::getBasesSlowCase() const {
@@ -203,11 +203,6 @@ CXXRecordDecl::setBases(CXXBaseSpecifier const * const *Bases,
     if (!BaseClassDecl->isStandardLayout())
       data().IsStandardLayout = false;
 
-    // HLSL Change Begin - Propagate presence of long vector to child classes.
-    if (BaseClassDecl->hasHLSLLongVector())
-      data().HasHLSLLongVector = true;
-    // HLSL Change End
-
     // Record if this base is the first non-literal field or base.
     if (!hasNonLiteralTypeFieldsOrBases() && !BaseType->isLiteralType(C))
       data().HasNonLiteralTypeFieldsOrBases = true;
@@ -389,11 +384,6 @@ void CXXRecordDecl::addedClassSubobject(CXXRecordDecl *Subobj) {
     data().NeedOverloadResolutionForMoveConstructor = true;
     data().NeedOverloadResolutionForDestructor = true;
   }
-
-  // HLSL Change Begin - Propagate presence of long vector to child classes.
-  if (Subobj->hasHLSLLongVector())
-    data().HasHLSLLongVector = true;
-  // HLSL Change End
 }
 
 /// Callback function for CXXRecordDecl::forallBases that acknowledges
diff --git a/tools/clang/lib/AST/Expr.cpp b/tools/clang/lib/AST/Expr.cpp
index c6dc21217e..2d039a7e98 100644
--- a/tools/clang/lib/AST/Expr.cpp
+++ b/tools/clang/lib/AST/Expr.cpp
@@ -5,9 +5,6 @@
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
-// Modifications Copyright(C) 2025 Advanced Micro Devices, Inc.
-// All rights reserved.
-//
 //===----------------------------------------------------------------------===//
 //
 // This file implements the Expr class and subclasses.
@@ -3886,25 +3883,21 @@ GenericSelectionExpr::GenericSelectionExpr(const ASTContext &Context,
   std::copy(AssocExprs.begin(), AssocExprs.end(), SubExprs+END_EXPR);
 }
 
-GenericSelectionExpr::GenericSelectionExpr(const ASTContext &Context,
-                               SourceLocation GenericLoc, Expr *ControllingExpr,
-                               ArrayRef<TypeSourceInfo*> AssocTypes,
-                               ArrayRef<Expr*> AssocExprs,
-                               SourceLocation DefaultLoc,
-                               SourceLocation RParenLoc,
-                               bool ContainsUnexpandedParameterPack)
-  : Expr(GenericSelectionExprClass,
-         Context.DependentTy,
-         VK_RValue,
-         OK_Ordinary,
-         /*isTypeDependent=*/true,
-         /*isValueDependent=*/true,
-         /*isInstantiationDependent=*/true,
-         ContainsUnexpandedParameterPack),
-    AssocTypes(new (Context) TypeSourceInfo*[AssocTypes.size()]),
-    SubExprs(new (Context) Stmt*[END_EXPR+AssocExprs.size()]),
-    NumAssocs(AssocExprs.size()), ResultIndex(-1U), GenericLoc(GenericLoc),
-    DefaultLoc(DefaultLoc), RParenLoc(RParenLoc) {
+GenericSelectionExpr::GenericSelectionExpr(
+    const ASTContext &Context, SourceLocation GenericLoc, Expr *ControllingExpr,
+    ArrayRef<TypeSourceInfo *> AssocTypes, ArrayRef<Expr *> AssocExprs,
+    SourceLocation DefaultLoc, SourceLocation RParenLoc,
+    bool ContainsUnexpandedParameterPack)
+    : Expr(GenericSelectionExprClass, Context.DependentTy, VK_RValue,
+           OK_Ordinary,
+           /*isTypeDependent=*/true,
+           /*isValueDependent=*/true,
+           /*isInstantiationDependent=*/true, ContainsUnexpandedParameterPack),
+      AssocTypes(new(Context) TypeSourceInfo *[AssocTypes.size()]),
+      SubExprs(new(Context) Stmt *[END_EXPR + AssocExprs.size()]),
+      NumAssocs(AssocExprs.size()),
+      ResultIndex(std::numeric_limits<unsigned>::max()), GenericLoc(GenericLoc),
+      DefaultLoc(DefaultLoc), RParenLoc(RParenLoc) {
   SubExprs[CONTROLLING] = ControllingExpr;
   assert(AssocTypes.size() == AssocExprs.size());
   std::copy(AssocTypes.begin(), AssocTypes.end(), this->AssocTypes);
diff --git a/tools/clang/lib/AST/ExprConstant.cpp b/tools/clang/lib/AST/ExprConstant.cpp
index 69e0760bce..c24e44022f 100644
--- a/tools/clang/lib/AST/ExprConstant.cpp
+++ b/tools/clang/lib/AST/ExprConstant.cpp
@@ -5,9 +5,6 @@
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
-// Modifications Copyright(C) 2025 Advanced Micro Devices, Inc.
-// All rights reserved.
-//
 //===----------------------------------------------------------------------===//
 //
 // This file implements the Expr constant evaluator.
@@ -6558,7 +6555,7 @@ bool IntExprEvaluator::VisitCallExpr(const CallExpr *E) {
     // handle all cases where the expression has side-effects.
     if (E->getArg(0)->HasSideEffects(Info.Ctx)) {
       if (E->getArg(1)->EvaluateKnownConstInt(Info.Ctx).getZExtValue() <= 1)
-        return Success(-1ULL, E);
+        return Success(~0ULL, E);
       return Success(0, E);
     }
 
@@ -6573,7 +6570,7 @@ bool IntExprEvaluator::VisitCallExpr(const CallExpr *E) {
       return Error(E);
     case EvalInfo::EM_ConstantExpressionUnevaluated:
     case EvalInfo::EM_PotentialConstantExpressionUnevaluated:
-      return Success(-1ULL, E);
+      return Success(~0ULL, E);
     }
     llvm_unreachable("Invalid EvalMode!");
   }
diff --git a/tools/clang/lib/AST/HlslTypes.cpp b/tools/clang/lib/AST/HlslTypes.cpp
index 5b19e064a3..00c18a81a9 100644
--- a/tools/clang/lib/AST/HlslTypes.cpp
+++ b/tools/clang/lib/AST/HlslTypes.cpp
@@ -5,9 +5,6 @@
 // Copyright (C) Microsoft Corporation. All rights reserved.                 //
 // This file is distributed under the University of Illinois Open Source     //
 // License. See LICENSE.TXT for details.                                     //
-//
-// Modifications Copyright(C) 2025 Advanced Micro Devices, Inc.
-// All rights reserved.
 //                                                                           //
 ///
 /// \file                                                                    //
@@ -95,6 +92,8 @@ bool IsHLSLNumericOrAggregateOfNumericType(clang::QualType type) {
   } else if (type->isArrayType()) {
     return IsHLSLNumericOrAggregateOfNumericType(
         QualType(type->getArrayElementTypeNoTypeQual(), 0));
+  } else if (type->isEnumeralType()) {
+    return true;
   }
 
   // Chars can only appear as part of strings, which we don't consider numeric.
@@ -103,31 +102,34 @@ bool IsHLSLNumericOrAggregateOfNumericType(clang::QualType type) {
          BuiltinTy->getKind() != BuiltinType::Kind::Char_S;
 }
 
-bool IsHLSLNumericUserDefinedType(clang::QualType type) {
-  const clang::Type *Ty = type.getCanonicalType().getTypePtr();
+// In some cases we need record types that are annotatable and trivially
+// copyable from outside the shader. This excludes resource types which may be
+// trivially copyable inside the shader, and builtin matrix and vector types
+// which can't be annotated. But includes UDTs of trivially copyable data and
+// the builtin trivially copyable raytracing structs.
+bool IsHLSLCopyableAnnotatableRecord(clang::QualType QT) {
+  assert(!QT->isIncompleteType() && "Type must be complete!");
+  const clang::Type *Ty = QT.getCanonicalType().getTypePtr();
   if (const RecordType *RT = dyn_cast<RecordType>(Ty)) {
     const RecordDecl *RD = RT->getDecl();
-    if (!IsUserDefinedRecordType(type))
+    if (!IsUserDefinedRecordType(QT))
       return false;
-    for (auto member : RD->fields()) {
-      if (!IsHLSLNumericOrAggregateOfNumericType(member->getType()))
+    for (auto Member : RD->fields()) {
+      if (!IsHLSLNumericOrAggregateOfNumericType(Member->getType()))
         return false;
     }
+    if (auto *CXXRD = dyn_cast<CXXRecordDecl>(RD)) {
+      // Walk up the inheritance chain and check base class fields
+      for (const auto &Base : CXXRD->bases()) {
+        if (!IsHLSLCopyableAnnotatableRecord(Base.getType()))
+          return false;
+      }
+    }
     return true;
   }
   return false;
 }
 
-// In some cases we need record types that are annotatable and trivially
-// copyable from outside the shader. This excludes resource types which may be
-// trivially copyable inside the shader, and builtin matrix and vector types
-// which can't be annotated. But includes UDTs of trivially copyable data and
-// the builtin trivially copyable raytracing structs.
-bool IsHLSLCopyableAnnotatableRecord(clang::QualType QT) {
-  return IsHLSLNumericUserDefinedType(QT) ||
-         IsHLSLBuiltinRayAttributeStruct(QT);
-}
-
 bool IsHLSLBuiltinRayAttributeStruct(clang::QualType QT) {
   QT = QT.getCanonicalType();
   const clang::Type *Ty = QT.getTypePtr();
@@ -586,6 +588,12 @@ bool IsHLSLRONodeInputRecordType(clang::QualType type) {
          static_cast<uint32_t>(DXIL::NodeIOFlags::Input);
 }
 
+bool IsHLSLDispatchNodeInputRecordType(clang::QualType type) {
+  return IsHLSLNodeInputType(type) &&
+         (static_cast<uint32_t>(GetNodeIOType(type)) &
+          static_cast<uint32_t>(DXIL::NodeIOFlags::DispatchRecord)) != 0;
+}
+
 bool IsHLSLNodeOutputType(clang::QualType type) {
   return (static_cast<uint32_t>(GetNodeIOType(type)) &
           (static_cast<uint32_t>(DXIL::NodeIOFlags::Output) |
@@ -593,6 +601,23 @@ bool IsHLSLNodeOutputType(clang::QualType type) {
          static_cast<uint32_t>(DXIL::NodeIOFlags::Output);
 }
 
+bool IsHLSLNodeRecordArrayType(clang::QualType type) {
+  if (const RecordType *RT = type->getAs<RecordType>()) {
+    StringRef name = RT->getDecl()->getName();
+    if (name == "ThreadNodeOutputRecords" || name == "GroupNodeOutputRecords" ||
+        name == "GroupNodeInputRecords" || name == "RWGroupNodeInputRecords" ||
+        name == "EmptyNodeInput")
+      return true;
+  }
+  return false;
+}
+
+bool IsHLSLEmptyNodeRecordType(clang::QualType type) {
+  return (static_cast<uint32_t>(GetNodeIOType(type)) &
+          static_cast<uint32_t>(DXIL::NodeIOFlags::EmptyRecord)) ==
+         static_cast<uint32_t>(DXIL::NodeIOFlags::EmptyRecord);
+}
+
 bool IsHLSLStructuredBufferType(clang::QualType type) {
   if (const HLSLResourceAttr *Attr = getAttr<HLSLResourceAttr>(type))
     return Attr->getResKind() == DXIL::ResourceKind::StructuredBuffer;
@@ -609,7 +634,8 @@ bool IsUserDefinedRecordType(clang::QualType QT) {
   const clang::Type *Ty = QT.getCanonicalType().getTypePtr();
   if (const RecordType *RT = dyn_cast<RecordType>(Ty)) {
     const RecordDecl *RD = RT->getDecl();
-    if (RD->isImplicit())
+    // Built-in ray tracing struct types are considered user defined types.
+    if (RD->isImplicit() && !IsHLSLBuiltinRayAttributeStruct(QT))
       return false;
     if (auto TD = dyn_cast<ClassTemplateSpecializationDecl>(RD))
       if (TD->getSpecializedTemplate()->isImplicit())
@@ -834,6 +860,23 @@ QualType GetHLSLResourceResultType(QualType type) {
   return HandleFieldDecl->getType();
 }
 
+QualType GetHLSLNodeIOResultType(ASTContext &astContext, QualType type) {
+  if (hlsl::IsHLSLEmptyNodeRecordType(type)) {
+    RecordDecl *RD = astContext.buildImplicitRecord("");
+    RD->startDefinition();
+    RD->completeDefinition();
+    return astContext.getRecordType(RD);
+  } else if (hlsl::IsHLSLNodeType(type)) {
+    const RecordType *recordType = type->getAs<RecordType>();
+    if (const auto *templateDecl =
+            dyn_cast<ClassTemplateSpecializationDecl>(recordType->getDecl())) {
+      const auto &templateArgs = templateDecl->getTemplateArgs();
+      return templateArgs[0].getAsType();
+    }
+  }
+  return type;
+}
+
 unsigned GetHLSLResourceTemplateUInt(clang::QualType type) {
   const ClassTemplateSpecializationDecl *templateDecl =
       cast<ClassTemplateSpecializationDecl>(
diff --git a/tools/clang/lib/AST/MicrosoftMangle.cpp b/tools/clang/lib/AST/MicrosoftMangle.cpp
index 40dca1bb1b..ae9f1cd7f8 100644
--- a/tools/clang/lib/AST/MicrosoftMangle.cpp
+++ b/tools/clang/lib/AST/MicrosoftMangle.cpp
@@ -633,7 +633,7 @@ void MicrosoftCXXNameMangler::mangleNumber(int64_t Number) {
 
   uint64_t Value = static_cast<uint64_t>(Number);
   if (Number < 0) {
-    Value = -Value;
+    Value = ~Value + 1ULL;
     Out << '?';
   }
 
@@ -2308,7 +2308,7 @@ static void mangleThunkThisAdjustment(const CXXMethodDecl *MD,
       Out << AccessSpec;
       Mangler.mangleNumber(
           static_cast<uint32_t>(Adjustment.Virtual.Microsoft.VtordispOffset));
-      Mangler.mangleNumber(-static_cast<uint32_t>(Adjustment.NonVirtual));
+      Mangler.mangleNumber(~static_cast<uint32_t>(Adjustment.NonVirtual) + 1);
     }
   } else if (Adjustment.NonVirtual != 0) {
     switch (MD->getAccess()) {
@@ -2323,7 +2323,7 @@ static void mangleThunkThisAdjustment(const CXXMethodDecl *MD,
     case AS_public:
       Out << 'W';
     }
-    Mangler.mangleNumber(-static_cast<uint32_t>(Adjustment.NonVirtual));
+    Mangler.mangleNumber(~static_cast<uint32_t>(Adjustment.NonVirtual) + 1);
   } else {
     switch (MD->getAccess()) {
     case AS_none:
diff --git a/tools/clang/lib/AST/SelectorLocationsKind.cpp b/tools/clang/lib/AST/SelectorLocationsKind.cpp
index 671207a7f2..36fd8cea6e 100644
--- a/tools/clang/lib/AST/SelectorLocationsKind.cpp
+++ b/tools/clang/lib/AST/SelectorLocationsKind.cpp
@@ -28,7 +28,7 @@ static SourceLocation getStandardSelLoc(unsigned Index,
     if (EndLoc.isInvalid())
       return SourceLocation();
     IdentifierInfo *II = Sel.getIdentifierInfoForSlot(0);
-    unsigned Len = II ? II->getLength() : 0;
+    int Len = II ? II->getLength() : 0;
     return EndLoc.getLocWithOffset(-Len);
   }
 
@@ -36,7 +36,7 @@ static SourceLocation getStandardSelLoc(unsigned Index,
   if (ArgLoc.isInvalid())
     return SourceLocation();
   IdentifierInfo *II = Sel.getIdentifierInfoForSlot(Index);
-  unsigned Len = /* selector id */ (II ? II->getLength() : 0) + /* ':' */ 1;
+  int Len = /* selector id */ (II ? II->getLength() : 0) + /* ':' */ 1;
   if (WithArgSpace)
     ++Len;
   return ArgLoc.getLocWithOffset(-Len);
diff --git a/tools/clang/lib/CodeGen/CGExprScalar.cpp b/tools/clang/lib/CodeGen/CGExprScalar.cpp
index 530c791fcc..50aae94505 100644
--- a/tools/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/tools/clang/lib/CodeGen/CGExprScalar.cpp
@@ -2559,7 +2559,8 @@ void ScalarExprEmitter::EmitUndefinedBehaviorIntegerDivAndRemCheck(
 
     llvm::Value *IntMin =
       Builder.getInt(llvm::APInt::getSignedMinValue(Ty->getBitWidth()));
-    llvm::Value *NegOne = llvm::ConstantInt::get(Ty, -1ULL);
+    llvm::Value *NegOne =
+        llvm::ConstantInt::get(Ty, std::numeric_limits<uint64_t>::max());
 
     llvm::Value *LHSCmp = Builder.CreateICmpNE(Ops.LHS, IntMin);
     llvm::Value *RHSCmp = Builder.CreateICmpNE(Ops.RHS, NegOne);
diff --git a/tools/clang/lib/CodeGen/CGHLSLMS.cpp b/tools/clang/lib/CodeGen/CGHLSLMS.cpp
index 16ddeaec60..b5add521a6 100644
--- a/tools/clang/lib/CodeGen/CGHLSLMS.cpp
+++ b/tools/clang/lib/CodeGen/CGHLSLMS.cpp
@@ -288,6 +288,9 @@ class CGMSHLSLRuntime : public CGHLSLRuntime {
                                            llvm::Value *DestPtr,
                                            clang::QualType DestTy) override;
   void AddHLSLFunctionInfo(llvm::Function *, const FunctionDecl *FD) override;
+  bool FindDispatchGridSemantic(const CXXRecordDecl *RD,
+                                hlsl::SVDispatchGrid &SDGRec,
+                                CharUnits Offset = CharUnits());
   void AddHLSLNodeRecordTypeInfo(const clang::ParmVarDecl *parmDecl,
                                  hlsl::NodeIOProperties &node);
   void EmitHLSLFunctionProlog(llvm::Function *,
@@ -2560,6 +2563,66 @@ void CGMSHLSLRuntime::AddHLSLFunctionInfo(Function *F, const FunctionDecl *FD) {
   m_ScopeMap[F] = ScopeInfo(F, FD->getLocation());
 }
 
+// Find the input node record field with the SV_DispatchGrid semantic.
+// We have already diagnosed any error conditions in Sema, so we
+// expect valid size and types, and use the first occurance found.
+// We return true if we have populated the SV_DispatchGrid values.
+bool CGMSHLSLRuntime::FindDispatchGridSemantic(const CXXRecordDecl *RD,
+                                               hlsl::SVDispatchGrid &SDGRec,
+                                               CharUnits Offset) {
+  const ASTRecordLayout &Layout = CGM.getContext().getASTRecordLayout(RD);
+
+  // Check (non-virtual) bases
+  for (const CXXBaseSpecifier &Base : RD->bases()) {
+    DXASSERT(!Base.getType()->isDependentType(),
+             "Node Record with dependent base class not caught by Sema");
+    if (Base.getType()->isDependentType())
+      continue;
+    CXXRecordDecl *BaseDecl = Base.getType()->getAsCXXRecordDecl();
+    CharUnits BaseOffset = Offset + Layout.getBaseClassOffset(BaseDecl);
+    if (FindDispatchGridSemantic(BaseDecl, SDGRec, BaseOffset))
+      return true;
+  }
+
+  // Check each field in this record.
+  for (FieldDecl *Field : RD->fields()) {
+    uint64_t FieldNo = Field->getFieldIndex();
+    CharUnits FieldOffset = Offset + CGM.getContext().toCharUnitsFromBits(
+                                         Layout.getFieldOffset(FieldNo));
+
+    // If this field is a record check its fields
+    if (const CXXRecordDecl *D = Field->getType()->getAsCXXRecordDecl()) {
+      if (FindDispatchGridSemantic(D, SDGRec, FieldOffset))
+        return true;
+    }
+    // Otherwise check this field for the SV_DispatchGrid semantic annotation
+    for (const hlsl::UnusualAnnotation *UA : Field->getUnusualAnnotations()) {
+      if (UA->getKind() == hlsl::UnusualAnnotation::UA_SemanticDecl) {
+        const hlsl::SemanticDecl *SD = cast<hlsl::SemanticDecl>(UA);
+        if (SD->SemanticName.equals("SV_DispatchGrid")) {
+          const llvm::Type *FTy = CGM.getTypes().ConvertType(Field->getType());
+          const llvm::Type *ElTy = FTy;
+          SDGRec.NumComponents = 1;
+          SDGRec.ByteOffset = (unsigned)FieldOffset.getQuantity();
+          if (const llvm::VectorType *VT = dyn_cast<llvm::VectorType>(FTy)) {
+            SDGRec.NumComponents = VT->getNumElements();
+            ElTy = VT->getElementType();
+          } else if (const llvm::ArrayType *AT =
+                         dyn_cast<llvm::ArrayType>(FTy)) {
+            SDGRec.NumComponents = AT->getNumElements();
+            ElTy = AT->getElementType();
+          }
+          SDGRec.ComponentType = (ElTy->getIntegerBitWidth() == 16)
+                                     ? DXIL::ComponentType::U16
+                                     : DXIL::ComponentType::U32;
+          return true;
+        }
+      }
+    }
+  }
+  return false;
+}
+
 void CGMSHLSLRuntime::AddHLSLNodeRecordTypeInfo(
     const clang::ParmVarDecl *parmDecl, hlsl::NodeIOProperties &node) {
   clang::QualType paramTy = parmDecl->getType().getCanonicalType();
@@ -2577,7 +2640,6 @@ void CGMSHLSLRuntime::AddHLSLNodeRecordTypeInfo(
         DiagnosticsEngine &Diags = CGM.getDiags();
         auto &Rec = TemplateArgs.get(0);
         clang::QualType RecType = Rec.getAsType();
-        llvm::Type *Type = CGM.getTypes().ConvertType(RecType);
         CXXRecordDecl *RD = RecType->getAsCXXRecordDecl();
 
         // Get the TrackRWInputSharing flag from the record attribute
@@ -2597,63 +2659,12 @@ void CGMSHLSLRuntime::AddHLSLNodeRecordTypeInfo(
 
         // Ex: For DispatchNodeInputRecord<MY_RECORD>, set size =
         // size(MY_RECORD), alignment = alignof(MY_RECORD)
+        llvm::Type *Type = CGM.getTypes().ConvertType(RecType);
         node.RecordType.size = CGM.getDataLayout().getTypeAllocSize(Type);
         node.RecordType.alignment =
             CGM.getDataLayout().getABITypeAlignment(Type);
-        // Iterate over fields of the MY_RECORD(example) struct
-        for (auto fieldDecl : RD->fields()) {
-          // Check if any of the fields have a semantic annotation =
-          // SV_DispatchGrid
-          for (const hlsl::UnusualAnnotation *it :
-               fieldDecl->getUnusualAnnotations()) {
-            if (it->getKind() == hlsl::UnusualAnnotation::UA_SemanticDecl) {
-              const hlsl::SemanticDecl *sd = cast<hlsl::SemanticDecl>(it);
-              // if we find a field with SV_DispatchGrid, fill out the
-              // SV_DispatchGrid member with byteoffset of the field,
-              // NumComponents (3 for uint3 etc) and U32 vs U16 types, which are
-              // the only types allowed
-              if (sd->SemanticName.equals("SV_DispatchGrid")) {
-                clang::QualType FT = fieldDecl->getType();
-                auto &DL = CGM.getDataLayout();
-                auto &SDGRec = node.RecordType.SV_DispatchGrid;
-
-                DXASSERT_NOMSG(SDGRec.NumComponents == 0);
-
-                unsigned fieldIdx = fieldDecl->getFieldIndex();
-                if (StructType *ST = dyn_cast<StructType>(Type)) {
-                  SDGRec.ByteOffset =
-                      DL.getStructLayout(ST)->getElementOffset(fieldIdx);
-                }
-                const llvm::Type *lTy = CGM.getTypes().ConvertType(FT);
-                if (const llvm::VectorType *VT =
-                        dyn_cast<llvm::VectorType>(lTy)) {
-                  DXASSERT(VT->getElementType()->isIntegerTy(), "invalid type");
-                  SDGRec.NumComponents = VT->getNumElements();
-                  SDGRec.ComponentType =
-                      (VT->getElementType()->getIntegerBitWidth() == 16)
-                          ? DXIL::ComponentType::U16
-                          : DXIL::ComponentType::U32;
-                } else if (const llvm::ArrayType *AT =
-                               dyn_cast<llvm::ArrayType>(lTy)) {
-                  DXASSERT(AT->getElementType()->isIntegerTy(), "invalid type");
-                  DXASSERT_NOMSG(AT->getNumElements() <= 3);
-                  SDGRec.NumComponents = AT->getNumElements();
-                  SDGRec.ComponentType =
-                      (AT->getElementType()->getIntegerBitWidth() == 16)
-                          ? DXIL::ComponentType::U16
-                          : DXIL::ComponentType::U32;
-                } else {
-                  // Scalar U16 or U32
-                  DXASSERT(lTy->isIntegerTy(), "invalid type");
-                  SDGRec.NumComponents = 1;
-                  SDGRec.ComponentType = (lTy->getIntegerBitWidth() == 16)
-                                             ? DXIL::ComponentType::U16
-                                             : DXIL::ComponentType::U32;
-                }
-              }
-            }
-          }
-        }
+
+        FindDispatchGridSemantic(RD, node.RecordType.SV_DispatchGrid);
       }
     }
   }
diff --git a/tools/clang/lib/CodeGen/CoverageMappingGen.cpp b/tools/clang/lib/CodeGen/CoverageMappingGen.cpp
index eca91590e6..e16e015a74 100644
--- a/tools/clang/lib/CodeGen/CoverageMappingGen.cpp
+++ b/tools/clang/lib/CodeGen/CoverageMappingGen.cpp
@@ -116,7 +116,7 @@ class CoverageMappingBuilder {
   /// \brief Return the start location of an included file or expanded macro.
   SourceLocation getStartOfFileOrMacro(SourceLocation Loc) {
     if (Loc.isMacroID())
-      return Loc.getLocWithOffset(-SM.getFileOffset(Loc));
+      return Loc.getLocWithOffset(~SM.getFileOffset(Loc) + 1);
     return SM.getLocForStartOfFile(SM.getFileID(Loc));
   }
 
diff --git a/tools/clang/lib/CodeGen/ItaniumCXXABI.cpp b/tools/clang/lib/CodeGen/ItaniumCXXABI.cpp
index 97fe28be7f..f39ec6d497 100644
--- a/tools/clang/lib/CodeGen/ItaniumCXXABI.cpp
+++ b/tools/clang/lib/CodeGen/ItaniumCXXABI.cpp
@@ -639,8 +639,8 @@ llvm::Constant *
 ItaniumCXXABI::EmitNullMemberPointer(const MemberPointerType *MPT) {
   // Itanium C++ ABI 2.3:
   //   A NULL pointer is represented as -1.
-  if (MPT->isMemberDataPointer()) 
-    return llvm::ConstantInt::get(CGM.PtrDiffTy, -1ULL, /*isSigned=*/true);
+  if (MPT->isMemberDataPointer())
+    return llvm::ConstantInt::get(CGM.PtrDiffTy, -1LL, /*isSigned=*/true);
 
   llvm::Constant *Zero = llvm::ConstantInt::get(CGM.PtrDiffTy, 0);
   llvm::Constant *Values[2] = { Zero, Zero };
@@ -1023,7 +1023,7 @@ static CharUnits computeOffsetHint(ASTContext &Context,
   // If Dst is not derived from Src we can skip the whole computation below and
   // return that Src is not a public base of Dst.  Record all inheritance paths.
   if (!Dst->isDerivedFrom(Src, Paths))
-    return CharUnits::fromQuantity(-2ULL);
+    return CharUnits::fromQuantity(-2LL);
 
   unsigned NumPublicPaths = 0;
   CharUnits Offset;
@@ -1040,7 +1040,7 @@ static CharUnits computeOffsetHint(ASTContext &Context,
       // If the path contains a virtual base class we can't give any hint.
       // -1: no hint.
       if (J->Base->isVirtual())
-        return CharUnits::fromQuantity(-1ULL);
+        return CharUnits::fromQuantity(-1LL);
 
       if (NumPublicPaths > 1) // Won't use offsets, skip computation.
         continue;
@@ -1053,11 +1053,11 @@ static CharUnits computeOffsetHint(ASTContext &Context,
 
   // -2: Src is not a public base of Dst.
   if (NumPublicPaths == 0)
-    return CharUnits::fromQuantity(-2ULL);
+    return CharUnits::fromQuantity(-2LL);
 
   // -3: Src is a multiple public base type but never a virtual base type.
   if (NumPublicPaths > 1)
-    return CharUnits::fromQuantity(-3ULL);
+    return CharUnits::fromQuantity(-3LL);
 
   // Otherwise, the Src type is a unique public nonvirtual base type of Dst.
   // Return the offset of Src from the origin of Dst.
@@ -1090,7 +1090,7 @@ llvm::Value *ItaniumCXXABI::EmitTypeid(CodeGenFunction &CGF,
       CGF.GetVTablePtr(ThisPtr, StdTypeInfoPtrTy->getPointerTo());
 
   // Load the type info.
-  Value = CGF.Builder.CreateConstInBoundsGEP1_64(Value, -1ULL);
+  Value = CGF.Builder.CreateConstInBoundsGEP1_64(Value, -1LL);
   return CGF.Builder.CreateLoad(Value);
 }
 
@@ -1154,7 +1154,7 @@ llvm::Value *ItaniumCXXABI::EmitDynamicCastToVoid(CodeGenFunction &CGF,
 
   // Get the offset-to-top from the vtable.
   llvm::Value *OffsetToTop =
-      CGF.Builder.CreateConstInBoundsGEP1_64(VTable, -2ULL);
+      CGF.Builder.CreateConstInBoundsGEP1_64(VTable, -2LL);
   OffsetToTop = CGF.Builder.CreateLoad(OffsetToTop, "offset.to.top");
 
   // Finally, add the offset to the pointer.
diff --git a/tools/clang/lib/CodeGen/TargetInfo.cpp b/tools/clang/lib/CodeGen/TargetInfo.cpp
index aba43964d9..aaf63355af 100644
--- a/tools/clang/lib/CodeGen/TargetInfo.cpp
+++ b/tools/clang/lib/CodeGen/TargetInfo.cpp
@@ -1283,7 +1283,7 @@ llvm::Value *X86_32ABIInfo::EmitVAArg(llvm::Value *VAListAddr, QualType Ty,
     Addr = CGF.Builder.CreateGEP(Addr, Offset);
     llvm::Value *AsInt = CGF.Builder.CreatePtrToInt(Addr,
                                                     CGF.Int32Ty);
-    llvm::Value *Mask = llvm::ConstantInt::get(CGF.Int32Ty, -Align);
+    llvm::Value *Mask = llvm::ConstantInt::get(CGF.Int32Ty, ~Align + 1);
     Addr = CGF.Builder.CreateIntToPtr(CGF.Builder.CreateAnd(AsInt, Mask),
                                       Addr->getType(),
                                       "ap.cur.aligned");
@@ -2849,7 +2849,7 @@ static llvm::Value *EmitVAArgFromMemory(llvm::Value *VAListAddr,
     overflow_arg_area = CGF.Builder.CreateGEP(overflow_arg_area, Offset);
     llvm::Value *AsInt = CGF.Builder.CreatePtrToInt(overflow_arg_area,
                                                     CGF.Int64Ty);
-    llvm::Value *Mask = llvm::ConstantInt::get(CGF.Int64Ty, -(uint64_t)Align);
+    llvm::Value *Mask = llvm::ConstantInt::get(CGF.Int64Ty, ~Align + 1);
     overflow_arg_area =
       CGF.Builder.CreateIntToPtr(CGF.Builder.CreateAnd(AsInt, Mask),
                                  overflow_arg_area->getType(),
diff --git a/tools/clang/lib/Format/Format.cpp b/tools/clang/lib/Format/Format.cpp
index 7d556c9f0f..b6ca328972 100644
--- a/tools/clang/lib/Format/Format.cpp
+++ b/tools/clang/lib/Format/Format.cpp
@@ -1049,7 +1049,7 @@ class FormatTokenLexer {
     FormatTok = new (Allocator.Allocate()) FormatToken;
     readRawToken(*FormatTok);
     SourceLocation WhitespaceStart =
-        FormatTok->Tok.getLocation().getLocWithOffset(-TrailingWhitespace);
+        FormatTok->Tok.getLocation().getLocWithOffset(~TrailingWhitespace + 1);
     FormatTok->IsFirst = IsFirstToken;
     IsFirstToken = false;
 
diff --git a/tools/clang/lib/Format/FormatToken.h b/tools/clang/lib/Format/FormatToken.h
index f335eda086..249d526871 100644
--- a/tools/clang/lib/Format/FormatToken.h
+++ b/tools/clang/lib/Format/FormatToken.h
@@ -86,11 +86,11 @@ namespace format {
   TYPE(UnaryOperator) \
   TYPE(Unknown)
 
-enum TokenType {
+enum TokenType : unsigned int {
 #define TYPE(X) TT_##X,
-LIST_TOKEN_TYPES
+  LIST_TOKEN_TYPES
 #undef TYPE
-  NUM_TOKEN_TYPES
+      NUM_TOKEN_TYPES
 };
 
 /// \brief Determines the name of a token type.
diff --git a/tools/clang/lib/Headers/hlsl/dx/linalg.h b/tools/clang/lib/Headers/hlsl/dx/linalg.h
new file mode 100644
index 0000000000..4f5e62070d
--- /dev/null
+++ b/tools/clang/lib/Headers/hlsl/dx/linalg.h
@@ -0,0 +1,198 @@
+// Header for linear algebra APIs.
+
+#if __spirv__
+#error "Cooperative vectors not (yet) supported for SPIRV"
+#endif
+
+#if ((__SHADER_TARGET_MAJOR > 6) ||                                            \
+     (__SHADER_TARGET_MAJOR == 6 && __SHADER_TARGET_MINOR >= 9)) &&            \
+    (__HLSL_VERSION >= 2021)
+
+namespace dx {
+namespace linalg {
+
+// NOTE: can't be an enum class because we get this error:
+//     error: non-type template argument of type 'dx::linalg::DataType' is not
+//     an integral constant expression
+//
+enum DataType {
+  DATA_TYPE_SINT16 = 2,           // ComponentType::I16
+  DATA_TYPE_UINT16 = 3,           // ComponentType::U16
+  DATA_TYPE_SINT32 = 4,           // ComponentType::I32
+  DATA_TYPE_UINT32 = 5,           // ComponentType::U32
+  DATA_TYPE_FLOAT16 = 8,          // ComponentType::F16
+  DATA_TYPE_FLOAT32 = 9,          // ComponentType::F32
+  DATA_TYPE_SINT8_T4_PACKED = 17, // ComponentType::PackedS8x32
+  DATA_TYPE_UINT8_T4_PACKED = 18, // ComponentType::PackedU8x32
+  DATA_TYPE_UINT8 = 19,           // ComponentType::U8
+  DATA_TYPE_SINT8 = 20,           // ComponentType::I8
+  DATA_TYPE_FLOAT8_E4M3 = 21,     // ComponentType::F8_E4M3
+                                  // (1 sign, 4 exp, 3 mantissa bits)
+  DATA_TYPE_FLOAT8_E5M2 = 22,     // ComponentType::F8_E5M2
+                                  // (1 sign, 5 exp, 2 mantissa bits)
+};
+
+enum MatrixLayout {
+  MATRIX_LAYOUT_ROW_MAJOR = 0,
+  MATRIX_LAYOUT_COLUMN_MAJOR = 1,
+  MATRIX_LAYOUT_MUL_OPTIMAL = 2,
+  MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL = 3
+};
+
+//
+// Helper for signedness
+//
+namespace details {
+
+template <typename T> struct IsUnsigned {};
+
+#define _SPECIALIZE_ISUNSIGNED(type, value)                                    \
+  template <> struct IsUnsigned<type> {                                        \
+    static const bool Value = value;                                           \
+  }
+
+_SPECIALIZE_ISUNSIGNED(uint8_t4_packed, true);
+_SPECIALIZE_ISUNSIGNED(int8_t4_packed, true);
+_SPECIALIZE_ISUNSIGNED(uint32_t, true);
+_SPECIALIZE_ISUNSIGNED(int32_t, false);
+_SPECIALIZE_ISUNSIGNED(float32_t, false);
+
+#ifdef __HLSL_ENABLE_16_BIT
+_SPECIALIZE_ISUNSIGNED(uint16_t, true);
+_SPECIALIZE_ISUNSIGNED(int16_t, false);
+_SPECIALIZE_ISUNSIGNED(float16_t, false);
+#else  // //__HLSL_ENABLE_16_BIT
+_SPECIALIZE_ISUNSIGNED(half, false);
+#endif //__HLSL_ENABLE_16_BIT
+
+#undef _SPECIALIZE_ISUNSIGNED
+
+} // namespace details
+
+//
+// (RW)MatrixRef
+//
+
+template <typename BufferTy, DataType DT, uint M, uint K, MatrixLayout ML,
+          bool Transpose>
+struct MatrixRefImpl {
+  BufferTy Buffer;
+  uint StartOffset;
+  uint Stride;
+};
+
+template <DataType DT, uint M, uint K, MatrixLayout ML, bool Transpose = false>
+using MatrixRef = MatrixRefImpl<ByteAddressBuffer, DT, M, K, ML, Transpose>;
+
+template <DataType DT, uint M, uint K, MatrixLayout ML, bool Transpose = false>
+using RWMatrixRef = MatrixRefImpl<RWByteAddressBuffer, DT, M, K, ML, Transpose>;
+
+//
+// (RW)VectorRef
+//
+
+template <typename BufferTy, DataType DT> struct VectorRefImpl {
+  BufferTy Buffer;
+  uint StartOffset;
+};
+
+template <DataType DT> using VectorRef = VectorRefImpl<ByteAddressBuffer, DT>;
+
+template <DataType DT>
+using RWVectorRef = VectorRefImpl<RWByteAddressBuffer, DT>;
+
+//
+// Vector
+//
+
+template <typename T, int N, DataType DT> struct InterpretedVector {
+  vector<T, N> Data;
+};
+
+template <DataType DT, typename T, int N>
+InterpretedVector<T, N, DT> MakeInterpretedVector(vector<T, N> Vec) {
+  InterpretedVector<T, N, DT> IV = {Vec};
+  return IV;
+}
+
+//
+// Mul
+//
+
+template <typename OutputElTy, typename InputElTy, int InputElCount,
+          typename MatrixBufferTy, DataType InputDT, DataType MatrixDT,
+          uint MatrixM, uint MatrixK, MatrixLayout MatrixLayout,
+          bool MatrixTranspose>
+vector<OutputElTy, MatrixM>
+Mul(MatrixRefImpl<MatrixBufferTy, MatrixDT, MatrixM, MatrixK, MatrixLayout,
+                  MatrixTranspose>
+        Matrix,
+    InterpretedVector<InputElTy, InputElCount, InputDT> InputVector) {
+
+  vector<OutputElTy, MatrixM> OutputVector;
+
+  __builtin_MatVecMul(
+      /*out*/ OutputVector, details::IsUnsigned<OutputElTy>::Value,
+      InputVector.Data, details::IsUnsigned<InputElTy>::Value, InputDT,
+      Matrix.Buffer, Matrix.StartOffset, MatrixDT, MatrixM, MatrixK,
+      MatrixLayout, MatrixTranspose, Matrix.Stride);
+
+  return OutputVector;
+}
+
+//
+// MulAdd
+//
+
+template <typename OutputElTy, typename InputElTy, int InputElCount,
+          typename MatrixBufferTy, DataType InputDT, DataType MatrixDT,
+          uint MatrixM, uint MatrixK, MatrixLayout MatrixLayout,
+          bool MatrixTranspose, typename BiasVectorBufferTy,
+          DataType BiasVectorDT>
+vector<OutputElTy, MatrixM>
+MulAdd(MatrixRefImpl<MatrixBufferTy, MatrixDT, MatrixM, MatrixK, MatrixLayout,
+                     MatrixTranspose>
+           Matrix,
+       InterpretedVector<InputElTy, InputElCount, InputDT> InputVector,
+       VectorRefImpl<BiasVectorBufferTy, BiasVectorDT> BiasVector) {
+
+  vector<OutputElTy, MatrixM> OutputVector;
+
+  __builtin_MatVecMulAdd(
+      /*out*/ OutputVector, details::IsUnsigned<OutputElTy>::Value,
+      InputVector.Data, details::IsUnsigned<InputElTy>::Value, InputDT,
+      Matrix.Buffer, Matrix.StartOffset, MatrixDT, MatrixM, MatrixK,
+      MatrixLayout, MatrixTranspose, Matrix.Stride, BiasVector.Buffer,
+      BiasVector.StartOffset, BiasVectorDT);
+
+  return OutputVector;
+}
+
+//
+// OuterProductAccumulate
+//
+
+template <typename ElTy, int MatrixM, int MatrixN, DataType MatrixDT,
+          MatrixLayout MatrixLayout>
+void OuterProductAccumulate(
+    vector<ElTy, MatrixM> InputVector1, vector<ElTy, MatrixN> InputVector2,
+    RWMatrixRef<MatrixDT, MatrixM, MatrixN, MatrixLayout, false> Matrix) {
+  __builtin_OuterProductAccumulate(InputVector1, InputVector2, Matrix.Buffer,
+                                   Matrix.StartOffset, MatrixDT, MatrixLayout,
+                                   Matrix.Stride);
+}
+
+//
+// VectorAccumulate
+//
+
+template <typename ElTy, int ElCount>
+void VectorAccumulate(vector<ElTy, ElCount> InputVector,
+                      RWByteAddressBuffer Buffer, uint Offset) {
+  __builtin_VectorAccumulate(InputVector, Buffer, Offset);
+}
+
+} // namespace linalg
+} // namespace dx
+
+#endif // SM 6.9 check and HV version check
diff --git a/tools/clang/lib/Lex/Lexer.cpp b/tools/clang/lib/Lex/Lexer.cpp
index 089e76b78b..ce9dd8a3c0 100644
--- a/tools/clang/lib/Lex/Lexer.cpp
+++ b/tools/clang/lib/Lex/Lexer.cpp
@@ -480,7 +480,7 @@ static SourceLocation getBeginningOfFileToken(SourceLocation Loc,
   }
   
   // Create a lexer starting at the beginning of this token.
-  SourceLocation LexerStartLoc = Loc.getLocWithOffset(-LocInfo.second);
+  SourceLocation LexerStartLoc = Loc.getLocWithOffset(~LocInfo.second + 1);
   Lexer TheLexer(LexerStartLoc, LangOpts, BufStart, LexStart, Buffer.end());
   TheLexer.SetCommentRetentionState(true);
   
@@ -2737,7 +2737,7 @@ uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
     char C = getCharAndSize(CurPtr, CharSize);
 
     unsigned Value = llvm::hexDigitValue(C);
-    if (Value == -1U) {
+    if (Value == std::numeric_limits<unsigned>::max()) {
       if (Result && !isLexingRawMode()) {
         if (i == 0) {
           Diag(BufferPtr, diag::warn_ucn_escape_no_digits)
diff --git a/tools/clang/lib/Lex/LiteralSupport.cpp b/tools/clang/lib/Lex/LiteralSupport.cpp
index 606c821bb2..62f241812b 100644
--- a/tools/clang/lib/Lex/LiteralSupport.cpp
+++ b/tools/clang/lib/Lex/LiteralSupport.cpp
@@ -141,8 +141,12 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin,
     // Hex escapes are a maximal series of hex digits.
     bool Overflow = false;
     for (; ThisTokBuf != ThisTokEnd; ++ThisTokBuf) {
-      int CharVal = llvm::hexDigitValue(ThisTokBuf[0]);
-      if (CharVal == -1) break;
+      // originally returned -1 for invalid hex digits, now returns ~0u
+      // signature: static inline unsigned int llvm::hexDigitValue(char C)
+      unsigned int CharVal = llvm::hexDigitValue(ThisTokBuf[0]);
+      if (CharVal == ~0U)
+        break;
+
       // About to shift out a digit?
       if (ResultChar & 0xF0000000)
         Overflow = true;
@@ -245,7 +249,7 @@ void clang::expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) {
     uint32_t CodePoint = 0;
     for (++I; NumHexDigits != 0; ++I, --NumHexDigits) {
       unsigned Value = llvm::hexDigitValue(*I);
-      assert(Value != -1U);
+      assert(Value != ~0U);
 
       CodePoint <<= 4;
       CodePoint += Value;
@@ -278,8 +282,9 @@ static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
   UcnLen = (ThisTokBuf[-1] == 'u' ? 4 : 8);
   unsigned short UcnLenSave = UcnLen;
   for (; ThisTokBuf != ThisTokEnd && UcnLenSave; ++ThisTokBuf, UcnLenSave--) {
-    int CharVal = llvm::hexDigitValue(ThisTokBuf[0]);
-    if (CharVal == -1) break;
+    unsigned int CharVal = llvm::hexDigitValue(ThisTokBuf[0]);
+    if (CharVal == ~0U)
+      break;
     UcnVal <<= 4;
     UcnVal |= CharVal;
   }
diff --git a/tools/clang/lib/Lex/PPMacroExpansion.cpp b/tools/clang/lib/Lex/PPMacroExpansion.cpp
index ebfb93df2e..16040d69c7 100644
--- a/tools/clang/lib/Lex/PPMacroExpansion.cpp
+++ b/tools/clang/lib/Lex/PPMacroExpansion.cpp
@@ -5,9 +5,6 @@
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
-// Modifications Copyright(C) 2025 Advanced Micro Devices, Inc.
-// All rights reserved.
-//
 //===----------------------------------------------------------------------===//
 //
 // This file implements the top level handling of macro expansion for the
diff --git a/tools/clang/lib/Rewrite/Rewriter.cpp b/tools/clang/lib/Rewrite/Rewriter.cpp
index be09a363a6..fa081d65ac 100644
--- a/tools/clang/lib/Rewrite/Rewriter.cpp
+++ b/tools/clang/lib/Rewrite/Rewriter.cpp
@@ -60,7 +60,7 @@ void RewriteBuffer::RemoveText(unsigned OrigOffset, unsigned Size,
   Buffer.erase(RealOffset, Size);
 
   // Add a delta so that future changes are offset correctly.
-  AddReplaceDelta(OrigOffset, -Size);
+  AddReplaceDelta(OrigOffset, ~Size + 1);
 
   if (removeLineIfEmpty) {
     // Find the line that the remove occurred and if it is completely empty
@@ -86,7 +86,7 @@ void RewriteBuffer::RemoveText(unsigned OrigOffset, unsigned Size,
     }
     if (posI != end() && *posI == '\n') {
       Buffer.erase(curLineStartOffs, lineSize + 1/* + '\n'*/);
-      AddReplaceDelta(curLineStartOffs, -(lineSize + 1/* + '\n'*/));
+      AddReplaceDelta(curLineStartOffs, ~(lineSize + 1 /* + '\n'*/) + 1);
     }
   }
 }
diff --git a/tools/clang/lib/SPIRV/AlignmentSizeCalculator.cpp b/tools/clang/lib/SPIRV/AlignmentSizeCalculator.cpp
index db140f4766..9bb2f1b1fa 100644
--- a/tools/clang/lib/SPIRV/AlignmentSizeCalculator.cpp
+++ b/tools/clang/lib/SPIRV/AlignmentSizeCalculator.cpp
@@ -5,9 +5,6 @@
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
-// Modifications Copyright(C) 2025 Advanced Micro Devices, Inc.
-// All rights reserved.
-//
 //===----------------------------------------------------------------------===//
 
 #include "AlignmentSizeCalculator.h"
diff --git a/tools/clang/lib/SPIRV/AstTypeProbe.cpp b/tools/clang/lib/SPIRV/AstTypeProbe.cpp
index 31a9bd8f7d..b6ca1f60ae 100644
--- a/tools/clang/lib/SPIRV/AstTypeProbe.cpp
+++ b/tools/clang/lib/SPIRV/AstTypeProbe.cpp
@@ -1353,6 +1353,27 @@ bool isOrContainsNonFpColMajorMatrix(const ASTContext &astContext,
   return false;
 }
 
+bool isOrContainsBoolType(QualType type) {
+  if (isBoolOrVecMatOfBoolType(type)) {
+    return true;
+  }
+
+  if (const auto *arrayType = type->getAsArrayTypeUnsafe()) {
+    return isOrContainsBoolType(arrayType->getElementType());
+  }
+
+  if (const auto *recordType = type->getAs<RecordType>()) {
+    for (auto field : recordType->getDecl()->fields()) {
+      if (isOrContainsBoolType(field->getType())) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  return false;
+}
+
 bool isTypeInVkNamespace(const RecordType *type) {
   if (const auto *nameSpaceDecl =
           dyn_cast<NamespaceDecl>(type->getDecl()->getDeclContext())) {
diff --git a/tools/clang/lib/SPIRV/CapabilityVisitor.cpp b/tools/clang/lib/SPIRV/CapabilityVisitor.cpp
index 24dfdc2e9a..c8444a3b81 100644
--- a/tools/clang/lib/SPIRV/CapabilityVisitor.cpp
+++ b/tools/clang/lib/SPIRV/CapabilityVisitor.cpp
@@ -5,9 +5,6 @@
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
-// Modifications Copyright(C) 2025 Advanced Micro Devices, Inc.
-// All rights reserved.
-//
 //===----------------------------------------------------------------------===//
 
 #include "CapabilityVisitor.h"
@@ -125,6 +122,12 @@ void CapabilityVisitor::addCapabilityForType(const SpirvType *type,
     }
     addCapabilityForType(raType->getElementType(), loc, sc);
   }
+  // Node payload array also requires additional capability.
+  else if (const auto *npaType = dyn_cast<NodePayloadArrayType>(type)) {
+    addExtension(Extension::AMD_shader_enqueue, "Vulkan 1.3", loc);
+    addCapability(spv::Capability::ShaderEnqueueAMDX, loc);
+    addCapabilityForType(npaType->getElementType(), loc, sc);
+  }
   // Image types
   else if (const auto *imageType = dyn_cast<ImageType>(type)) {
     switch (imageType->getDimension()) {
@@ -257,6 +260,19 @@ bool CapabilityVisitor::visit(SpirvDecoration *decor) {
     addCapability(spv::Capability::FragmentBarycentricKHR);
     break;
   }
+  case spv::Decoration::NodeSharesPayloadLimitsWithAMDX:
+  case spv::Decoration::NodeMaxPayloadsAMDX:
+  case spv::Decoration::TrackFinishWritingAMDX:
+  case spv::Decoration::PayloadNodeNameAMDX:
+  case spv::Decoration::PayloadNodeBaseIndexAMDX:
+  case spv::Decoration::PayloadNodeSparseArrayAMDX:
+  case spv::Decoration::PayloadNodeArraySizeAMDX:
+  case spv::Decoration::PayloadDispatchIndirectAMDX: {
+    featureManager.requestTargetEnv(SPV_ENV_VULKAN_1_3, "WorkGraphs", loc);
+    addCapability(spv::Capability::ShaderEnqueueAMDX, loc);
+    addExtension(Extension::AMD_shader_enqueue, "Vulkan 1.3", loc);
+    break;
+  }
   // Capabilities needed for built-ins
   case spv::Decoration::BuiltIn: {
     AddVulkanMemoryModelForVolatile(decor, loc);
@@ -535,8 +551,14 @@ bool CapabilityVisitor::visitInstruction(SpirvInstruction *instr) {
     addCapability(spv::Capability::GroupNonUniformQuad);
     break;
   case spv::Op::OpVariable: {
-    if (spvOptions.enableReflect &&
-        !cast<SpirvVariable>(instr)->getHlslUserType().empty()) {
+    auto var = cast<SpirvVariable>(instr);
+    auto storage = var->getStorageClass();
+    if (storage == spv::StorageClass::NodePayloadAMDX) {
+      featureManager.requestTargetEnv(SPV_ENV_VULKAN_1_3, "WorkGraphs", loc);
+      addCapability(spv::Capability::ShaderEnqueueAMDX, loc);
+      addExtension(Extension::AMD_shader_enqueue, "Vulkan 1.3", loc);
+    }
+    if (spvOptions.enableReflect && !var->getHlslUserType().empty()) {
       addExtension(Extension::GOOGLE_user_type, "HLSL User Type", loc);
       addExtension(Extension::GOOGLE_hlsl_functionality1, "HLSL User Type",
                    loc);
@@ -580,6 +602,28 @@ bool CapabilityVisitor::visitInstruction(SpirvInstruction *instr) {
     }
     break;
   }
+  case spv::Op::OpConstantStringAMDX:
+  case spv::Op::OpSpecConstantStringAMDX:
+  case spv::Op::OpAllocateNodePayloadsAMDX:
+  case spv::Op::OpEnqueueNodePayloadsAMDX:
+  case spv::Op::OpIsNodePayloadValidAMDX:
+  case spv::Op::OpFinishWritingNodePayloadAMDX: {
+    featureManager.requestTargetEnv(SPV_ENV_VULKAN_1_3, "WorkGraphs", loc);
+    addCapability(spv::Capability::ShaderEnqueueAMDX, loc);
+    addExtension(Extension::AMD_shader_enqueue, "Vulkan 1.3", loc);
+    break;
+  }
+  case spv::Op::OpControlBarrier:
+  case spv::Op::OpMemoryBarrier: {
+    auto barrier = cast<SpirvBarrier>(instr);
+    if ((bool)(barrier->getMemorySemantics() &
+               spv::MemorySemanticsMask::OutputMemoryKHR)) {
+      featureManager.requestTargetEnv(SPV_ENV_VULKAN_1_3, "NODE_OUTPUT_MEMORY",
+                                      loc);
+      addCapability(spv::Capability::VulkanMemoryModel, loc);
+    }
+    break;
+  }
 
   default:
     break;
@@ -639,12 +683,25 @@ bool CapabilityVisitor::visit(SpirvEntryPoint *entryPoint) {
   return true;
 }
 
-bool CapabilityVisitor::visit(SpirvExecutionMode *execMode) {
+bool CapabilityVisitor::visit(SpirvExecutionModeBase *execMode) {
   spv::ExecutionMode executionMode = execMode->getExecutionMode();
   SourceLocation execModeSourceLocation = execMode->getSourceLocation();
   SourceLocation entryPointSourceLocation =
       execMode->getEntryPoint()->getSourceLocation();
   switch (executionMode) {
+  case spv::ExecutionMode::CoalescingAMDX:
+  case spv::ExecutionMode::MaxNodeRecursionAMDX:
+  case spv::ExecutionMode::StaticNumWorkgroupsAMDX:
+  case spv::ExecutionMode::MaxNumWorkgroupsAMDX:
+    featureManager.requestTargetEnv(SPV_ENV_VULKAN_1_3, "WorkGraphs",
+                                    execModeSourceLocation);
+    addCapability(spv::Capability::ShaderEnqueueAMDX, execModeSourceLocation);
+    addExtension(Extension::AMD_shader_enqueue, "Vulkan 1.3",
+                 execModeSourceLocation);
+    break;
+  case spv::ExecutionMode::SubgroupSize:
+    addCapability(spv::Capability::SubgroupDispatch, execModeSourceLocation);
+    break;
   case spv::ExecutionMode::PostDepthCoverage:
     addCapability(spv::Capability::SampleMaskPostDepthCoverage,
                   entryPointSourceLocation);
diff --git a/tools/clang/lib/SPIRV/CapabilityVisitor.h b/tools/clang/lib/SPIRV/CapabilityVisitor.h
index 95db110cce..35d4b5a18b 100644
--- a/tools/clang/lib/SPIRV/CapabilityVisitor.h
+++ b/tools/clang/lib/SPIRV/CapabilityVisitor.h
@@ -31,7 +31,7 @@ class CapabilityVisitor : public Visitor {
 
   bool visit(SpirvDecoration *decor) override;
   bool visit(SpirvEntryPoint *) override;
-  bool visit(SpirvExecutionMode *) override;
+  bool visit(SpirvExecutionModeBase *execMode) override;
   bool visit(SpirvImageQuery *) override;
   bool visit(SpirvImageOp *) override;
   bool visit(SpirvImageSparseTexelsResident *) override;
diff --git a/tools/clang/lib/SPIRV/DebugTypeVisitor.cpp b/tools/clang/lib/SPIRV/DebugTypeVisitor.cpp
index 058e7b6255..24fab092cc 100644
--- a/tools/clang/lib/SPIRV/DebugTypeVisitor.cpp
+++ b/tools/clang/lib/SPIRV/DebugTypeVisitor.cpp
@@ -356,6 +356,17 @@ SpirvDebugType *DebugTypeVisitor::lowerToDebugType(const SpirvType *spirvType) {
     debugType = spvContext.getDebugTypeArray(spirvType, elemDebugType, counts);
     break;
   }
+  case SpirvType::TK_NodePayloadArrayAMD: {
+    auto *arrType = dyn_cast<NodePayloadArrayType>(spirvType);
+    SpirvDebugInstruction *elemDebugType =
+        lowerToDebugType(arrType->getElementType());
+
+    llvm::SmallVector<uint32_t, 4> counts;
+    counts.push_back(0u);
+
+    debugType = spvContext.getDebugTypeArray(spirvType, elemDebugType, counts);
+    break;
+  }
   case SpirvType::TK_Vector: {
     auto *vecType = dyn_cast<VectorType>(spirvType);
     SpirvDebugInstruction *elemDebugType =
diff --git a/tools/clang/lib/SPIRV/DeclResultIdMapper.cpp b/tools/clang/lib/SPIRV/DeclResultIdMapper.cpp
index de73d5e417..9d0d8f51a3 100644
--- a/tools/clang/lib/SPIRV/DeclResultIdMapper.cpp
+++ b/tools/clang/lib/SPIRV/DeclResultIdMapper.cpp
@@ -467,6 +467,10 @@ hlsl::DxilParamInputQual deduceParamQual(const DeclaratorDecl *decl,
   if (decl->hasAttr<HLSLPayloadAttr>())
     return hlsl::DxilParamInputQual::InPayload;
 
+  if (hlsl::IsHLSLNodeType(type)) {
+    return hlsl::DxilParamInputQual::NodeIO;
+  }
+
   return asInput ? hlsl::DxilParamInputQual::In : hlsl::DxilParamInputQual::Out;
 }
 
@@ -475,6 +479,9 @@ hlsl::DxilParamInputQual deduceParamQual(const DeclaratorDecl *decl,
 const hlsl::SigPoint *deduceSigPoint(const DeclaratorDecl *decl, bool asInput,
                                      const hlsl::ShaderModel::Kind kind,
                                      bool forPCF) {
+  if (kind == hlsl::ShaderModel::Kind::Node) {
+    return hlsl::SigPoint::GetSigPoint(hlsl::SigPoint::Kind::CSIn);
+  }
   return hlsl::SigPoint::GetSigPoint(hlsl::SigPointFromInputQual(
       deduceParamQual(decl, asInput), kind, forPCF));
 }
@@ -2158,6 +2165,8 @@ bool DeclResultIdMapper::assignLocations(
     llvm::DenseSet<StageVariableLocationInfo, StageVariableLocationInfo>
         *stageVariableLocationInfo) {
   for (const auto *var : vars) {
+    if (hlsl::IsHLSLNodeType(var->getAstType()))
+      continue;
     auto locCount = var->getLocationCount();
     uint32_t location = nextLocs(locCount);
     spvBuilder.decorateLocation(var->getSpirvInstr(), location);
@@ -3489,7 +3498,9 @@ SpirvVariable *DeclResultIdMapper::createSpirvInterfaceVariable(
   StageVar stageVar(
       stageVarData.sigPoint, *stageVarData.semantic, builtinAttr, evalType,
       // For HS/DS/GS, we have already stripped the outmost arrayness on type.
-      getLocationAndComponentCount(astContext, stageVarData.type));
+      hlsl::IsHLSLNodeInputType(stageVarData.type)
+          ? LocationAndComponent({0, 0, false})
+          : getLocationAndComponentCount(astContext, stageVarData.type));
   const auto name =
       stageVarData.namePrefix.str() + "." + stageVar.getSemanticStr();
   SpirvVariable *varInstr = createSpirvStageVar(
@@ -3708,6 +3719,22 @@ bool DeclResultIdMapper::createStageVars(StageVarDataBundle &stageVarData,
     stageVarData.semantic = &thisSemantic;
   }
 
+  if (hlsl::IsHLSLNodeType(stageVarData.type)) {
+    // Hijack the notion of semantic to use createSpirvInterfaceVariable
+    StringRef str = stageVarData.decl->getName();
+    stageVarData.semantic->str = stageVarData.semantic->name = str;
+    stageVarData.semantic->semantic = hlsl::Semantic::GetArbitrary();
+    SpirvVariable *varInstr = createSpirvInterfaceVariable(stageVarData);
+    if (!varInstr) {
+      return false;
+    }
+
+    *value = hlsl::IsHLSLNodeInputType(stageVarData.type)
+                 ? varInstr
+                 : loadShaderInputVariable(varInstr, stageVarData);
+    return true;
+  }
+
   if (stageVarData.semantic->isValid() &&
       // Structs with attached semantics will be handled later.
       !stageVarData.type->isStructureType()) {
@@ -4161,6 +4188,8 @@ SpirvVariable *DeclResultIdMapper::getBuiltinVar(spv::BuiltIn builtIn,
   case spv::BuiltIn::GlobalInvocationId:
   case spv::BuiltIn::WorkgroupId:
   case spv::BuiltIn::LocalInvocationIndex:
+  case spv::BuiltIn::RemainingRecursionLevelsAMDX:
+  case spv::BuiltIn::ShaderIndexAMDX:
     sc = spv::StorageClass::Input;
     break;
   case spv::BuiltIn::TaskCountNV:
@@ -4196,7 +4225,9 @@ SpirvVariable *DeclResultIdMapper::createSpirvStageVar(
   const auto type = stageVar->getAstType();
   const auto isPrecise = decl->hasAttr<HLSLPreciseAttr>();
   auto isNointerp = decl->hasAttr<HLSLNoInterpolationAttr>();
-  spv::StorageClass sc = getStorageClassForSigPoint(sigPoint);
+  spv::StorageClass sc = hlsl::IsHLSLNodeInputType(stageVar->getAstType())
+                             ? spv::StorageClass::NodePayloadAMDX
+                             : getStorageClassForSigPoint(sigPoint);
   if (sc == spv::StorageClass::Max)
     return 0;
   stageVar->setStorageClass(sc);
diff --git a/tools/clang/lib/SPIRV/EmitVisitor.cpp b/tools/clang/lib/SPIRV/EmitVisitor.cpp
index eb00f59632..eb94ce0797 100644
--- a/tools/clang/lib/SPIRV/EmitVisitor.cpp
+++ b/tools/clang/lib/SPIRV/EmitVisitor.cpp
@@ -5,9 +5,6 @@
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
-// Modifications Copyright(C) 2025 Advanced Micro Devices, Inc.
-// All rights reserved.
-//
 //===----------------------------------------------------------------------===//
 
 // Do not change the inclusion order between "dxc/Support/*" files.
@@ -617,19 +614,20 @@ bool EmitVisitor::visit(SpirvEntryPoint *inst) {
   return true;
 }
 
-bool EmitVisitor::visit(SpirvExecutionMode *inst) {
+bool EmitVisitor::visit(SpirvExecutionModeBase *inst) {
   initInstruction(inst);
   curInst.push_back(getOrAssignResultId<SpirvFunction>(inst->getEntryPoint()));
   curInst.push_back(static_cast<uint32_t>(inst->getExecutionMode()));
-  if (inst->getopcode() == spv::Op::OpExecutionMode) {
-    curInst.insert(curInst.end(), inst->getParams().begin(),
-                   inst->getParams().end());
-  } else {
-    for (uint32_t param : inst->getParams()) {
-      curInst.push_back(typeHandler.getOrCreateConstantInt(
-          llvm::APInt(32, param), context.getUIntType(32),
-          /*isSpecConst */ false));
+  if (auto *exeModeId = dyn_cast<SpirvExecutionModeId>(inst)) {
+    for (SpirvInstruction *param : exeModeId->getParams()) {
+      if (auto *ConstantInst = dyn_cast<SpirvConstant>(param))
+        typeHandler.getOrCreateConstant(ConstantInst);
+      curInst.push_back(getOrAssignResultId<SpirvInstruction>(param));
     }
+  } else {
+    auto *exeMode = llvm::cast<SpirvExecutionMode>(inst);
+    ArrayRef<uint32_t> params = exeMode->getParams();
+    curInst.insert(curInst.end(), params.begin(), params.end());
   }
   finalizeInstruction(&preambleBinary);
   return true;
@@ -940,6 +938,73 @@ bool EmitVisitor::visit(SpirvBarrier *inst) {
   curInst.push_back(memoryScopeId);
   curInst.push_back(memorySemanticsId);
   finalizeInstruction(&mainBinary);
+  emitDebugNameForInstruction(getOrAssignResultId<SpirvInstruction>(inst),
+                              inst->getDebugName());
+  return true;
+}
+
+bool EmitVisitor::visit(SpirvIsNodePayloadValid *inst) {
+  initInstruction(inst);
+  curInst.push_back(inst->getResultTypeId());
+  curInst.push_back(getOrAssignResultId<SpirvInstruction>(inst));
+  curInst.push_back(
+      getOrAssignResultId<SpirvInstruction>(inst->getPayloadArray()));
+  curInst.push_back(
+      getOrAssignResultId<SpirvInstruction>(inst->getNodeIndex()));
+  finalizeInstruction(&mainBinary);
+  emitDebugNameForInstruction(getOrAssignResultId<SpirvInstruction>(inst),
+                              inst->getDebugName());
+  return true;
+}
+
+bool EmitVisitor::visit(SpirvNodePayloadArrayLength *inst) {
+  initInstruction(inst);
+  curInst.push_back(inst->getResultTypeId());
+  curInst.push_back(getOrAssignResultId<SpirvInstruction>(inst));
+  curInst.push_back(
+      getOrAssignResultId<SpirvInstruction>(inst->getPayloadArray()));
+  finalizeInstruction(&mainBinary);
+  emitDebugNameForInstruction(getOrAssignResultId<SpirvInstruction>(inst),
+                              inst->getDebugName());
+  return true;
+}
+
+bool EmitVisitor::visit(SpirvAllocateNodePayloads *inst) {
+  const uint32_t allocationScopeId = typeHandler.getOrCreateConstantInt(
+      llvm::APInt(32, static_cast<uint32_t>(inst->getAllocationScope())),
+      context.getUIntType(32), /*isSpecConst */ false);
+
+  initInstruction(inst);
+  curInst.push_back(inst->getResultTypeId());
+  curInst.push_back(getOrAssignResultId<SpirvInstruction>(inst));
+  curInst.push_back(allocationScopeId);
+  curInst.push_back(
+      getOrAssignResultId<SpirvInstruction>(inst->getRecordCount()));
+  curInst.push_back(
+      getOrAssignResultId<SpirvInstruction>(inst->getShaderIndex()));
+  finalizeInstruction(&mainBinary);
+  emitDebugNameForInstruction(getOrAssignResultId<SpirvInstruction>(inst),
+                              inst->getDebugName());
+  return true;
+}
+
+bool EmitVisitor::visit(SpirvEnqueueNodePayloads *inst) {
+  initInstruction(inst);
+  curInst.push_back(getOrAssignResultId<SpirvInstruction>(inst->getPayload()));
+  finalizeInstruction(&mainBinary);
+  emitDebugNameForInstruction(getOrAssignResultId<SpirvInstruction>(inst),
+                              inst->getDebugName());
+  return true;
+}
+
+bool EmitVisitor::visit(SpirvFinishWritingNodePayload *inst) {
+  initInstruction(inst);
+  curInst.push_back(inst->getResultTypeId());
+  curInst.push_back(getOrAssignResultId<SpirvInstruction>(inst));
+  curInst.push_back(getOrAssignResultId<SpirvInstruction>(inst->getPayload()));
+  finalizeInstruction(&mainBinary);
+  emitDebugNameForInstruction(getOrAssignResultId<SpirvInstruction>(inst),
+                              inst->getDebugName());
   return true;
 }
 
@@ -1013,6 +1078,13 @@ bool EmitVisitor::visit(SpirvConstantComposite *inst) {
   return true;
 }
 
+bool EmitVisitor::visit(SpirvConstantString *inst) {
+  typeHandler.getOrCreateConstant(inst);
+  emitDebugNameForInstruction(getOrAssignResultId<SpirvInstruction>(inst),
+                              inst->getDebugName());
+  return true;
+}
+
 bool EmitVisitor::visit(SpirvConstantNull *inst) {
   typeHandler.getOrCreateConstant(inst);
   emitDebugNameForInstruction(getOrAssignResultId<SpirvInstruction>(inst),
@@ -1563,12 +1635,6 @@ bool EmitVisitor::visit(SpirvDebugLexicalBlock *inst) {
 }
 
 bool EmitVisitor::visit(SpirvDebugScope *inst) {
-  // Technically entry function wrappers do not exist in HLSL. They
-  // are just created by DXC. We do not want to emit DebugScope for
-  // it.
-  if (inEntryFunctionWrapper)
-    return true;
-
   initInstruction(inst);
   curInst.push_back(inst->getResultTypeId());
   curInst.push_back(getOrAssignResultId<SpirvInstruction>(inst));
@@ -1999,7 +2065,13 @@ bool EmitVisitor::visit(SpirvIntrinsicInstruction *inst) {
     }
   }
 
-  finalizeInstruction(&mainBinary);
+  auto opcode = static_cast<spv::Op>(inst->getInstruction());
+  if ((opcode == spv::Op::OpSpecConstant || opcode == spv::Op::OpConstant) &&
+      !inst->getInstructionSet()) {
+    finalizeInstruction(&typeConstantBinary);
+  } else {
+    finalizeInstruction(&mainBinary);
+  }
   return true;
 }
 
@@ -2074,6 +2146,8 @@ uint32_t EmitTypeHandler::getOrCreateConstant(SpirvConstant *inst) {
     return getOrCreateConstantNull(constNull);
   } else if (auto *constBool = dyn_cast<SpirvConstantBoolean>(inst)) {
     return getOrCreateConstantBool(constBool);
+  } else if (auto *constString = dyn_cast<SpirvConstantString>(inst)) {
+    return getOrCreateConstantString(constString);
   } else if (auto *constUndef = dyn_cast<SpirvUndef>(inst)) {
     return getOrCreateUndef(constUndef);
   }
@@ -2112,6 +2186,36 @@ uint32_t EmitTypeHandler::getOrCreateConstantBool(SpirvConstantBoolean *inst) {
   return inst->getResultId();
 }
 
+uint32_t EmitTypeHandler::getOrCreateConstantString(SpirvConstantString *inst) {
+  const StringRef str = inst->getString();
+  const bool isSpecConst = inst->isSpecConstant();
+
+  if (!isSpecConst &&
+      emittedConstantStrings.find(str) != emittedConstantStrings.end()) {
+    // Already emitted this constant value. Reuse.
+    inst->setResultId(emittedConstantStrings[str]->getResultId());
+  } else if (isSpecConst && emittedSpecConstantInstructions.find(inst) !=
+                                emittedSpecConstantInstructions.end()) {
+    // We've already emitted this SpecConstant. Reuse.
+    return inst->getResultId();
+  } else {
+    // Constant wasn't emitted in the past.
+    const auto &words = string::encodeSPIRVString(inst->getString());
+    initTypeInstruction(inst->getopcode());
+    curTypeInst.push_back(getOrAssignResultId<SpirvInstruction>(inst));
+    curTypeInst.insert(curTypeInst.end(), words.begin(), words.end());
+    finalizeTypeInstruction();
+    // Remember this constant for the future (if not a spec constant)
+    if (isSpecConst) {
+      emittedSpecConstantInstructions.insert(inst);
+    } else {
+      emittedConstantStrings[str] = inst;
+    }
+  }
+
+  return inst->getResultId();
+}
+
 uint32_t EmitTypeHandler::getOrCreateConstantNull(SpirvConstantNull *inst) {
   auto found =
       std::find_if(emittedConstantNulls.begin(), emittedConstantNulls.end(),
@@ -2532,6 +2636,84 @@ uint32_t EmitTypeHandler::emitType(const SpirvType *type) {
     if (stride.hasValue())
       emitDecoration(id, spv::Decoration::ArrayStride, {stride.getValue()});
   }
+  // NodePayloadArray types
+  else if (const auto *npaType = dyn_cast<NodePayloadArrayType>(type)) {
+    const uint32_t elemTypeId = emitType(npaType->getElementType());
+    initTypeInstruction(spv::Op::OpTypeNodePayloadArrayAMDX);
+    curTypeInst.push_back(id);
+    curTypeInst.push_back(elemTypeId);
+    finalizeTypeInstruction();
+
+    // Emit decorations
+    const ParmVarDecl *nodeDecl = npaType->getNodeDecl();
+    if (hlsl::IsHLSLNodeOutputType(nodeDecl->getType())) {
+      StringRef name = nodeDecl->getName();
+      unsigned index = 0;
+      if (auto nodeID = nodeDecl->getAttr<HLSLNodeIdAttr>()) {
+        name = nodeID->getName();
+        index = nodeID->getArrayIndex();
+      }
+
+      auto *str = new (context) SpirvConstantString(name);
+      uint32_t nodeName = getOrCreateConstantString(str);
+      emitDecoration(id, spv::Decoration::PayloadNodeNameAMDX, {nodeName},
+                     llvm::None, true);
+      if (index) {
+        uint32_t baseIndex = getOrCreateConstantInt(
+            llvm::APInt(32, index), context.getUIntType(32), false);
+        emitDecoration(id, spv::Decoration::PayloadNodeBaseIndexAMDX,
+                       {baseIndex}, llvm::None, true);
+      }
+    }
+
+    uint32_t maxRecords;
+    if (const auto *attr = nodeDecl->getAttr<HLSLMaxRecordsAttr>()) {
+      maxRecords = getOrCreateConstantInt(llvm::APInt(32, attr->getMaxCount()),
+                                          context.getUIntType(32), false);
+    } else {
+      maxRecords = getOrCreateConstantInt(llvm::APInt(32, 1),
+                                          context.getUIntType(32), false);
+    }
+    emitDecoration(id, spv::Decoration::NodeMaxPayloadsAMDX, {maxRecords},
+                   llvm::None, true);
+
+    if (const auto *attr = nodeDecl->getAttr<HLSLMaxRecordsSharedWithAttr>()) {
+      const DeclContext *dc = nodeDecl->getParentFunctionOrMethod();
+      if (const auto *funDecl = dyn_cast_or_null<FunctionDecl>(dc)) {
+        IdentifierInfo *ii = attr->getName();
+        bool alreadyExists = false;
+        for (auto *paramDecl : funDecl->params()) {
+          if (paramDecl->getIdentifier() == ii) {
+            assert(paramDecl != nodeDecl);
+            auto otherType = context.getNodeDeclPayloadType(paramDecl);
+            const uint32_t otherId =
+                getResultIdForType(otherType, &alreadyExists);
+            assert(alreadyExists && "forward references not allowed in "
+                                    "MaxRecordsSharedWith attribute");
+            emitDecoration(id, spv::Decoration::NodeSharesPayloadLimitsWithAMDX,
+                           {otherId}, llvm::None, true);
+            break;
+          }
+        }
+        assert(alreadyExists &&
+               "invalid reference in MaxRecordsSharedWith attribute");
+      }
+    }
+    if (const auto *attr = nodeDecl->getAttr<HLSLAllowSparseNodesAttr>()) {
+      emitDecoration(id, spv::Decoration::PayloadNodeSparseArrayAMDX, {},
+                     llvm::None);
+    }
+    if (const auto *attr = nodeDecl->getAttr<HLSLUnboundedSparseNodesAttr>()) {
+      emitDecoration(id, spv::Decoration::PayloadNodeSparseArrayAMDX, {},
+                     llvm::None);
+    }
+    if (const auto *attr = nodeDecl->getAttr<HLSLNodeArraySizeAttr>()) {
+      uint32_t arraySize = getOrCreateConstantInt(
+          llvm::APInt(32, attr->getCount()), context.getUIntType(32), false);
+      emitDecoration(id, spv::Decoration::PayloadNodeArraySizeAMDX, {arraySize},
+                     llvm::None, true);
+    }
+  }
   // Structure types
   else if (const auto *structType = dyn_cast<StructType>(type)) {
     std::vector<std::reference_wrapper<const StructType::FieldInfo>>
@@ -2545,6 +2727,15 @@ uint32_t EmitTypeHandler::emitType(const SpirvType *type) {
       }
     }
 
+    if (const auto recordDecl = dyn_cast_or_null<RecordDecl>(
+            context.getStructDeclForSpirvType(structType))) {
+      auto index = context.getDispatchGridIndex(recordDecl);
+      if (index.hasValue()) {
+        emitDecoration(id, spv::Decoration::PayloadDispatchIndirectAMDX, {},
+                       index);
+      }
+    }
+
     // Emit OpMemberName for the struct members.
     for (size_t i = 0; i < fieldsToGenerate.size(); ++i)
       emitNameForType(fieldsToGenerate[i].get().name, id, i);
@@ -2607,6 +2798,13 @@ uint32_t EmitTypeHandler::emitType(const SpirvType *type) {
     else if (interfaceType == StructInterfaceType::UniformBuffer)
       emitDecoration(id, spv::Decoration::Block, {});
 
+    // Emit NodeTrackRWInputSharing decoration if attribute is present.
+    const auto *structDecl = dyn_cast_or_null<RecordDecl>(
+        context.getStructDeclForSpirvType(structType));
+    if (structDecl && structDecl->hasAttr<HLSLNodeTrackRWInputSharingAttr>()) {
+      emitDecoration(id, spv::Decoration::TrackFinishWritingAMDX, {});
+    }
+
     initTypeInstruction(spv::Op::OpTypeStruct);
     curTypeInst.push_back(id);
     for (auto fieldTypeId : fieldTypeIds)
@@ -2749,14 +2947,17 @@ void EmitTypeHandler::emitLiteral(const SpirvConstant *literal,
 void EmitTypeHandler::emitDecoration(uint32_t typeResultId,
                                      spv::Decoration decoration,
                                      llvm::ArrayRef<uint32_t> decorationParams,
-                                     llvm::Optional<uint32_t> memberIndex) {
-
+                                     llvm::Optional<uint32_t> memberIndex,
+                                     bool usesIdParams) {
   spv::Op op =
       memberIndex.hasValue() ? spv::Op::OpMemberDecorate : spv::Op::OpDecorate;
   if (decoration == spv::Decoration::UserTypeGOOGLE) {
     op = memberIndex.hasValue() ? spv::Op::OpMemberDecorateString
                                 : spv::Op::OpDecorateString;
   }
+  if (usesIdParams) {
+    op = spv::Op::OpDecorateId;
+  }
 
   assert(curDecorationInst.empty());
   curDecorationInst.push_back(static_cast<uint32_t>(op));
diff --git a/tools/clang/lib/SPIRV/EmitVisitor.h b/tools/clang/lib/SPIRV/EmitVisitor.h
index 1f9b0939e6..fb4b22e52b 100644
--- a/tools/clang/lib/SPIRV/EmitVisitor.h
+++ b/tools/clang/lib/SPIRV/EmitVisitor.h
@@ -5,9 +5,6 @@
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
-// Modifications Copyright(C) 2025 Advanced Micro Devices, Inc.
-// All rights reserved.
-//
 //===----------------------------------------------------------------------===//
 #ifndef LLVM_CLANG_SPIRV_EMITVISITOR_H
 #define LLVM_CLANG_SPIRV_EMITVISITOR_H
@@ -86,7 +83,8 @@ class EmitTypeHandler {
   // parameters.
   void emitDecoration(uint32_t typeResultId, spv::Decoration,
                       llvm::ArrayRef<uint32_t> decorationParams,
-                      llvm::Optional<uint32_t> memberIndex = llvm::None);
+                      llvm::Optional<uint32_t> memberIndex = llvm::None,
+                      bool usesIdParams = false);
 
   uint32_t getOrCreateConstant(SpirvConstant *);
 
@@ -113,6 +111,7 @@ class EmitTypeHandler {
   uint32_t getOrCreateConstantNull(SpirvConstantNull *);
   uint32_t getOrCreateUndef(SpirvUndef *);
   uint32_t getOrCreateConstantBool(SpirvConstantBoolean *);
+  uint32_t getOrCreateConstantString(SpirvConstantString *);
   template <typename vecType>
   void emitLiteral(const SpirvConstant *, vecType &outInst);
   template <typename vecType>
@@ -176,6 +175,7 @@ class EmitTypeHandler {
       emittedConstantInts;
   llvm::DenseMap<std::pair<uint64_t, const SpirvType *>, uint32_t>
       emittedConstantFloats;
+  llvm::DenseMap<StringRef, const SpirvConstantString *> emittedConstantStrings;
   llvm::SmallVector<SpirvConstantComposite *, 8> emittedConstantComposites;
   llvm::SmallVector<SpirvConstantNull *, 8> emittedConstantNulls;
   llvm::SmallVector<SpirvUndef *, 8> emittedUndef;
@@ -233,7 +233,7 @@ class EmitVisitor : public Visitor {
   bool visit(SpirvEmitVertex *) override;
   bool visit(SpirvEndPrimitive *) override;
   bool visit(SpirvEntryPoint *) override;
-  bool visit(SpirvExecutionMode *) override;
+  bool visit(SpirvExecutionModeBase *) override;
   bool visit(SpirvString *) override;
   bool visit(SpirvSource *) override;
   bool visit(SpirvModuleProcessed *) override;
@@ -251,6 +251,11 @@ class EmitVisitor : public Visitor {
   bool visit(SpirvAccessChain *) override;
   bool visit(SpirvAtomic *) override;
   bool visit(SpirvBarrier *) override;
+  bool visit(SpirvIsNodePayloadValid *inst) override;
+  bool visit(SpirvNodePayloadArrayLength *inst) override;
+  bool visit(SpirvAllocateNodePayloads *inst) override;
+  bool visit(SpirvEnqueueNodePayloads *inst) override;
+  bool visit(SpirvFinishWritingNodePayload *inst) override;
   bool visit(SpirvBinaryOp *) override;
   bool visit(SpirvBitFieldExtract *) override;
   bool visit(SpirvBitFieldInsert *) override;
@@ -258,6 +263,7 @@ class EmitVisitor : public Visitor {
   bool visit(SpirvConstantInteger *) override;
   bool visit(SpirvConstantFloat *) override;
   bool visit(SpirvConstantComposite *) override;
+  bool visit(SpirvConstantString *) override;
   bool visit(SpirvConstantNull *) override;
   bool visit(SpirvConvertPtrToU *) override;
   bool visit(SpirvConvertUToPtr *) override;
@@ -458,6 +464,10 @@ class EmitVisitor : public Visitor {
   std::vector<uint32_t> mainBinary;
   // String literals to SpirvString objects
   llvm::StringMap<uint32_t> stringIdMap;
+  // String literals to SpirvConstantString objects
+  llvm::StringMap<uint32_t> stringConstantIdMap;
+  // String spec constants
+  llvm::DenseSet<const SpirvInstruction *> stringSpecConstantInstructions;
   // Main file information for debugging that will be used by OpLine.
   uint32_t debugMainFileId;
   // Id for Vulkan DebugInfo extended instruction set. Used when generating
diff --git a/tools/clang/lib/SPIRV/FeatureManager.cpp b/tools/clang/lib/SPIRV/FeatureManager.cpp
index 7fb449fee9..b6aed4d8b6 100644
--- a/tools/clang/lib/SPIRV/FeatureManager.cpp
+++ b/tools/clang/lib/SPIRV/FeatureManager.cpp
@@ -214,6 +214,7 @@ Extension FeatureManager::getExtensionSymbol(llvm::StringRef name) {
       .Case("SPV_EXT_shader_image_int64", Extension::EXT_shader_image_int64)
       .Case("SPV_KHR_physical_storage_buffer",
             Extension::KHR_physical_storage_buffer)
+      .Case("SPV_AMDX_shader_enqueue", Extension::AMD_shader_enqueue)
       .Case("SPV_KHR_vulkan_memory_model", Extension::KHR_vulkan_memory_model)
       .Case("SPV_KHR_compute_shader_derivatives",
             Extension::KHR_compute_shader_derivatives)
@@ -284,6 +285,8 @@ const char *FeatureManager::getExtensionName(Extension symbol) {
     return "SPV_EXT_shader_image_int64";
   case Extension::KHR_physical_storage_buffer:
     return "SPV_KHR_physical_storage_buffer";
+  case Extension::AMD_shader_enqueue:
+    return "SPV_AMDX_shader_enqueue";
   case Extension::KHR_vulkan_memory_model:
     return "SPV_KHR_vulkan_memory_model";
   case Extension::KHR_compute_shader_derivatives:
diff --git a/tools/clang/lib/SPIRV/GlPerVertex.cpp b/tools/clang/lib/SPIRV/GlPerVertex.cpp
index 09b09236b4..aa5a40d008 100644
--- a/tools/clang/lib/SPIRV/GlPerVertex.cpp
+++ b/tools/clang/lib/SPIRV/GlPerVertex.cpp
@@ -324,6 +324,9 @@ bool GlPerVertex::setClipCullDistanceType(SemanticIndexToTypeMap *typeMap,
 
 bool GlPerVertex::doGlPerVertexFacts(const NamedDecl *decl, QualType baseType,
                                      bool asInput) {
+  if (hlsl::IsHLSLNodeType(baseType)) {
+    return true;
+  }
 
   llvm::StringRef semanticStr;
   const hlsl::Semantic *semantic = {};
diff --git a/tools/clang/lib/SPIRV/LowerTypeVisitor.cpp b/tools/clang/lib/SPIRV/LowerTypeVisitor.cpp
index b31d19b5d8..45d04e8160 100644
--- a/tools/clang/lib/SPIRV/LowerTypeVisitor.cpp
+++ b/tools/clang/lib/SPIRV/LowerTypeVisitor.cpp
@@ -5,9 +5,6 @@
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
-// Modifications Copyright(C) 2025 Advanced Micro Devices, Inc.
-// All rights reserved.
-//
 //===----------------------------------------------------------------------===//
 
 #include "LowerTypeVisitor.h"
@@ -40,33 +37,6 @@ inline uint32_t roundToPow2(uint32_t val, uint32_t pow2) {
 
 } // end anonymous namespace
 
-// This method sorts a field list in the following order:
-//  - fields with register annotation first, sorted by register index.
-//  - then fields without annotation, in order of declaration.
-static std::vector<const HybridStructType::FieldInfo *>
-sortFields(llvm::ArrayRef<HybridStructType::FieldInfo> fields) {
-  std::vector<const HybridStructType::FieldInfo *> output;
-  output.resize(fields.size());
-
-  auto back_inserter = output.rbegin();
-  std::map<uint32_t, const HybridStructType::FieldInfo *> fixed_fields;
-  for (auto it = fields.rbegin(); it < fields.rend(); it++) {
-    if (it->registerC) {
-      fixed_fields.insert({it->registerC->RegisterNumber, &*it});
-    } else {
-      *back_inserter = &*it;
-      back_inserter++;
-    }
-  }
-
-  auto front_inserter = output.begin();
-  for (const auto &item : fixed_fields) {
-    *front_inserter = item.second;
-    front_inserter++;
-  }
-  return output;
-}
-
 static void setDefaultFieldSize(const AlignmentSizeCalculator &alignmentCalc,
                                 const SpirvLayoutRule rule,
                                 const HybridStructType::FieldInfo *currentField,
@@ -295,6 +265,37 @@ bool LowerTypeVisitor::visitInstruction(SpirvInstruction *instr) {
   return true;
 }
 
+std::vector<const HybridStructType::FieldInfo *> LowerTypeVisitor::sortFields(
+    llvm::ArrayRef<HybridStructType::FieldInfo> fields) {
+  std::vector<const HybridStructType::FieldInfo *> output;
+  output.resize(fields.size());
+
+  auto back_inserter = output.rbegin();
+  std::map<uint32_t, const HybridStructType::FieldInfo *> fixed_fields;
+  for (auto it = fields.rbegin(); it < fields.rend(); it++) {
+    if (it->registerC) {
+      auto insertionResult =
+          fixed_fields.insert({it->registerC->RegisterNumber, &*it});
+      if (!insertionResult.second) {
+        emitError(
+            "field \"%0\" at register(c%1) overlaps with previous members",
+            it->registerC->Loc)
+            << it->name << it->registerC->RegisterNumber;
+      }
+    } else {
+      *back_inserter = &*it;
+      back_inserter++;
+    }
+  }
+
+  auto front_inserter = output.begin();
+  for (const auto &item : fixed_fields) {
+    *front_inserter = item.second;
+    front_inserter++;
+  }
+  return output;
+}
+
 const SpirvType *LowerTypeVisitor::lowerType(const SpirvType *type,
                                              SpirvLayoutRule rule,
                                              SourceLocation loc) {
@@ -365,6 +366,16 @@ const SpirvType *LowerTypeVisitor::lowerType(const SpirvType *type,
       return raType;
     return spvContext.getRuntimeArrayType(loweredElemType, raType->getStride());
   }
+  // Node payload arrays could contain a hybrid type
+  else if (const auto *npaType = dyn_cast<NodePayloadArrayType>(type)) {
+    const auto *loweredElemType =
+        lowerType(npaType->getElementType(), rule, loc);
+    // If runtime array didn't contain any hybrid types, return itself.
+    if (npaType->getElementType() == loweredElemType)
+      return npaType;
+    return spvContext.getNodePayloadArrayType(loweredElemType,
+                                              npaType->getNodeDecl());
+  }
   // Pointer types could point to a hybrid type.
   else if (const auto *ptrType = dyn_cast<SpirvPointerType>(type)) {
     const auto *loweredPointee =
@@ -1149,6 +1160,10 @@ LowerTypeVisitor::lowerStructFields(const RecordDecl *decl,
 spv::ImageFormat
 LowerTypeVisitor::translateSampledTypeToImageFormat(QualType sampledType,
                                                     SourceLocation srcLoc) {
+
+  if (spvOptions.useUnknownImageFormat)
+    return spv::ImageFormat::Unknown;
+
   uint32_t elemCount = 1;
   QualType ty = {};
   if (!isScalarType(sampledType, &ty) &&
@@ -1367,12 +1382,19 @@ LowerTypeVisitor::populateLayoutInformation(
   llvm::SmallVector<StructType::FieldInfo, 4> loweredFields;
   llvm::DenseMap<const HybridStructType::FieldInfo *, uint32_t> fieldToIndexMap;
 
+  llvm::SmallVector<StructType::FieldInfo, 4> result;
+
   // This stores the index of the field in the actual SPIR-V construct.
   // When bitfields are merged, this index will be the same for merged fields.
   uint32_t fieldIndexInConstruct = 0;
   for (size_t i = 0, iPrevious = -1; i < sortedFields.size(); iPrevious = i++) {
     const size_t fieldIndexForMap = loweredFields.size();
 
+    // Can happen if sortFields runs over fields with the same register(c#)
+    if (!sortedFields[i]) {
+      return result;
+    }
+
     loweredFields.emplace_back(fieldVisitor(
         (iPrevious < loweredFields.size() ? &loweredFields[iPrevious]
                                           : nullptr),
@@ -1386,7 +1408,6 @@ LowerTypeVisitor::populateLayoutInformation(
   }
 
   // Re-order the sorted fields back to their original order.
-  llvm::SmallVector<StructType::FieldInfo, 4> result;
   for (const auto &field : fields)
     result.push_back(loweredFields[fieldToIndexMap[&field]]);
   return result;
diff --git a/tools/clang/lib/SPIRV/LowerTypeVisitor.h b/tools/clang/lib/SPIRV/LowerTypeVisitor.h
index 5b26b67e3a..276e6c9232 100644
--- a/tools/clang/lib/SPIRV/LowerTypeVisitor.h
+++ b/tools/clang/lib/SPIRV/LowerTypeVisitor.h
@@ -5,9 +5,6 @@
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
-// Modifications Copyright(C) 2025 Advanced Micro Devices, Inc.
-// All rights reserved.
-//
 //===----------------------------------------------------------------------===//
 
 #ifndef LLVM_CLANG_LIB_SPIRV_LOWERTYPEVISITOR_H
@@ -65,6 +62,12 @@ class LowerTypeVisitor : public Visitor {
     return astContext.getDiagnostics().Report(srcLoc, diagId);
   }
 
+  // This method sorts a field list in the following order:
+  //  - fields with register annotation first, sorted by register index.
+  //  - then fields without annotation, in order of declaration.
+  std::vector<const HybridStructType::FieldInfo *>
+  sortFields(llvm::ArrayRef<HybridStructType::FieldInfo> fields);
+
   /// Lowers the given Hybrid type into a SPIR-V type.
   ///
   /// Uses the above lowerType method to lower the QualType components of hybrid
diff --git a/tools/clang/lib/SPIRV/PreciseVisitor.cpp b/tools/clang/lib/SPIRV/PreciseVisitor.cpp
index 34e6087990..f1869318a4 100644
--- a/tools/clang/lib/SPIRV/PreciseVisitor.cpp
+++ b/tools/clang/lib/SPIRV/PreciseVisitor.cpp
@@ -60,6 +60,9 @@ bool isAccessingPrecise(clang::spirv::SpirvAccessChain *inst) {
     } else if (auto *raType = llvm::dyn_cast<RuntimeArrayType>(baseType)) {
       indexes.pop();
       baseType = raType->getElementType();
+    } else if (auto *npaType = llvm::dyn_cast<NodePayloadArrayType>(baseType)) {
+      indexes.pop();
+      baseType = npaType->getElementType();
     } else if (auto *structType = llvm::dyn_cast<StructType>(baseType)) {
       SpirvInstruction *index = indexes.top();
       if (auto *constInt = llvm::dyn_cast<SpirvConstantInteger>(index)) {
diff --git a/tools/clang/lib/SPIRV/SpirvBuilder.cpp b/tools/clang/lib/SPIRV/SpirvBuilder.cpp
index 689fc0715f..22523eed0e 100644
--- a/tools/clang/lib/SPIRV/SpirvBuilder.cpp
+++ b/tools/clang/lib/SPIRV/SpirvBuilder.cpp
@@ -5,9 +5,6 @@
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
-// Modifications Copyright(C) 2025 Advanced Micro Devices, Inc.
-// All rights reserved.
-//
 //===----------------------------------------------------------------------===//
 
 #include "clang/SPIRV/SpirvBuilder.h"
@@ -84,7 +81,9 @@ SpirvBuilder::addFnParam(QualType ptrType, bool isPrecise, bool isNointerp,
     param = new (context)
         SpirvFunctionParameter(ptrType, isPrecise, isNointerp, loc);
   }
-  param->setStorageClass(spv::StorageClass::Function);
+  param->setStorageClass(hlsl::IsHLSLNodeInputType(ptrType)
+                             ? spv::StorageClass::NodePayloadAMDX
+                             : spv::StorageClass::Function);
   param->setDebugName(name);
   function->addParameter(param);
   return param;
@@ -206,10 +205,17 @@ SpirvInstruction *SpirvBuilder::createLoad(QualType resultType,
   instruction->setRValue(true);
 
   if (pointer->getStorageClass() == spv::StorageClass::PhysicalStorageBuffer) {
-    AlignmentSizeCalculator alignmentCalc(astContext, spirvOptions);
-    uint32_t align, size, stride;
-    std::tie(align, size) = alignmentCalc.getAlignmentAndSize(
-        resultType, pointer->getLayoutRule(), llvm::None, &stride);
+    QualType pointerType = pointer->getAstResultType();
+    uint32_t align = 0;
+    if (!pointerType.isNull() && hlsl::IsVKBufferPointerType(pointerType)) {
+      align = hlsl::GetVKBufferPointerAlignment(pointerType);
+    }
+    if (!align) {
+      AlignmentSizeCalculator alignmentCalc(astContext, spirvOptions);
+      uint32_t stride;
+      std::tie(align, std::ignore) = alignmentCalc.getAlignmentAndSize(
+          resultType, pointer->getLayoutRule(), llvm::None, &stride);
+    }
     instruction->setAlignment(align);
   }
 
@@ -233,6 +239,13 @@ SpirvInstruction *SpirvBuilder::createLoad(QualType resultType,
     createEndInvocationInterlockEXT(loc, range);
   }
 
+  if (context.hasLoweredType(pointer)) {
+    // preserve distinct node payload array types
+    auto *ptrType = dyn_cast<SpirvPointerType>(pointer->getResultType());
+    instruction->setResultType(ptrType->getPointeeType());
+    context.addToInstructionsWithLoweredType(instruction);
+  }
+
   const auto &bitfieldInfo = pointer->getBitfieldInfo();
   if (!bitfieldInfo.hasValue())
     return instruction;
@@ -309,6 +322,12 @@ SpirvStore *SpirvBuilder::createStore(SpirvInstruction *address,
 
   auto *instruction =
       new (context) SpirvStore(loc, address, source, llvm::None, range);
+  if (context.hasLoweredType(source)) {
+    // preserve distinct node payload array types
+    address->setResultType(context.getPointerType(source->getResultType(),
+                                                  address->getStorageClass()));
+    context.addToInstructionsWithLoweredType(address);
+  }
   insertPoint->addInstruction(instruction);
 
   if (address->getStorageClass() == spv::StorageClass::PhysicalStorageBuffer &&
@@ -316,7 +335,7 @@ SpirvStore *SpirvBuilder::createStore(SpirvInstruction *address,
     AlignmentSizeCalculator alignmentCalc(astContext, spirvOptions);
     uint32_t align, size, stride;
     std::tie(align, size) = alignmentCalc.getAlignmentAndSize(
-        address->getAstResultType(), address->getLayoutRule(), llvm::None,
+        source->getAstResultType(), address->getLayoutRule(), llvm::None,
         &stride);
     instruction->setAlignment(align);
   }
@@ -875,6 +894,53 @@ SpirvInstruction *SpirvBuilder::createNonSemanticDebugPrintfExtInst(
   return extInst;
 }
 
+SpirvInstruction *
+SpirvBuilder::createIsNodePayloadValid(SpirvInstruction *payloadArray,
+                                       SpirvInstruction *nodeIndex,
+                                       SourceLocation loc) {
+  auto *inst = new (context)
+      SpirvIsNodePayloadValid(astContext.BoolTy, loc, payloadArray, nodeIndex);
+  insertPoint->addInstruction(inst);
+  return inst;
+}
+
+SpirvInstruction *
+SpirvBuilder::createNodePayloadArrayLength(SpirvInstruction *payloadArray,
+                                           SourceLocation loc) {
+  auto *inst = new (context)
+      SpirvNodePayloadArrayLength(astContext.UnsignedIntTy, loc, payloadArray);
+  insertPoint->addInstruction(inst);
+  return inst;
+}
+
+SpirvInstruction *SpirvBuilder::createAllocateNodePayloads(
+    QualType resultType, spv::Scope allocationScope,
+    SpirvInstruction *shaderIndex, SpirvInstruction *recordCount,
+    SourceLocation loc) {
+  assert(insertPoint && "null insert point");
+  auto *inst = new (context) SpirvAllocateNodePayloads(
+      resultType, loc, allocationScope, shaderIndex, recordCount);
+  insertPoint->addInstruction(inst);
+  return inst;
+}
+
+void SpirvBuilder::createEnqueueOutputNodePayloads(SpirvInstruction *payload,
+                                                   SourceLocation loc) {
+  assert(insertPoint && "null insert point");
+  auto *inst = new (context) SpirvEnqueueNodePayloads(loc, payload);
+  insertPoint->addInstruction(inst);
+}
+
+SpirvInstruction *
+SpirvBuilder::createFinishWritingNodePayload(SpirvInstruction *payload,
+                                             SourceLocation loc) {
+  assert(insertPoint && "null insert point");
+  auto *inst = new (context)
+      SpirvFinishWritingNodePayload(astContext.BoolTy, loc, payload);
+  insertPoint->addInstruction(inst);
+  return inst;
+}
+
 void SpirvBuilder::createBarrier(spv::Scope memoryScope,
                                  spv::MemorySemanticsMask memorySemantics,
                                  llvm::Optional<spv::Scope> exec,
@@ -1869,6 +1935,14 @@ SpirvConstant *SpirvBuilder::getConstantNull(QualType type) {
   return nullConst;
 }
 
+SpirvConstant *SpirvBuilder::getConstantString(llvm::StringRef str,
+                                               bool specConst) {
+  // We do not care about making unique constants at this point.
+  auto *stringConst = new (context) SpirvConstantString(str, specConst);
+  mod->addConstant(stringConst);
+  return stringConst;
+}
+
 SpirvUndef *SpirvBuilder::getUndef(QualType type) {
   // We do not care about making unique constants at this point.
   auto *undef = new (context) SpirvUndef(type);
diff --git a/tools/clang/lib/SPIRV/SpirvContext.cpp b/tools/clang/lib/SPIRV/SpirvContext.cpp
index 47dfc67433..88716dddde 100644
--- a/tools/clang/lib/SPIRV/SpirvContext.cpp
+++ b/tools/clang/lib/SPIRV/SpirvContext.cpp
@@ -5,9 +5,6 @@
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
-// Modifications Copyright(C) 2025 Advanced Micro Devices, Inc.
-// All rights reserved.
-//
 //===----------------------------------------------------------------------===//
 
 #include <algorithm>
@@ -65,6 +62,9 @@ SpirvContext::~SpirvContext() {
   for (auto *raType : runtimeArrayTypes)
     raType->~RuntimeArrayType();
 
+  for (auto *npaType : nodePayloadArrayTypes)
+    npaType->~NodePayloadArrayType();
+
   for (auto *fnType : functionTypes)
     fnType->~FunctionType();
 
@@ -276,6 +276,19 @@ SpirvContext::getRuntimeArrayType(const SpirvType *elemType,
   return *(inserted.first);
 }
 
+const NodePayloadArrayType *
+SpirvContext::getNodePayloadArrayType(const SpirvType *elemType,
+                                      const ParmVarDecl *nodeDecl) {
+  NodePayloadArrayType type(elemType, nodeDecl);
+  auto found = nodePayloadArrayTypes.find(&type);
+  if (found != nodePayloadArrayTypes.end())
+    return *found;
+
+  auto inserted = nodePayloadArrayTypes.insert(
+      new (this) NodePayloadArrayType(elemType, nodeDecl));
+  return *(inserted.first);
+}
+
 const StructType *
 SpirvContext::getStructType(llvm::ArrayRef<StructType::FieldInfo> fields,
                             llvm::StringRef name, bool isReadOnly,
diff --git a/tools/clang/lib/SPIRV/SpirvEmitter.cpp b/tools/clang/lib/SPIRV/SpirvEmitter.cpp
index cd5f860555..734340e9ae 100644
--- a/tools/clang/lib/SPIRV/SpirvEmitter.cpp
+++ b/tools/clang/lib/SPIRV/SpirvEmitter.cpp
@@ -5,9 +5,6 @@
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
-// Modifications Copyright(C) 2025 Advanced Micro Devices, Inc.
-// All rights reserved.
-//
 //===----------------------------------------------------------------------===//
 //
 //  This file implements a SPIR-V emitter class that takes in HLSL AST and emits
@@ -268,7 +265,8 @@ inline bool canActAsInParmVar(const ParmVarDecl *param) {
   return !param->hasAttr<HLSLOutAttr>() &&
          // GS output streams are marked as inout, but it should not be
          // used as in parameter.
-         !hlsl::IsHLSLStreamOutputType(param->getType());
+         !hlsl::IsHLSLStreamOutputType(param->getType()) &&
+         !hlsl::IsHLSLNodeOutputType(param->getType());
 }
 
 /// Returns true if the given function parameter can act as shader stage
@@ -604,8 +602,8 @@ SpirvEmitter::SpirvEmitter(CompilerInstance &ci)
     emitError("unknown shader module: %0", {}) << shaderModel->GetName();
 
   if (spirvOptions.invertY && !shaderModel->IsVS() && !shaderModel->IsDS() &&
-      !shaderModel->IsGS() && !shaderModel->IsMS())
-    emitError("-fvk-invert-y can only be used in VS/DS/GS/MS", {});
+      !shaderModel->IsGS() && !shaderModel->IsMS() && !shaderModel->IsLib())
+    emitError("-fvk-invert-y can only be used in VS/DS/GS/MS/Lib", {});
 
   if (spirvOptions.useGlLayout && spirvOptions.useDxLayout)
     emitError("cannot specify both -fvk-use-dx-layout and -fvk-use-gl-layout",
@@ -1146,8 +1144,9 @@ void SpirvEmitter::doStmt(const Stmt *stmt,
     // All cases for expressions used as statements
     SpirvInstruction *result = doExpr(expr);
 
-    if (result && result->getKind() == SpirvInstruction::IK_ExecutionMode &&
-        !attrs.empty()) {
+    if (result && !attrs.empty() &&
+        (result->getKind() == SpirvInstruction::IK_ExecutionMode ||
+         result->getKind() == SpirvInstruction::IK_ExecutionModeId)) {
       // Handle [[vk::ext_capability(..)]] and [[vk::ext_extension(..)]]
       // attributes for vk::ext_execution_mode[_id](..).
       createSpirvIntrInstExt(
@@ -1262,6 +1261,15 @@ SpirvInstruction *SpirvEmitter::doExpr(const Expr *expr,
   return result;
 }
 
+SpirvInstruction *SpirvEmitter::doExprEnsuringRValue(const Expr *E,
+                                                     SourceLocation location,
+                                                     SourceRange range) {
+  SpirvInstruction *I = doExpr(E);
+  if (I->isRValue())
+    return I;
+  return spvBuilder.createLoad(E->getType(), I, location, range);
+}
+
 SpirvInstruction *SpirvEmitter::loadIfGLValue(const Expr *expr,
                                               SourceRange rangeOverride) {
   // We are trying to load the value here, which is what an LValueToRValue
@@ -1274,7 +1282,8 @@ SpirvInstruction *SpirvEmitter::loadIfGLValue(const Expr *expr,
 }
 
 SpirvInstruction *SpirvEmitter::loadIfGLValue(const Expr *expr,
-                                              SpirvInstruction *info) {
+                                              SpirvInstruction *info,
+                                              SourceRange rangeOverride) {
   const auto exprType = expr->getType();
 
   // Do nothing if this is already rvalue
@@ -1309,9 +1318,11 @@ SpirvInstruction *SpirvEmitter::loadIfGLValue(const Expr *expr,
     return info;
   }
 
+  SourceRange range =
+      (rangeOverride != SourceRange()) ? rangeOverride : expr->getSourceRange();
   SpirvInstruction *loadedInstr = nullptr;
-  loadedInstr = spvBuilder.createLoad(exprType, info, expr->getExprLoc(),
-                                      expr->getSourceRange());
+  loadedInstr =
+      spvBuilder.createLoad(exprType, info, expr->getExprLoc(), range);
   assert(loadedInstr);
 
   // Special-case: According to the SPIR-V Spec: There is no physical size or
@@ -1414,6 +1425,83 @@ SpirvInstruction *SpirvEmitter::castToType(SpirvInstruction *value,
   return nullptr;
 }
 
+static bool handleDispatchGrid(SpirvContext &spvContext,
+                               const RecordDecl *recordDecl) {
+  unsigned index = 0;
+  for (auto fieldDecl : recordDecl->fields()) {
+    QualType fieldType = fieldDecl->getType();
+    for (const hlsl::UnusualAnnotation *it :
+         fieldDecl->getUnusualAnnotations()) {
+      if (it->getKind() == hlsl::UnusualAnnotation::UA_SemanticDecl) {
+        const hlsl::SemanticDecl *sd = cast<hlsl::SemanticDecl>(it);
+        if (sd->SemanticName.equals("SV_DispatchGrid")) {
+          spvContext.registerDispatchGridIndex(recordDecl, index);
+          return true;
+        }
+      }
+    }
+    if (const auto *innerType = fieldType->getAs<RecordType>()) {
+      if (handleDispatchGrid(spvContext, innerType->getDecl()))
+        return true;
+    }
+    ++index;
+  }
+  return false;
+}
+
+bool SpirvEmitter::handleNodePayloadArrayType(const ParmVarDecl *decl,
+                                              SpirvInstruction *instr) {
+  // Because SPIR-V node payload array types are node-specific, propagate
+  // lowered types
+  switch (instr->getKind()) {
+  case SpirvInstruction::Kind::IK_Load: {
+    SpirvInstruction *ptr = dyn_cast<SpirvLoad>(instr)->getPointer();
+    if (handleNodePayloadArrayType(decl, ptr)) {
+      const SpirvPointerType *ptrType =
+          dyn_cast<SpirvPointerType>(ptr->getResultType());
+      instr->setResultType(ptrType->getPointeeType());
+      spvContext.addToInstructionsWithLoweredType(instr);
+      return true;
+    }
+    return false;
+  }
+  case SpirvInstruction::Kind::IK_FunctionParameter:
+  case SpirvInstruction::Kind::IK_Variable: {
+    QualType varType = decl->getType();
+    if (hlsl::IsHLSLNodeType(varType)) {
+      if (auto *type = spvContext.getNodeDeclPayloadType(decl)) {
+        instr->setResultType(
+            spvContext.getPointerType(type, instr->getStorageClass()));
+      } else {
+        LowerTypeVisitor lowerTypeVisitor(astContext, spvContext, spirvOptions,
+                                          spvBuilder);
+        QualType resultType =
+            hlsl::GetHLSLNodeIOResultType(astContext, varType);
+        const auto *recordType = resultType->getAs<RecordType>();
+        assert(recordType);
+        if (hlsl::IsHLSLDispatchNodeInputRecordType(varType)) {
+          handleDispatchGrid(spvContext, recordType->getDecl());
+        }
+        const SpirvType *elemType = lowerTypeVisitor.lowerType(
+            resultType, clang::spirv::SpirvLayoutRule::Scalar, llvm::None,
+            decl->getLocation());
+        const NodePayloadArrayType *arrType =
+            spvContext.getNodePayloadArrayType(elemType, decl);
+        const SpirvType *ptrType =
+            spvContext.getPointerType(arrType, instr->getStorageClass());
+        instr->setResultType(ptrType);
+        spvContext.registerNodeDeclPayloadType(arrType, decl);
+      }
+      spvContext.addToInstructionsWithLoweredType(instr);
+      return true;
+    }
+    return false;
+  }
+  default:
+    return false;
+  }
+}
+
 void SpirvEmitter::doFunctionDecl(const FunctionDecl *decl) {
   // Forward declaration of a function inside another.
   if (!decl->isThisDeclarationADefinition()) {
@@ -1555,6 +1643,9 @@ void SpirvEmitter::doFunctionDecl(const FunctionDecl *decl) {
     QualType paramType = paramDecl->getType();
     auto *param =
         declIdMapper.createFnParam(paramDecl, i + 1 + isNonStaticMemberFn);
+    if (isEntry) {
+      handleNodePayloadArrayType(paramDecl, param);
+    }
 #ifdef ENABLE_SPIRV_CODEGEN
     if (hlsl::IsVKBufferPointerType(paramType)) {
       Optional<bool> isRowMajor = llvm::None;
@@ -2020,6 +2111,10 @@ void SpirvEmitter::doVarDecl(const VarDecl *decl) {
   // variables) belongs to the Function storage class.
   if (isExternalVar(decl)) {
     var = declIdMapper.createExternVar(decl);
+    if (decl->hasInit()) {
+      emitWarning("Initializer of external global will be ignored",
+                  decl->getLocation());
+    }
   } else {
     // We already know the variable is not externally visible here. If it does
     // not have local storage, it should be file scope variable.
@@ -4304,9 +4399,7 @@ SpirvEmitter::processTextureLevelOfDetail(const CXXMemberCallExpr *expr,
       spvBuilder.createImageQuery(spv::Op::OpImageQueryLod, queryResultType,
                                   expr->getExprLoc(), sampledImage, coordinate);
 
-  if (spvContext.isCS()) {
-    addDerivativeGroupExecutionMode();
-  }
+  addDerivativeGroupExecutionMode();
   // The first component of the float2 contains the mipmap array layer.
   // The second component of the float2 represents the unclamped lod.
   return spvBuilder.createCompositeExtract(astContext.FloatTy, query,
@@ -5307,6 +5400,9 @@ SpirvEmitter::doCXXMemberCallExpr(const CXXMemberCallExpr *expr) {
   uint32_t opcode = static_cast<uint32_t>(hlsl::IntrinsicOp::Num_Intrinsics);
 
   if (hlsl::GetIntrinsicOp(callee, opcode, group)) {
+    if (group == "subscript") {
+      return processIntrinsicExtractRecordStruct(expr);
+    }
     return processIntrinsicMemberCall(expr,
                                       static_cast<hlsl::IntrinsicOp>(opcode));
   }
@@ -5503,6 +5599,28 @@ SpirvEmitter::processIntrinsicMemberCall(const CXXMemberCallExpr *expr,
     return processRayQueryIntrinsics(expr, opcode);
   case IntrinsicOp::MOP_GetBufferContents:
     return processIntrinsicGetBufferContents(expr);
+  case hlsl::IntrinsicOp::MOP_GetThreadNodeOutputRecords:
+    return processIntrinsicGetNodeOutputRecords(expr, false);
+  case hlsl::IntrinsicOp::MOP_GetGroupNodeOutputRecords:
+    return processIntrinsicGetNodeOutputRecords(expr, true);
+  case hlsl::IntrinsicOp::MOP_ThreadIncrementOutputCount:
+    retVal = processIntrinsicIncrementOutputCount(expr, false);
+    break;
+  case hlsl::IntrinsicOp::MOP_GroupIncrementOutputCount:
+    retVal = processIntrinsicIncrementOutputCount(expr, true);
+    break;
+  case hlsl::IntrinsicOp::MOP_IsValid:
+    retVal = processIntrinsicIsValid(expr);
+    break;
+  case hlsl::IntrinsicOp::MOP_Count:
+    retVal = processIntrinsicGetRecordCount(expr);
+    break;
+  case hlsl::IntrinsicOp::MOP_OutputComplete:
+    processIntrinsicOutputComplete(expr);
+    break;
+  case hlsl::IntrinsicOp::MOP_FinishedCrossGroupSharing:
+    retVal = processIntrinsicFinishedCrossGroupSharing(expr);
+    break;
   default:
     emitError("intrinsic '%0' method unimplemented",
               expr->getCallee()->getExprLoc())
@@ -5554,7 +5672,8 @@ SpirvInstruction *SpirvEmitter::createImageSample(
   const bool isExplicit = lod || (grad.first && grad.second);
 
   // Implicit-lod instructions are only allowed in pixel and compute shaders.
-  if (!spvContext.isPS() && !spvContext.isCS() && !isExplicit)
+  if (!spvContext.isPS() && !spvContext.isCS() && !spvContext.isNode() &&
+      !isExplicit)
     emitError("sampling with implicit lod is only allowed in fragment and "
               "compute shaders",
               loc);
@@ -5659,9 +5778,7 @@ SpirvEmitter::processTextureSampleGather(const CXXMemberCallExpr *expr,
 
   const auto retType = expr->getDirectCallee()->getReturnType();
   if (isSample) {
-    if (spvContext.isCS()) {
-      addDerivativeGroupExecutionMode();
-    }
+    addDerivativeGroupExecutionMode();
     return createImageSample(retType, imageType, image, sampler, coordinate,
                              /*compareVal*/ nullptr, /*bias*/ nullptr,
                              /*lod*/ nullptr, std::make_pair(nullptr, nullptr),
@@ -5749,9 +5866,9 @@ SpirvEmitter::processTextureSampleBiasLevel(const CXXMemberCallExpr *expr,
 
   const auto retType = expr->getDirectCallee()->getReturnType();
 
-  if (!lod && spvContext.isCS()) {
+  if (!lod)
     addDerivativeGroupExecutionMode();
-  }
+
   return createImageSample(
       retType, imageType, image, sampler, coordinate,
       /*compareVal*/ nullptr, bias, lod, std::make_pair(nullptr, nullptr),
@@ -5871,9 +5988,7 @@ SpirvEmitter::processTextureSampleCmp(const CXXMemberCallExpr *expr) {
   const auto retType = expr->getDirectCallee()->getReturnType();
   const auto imageType = imageExpr->getType();
 
-  if (spvContext.isCS()) {
-    addDerivativeGroupExecutionMode();
-  }
+  addDerivativeGroupExecutionMode();
 
   return createImageSample(
       retType, imageType, image, sampler, coordinate, compareVal,
@@ -5926,9 +6041,7 @@ SpirvEmitter::processTextureSampleCmpBias(const CXXMemberCallExpr *expr) {
   const auto retType = expr->getDirectCallee()->getReturnType();
   const auto imageType = imageExpr->getType();
 
-  if (spvContext.isCS()) {
-    addDerivativeGroupExecutionMode();
-  }
+  addDerivativeGroupExecutionMode();
 
   return createImageSample(
       retType, imageType, image, sampler, coordinate, compareVal, bias,
@@ -6987,6 +7100,38 @@ void SpirvEmitter::storeValue(SpirvInstruction *lhsPtr,
   }
 }
 
+bool SpirvEmitter::canUseOpCopyLogical(QualType type) const {
+  if (featureManager.getSpirvVersion(featureManager.getTargetEnv()) <
+      VersionTuple(1, 4)) {
+    return false;
+  }
+
+  if (!type->isArrayType() && !type->isRecordType()) {
+    return false;
+  }
+
+  if (const auto *recordType = type->getAs<RecordType>()) {
+    if (isTypeInVkNamespace(recordType) &&
+        (recordType->getDecl()->getName().equals("BufferPointer") ||
+         recordType->getDecl()->getName().equals("SpirvType") ||
+         recordType->getDecl()->getName().equals("SpirvOpaqueType"))) {
+      // vk::BufferPointer<T> lowers to a pointer type. No need to reconstruct
+      // the value. The vk::Spirv*Type should be treated an opaque type. All we
+      // can do is leave it the same.
+      return false;
+    }
+  }
+
+  if (hlsl::IsHLSLVecMatType(type) || hlsl::IsHLSLResourceType(type)) {
+    return false;
+  }
+
+  // If the type contains a bool it is possible that one type represents it with
+  // a bool and the other with an int. If that happens, OpCopyLogical is not
+  // valid.
+  return !isOrContainsBoolType(type);
+}
+
 SpirvInstruction *SpirvEmitter::reconstructValue(SpirvInstruction *srcVal,
                                                  const QualType valType,
                                                  SpirvLayoutRule dstLR,
@@ -7050,6 +7195,13 @@ SpirvInstruction *SpirvEmitter::reconstructValue(SpirvInstruction *srcVal,
     return result;
   };
 
+  if (canUseOpCopyLogical(valType)) {
+    SpirvInstruction *copy = spvBuilder.createUnaryOp(
+        spv::Op::OpCopyLogical, valType, srcVal, srcVal->getSourceLocation());
+    copy->setLayoutRule(dstLR);
+    return copy;
+  }
+
   // Constant arrays
   if (const auto *arrayType = astContext.getAsConstantArrayType(valType)) {
     const auto elemType = arrayType->getElementType();
@@ -7080,14 +7232,17 @@ SpirvInstruction *SpirvEmitter::reconstructValue(SpirvInstruction *srcVal,
 
   // Structs
   if (const auto *recordType = valType->getAs<RecordType>()) {
-    assert(recordType->isStructureType());
-
     if (isTypeInVkNamespace(recordType) &&
-        recordType->getDecl()->getName().equals("BufferPointer")) {
-      // Uniquely among structs, vk::BufferPointer<T> lowers to a pointer type.
+        (recordType->getDecl()->getName().equals("BufferPointer") ||
+         recordType->getDecl()->getName().equals("SpirvType") ||
+         recordType->getDecl()->getName().equals("SpirvOpaqueType"))) {
+      // vk::BufferPointer<T> lowers to a pointer type. No need to reconstruct
+      // the value. The vk::Spirv*Type should be treated an opaque type. All we
+      // can do is leave it the same.
       return srcVal;
     }
 
+    assert(recordType->isStructureType());
     LowerTypeVisitor lowerTypeVisitor(astContext, spvContext, spirvOptions,
                                       spvBuilder);
     const StructType *spirvStructType =
@@ -7955,15 +8110,12 @@ SpirvInstruction *SpirvEmitter::tryToAssignToVectorElements(
   }
 
   auto *vec1 = doExpr(base, range);
-  auto *vec1Val =
-      vec1->isRValue()
-          ? vec1
-          : spvBuilder.createLoad(baseType, vec1, base->getLocStart(), range);
+  auto *vec1Val = vec1->isRValue() ? vec1 : loadIfGLValue(base, vec1, range);
   auto *shuffle = spvBuilder.createVectorShuffle(
       baseType, vec1Val, rhs, selectors, lhs->getLocStart(), range);
 
   if (!tryToAssignToRWBufferRWTexture(base, shuffle))
-    spvBuilder.createStore(vec1, shuffle, lhs->getLocStart(), range);
+    storeValue(vec1, shuffle, base->getType(), lhs->getLocStart(), range);
 
   // TODO: OK, this return value is incorrect for compound assignments, for
   // which cases we should return lvalues. Should at least emit errors if
@@ -8633,9 +8785,10 @@ const Expr *SpirvEmitter::collectArrayStructIndices(
   }
 
   {
-    // Indexing into ConstantBuffers and TextureBuffers involves an additional
-    // FlatConversion node which casts the handle to the underlying structure
-    // type. We can look past the FlatConversion to continue to collect indices.
+    // Indexing into ConstantBuffers, TextureBuffers, and node input/output
+    // types involves an additional FlatConversion node which casts the handle
+    // to the underlying structure type. We can look past the FlatConversion to
+    // continue to collect indices.
     // For example: MyConstantBufferArray[0].structMember1
     // `-MemberExpr .structMember1
     //   `-ImplicitCastExpr 'const T' lvalue <FlatConversion>
@@ -8644,7 +8797,8 @@ const Expr *SpirvEmitter::collectArrayStructIndices(
       if (castExpr->getCastKind() == CK_FlatConversion) {
         const auto *subExpr = castExpr->getSubExpr();
         const QualType subExprType = subExpr->getType();
-        if (isConstantTextureBuffer(subExprType)) {
+        if (isConstantTextureBuffer(subExprType) ||
+            hlsl::IsHLSLNodeType(subExprType)) {
           return collectArrayStructIndices(subExpr, rawIndex, rawIndices,
                                            indices, isMSOutAttribute);
         }
@@ -9046,6 +9200,9 @@ SpirvEmitter::processIntrinsicCallExpr(const CallExpr *callExpr) {
   case hlsl::IntrinsicOp::IOP_udot:
     retVal = processIntrinsicDot(callExpr);
     break;
+  case hlsl::IntrinsicOp::IOP_Barrier:
+    retVal = processIntrinsicBarrier(callExpr);
+    break;
   case hlsl::IntrinsicOp::IOP_GroupMemoryBarrier:
     retVal = processIntrinsicMemoryBarrier(callExpr,
                                            /*isDevice*/ false,
@@ -9078,6 +9235,9 @@ SpirvEmitter::processIntrinsicCallExpr(const CallExpr *callExpr) {
                                            /*groupSync*/ true,
                                            /*isAllBarrier*/ true);
     break;
+  case hlsl::IntrinsicOp::IOP_GetRemainingRecursionLevels:
+    retVal = processIntrinsicGetRemainingRecursionLevels(callExpr);
+    break;
   case hlsl::IntrinsicOp::IOP_CheckAccessFullyMapped:
     retVal = spvBuilder.createImageSparseTexelsResident(
         doExpr(callExpr->getArg(0)), srcLoc, srcRange);
@@ -9161,10 +9321,10 @@ SpirvEmitter::processIntrinsicCallExpr(const CallExpr *callExpr) {
     retVal = processRawBufferStore(callExpr);
     break;
   case hlsl::IntrinsicOp::IOP_Vkext_execution_mode:
-    retVal = processIntrinsicExecutionMode(callExpr, false);
+    retVal = processIntrinsicExecutionMode(callExpr);
     break;
   case hlsl::IntrinsicOp::IOP_Vkext_execution_mode_id:
-    retVal = processIntrinsicExecutionMode(callExpr, true);
+    retVal = processIntrinsicExecutionModeId(callExpr);
     break;
   case hlsl::IntrinsicOp::IOP_saturate:
     retVal = processIntrinsicSaturate(callExpr);
@@ -9483,12 +9643,17 @@ SpirvEmitter::processIntrinsicCallExpr(const CallExpr *callExpr) {
     retVal = processIntrinsicPointerCast(callExpr, true);
     break;
   }
-    INTRINSIC_SPIRV_OP_CASE(ddx, DPdx, true);
-    INTRINSIC_SPIRV_OP_CASE(ddx_coarse, DPdxCoarse, false);
-    INTRINSIC_SPIRV_OP_CASE(ddx_fine, DPdxFine, false);
-    INTRINSIC_SPIRV_OP_CASE(ddy, DPdy, true);
-    INTRINSIC_SPIRV_OP_CASE(ddy_coarse, DPdyCoarse, false);
-    INTRINSIC_SPIRV_OP_CASE(ddy_fine, DPdyFine, false);
+  case hlsl::IntrinsicOp::IOP_ddx:
+  case hlsl::IntrinsicOp::IOP_ddx_coarse:
+  case hlsl::IntrinsicOp::IOP_ddx_fine:
+  case hlsl::IntrinsicOp::IOP_ddy:
+  case hlsl::IntrinsicOp::IOP_ddy_coarse:
+  case hlsl::IntrinsicOp::IOP_ddy_fine: {
+    retVal = processDerivativeIntrinsic(hlslOpcode, callExpr->getArg(0),
+                                        callExpr->getExprLoc(),
+                                        callExpr->getSourceRange());
+    break;
+  }
     INTRINSIC_SPIRV_OP_CASE(countbits, BitCount, false);
     INTRINSIC_SPIRV_OP_CASE(fmod, FRem, true);
     INTRINSIC_SPIRV_OP_CASE(fwidth, Fwidth, true);
@@ -9549,6 +9714,15 @@ SpirvEmitter::processIntrinsicCallExpr(const CallExpr *callExpr) {
   return retVal;
 }
 
+SpirvInstruction *SpirvEmitter::processIntrinsicGetRecordCount(
+    const CXXMemberCallExpr *callExpr) {
+  assert(callExpr->getNumArgs() == 0);
+  const auto obj = callExpr->getImplicitObjectArgument();
+  const auto loc = callExpr->getExprLoc();
+  SpirvInstruction *payload = doExpr(obj);
+  return spvBuilder.createNodePayloadArrayLength(payload, loc);
+}
+
 SpirvInstruction *
 SpirvEmitter::processIntrinsicFirstbit(const CallExpr *callExpr,
                                        GLSLstd450 glslOpcode) {
@@ -9571,6 +9745,76 @@ SpirvEmitter::processIntrinsicFirstbit(const CallExpr *callExpr,
                                        srcRange);
 }
 
+SpirvInstruction *SpirvEmitter::processMatrixDerivativeIntrinsic(
+    hlsl::IntrinsicOp hlslOpcode, const Expr *arg, SourceLocation loc,
+    SourceRange range) {
+  const auto actOnEachVec = [this, hlslOpcode, loc, range](
+                                uint32_t /*index*/, QualType inType,
+                                QualType outType, SpirvInstruction *curRow) {
+    return processDerivativeIntrinsic(hlslOpcode, curRow, loc, range);
+  };
+
+  return processEachVectorInMatrix(arg, arg->getType(), doExpr(arg),
+                                   actOnEachVec, loc, range);
+}
+
+SpirvInstruction *
+SpirvEmitter::processDerivativeIntrinsic(hlsl::IntrinsicOp hlslOpcode,
+                                         const Expr *arg, SourceLocation loc,
+                                         SourceRange range) {
+  if (isMxNMatrix(arg->getType())) {
+    return processMatrixDerivativeIntrinsic(hlslOpcode, arg, loc, range);
+  }
+  return processDerivativeIntrinsic(hlslOpcode, doExpr(arg), loc, range);
+}
+
+SpirvInstruction *SpirvEmitter::processDerivativeIntrinsic(
+    hlsl::IntrinsicOp hlslOpcode, SpirvInstruction *arg, SourceLocation loc,
+    SourceRange range) {
+  QualType returnType = arg->getAstResultType();
+  assert(isFloatOrVecOfFloatType(returnType));
+
+  addDerivativeGroupExecutionMode();
+  needsLegalization = true;
+
+  QualType B32Type = astContext.FloatTy;
+  uint32_t vectorSize = 0;
+  QualType elementType = returnType;
+  if (isVectorType(returnType, &elementType, &vectorSize)) {
+    B32Type = astContext.getExtVectorType(B32Type, vectorSize);
+  }
+
+  // Derivative operations work on 32-bit floats only. Cast to 32-bit if needed.
+  SpirvInstruction *operand = castToType(arg, returnType, B32Type, loc, range);
+
+  spv::Op opcode = spv::Op::OpNop;
+  switch (hlslOpcode) {
+  case hlsl::IntrinsicOp::IOP_ddx:
+    opcode = spv::Op::OpDPdx;
+    break;
+  case hlsl::IntrinsicOp::IOP_ddx_coarse:
+    opcode = spv::Op::OpDPdxCoarse;
+    break;
+  case hlsl::IntrinsicOp::IOP_ddx_fine:
+    opcode = spv::Op::OpDPdxFine;
+    break;
+  case hlsl::IntrinsicOp::IOP_ddy:
+    opcode = spv::Op::OpDPdy;
+    break;
+  case hlsl::IntrinsicOp::IOP_ddy_coarse:
+    opcode = spv::Op::OpDPdyCoarse;
+    break;
+  case hlsl::IntrinsicOp::IOP_ddy_fine:
+    opcode = spv::Op::OpDPdyFine;
+    break;
+  };
+
+  SpirvInstruction *result =
+      spvBuilder.createUnaryOp(opcode, B32Type, operand, loc, range);
+  result = castToType(result, B32Type, returnType, loc, range);
+  return result;
+}
+
 // Returns true is the given expression can be used as an output parameter.
 //
 // Warning: this function could return false negatives.
@@ -10926,38 +11170,202 @@ SpirvEmitter::processIntrinsicPointerCast(const CallExpr *callExpr,
 
 SpirvInstruction *SpirvEmitter::processIntrinsicGetBufferContents(
     const CXXMemberCallExpr *callExpr) {
-  LowerTypeVisitor lowerTypeVisitor(astContext, spvContext, spirvOptions,
-                                    spvBuilder);
-  Expr *obj = callExpr->getImplicitObjectArgument();
-  SpirvInstruction *bufferPointer = doExpr(obj);
+  SpirvInstruction *bufferPointer =
+      doExpr(callExpr->getImplicitObjectArgument());
   if (!bufferPointer)
     return nullptr;
-  if (bufferPointer->isRValue()) {
-    bufferPointer->setRValue(false);
-    bufferPointer->setStorageClass(spv::StorageClass::PhysicalStorageBuffer);
-    return bufferPointer;
-  }
-
-  unsigned align = hlsl::GetVKBufferPointerAlignment(obj->getType());
-  lowerTypeVisitor.visitInstruction(bufferPointer);
-
-  const SpirvPointerType *bufferPointerType =
-      dyn_cast<SpirvPointerType>(bufferPointer->getResultType());
-  SpirvLoad *retVal =
-      spvBuilder.createLoad(bufferPointerType->getPointeeType(), bufferPointer,
-                            callExpr->getLocStart());
-  if (!align) {
-    QualType bufferType = hlsl::GetVKBufferPointerBufferType(obj->getType());
-    AlignmentSizeCalculator alignmentCalc(astContext, spirvOptions);
-    uint32_t stride;
-    std::tie(align, std::ignore) = alignmentCalc.getAlignmentAndSize(
-        bufferType, retVal->getLayoutRule(), llvm::None, &stride);
-  }
-  retVal->setAlignment(align);
+
+  SpirvInstruction *retVal =
+      bufferPointer->isRValue()
+          ? bufferPointer
+          : spvBuilder.createLoad(bufferPointer->getAstResultType(),
+                                  bufferPointer, callExpr->getLocStart());
   retVal->setRValue(false);
+  retVal->setStorageClass(spv::StorageClass::PhysicalStorageBuffer);
+  retVal->setLayoutRule(spirvOptions.sBufferLayoutRule);
   return retVal;
 }
 
+SpirvInstruction *SpirvEmitter::processIntrinsicExtractRecordStruct(
+    const CXXMemberCallExpr *callExpr) {
+  Expr *obj = callExpr->getImplicitObjectArgument();
+  QualType objType = obj->getType();
+  unsigned n = callExpr->getNumArgs();
+  assert(hlsl::IsHLSLNodeType(objType));
+  assert(n == 0 || n == 1 && hlsl::IsHLSLNodeRecordArrayType(objType));
+
+  QualType recordType = hlsl::GetHLSLNodeIOResultType(astContext, objType);
+  SpirvInstruction *res = doExpr(obj);
+  SpirvInstruction *index =
+      n ? doExpr(callExpr->getArg(0))
+        : spvBuilder.getConstantInt(astContext.UnsignedIntTy,
+                                    llvm::APInt(32, 0));
+  res->setLayoutRule(SpirvLayoutRule::Scalar);
+
+  return spvBuilder.createAccessChain(recordType, res, {index},
+                                      callExpr->getExprLoc(),
+                                      callExpr->getSourceRange());
+}
+
+SpirvInstruction *SpirvEmitter::processIntrinsicGetRemainingRecursionLevels(
+    const CallExpr *callExpr) {
+  assert(callExpr->getNumArgs() == 0);
+  const auto loc = callExpr->getExprLoc();
+  const QualType retType = callExpr->getCallReturnType(astContext);
+  auto *var = declIdMapper.getBuiltinVar(
+      spv::BuiltIn::RemainingRecursionLevelsAMDX, retType, loc);
+  return spvBuilder.createLoad(retType, var, loc);
+}
+
+SpirvInstruction *
+SpirvEmitter::processIntrinsicIsValid(const CXXMemberCallExpr *callExpr) {
+  assert(callExpr->getNumArgs() == 0);
+  const auto loc = callExpr->getExprLoc();
+  const Expr *nodeOutputExpr = callExpr->getImplicitObjectArgument();
+  Expr *baseExpr = const_cast<Expr *>(nodeOutputExpr);
+  SpirvInstruction *shaderIndex = nullptr;
+
+  if (const auto subExpr = dyn_cast_or_null<CXXOperatorCallExpr>(
+          nodeOutputExpr->IgnoreParenNoopCasts(astContext))) {
+    if (subExpr->getOperator() == OverloadedOperatorKind::OO_Subscript) {
+      // special case: offset shader index by the array subscript
+      shaderIndex = doExpr(subExpr->getArg(1));
+      baseExpr = const_cast<Expr *>(subExpr->getArg(0));
+    }
+  }
+
+  const auto *declRefExpr = dyn_cast<DeclRefExpr>(baseExpr->IgnoreImpCasts());
+  const auto *paramDecl = dyn_cast<ParmVarDecl>(declRefExpr->getDecl());
+  int nodeIndex = 0;
+  if (HLSLNodeIdAttr *nodeId = paramDecl->getAttr<HLSLNodeIdAttr>()) {
+    nodeIndex = nodeId->getArrayIndex();
+  }
+
+  SpirvInstruction *payload = doExpr(baseExpr);
+  if (!shaderIndex) {
+    shaderIndex = spvBuilder.getConstantInt(astContext.UnsignedIntTy,
+                                            llvm::APInt(32, nodeIndex));
+  }
+
+  return spvBuilder.createIsNodePayloadValid(payload, shaderIndex, loc);
+}
+
+SpirvInstruction *SpirvEmitter::processIntrinsicGetNodeOutputRecords(
+    const CXXMemberCallExpr *callExpr, bool isGroupShared) {
+  assert(callExpr->getNumArgs() == 1);
+  const auto loc = callExpr->getExprLoc();
+  const Expr *nodeOutputExpr = callExpr->getImplicitObjectArgument();
+  Expr *baseExpr = const_cast<Expr *>(nodeOutputExpr);
+  SpirvInstruction *shaderIndex = nullptr;
+
+  if (const auto subExpr = dyn_cast_or_null<CXXOperatorCallExpr>(
+          nodeOutputExpr->IgnoreParenNoopCasts(astContext))) {
+    if (subExpr->getOperator() == OverloadedOperatorKind::OO_Subscript) {
+      // special case: offset shader index by the array subscript
+      shaderIndex = doExpr(subExpr->getArg(1));
+      baseExpr = const_cast<Expr *>(subExpr->getArg(0));
+    }
+  }
+
+  const auto *declRefExpr = dyn_cast<DeclRefExpr>(baseExpr->IgnoreImpCasts());
+  const auto *paramDecl = dyn_cast<ParmVarDecl>(declRefExpr->getDecl());
+  if (!shaderIndex) {
+    shaderIndex =
+        spvBuilder.getConstantInt(astContext.UnsignedIntTy, llvm::APInt(32, 0));
+  }
+
+  LowerTypeVisitor lowerTypeVisitor(astContext, spvContext, spirvOptions,
+                                    spvBuilder);
+  const SpirvType *elemType = lowerTypeVisitor.lowerType(
+      hlsl::GetHLSLNodeIOResultType(astContext, baseExpr->getType()),
+      clang::spirv::SpirvLayoutRule::Scalar, llvm::None,
+      paramDecl->getLocation());
+  const SpirvType *payloadType = spvContext.getPointerType(
+      spvContext.getNodePayloadArrayType(elemType, paramDecl),
+      spv::StorageClass::NodePayloadAMDX);
+
+  spv::Scope scope =
+      isGroupShared ? spv::Scope::Workgroup : spv::Scope::Invocation;
+  SpirvInstruction *recordCount = doExpr(callExpr->getArg(0));
+  SpirvInstruction *result = spvBuilder.createAllocateNodePayloads(
+      callExpr->getType(), scope, shaderIndex, recordCount, loc);
+  result->setResultType(payloadType);
+  spvContext.addToInstructionsWithLoweredType(result);
+  return result;
+}
+
+SpirvInstruction *SpirvEmitter::processIntrinsicIncrementOutputCount(
+    const CXXMemberCallExpr *callExpr, bool isGroupShared) {
+  return processIntrinsicGetNodeOutputRecords(callExpr, isGroupShared);
+}
+
+void SpirvEmitter::processIntrinsicOutputComplete(
+    const CXXMemberCallExpr *callExpr) {
+  Expr *payloadExpr =
+      callExpr->getImplicitObjectArgument()->IgnoreParenNoopCasts(astContext);
+  SpirvInstruction *payload = doExpr(payloadExpr);
+  spvBuilder.createEnqueueOutputNodePayloads(payload, callExpr->getExprLoc());
+}
+
+SpirvInstruction *SpirvEmitter::processIntrinsicFinishedCrossGroupSharing(
+    const CXXMemberCallExpr *callExpr) {
+  Expr *payloadExpr = callExpr->getImplicitObjectArgument();
+  SpirvInstruction *payload = doExpr(payloadExpr);
+  return spvBuilder.createFinishWritingNodePayload(payload,
+                                                   callExpr->getExprLoc());
+}
+
+SpirvInstruction *
+SpirvEmitter::processIntrinsicBarrier(const CallExpr *callExpr) {
+  llvm::APSInt a1(32, true), a2(32, true);
+  int64_t i1, i2;
+  const Expr *e1 = callExpr->getArg(0), *e2 = callExpr->getArg(1);
+
+  // object as first argument
+  if (!e1->EvaluateAsInt(a1, astContext)) {
+    assert(e1->getType()->isStructureOrClassType());
+    a1.setAllBits();
+  }
+
+  if (e2->EvaluateAsInt(a2, astContext) && (i1 = a1.getExtValue()) >= 0 &&
+      (i2 = a2.getExtValue()) >= 0) {
+  } else {
+    emitError("Barrier arguments must be non-negative integer constants",
+              callExpr->getExprLoc());
+    return nullptr;
+  }
+
+  if (!(i1 | i2)) { // all zero -> no-op
+    return nullptr;
+  }
+
+  spv::Scope memScope =
+      (i2 & (unsigned)hlsl::DXIL::BarrierSemanticFlag::DeviceScope)
+          ? spv::Scope::Device
+      : (i2 & (unsigned)hlsl::DXIL::BarrierSemanticFlag::GroupScope)
+          ? spv::Scope::Workgroup
+          : spv::Scope::Invocation;
+  spv::MemorySemanticsMask memSemaMask =
+      spv::MemorySemanticsMask::AcquireRelease |
+      ((i1 & (unsigned)hlsl::DXIL::MemoryTypeFlag::UavMemory)
+           ? spv::MemorySemanticsMask::UniformMemory
+           : spv::MemorySemanticsMask::MaskNone) |
+      ((i1 & (unsigned)hlsl::DXIL::MemoryTypeFlag::GroupSharedMemory)
+           ? spv::MemorySemanticsMask::WorkgroupMemory
+           : spv::MemorySemanticsMask::MaskNone) |
+      ((i1 & (unsigned)hlsl::DXIL::MemoryTypeFlag::NodeOutputMemory)
+           ? spv::MemorySemanticsMask::OutputMemory
+           : spv::MemorySemanticsMask::MaskNone);
+  Optional<spv::Scope> execScope =
+      (i2 & (unsigned)hlsl::DXIL::BarrierSemanticFlag::GroupSync)
+          ? Optional<spv::Scope>(spv::Scope::Workgroup)
+          : None;
+
+  spvBuilder.createBarrier(memScope, memSemaMask, execScope,
+                           callExpr->getExprLoc());
+  return nullptr;
+}
+
 SpirvInstruction *
 SpirvEmitter::processIntrinsicMemoryBarrier(const CallExpr *callExpr,
                                             bool isDevice, bool groupSync,
@@ -11283,8 +11691,8 @@ SpirvInstruction *SpirvEmitter::processIntrinsicMul(const CallExpr *callExpr) {
     uint32_t numRows = 0;
     if (isMxNMatrix(returnType, &elemType, &numRows)) {
       llvm::SmallVector<SpirvInstruction *, 4> rows;
-      auto *arg0Id = doExpr(arg0);
-      auto *arg1Id = doExpr(arg1);
+      auto *arg0Id = doExprEnsuringRValue(arg0, loc, range);
+      auto *arg1Id = doExprEnsuringRValue(arg1, loc, range);
       for (uint32_t i = 0; i < numRows; ++i) {
         auto *scalar = spvBuilder.createCompositeExtract(elemType, arg0Id, {i},
                                                          loc, range);
@@ -11299,8 +11707,8 @@ SpirvInstruction *SpirvEmitter::processIntrinsicMul(const CallExpr *callExpr) {
   }
 
   // All the following cases require handling arg0 and arg1 expressions first.
-  auto *arg0Id = doExpr(arg0);
-  auto *arg1Id = doExpr(arg1);
+  auto *arg0Id = doExprEnsuringRValue(arg0, loc, range);
+  auto *arg1Id = doExprEnsuringRValue(arg1, loc, range);
 
   // mul(scalar, scalar)
   if (isScalarType(arg0Type) && isScalarType(arg1Type))
@@ -12095,8 +12503,7 @@ SpirvInstruction *SpirvEmitter::processIntrinsicUsingSpirvInst(
     case spv::Op::OpFwidth:
     case spv::Op::OpFwidthFine:
     case spv::Op::OpFwidthCoarse:
-      if (spvContext.isCS())
-        addDerivativeGroupExecutionMode();
+      addDerivativeGroupExecutionMode();
       needsLegalization = true;
       break;
     default:
@@ -12931,7 +13338,7 @@ void SpirvEmitter::processDispatchMesh(const CallExpr *callExpr) {
           : spv::StorageClass::Output;
   auto *payloadArg = doExpr(args[3]);
   bool isValid = false;
-  const VarDecl *param = nullptr;
+  SpirvInstruction *param = nullptr;
   if (const auto *implCastExpr = dyn_cast<CastExpr>(args[3])) {
     if (const auto *arg = dyn_cast<DeclRefExpr>(implCastExpr->getSubExpr())) {
       if (const auto *paramDecl = dyn_cast<VarDecl>(arg->getDecl())) {
@@ -12939,7 +13346,8 @@ void SpirvEmitter::processDispatchMesh(const CallExpr *callExpr) {
           isValid = declIdMapper.createPayloadStageVars(
               sigPoint, sc, paramDecl, /*asInput=*/false, paramDecl->getType(),
               "out.var", &payloadArg);
-          param = paramDecl;
+          param =
+              declIdMapper.getDeclEvalInfo(paramDecl, paramDecl->getLocation());
         }
       }
     }
@@ -12956,7 +13364,7 @@ void SpirvEmitter::processDispatchMesh(const CallExpr *callExpr) {
 
   if (featureManager.isExtensionEnabled(Extension::EXT_mesh_shader)) {
     // for EXT_mesh_shader, create opEmitMeshTasksEXT.
-    spvBuilder.createEmitMeshTasksEXT(threadX, threadY, threadZ, loc, nullptr,
+    spvBuilder.createEmitMeshTasksEXT(threadX, threadY, threadZ, loc, param,
                                       range);
   } else {
     // for NV_mesh_shader, set TaskCountNV = threadX * threadY * threadZ.
@@ -13180,6 +13588,7 @@ hlsl::ShaderModel::Kind SpirvEmitter::getShaderModelKind(StringRef stageName) {
           .Case("callable", hlsl::ShaderModel::Kind::Callable)
           .Case("mesh", hlsl::ShaderModel::Kind::Mesh)
           .Case("amplification", hlsl::ShaderModel::Kind::Amplification)
+          .Case("node", hlsl::ShaderModel::Kind::Node)
           .Default(hlsl::ShaderModel::Kind::Invalid);
   assert(SMK != hlsl::ShaderModel::Kind::Invalid);
   return SMK;
@@ -13200,6 +13609,7 @@ SpirvEmitter::getSpirvShaderStage(hlsl::ShaderModel::Kind smk,
   case hlsl::ShaderModel::Kind::Pixel:
     return spv::ExecutionModel::Fragment;
   case hlsl::ShaderModel::Kind::Compute:
+  case hlsl::ShaderModel::Kind::Node:
     return spv::ExecutionModel::GLCompute;
   case hlsl::ShaderModel::Kind::RayGeneration:
     return spv::ExecutionModel::RayGenerationNV;
@@ -13420,6 +13830,21 @@ void SpirvEmitter::processPixelShaderAttributes(const FunctionDecl *decl) {
   }
 }
 
+void SpirvEmitter::checkForWaveSizeAttr(const FunctionDecl *decl) {
+  if (auto *waveSizeAttr = decl->getAttr<HLSLWaveSizeAttr>()) {
+    // Not supported in Vulkan SPIR-V, warn and ignore.
+
+    // SPIR-V SubgroupSize execution mode would work but it is Kernel only
+    // (requires the SubgroupDispatch capability, which implies the
+    // DeviceEnqueue capability, which is Kernel only). Subgroup sizes can be
+    // specified in Vulkan on the application side via
+    // VK_EXT_subgroup_size_control.
+    emitWarning("Wave size is not supported by Vulkan SPIR-V. Consider using "
+                "VK_EXT_subgroup_size_control.",
+                waveSizeAttr->getLocation());
+  }
+}
+
 void SpirvEmitter::processComputeShaderAttributes(const FunctionDecl *decl) {
   auto *numThreadsAttr = decl->getAttr<HLSLNumThreadsAttr>();
   assert(numThreadsAttr && "thread group size missing from entry-point");
@@ -13431,19 +13856,82 @@ void SpirvEmitter::processComputeShaderAttributes(const FunctionDecl *decl) {
   spvBuilder.addExecutionMode(entryFunction, spv::ExecutionMode::LocalSize,
                               {x, y, z}, decl->getLocation());
 
-  auto *waveSizeAttr = decl->getAttr<HLSLWaveSizeAttr>();
-  if (waveSizeAttr) {
-    // Not supported in Vulkan SPIR-V, warn and ignore.
+  checkForWaveSizeAttr(decl);
+}
 
-    // SPIR-V SubgroupSize execution mode would work but it is Kernel only
-    // (requires the SubgroupDispatch capability, which implies the
-    // DeviceEnqueue capability, which is Kernel only). Subgroup sizes can be
-    // specified in Vulkan on the application side via
-    // VK_EXT_subgroup_size_control.
-    emitWarning("Wave size is not supported by Vulkan SPIR-V. Consider using "
-                "VK_EXT_subgroup_size_control.",
-                waveSizeAttr->getLocation());
+void SpirvEmitter::processNodeShaderAttributes(const FunctionDecl *decl) {
+  uint32_t x = 1, y = 1, z = 1;
+  if (auto *numThreadsAttr = decl->getAttr<HLSLNumThreadsAttr>()) {
+    x = static_cast<uint32_t>(numThreadsAttr->getX());
+    y = static_cast<uint32_t>(numThreadsAttr->getY());
+    z = static_cast<uint32_t>(numThreadsAttr->getZ());
+  }
+  spvBuilder.addExecutionMode(entryFunction, spv::ExecutionMode::LocalSize,
+                              {x, y, z}, decl->getLocation());
+
+  auto *nodeLaunchAttr = decl->getAttr<HLSLNodeLaunchAttr>();
+  StringRef launchType = nodeLaunchAttr ? nodeLaunchAttr->getLaunchType() : "";
+  if (launchType.equals("coalescing") || launchType.equals("thread")) {
+    spvBuilder.addExecutionMode(entryFunction,
+                                spv::ExecutionMode::CoalescingAMDX, {},
+                                decl->getLocation());
   }
+
+  uint64_t nodeId = 0;
+  if (const auto nodeIdAttr = decl->getAttr<HLSLNodeIdAttr>())
+    nodeId = static_cast<uint64_t>(nodeIdAttr->getArrayIndex());
+  spvBuilder.addExecutionModeId(
+      entryFunction, spv::ExecutionMode::ShaderIndexAMDX,
+      {spvBuilder.getConstantInt(astContext.UnsignedIntTy,
+                                 llvm::APInt(32, nodeId))},
+      decl->getLocation());
+
+  if (const auto *nodeMaxRecursionDepthAttr =
+          decl->getAttr<HLSLNodeMaxRecursionDepthAttr>()) {
+    SpirvInstruction *count = spvBuilder.getConstantInt(
+        astContext.UnsignedIntTy,
+        llvm::APInt(32, nodeMaxRecursionDepthAttr->getCount()));
+    spvBuilder.addExecutionModeId(entryFunction,
+                                  spv::ExecutionMode::MaxNodeRecursionAMDX,
+                                  {count}, decl->getLocation());
+  }
+
+  if (const auto *nodeShareInputOfAttr =
+          decl->getAttr<HLSLNodeShareInputOfAttr>()) {
+    SpirvInstruction *name =
+        spvBuilder.getConstantString(nodeShareInputOfAttr->getName());
+    SpirvInstruction *index = spvBuilder.getConstantInt(
+        astContext.UnsignedIntTy,
+        llvm::APInt(32, nodeShareInputOfAttr->getArrayIndex()));
+    spvBuilder.addExecutionModeId(entryFunction,
+                                  spv::ExecutionMode::SharesInputWithAMDX,
+                                  {name, index}, decl->getLocation());
+  }
+
+  if (const auto *dispatchGrid = decl->getAttr<HLSLNodeDispatchGridAttr>()) {
+    SpirvInstruction *gridX = spvBuilder.getConstantInt(
+        astContext.UnsignedIntTy, llvm::APInt(32, dispatchGrid->getX()));
+    SpirvInstruction *gridY = spvBuilder.getConstantInt(
+        astContext.UnsignedIntTy, llvm::APInt(32, dispatchGrid->getY()));
+    SpirvInstruction *gridZ = spvBuilder.getConstantInt(
+        astContext.UnsignedIntTy, llvm::APInt(32, dispatchGrid->getZ()));
+    spvBuilder.addExecutionModeId(entryFunction,
+                                  spv::ExecutionMode::StaticNumWorkgroupsAMDX,
+                                  {gridX, gridY, gridZ}, decl->getLocation());
+  } else if (const auto *maxDispatchGrid =
+                 decl->getAttr<HLSLNodeMaxDispatchGridAttr>()) {
+    SpirvInstruction *gridX = spvBuilder.getConstantInt(
+        astContext.UnsignedIntTy, llvm::APInt(32, maxDispatchGrid->getX()));
+    SpirvInstruction *gridY = spvBuilder.getConstantInt(
+        astContext.UnsignedIntTy, llvm::APInt(32, maxDispatchGrid->getY()));
+    SpirvInstruction *gridZ = spvBuilder.getConstantInt(
+        astContext.UnsignedIntTy, llvm::APInt(32, maxDispatchGrid->getZ()));
+    spvBuilder.addExecutionModeId(entryFunction,
+                                  spv::ExecutionMode::MaxNumWorkgroupsAMDX,
+                                  {gridX, gridY, gridZ}, decl->getLocation());
+  }
+
+  checkForWaveSizeAttr(decl);
 }
 
 bool SpirvEmitter::processTessellationShaderAttributes(
@@ -13535,8 +14023,8 @@ bool SpirvEmitter::processTessellationShaderAttributes(
 }
 
 bool SpirvEmitter::emitEntryFunctionWrapperForRayTracing(
-    const FunctionDecl *decl, SpirvDebugFunction *debugFunction,
-    SpirvFunction *entryFuncInstr) {
+    const FunctionDecl *decl, RichDebugInfo **info,
+    SpirvDebugFunction *debugFunction, SpirvFunction *entryFuncInstr) {
   // The entry basic block.
   auto *entryLabel = spvBuilder.createBasicBlock();
   spvBuilder.setInsertPoint(entryLabel);
@@ -13645,6 +14133,10 @@ bool SpirvEmitter::emitEntryFunctionWrapperForRayTracing(
   spvBuilder.createReturn(decl->getBody()->getLocEnd());
   spvBuilder.endFunction();
 
+  if (spirvOptions.debugInfoRich && decl->hasBody()) {
+    spvContext.popDebugLexicalScope(*info);
+  }
+
   return true;
 }
 
@@ -13859,7 +14351,9 @@ SpirvFunction *SpirvEmitter::emitEntryFunctionWrapper(
       astContext.VoidTy, decl->getLocStart(), decl->getName());
 
   if (spirvOptions.debugInfoRich && decl->hasBody()) {
-    *debugFunction = emitDebugFunction(decl, entryFunction, info, "wrapper");
+    *debugFunction =
+        emitDebugFunction(decl, entryFunction, info, "__dxc_setup");
+    spvContext.pushDebugLexicalScope(*info, *debugFunction);
   }
 
   // Specify that entryFunction is an entry function wrapper.
@@ -13876,7 +14370,7 @@ SpirvFunction *SpirvEmitter::emitEntryFunctionWrapper(
   entryInfo->entryFunction = entryFunction;
 
   if (spvContext.isRay()) {
-    return emitEntryFunctionWrapperForRayTracing(decl, *debugFunction,
+    return emitEntryFunctionWrapperForRayTracing(decl, info, *debugFunction,
                                                  entryFuncInstr)
                ? entryFunction
                : nullptr;
@@ -13886,6 +14380,8 @@ SpirvFunction *SpirvEmitter::emitEntryFunctionWrapper(
     processPixelShaderAttributes(decl);
   } else if (spvContext.isCS()) {
     processComputeShaderAttributes(decl);
+  } else if (spvContext.isNode()) {
+    processNodeShaderAttributes(decl);
   } else if (spvContext.isHS()) {
     if (!processTessellationShaderAttributes(decl, &numOutputControlPoints))
       return nullptr;
@@ -13994,12 +14490,23 @@ SpirvFunction *SpirvEmitter::emitEntryFunctionWrapper(
   llvm::SmallVector<SpirvInstruction *, 4> params;
   for (const auto *param : decl->params()) {
     const auto paramType = param->getType();
+    if (hlsl::IsHLSLNodeInputType(paramType)) {
+      SpirvInstruction *value = nullptr;
+      if (!declIdMapper.createStageInputVar(param, &value, false))
+        return nullptr;
+      if (value && value->getKind() == SpirvInstruction::Kind::IK_Variable) {
+        handleNodePayloadArrayType(param, value);
+        params.push_back(value);
+      }
+      continue;
+    }
+
     std::string tempVarName = "param.var." + param->getNameAsString();
     auto *tempVar =
         spvBuilder.addFnVar(paramType, param->getLocation(), tempVarName,
                             param->hasAttr<HLSLPreciseAttr>(),
                             param->hasAttr<HLSLNoInterpolationAttr>());
-
+    handleNodePayloadArrayType(param, tempVar);
     params.push_back(tempVar);
 
     // Create the stage input variable for parameter not marked as pure out and
@@ -14017,6 +14524,9 @@ SpirvFunction *SpirvEmitter::emitEntryFunctionWrapper(
 
       if (!declIdMapper.createStageInputVar(param, &loadedValue, false))
         return nullptr;
+      if (loadedValue) {
+        handleNodePayloadArrayType(param, loadedValue);
+      }
 
       // Only initialize the temporary variable if the parameter is indeed used,
       // or if it is an inout parameter.
@@ -14101,6 +14611,10 @@ SpirvFunction *SpirvEmitter::emitEntryFunctionWrapper(
   if (spvContext.isHS())
     doDecl(patchConstFunc);
 
+  if (spirvOptions.debugInfoRich && decl->hasBody()) {
+    spvContext.popDebugLexicalScope(*info);
+  }
+
   return entryFunction;
 }
 
@@ -14880,8 +15394,12 @@ SpirvEmitter::createSpirvIntrInstExt(llvm::ArrayRef<const Attr *> attrs,
 SpirvInstruction *SpirvEmitter::invertYIfRequested(SpirvInstruction *position,
                                                    SourceLocation loc,
                                                    SourceRange range) {
-  // Negate SV_Position.y if requested
-  if (spirvOptions.invertY) {
+  // Negate SV_Position.y if requested and supported
+
+  bool supportsInvertY = spvContext.isVS() || spvContext.isGS() ||
+                         spvContext.isDS() || spvContext.isMS();
+
+  if (spirvOptions.invertY && supportsInvertY) {
     const auto oldY = spvBuilder.createCompositeExtract(
         astContext.FloatTy, position, {1}, loc, range);
     const auto newY = spvBuilder.createUnaryOp(
@@ -15120,8 +15638,7 @@ SpirvEmitter::processCooperativeMatrixGetLength(const CallExpr *call) {
 }
 
 SpirvInstruction *
-SpirvEmitter::processIntrinsicExecutionMode(const CallExpr *expr,
-                                            bool useIdParams) {
+SpirvEmitter::processIntrinsicExecutionMode(const CallExpr *expr) {
   llvm::SmallVector<uint32_t, 2> execModesParams;
   uint32_t exeMode = 0;
   const auto args = expr->getArgs();
@@ -15145,9 +15662,38 @@ SpirvEmitter::processIntrinsicExecutionMode(const CallExpr *expr,
   assert(entryFunction != nullptr);
   assert(exeMode != 0);
 
-  return spvBuilder.addExecutionMode(
-      entryFunction, static_cast<spv::ExecutionMode>(exeMode), execModesParams,
-      expr->getExprLoc(), useIdParams);
+  return spvBuilder.addExecutionMode(entryFunction,
+                                     static_cast<spv::ExecutionMode>(exeMode),
+                                     execModesParams, expr->getExprLoc());
+}
+
+SpirvInstruction *
+SpirvEmitter::processIntrinsicExecutionModeId(const CallExpr *expr) {
+  assert(expr->getNumArgs() > 0);
+  uint32_t exeMode = 0;
+  const Expr *modeExpr = expr->getArg(0);
+  Expr::EvalResult evalResult;
+  if (modeExpr->EvaluateAsRValue(evalResult, astContext) &&
+      !evalResult.HasSideEffects && evalResult.Val.isInt()) {
+    exeMode = evalResult.Val.getInt().getZExtValue();
+  } else {
+    emitError("The execution mode must be constant integer",
+              expr->getExprLoc());
+    return nullptr;
+  }
+
+  llvm::SmallVector<SpirvInstruction *, 2> execModesParams;
+  const auto args = expr->getArgs();
+  for (uint32_t i = 1; i < expr->getNumArgs(); ++i) {
+    const Expr *argExpr = args[i];
+    SpirvInstruction *argInst = doExpr(argExpr);
+    execModesParams.push_back(argInst);
+  }
+
+  assert(entryFunction != nullptr);
+  return spvBuilder.addExecutionModeId(entryFunction,
+                                       static_cast<spv::ExecutionMode>(exeMode),
+                                       execModesParams, expr->getExprLoc());
 }
 
 SpirvInstruction *
@@ -15215,11 +15761,33 @@ bool SpirvEmitter::spirvToolsValidate(std::vector<uint32_t> *mod,
   return tools.Validate(mod->data(), mod->size(), options);
 }
 
+static bool canUseDerivativeGroupExecutionMode(SpirvContext::ShaderModelKind sm,
+                                               bool usingEXTMeshShader) {
+  switch (sm) {
+  case SpirvContext::ShaderModelKind::Compute:
+  case SpirvContext::ShaderModelKind::Node:
+    return true;
+
+  // The KHR extension that allows derivative instruction in mesh and task
+  // (amplification) shader does not work with SPV_NV_mesh_shader extesion.
+  case SpirvContext::ShaderModelKind::Mesh:
+  case SpirvContext::ShaderModelKind::Amplification:
+    return usingEXTMeshShader;
+  default:
+    return false;
+  }
+}
+
 void SpirvEmitter::addDerivativeGroupExecutionMode() {
-  assert(spvContext.isCS());
+  bool usingEXTMeshShader =
+      featureManager.isExtensionEnabled(Extension::EXT_mesh_shader);
+  SpirvContext::ShaderModelKind sm = spvContext.getCurrentShaderModelKind();
+  if (!canUseDerivativeGroupExecutionMode(sm, usingEXTMeshShader))
+    return;
 
-  SpirvExecutionMode *numThreadsEm = spvBuilder.getModule()->findExecutionMode(
-      entryFunction, spv::ExecutionMode::LocalSize);
+  SpirvExecutionMode *numThreadsEm =
+      cast<SpirvExecutionMode>(spvBuilder.getModule()->findExecutionMode(
+          entryFunction, spv::ExecutionMode::LocalSize));
   auto numThreads = numThreadsEm->getParams();
 
   // The layout of the quad is determined by the numer of threads in each
diff --git a/tools/clang/lib/SPIRV/SpirvEmitter.h b/tools/clang/lib/SPIRV/SpirvEmitter.h
index 79d2c43c35..ada8db3068 100644
--- a/tools/clang/lib/SPIRV/SpirvEmitter.h
+++ b/tools/clang/lib/SPIRV/SpirvEmitter.h
@@ -5,9 +5,6 @@
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
-// Modifications Copyright(C) 2025 Advanced Micro Devices, Inc.
-// All rights reserved.
-//
 //===----------------------------------------------------------------------===//
 //
 //  This file defines a SPIR-V emitter class that takes in HLSL AST and emits
@@ -83,6 +80,9 @@ class SpirvEmitter : public ASTConsumer {
   void doDecl(const Decl *decl);
   void doStmt(const Stmt *stmt, llvm::ArrayRef<const Attr *> attrs = {});
   SpirvInstruction *doExpr(const Expr *expr, SourceRange rangeOverride = {});
+  SpirvInstruction *doExprEnsuringRValue(const Expr *expr,
+                                         SourceLocation location,
+                                         SourceRange range);
 
   /// Processes the given expression and emits SPIR-V instructions. If the
   /// result is a GLValue, does an additional load.
@@ -126,6 +126,8 @@ class SpirvEmitter : public ASTConsumer {
                                        SourceRange range = {});
 
 private:
+  bool handleNodePayloadArrayType(const ParmVarDecl *decl,
+                                  SpirvInstruction *instr);
   void doFunctionDecl(const FunctionDecl *decl);
   void doVarDecl(const VarDecl *decl);
   void doRecordDecl(const RecordDecl *decl);
@@ -176,7 +178,8 @@ class SpirvEmitter : public ASTConsumer {
   /// Overload with pre computed SpirvEvalInfo.
   ///
   /// The given expr will not be evaluated again.
-  SpirvInstruction *loadIfGLValue(const Expr *expr, SpirvInstruction *info);
+  SpirvInstruction *loadIfGLValue(const Expr *expr, SpirvInstruction *info,
+                                  SourceRange rangeOverride = {});
 
   /// Loads the pointer of the aliased-to-variable if the given expression is a
   /// DeclRefExpr referencing an alias variable. See DeclResultIdMapper for
@@ -225,6 +228,8 @@ class SpirvEmitter : public ASTConsumer {
                   QualType lhsValType, SourceLocation loc,
                   SourceRange range = {});
 
+  bool canUseOpCopyLogical(QualType type) const;
+
   /// Decomposes and reconstructs the given srcVal of the given valType to meet
   /// the requirements of the dstLR layout rule.
   SpirvInstruction *reconstructValue(SpirvInstruction *srcVal, QualType valType,
@@ -504,6 +509,9 @@ class SpirvEmitter : public ASTConsumer {
   SpirvInstruction *
   processIntrinsicGetBufferContents(const CXXMemberCallExpr *);
 
+  /// Processes the 'Barrier' intrinsic function.
+  SpirvInstruction *processIntrinsicBarrier(const CallExpr *);
+
   /// Processes the 'GroupMemoryBarrier', 'GroupMemoryBarrierWithGroupSync',
   /// 'DeviceMemoryBarrier', 'DeviceMemoryBarrierWithGroupSync',
   /// 'AllMemoryBarrier', and 'AllMemoryBarrierWithGroupSync' intrinsic
@@ -512,6 +520,40 @@ class SpirvEmitter : public ASTConsumer {
                                                   bool isDevice, bool groupSync,
                                                   bool isAllBarrier);
 
+  /// Processes the 'GetRemainingRecursionLevels' intrinsic function.
+  SpirvInstruction *
+  processIntrinsicGetRemainingRecursionLevels(const CallExpr *callExpr);
+
+  /// Processes the 'IsValid' intrinsic function.
+  SpirvInstruction *processIntrinsicIsValid(const CXXMemberCallExpr *callExpr);
+
+  /// Processes the 'Get' intrinsic function for (arrays of) node records and
+  /// the array subscript operator for node record arrays.
+  SpirvInstruction *
+  processIntrinsicExtractRecordStruct(const CXXMemberCallExpr *callExpr);
+
+  /// Processes the 'GetGroupNodeOutputRecords' and 'GetThreadNodeOutputRecords'
+  /// intrinsic functions.
+  SpirvInstruction *
+  processIntrinsicGetNodeOutputRecords(const CXXMemberCallExpr *callExpr,
+                                       bool isGroupShared);
+
+  /// Processes the 'IncrementOutputCount' intrinsic function.
+  SpirvInstruction *
+  processIntrinsicIncrementOutputCount(const CXXMemberCallExpr *callExpr,
+                                       bool isGroupShared);
+
+  /// Processes the 'Count' intrinsic function for node input record arrays.
+  SpirvInstruction *
+  processIntrinsicGetRecordCount(const CXXMemberCallExpr *callExpr);
+
+  /// Processes the 'OutputComplete' intrinsic function.
+  void processIntrinsicOutputComplete(const CXXMemberCallExpr *callExpr);
+
+  /// Processes the 'FinishedCrossGroupSharing' intrinsic function.
+  SpirvInstruction *
+  processIntrinsicFinishedCrossGroupSharing(const CXXMemberCallExpr *callExpr);
+
   /// Processes the 'mad' intrinsic function.
   SpirvInstruction *processIntrinsicMad(const CallExpr *);
 
@@ -781,13 +823,29 @@ class SpirvEmitter : public ASTConsumer {
   SpirvInstruction *processCooperativeMatrixGetLength(const CallExpr *call);
 
   /// Process vk::ext_execution_mode intrinsic
-  SpirvInstruction *processIntrinsicExecutionMode(const CallExpr *expr,
-                                                  bool useIdParams);
+  SpirvInstruction *processIntrinsicExecutionMode(const CallExpr *expr);
+  /// Process vk::ext_execution_mode_id intrinsic
+  SpirvInstruction *processIntrinsicExecutionModeId(const CallExpr *expr);
 
   /// Processes the 'firstbit{high|low}' intrinsic functions.
   SpirvInstruction *processIntrinsicFirstbit(const CallExpr *,
                                              GLSLstd450 glslOpcode);
 
+  SpirvInstruction *
+  processMatrixDerivativeIntrinsic(hlsl::IntrinsicOp hlslOpcode,
+                                   const Expr *arg, SourceLocation loc,
+                                   SourceRange range);
+
+  SpirvInstruction *processDerivativeIntrinsic(hlsl::IntrinsicOp hlslOpcode,
+                                               const Expr *arg,
+                                               SourceLocation loc,
+                                               SourceRange range);
+
+  SpirvInstruction *processDerivativeIntrinsic(hlsl::IntrinsicOp hlslOpcode,
+                                               SpirvInstruction *arg,
+                                               SourceLocation loc,
+                                               SourceRange range);
+
 private:
   /// Returns the <result-id> for constant value 0 of the given type.
   SpirvConstant *getValueZero(QualType type);
@@ -833,6 +891,7 @@ class SpirvEmitter : public ASTConsumer {
   static hlsl::ShaderModel::Kind getShaderModelKind(StringRef stageName);
   static spv::ExecutionModel getSpirvShaderStage(hlsl::ShaderModel::Kind smk,
                                                  bool);
+  void checkForWaveSizeAttr(const FunctionDecl *decl);
 
   /// \brief Handle inline SPIR-V attributes for the entry function.
   void processInlineSpirvAttributes(const FunctionDecl *entryFunction);
@@ -859,6 +918,10 @@ class SpirvEmitter : public ASTConsumer {
   /// HLSL attributes of the entry point function.
   void processComputeShaderAttributes(const FunctionDecl *entryFunction);
 
+  /// \brief Adds necessary execution modes for the node shader based on the
+  /// HLSL attributes of the entry point function.
+  void processNodeShaderAttributes(const FunctionDecl *entryFunction);
+
   /// \brief Adds necessary execution modes for the mesh/amplification shader
   /// based on the HLSL attributes of the entry point function.
   bool
@@ -895,6 +958,7 @@ class SpirvEmitter : public ASTConsumer {
   /// The wrapper function is also responsible for initializing global static
   /// variables for some cases.
   bool emitEntryFunctionWrapperForRayTracing(const FunctionDecl *entryFunction,
+                                             RichDebugInfo **info,
                                              SpirvDebugFunction *debugFunction,
                                              SpirvFunction *entryFuncId);
 
diff --git a/tools/clang/lib/SPIRV/SpirvInstruction.cpp b/tools/clang/lib/SPIRV/SpirvInstruction.cpp
index f41de03adc..88d669d397 100644
--- a/tools/clang/lib/SPIRV/SpirvInstruction.cpp
+++ b/tools/clang/lib/SPIRV/SpirvInstruction.cpp
@@ -5,9 +5,6 @@
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
-// Modifications Copyright(C) 2025 Advanced Micro Devices, Inc.
-// All rights reserved.
-//
 //===----------------------------------------------------------------------===//
 //
 //  This file implements the in-memory representation of SPIR-V instructions.
@@ -33,7 +30,9 @@ DEFINE_INVOKE_VISITOR_FOR_CLASS(SpirvExtension)
 DEFINE_INVOKE_VISITOR_FOR_CLASS(SpirvExtInstImport)
 DEFINE_INVOKE_VISITOR_FOR_CLASS(SpirvMemoryModel)
 DEFINE_INVOKE_VISITOR_FOR_CLASS(SpirvEntryPoint)
+DEFINE_INVOKE_VISITOR_FOR_CLASS(SpirvExecutionModeBase)
 DEFINE_INVOKE_VISITOR_FOR_CLASS(SpirvExecutionMode)
+DEFINE_INVOKE_VISITOR_FOR_CLASS(SpirvExecutionModeId)
 DEFINE_INVOKE_VISITOR_FOR_CLASS(SpirvString)
 DEFINE_INVOKE_VISITOR_FOR_CLASS(SpirvSource)
 DEFINE_INVOKE_VISITOR_FOR_CLASS(SpirvModuleProcessed)
@@ -53,6 +52,11 @@ DEFINE_INVOKE_VISITOR_FOR_CLASS(SpirvUnreachable)
 DEFINE_INVOKE_VISITOR_FOR_CLASS(SpirvAccessChain)
 DEFINE_INVOKE_VISITOR_FOR_CLASS(SpirvAtomic)
 DEFINE_INVOKE_VISITOR_FOR_CLASS(SpirvBarrier)
+DEFINE_INVOKE_VISITOR_FOR_CLASS(SpirvIsNodePayloadValid)
+DEFINE_INVOKE_VISITOR_FOR_CLASS(SpirvNodePayloadArrayLength)
+DEFINE_INVOKE_VISITOR_FOR_CLASS(SpirvAllocateNodePayloads)
+DEFINE_INVOKE_VISITOR_FOR_CLASS(SpirvEnqueueNodePayloads)
+DEFINE_INVOKE_VISITOR_FOR_CLASS(SpirvFinishWritingNodePayload)
 DEFINE_INVOKE_VISITOR_FOR_CLASS(SpirvBinaryOp)
 DEFINE_INVOKE_VISITOR_FOR_CLASS(SpirvBitFieldExtract)
 DEFINE_INVOKE_VISITOR_FOR_CLASS(SpirvBitFieldInsert)
@@ -60,6 +64,7 @@ DEFINE_INVOKE_VISITOR_FOR_CLASS(SpirvConstantBoolean)
 DEFINE_INVOKE_VISITOR_FOR_CLASS(SpirvConstantInteger)
 DEFINE_INVOKE_VISITOR_FOR_CLASS(SpirvConstantFloat)
 DEFINE_INVOKE_VISITOR_FOR_CLASS(SpirvConstantComposite)
+DEFINE_INVOKE_VISITOR_FOR_CLASS(SpirvConstantString)
 DEFINE_INVOKE_VISITOR_FOR_CLASS(SpirvConstantNull)
 DEFINE_INVOKE_VISITOR_FOR_CLASS(SpirvConvertPtrToU)
 DEFINE_INVOKE_VISITOR_FOR_CLASS(SpirvConvertUToPtr)
@@ -207,13 +212,16 @@ SpirvEntryPoint::SpirvEntryPoint(SourceLocation loc,
 // OpExecutionMode and OpExecutionModeId instructions
 SpirvExecutionMode::SpirvExecutionMode(SourceLocation loc, SpirvFunction *entry,
                                        spv::ExecutionMode em,
-                                       llvm::ArrayRef<uint32_t> paramsVec,
-                                       bool usesIdParams)
-    : SpirvInstruction(IK_ExecutionMode,
-                       usesIdParams ? spv::Op::OpExecutionModeId
-                                    : spv::Op::OpExecutionMode,
-                       QualType(), loc),
-      entryPoint(entry), execMode(em),
+                                       llvm::ArrayRef<uint32_t> paramsVec)
+    : SpirvExecutionModeBase(IK_ExecutionMode, spv::Op::OpExecutionMode, loc,
+                             entry, em),
+      params(paramsVec.begin(), paramsVec.end()) {}
+
+SpirvExecutionModeId::SpirvExecutionModeId(
+    SourceLocation loc, SpirvFunction *entry, spv::ExecutionMode em,
+    llvm::ArrayRef<SpirvInstruction *> paramsVec)
+    : SpirvExecutionModeBase(IK_ExecutionModeId, spv::Op::OpExecutionModeId,
+                             loc, entry, em),
       params(paramsVec.begin(), paramsVec.end()) {}
 
 SpirvString::SpirvString(SourceLocation loc, llvm::StringRef stringLiteral)
@@ -467,6 +475,41 @@ SpirvBarrier::SpirvBarrier(SourceLocation loc, spv::Scope memScope,
       memoryScope(memScope), memorySemantics(memSemantics),
       executionScope(execScope) {}
 
+SpirvIsNodePayloadValid::SpirvIsNodePayloadValid(QualType resultType,
+                                                 SourceLocation loc,
+                                                 SpirvInstruction *payloadArray,
+                                                 SpirvInstruction *nodeIndex)
+    : SpirvInstruction(IK_IsNodePayloadValid, spv::Op::OpIsNodePayloadValidAMDX,
+                       resultType, loc),
+      payloadArray(payloadArray), nodeIndex(nodeIndex) {}
+
+SpirvNodePayloadArrayLength::SpirvNodePayloadArrayLength(
+    QualType resultType, SourceLocation loc, SpirvInstruction *payloadArray)
+    : SpirvInstruction(IK_NodePayloadArrayLength,
+                       spv::Op::OpNodePayloadArrayLengthAMDX, resultType, loc),
+      payloadArray(payloadArray) {}
+
+SpirvAllocateNodePayloads::SpirvAllocateNodePayloads(
+    QualType resultType, SourceLocation loc, spv::Scope allocationScope,
+    SpirvInstruction *shaderIndex, SpirvInstruction *recordCount)
+    : SpirvInstruction(IK_AllocateNodePayloads,
+                       spv::Op::OpAllocateNodePayloadsAMDX, resultType, loc),
+      allocationScope(allocationScope), shaderIndex(shaderIndex),
+      recordCount(recordCount) {}
+
+SpirvEnqueueNodePayloads::SpirvEnqueueNodePayloads(SourceLocation loc,
+                                                   SpirvInstruction *payload)
+    : SpirvInstruction(IK_EnqueueNodePayloads,
+                       spv::Op::OpEnqueueNodePayloadsAMDX, QualType(), loc),
+      payload(payload) {}
+
+SpirvFinishWritingNodePayload::SpirvFinishWritingNodePayload(
+    QualType resultType, SourceLocation loc, SpirvInstruction *payload)
+    : SpirvInstruction(IK_FinishWritingNodePayload,
+                       spv::Op::OpFinishWritingNodePayloadAMDX, resultType,
+                       loc),
+      payload(payload) {}
+
 SpirvBinaryOp::SpirvBinaryOp(spv::Op opcode, QualType resultType,
                              SourceLocation loc, SpirvInstruction *op1,
                              SpirvInstruction *op2, SourceRange range)
@@ -563,7 +606,8 @@ bool SpirvConstant::isSpecConstant() const {
   return opcode == spv::Op::OpSpecConstant ||
          opcode == spv::Op::OpSpecConstantTrue ||
          opcode == spv::Op::OpSpecConstantFalse ||
-         opcode == spv::Op::OpSpecConstantComposite;
+         opcode == spv::Op::OpSpecConstantComposite ||
+         opcode == spv::Op::OpSpecConstantStringAMDX;
 }
 
 SpirvConstantBoolean::SpirvConstantBoolean(QualType type, bool val,
@@ -618,6 +662,19 @@ SpirvConstantComposite::SpirvConstantComposite(
                     type),
       constituents(constituentsVec.begin(), constituentsVec.end()) {}
 
+SpirvConstantString::SpirvConstantString(llvm::StringRef stringLiteral,
+                                         bool isSpecConst)
+    : SpirvConstant(IK_ConstantString,
+                    isSpecConst ? spv::Op::OpSpecConstantStringAMDX
+                                : spv::Op::OpConstantStringAMDX,
+                    QualType()),
+      str(stringLiteral) {}
+
+bool SpirvConstantString::operator==(const SpirvConstantString &that) const {
+  return opcode == that.opcode && resultType == that.resultType &&
+         str == that.str;
+}
+
 SpirvConstantNull::SpirvConstantNull(QualType type)
     : SpirvConstant(IK_ConstantNull, spv::Op::OpConstantNull, type) {}
 
diff --git a/tools/clang/lib/SPIRV/SpirvModule.cpp b/tools/clang/lib/SPIRV/SpirvModule.cpp
index 9c6a826a5b..ed6aca7488 100644
--- a/tools/clang/lib/SPIRV/SpirvModule.cpp
+++ b/tools/clang/lib/SPIRV/SpirvModule.cpp
@@ -294,9 +294,10 @@ void SpirvModule::addEntryPoint(SpirvEntryPoint *ep) {
   entryPoints.push_back(ep);
 }
 
-SpirvExecutionMode *SpirvModule::findExecutionMode(SpirvFunction *entryPoint,
-                                                   spv::ExecutionMode em) {
-  for (SpirvExecutionMode *cem : executionModes) {
+SpirvExecutionModeBase *
+SpirvModule::findExecutionMode(SpirvFunction *entryPoint,
+                               spv::ExecutionMode em) {
+  for (SpirvExecutionModeBase *cem : executionModes) {
     if (cem->getEntryPoint() != entryPoint)
       continue;
     if (cem->getExecutionMode() != em)
@@ -306,7 +307,7 @@ SpirvExecutionMode *SpirvModule::findExecutionMode(SpirvFunction *entryPoint,
   return nullptr;
 }
 
-void SpirvModule::addExecutionMode(SpirvExecutionMode *em) {
+void SpirvModule::addExecutionMode(SpirvExecutionModeBase *em) {
   assert(em && "cannot add null execution mode");
   executionModes.push_back(em);
 }
diff --git a/tools/clang/lib/SPIRV/SpirvType.cpp b/tools/clang/lib/SPIRV/SpirvType.cpp
index cabeba4cda..286e6224a4 100644
--- a/tools/clang/lib/SPIRV/SpirvType.cpp
+++ b/tools/clang/lib/SPIRV/SpirvType.cpp
@@ -167,6 +167,10 @@ bool RuntimeArrayType::operator==(const RuntimeArrayType &that) const {
          (!stride.hasValue() || stride.getValue() == that.stride.getValue());
 }
 
+bool NodePayloadArrayType::operator==(const NodePayloadArrayType &that) const {
+  return elementType == that.elementType && nodeDecl == that.nodeDecl;
+}
+
 bool SpvIntrinsicTypeOperand::operator==(
     const SpvIntrinsicTypeOperand &that) const {
   if (isTypeOperand != that.isTypeOperand)
diff --git a/tools/clang/lib/Sema/SemaCast.cpp b/tools/clang/lib/Sema/SemaCast.cpp
index f5a864e2b6..dcff6c2461 100644
--- a/tools/clang/lib/Sema/SemaCast.cpp
+++ b/tools/clang/lib/Sema/SemaCast.cpp
@@ -5,9 +5,6 @@
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
-// Modifications Copyright(C) 2025 Advanced Micro Devices, Inc.
-// All rights reserved.
-//
 //===----------------------------------------------------------------------===//
 //
 //  This file implements semantic analysis for cast expressions, including
diff --git a/tools/clang/lib/Sema/SemaChecking.cpp b/tools/clang/lib/Sema/SemaChecking.cpp
index 9e64732336..e3932220f9 100644
--- a/tools/clang/lib/Sema/SemaChecking.cpp
+++ b/tools/clang/lib/Sema/SemaChecking.cpp
@@ -1426,7 +1426,7 @@ bool Sema::CheckFunctionCall(FunctionDecl *FDecl, CallExpr *TheCall,
     CheckMemaccessArguments(TheCall, CMId, FnInfo);
 #endif // HLSL Change Ends
 
-  CheckHLSLFunctionCall(FDecl, TheCall, Proto); // HLSL Change
+  CheckHLSLFunctionCall(FDecl, TheCall); // HLSL Change
 
   return false;
 }
diff --git a/tools/clang/lib/Sema/SemaCodeComplete.cpp b/tools/clang/lib/Sema/SemaCodeComplete.cpp
index b1b4668ba3..84d0990346 100644
--- a/tools/clang/lib/Sema/SemaCodeComplete.cpp
+++ b/tools/clang/lib/Sema/SemaCodeComplete.cpp
@@ -4020,7 +4020,7 @@ void Sema::CodeCompleteCall(Scope *S, Expr *Fn, ArrayRef<Expr *> Args) {
 
   Expr *NakedFn = Fn->IgnoreParenCasts();
   if (auto ULE = dyn_cast<UnresolvedLookupExpr>(NakedFn))
-    AddOverloadedCallCandidates(ULE, Args, CandidateSet,
+    AddOverloadedCallCandidates(ULE, Args, CandidateSet, S, // HLSL Change
                                 /*PartialOverloading=*/true);
   else if (auto UME = dyn_cast<UnresolvedMemberExpr>(NakedFn)) {
     TemplateArgumentListInfo TemplateArgsBuffer, *TemplateArgs = nullptr;
diff --git a/tools/clang/lib/Sema/SemaDXR.cpp b/tools/clang/lib/Sema/SemaDXR.cpp
index 36ab55ea10..0ccb21fb2b 100644
--- a/tools/clang/lib/Sema/SemaDXR.cpp
+++ b/tools/clang/lib/Sema/SemaDXR.cpp
@@ -28,6 +28,7 @@
 
 #include "dxc/DXIL/DxilConstants.h"
 #include "dxc/DXIL/DxilShaderModel.h"
+#include "dxc/HlslIntrinsicOp.h"
 
 using namespace clang;
 using namespace sema;
@@ -49,9 +50,9 @@ struct PayloadUse {
   const MemberExpr *Member = nullptr;
 };
 
-struct TraceRayCall {
-  TraceRayCall() = default;
-  TraceRayCall(const CallExpr *Call, const CFGBlock *Parent)
+struct PayloadBuiltinCall {
+  PayloadBuiltinCall() = default;
+  PayloadBuiltinCall(const CallExpr *Call, const CFGBlock *Parent)
       : Call(Call), Parent(Parent) {}
   const CallExpr *Call = nullptr;
   const CFGBlock *Parent = nullptr;
@@ -71,7 +72,7 @@ struct DxrShaderDiagnoseInfo {
   const FunctionDecl *funcDecl;
   const VarDecl *Payload;
   DXIL::PayloadAccessShaderStage Stage;
-  std::vector<TraceRayCall> TraceCalls;
+  std::vector<PayloadBuiltinCall> PayloadBuiltinCalls;
   std::map<const FieldDecl *, std::vector<PayloadUse>> WritesPerField;
   std::map<const FieldDecl *, std::vector<PayloadUse>> ReadsPerField;
   std::vector<PayloadUse> PayloadAsCallArg;
@@ -121,24 +122,42 @@ GetPayloadQualifierForStage(FieldDecl *Field,
   return DXIL::PayloadAccessQualifier::NoAccess;
 }
 
-// Returns the declaration of the payload used in a TraceRay call
-const VarDecl *GetPayloadParameterForTraceCall(const CallExpr *Trace) {
-  const Decl *callee = Trace->getCalleeDecl();
-  if (!callee)
+static int GetPayloadParamIdxForIntrinsic(const FunctionDecl *FD) {
+  HLSLIntrinsicAttr *IntrinAttr = FD->getAttr<HLSLIntrinsicAttr>();
+  if (!IntrinAttr)
+    return -1;
+  switch ((IntrinsicOp)IntrinAttr->getOpcode()) {
+  default:
+    return -1;
+  case IntrinsicOp::IOP_TraceRay:
+  case IntrinsicOp::MOP_DxHitObject_TraceRay:
+  case IntrinsicOp::MOP_DxHitObject_Invoke:
+    return FD->getNumParams() - 1;
+  }
+}
+
+static bool IsBuiltinWithPayload(const FunctionDecl *FD) {
+  return GetPayloadParamIdxForIntrinsic(FD) >= 0;
+}
+
+// Returns the declaration of the payload used in a call to TraceRay,
+// HitObject::TraceRay or HitObject::Invoke.
+const VarDecl *GetPayloadParameterForBuiltinCall(const CallExpr *Call) {
+  const Decl *Callee = Call->getCalleeDecl();
+  if (!Callee)
     return nullptr;
 
-  if (!isa<FunctionDecl>(callee))
+  if (!isa<FunctionDecl>(Callee))
     return nullptr;
 
-  const FunctionDecl *FD = cast<FunctionDecl>(callee);
+  int PldParamIdx = GetPayloadParamIdxForIntrinsic(cast<FunctionDecl>(Callee));
+  if (PldParamIdx < 0)
+    return nullptr;
 
-  if (FD->isImplicit() && FD->getName() == "TraceRay") {
-    const Stmt *Param = IgnoreParensAndDecay(Trace->getArg(7));
-    if (const DeclRefExpr *ParamRef = dyn_cast<DeclRefExpr>(Param)) {
-      if (const VarDecl *Decl = dyn_cast<VarDecl>(ParamRef->getDecl()))
-        return Decl;
-    }
-  }
+  const Stmt *Param = IgnoreParensAndDecay(Call->getArg(PldParamIdx));
+  if (const DeclRefExpr *ParamRef = dyn_cast<DeclRefExpr>(Param))
+    if (const VarDecl *Decl = dyn_cast<VarDecl>(ParamRef->getDecl()))
+      return Decl;
   return nullptr;
 }
 
@@ -190,12 +209,9 @@ void CollectReadsWritesAndCallsForPayload(const Stmt *S,
   }
 }
 
-// Collects all TraceRay calls.
-void CollectTraceRayCalls(const Stmt *S, DxrShaderDiagnoseInfo &Info,
-                          const CFGBlock *Block) {
-  // TraceRay has void as return type so it should never be something else
-  // than a plain CallExpr.
-
+// Collects all calls to TraceRay, HitObject::TraceRay and HitObject::Invoke.
+void CollectBuiltinCallsWithPayload(const Stmt *S, DxrShaderDiagnoseInfo &Info,
+                                    const CFGBlock *Block) {
   if (const CallExpr *Call = dyn_cast<CallExpr>(S)) {
 
     const Decl *Callee = Call->getCalleeDecl();
@@ -204,11 +220,8 @@ void CollectTraceRayCalls(const Stmt *S, DxrShaderDiagnoseInfo &Info,
 
     const FunctionDecl *CalledFunction = cast<FunctionDecl>(Callee);
 
-    // Ignore trace calls here.
-    if (CalledFunction->isImplicit() &&
-        CalledFunction->getName() == "TraceRay") {
-      Info.TraceCalls.push_back({Call, Block});
-    }
+    if (IsBuiltinWithPayload(CalledFunction))
+      Info.PayloadBuiltinCalls.push_back({Call, Block});
   }
 }
 
@@ -528,13 +541,14 @@ void TraverseCFG(const CFGBlock &Block, Action PerElementAction,
   }
 }
 
-// Forward traverse the CFG and collect calls to TraceRay.
-void ForwardTraverseCFGAndCollectTraceCalls(
+// Forward traverse the CFG and collect calls to TraceRay, HitObject::TraceRay
+// and HitObject::Invoke.
+void ForwardTraverseCFGAndCollectBuiltinCallsWithPayload(
     const CFGBlock &Block, DxrShaderDiagnoseInfo &Info,
     std::set<const CFGBlock *> &Visited) {
   auto Action = [&Info](const CFGBlock &Block, const CFGElement &Element) {
     if (Optional<CFGStmt> S = Element.getAs<CFGStmt>()) {
-      CollectTraceRayCalls(S->getStmt(), Info, &Block);
+      CollectBuiltinCallsWithPayload(S->getStmt(), Info, &Block);
     }
   };
 
@@ -664,9 +678,9 @@ DiagnosePayloadAsFunctionArg(
       const FunctionDecl *CalledFunction = cast<FunctionDecl>(Callee);
 
       // Ignore trace calls here.
-      if (CalledFunction->isImplicit() &&
-          CalledFunction->getName() == "TraceRay") {
-        Info.TraceCalls.push_back(TraceRayCall{Call, Use.Parent});
+      if (IsBuiltinWithPayload(CalledFunction)) {
+        Info.PayloadBuiltinCalls.push_back(
+            PayloadBuiltinCall{Call, Use.Parent});
         continue;
       }
 
@@ -789,10 +803,12 @@ void HandlePayloadInitializer(DxrShaderDiagnoseInfo &Info) {
   }
 }
 
-// Emit diagnostics for a TraceRay call.
-void DiagnoseTraceCall(Sema &S, const VarDecl *Payload,
-                       const TraceRayCall &Trace, DominatorTree &DT) {
-  // For each TraceRay call check if write(caller) fields are written.
+// Emit diagnostics for this call to either TraceRay, HitObject::TraceRay or
+// HitObject::Invoke.
+void DiagnoseBuiltinCallWithPayload(Sema &S, const VarDecl *Payload,
+                                    const PayloadBuiltinCall &PldCall,
+                                    DominatorTree &DT) {
+  // For each call check if write(caller) fields are written.
   const DXIL::PayloadAccessShaderStage CallerStage =
       DXIL::PayloadAccessShaderStage::Caller;
 
@@ -810,12 +826,17 @@ void DiagnoseTraceCall(Sema &S, const VarDecl *Payload,
     return;
   }
 
-  if (ContainsLongVector(Payload->getType())) {
-    const unsigned PayloadParametersIdx = 10;
-    S.Diag(Payload->getLocation(), diag::err_hlsl_unsupported_long_vector)
-        << PayloadParametersIdx;
+  // Verify that the payload type is legal
+  if (!hlsl::IsHLSLCopyableAnnotatableRecord(Payload->getType()))
+    S.Diag(Payload->getLocation(), diag::err_payload_attrs_must_be_udt)
+        << /*payload|attributes|callable*/ 0 << /*parameter %2|type*/ 0
+        << Payload;
+
+  // This will produce more details, but also catch disallowed long vectors
+  const TypeDiagContext DiagContext = TypeDiagContext::PayloadParameters;
+  if (DiagnoseTypeElements(S, Payload->getLocation(), Payload->getType(),
+                           DiagContext, DiagContext))
     return;
-  }
 
   CollectNonAccessableFields(PayloadType, CallerStage, {}, {},
                              NonWriteableFields, NonReadableFields);
@@ -832,12 +853,12 @@ void DiagnoseTraceCall(Sema &S, const VarDecl *Payload,
 
   std::set<const CFGBlock *> Visited;
 
-  const CFGBlock *Parent = Trace.Parent;
+  const CFGBlock *Parent = PldCall.Parent;
   Visited.insert(Parent);
-  // Collect payload accesses in the same block until we reach the TraceRay call
+  // Collect payload accesses in the same block until we reach the call
   for (auto Element : *Parent) {
     if (Optional<CFGStmt> S = Element.getAs<CFGStmt>()) {
-      if (S->getStmt() == Trace.Call)
+      if (S->getStmt() == PldCall.Call)
         break;
       CollectReadsWritesAndCallsForPayload(S->getStmt(), TraceInfo, Parent);
     }
@@ -850,10 +871,12 @@ void DiagnoseTraceCall(Sema &S, const VarDecl *Payload,
     BackwardTraverseCFGAndCollectReadsWrites(*Pred, TraceInfo, Visited);
   }
 
+  int PldArgIdx = PldCall.Call->getNumArgs() - 1;
+
   // Warn if a writeable field has not been written.
   for (const FieldDecl *Field : WriteableFields) {
     if (!TraceInfo.WritesPerField.count(Field)) {
-      S.Diag(Trace.Call->getArg(7)->getExprLoc(),
+      S.Diag(PldCall.Call->getArg(PldArgIdx)->getExprLoc(),
              diag::warn_hlsl_payload_access_no_write_for_trace_payload)
           << Field->getName();
     }
@@ -862,7 +885,7 @@ void DiagnoseTraceCall(Sema &S, const VarDecl *Payload,
   for (const FieldDecl *Field : NonWriteableFields) {
     if (TraceInfo.WritesPerField.count(Field)) {
       S.Diag(
-          Trace.Call->getArg(7)->getExprLoc(),
+          PldCall.Call->getArg(PldArgIdx)->getExprLoc(),
           diag::warn_hlsl_payload_access_write_but_no_write_for_trace_payload)
           << Field->getName();
     }
@@ -878,7 +901,7 @@ void DiagnoseTraceCall(Sema &S, const VarDecl *Payload,
   bool CallFound = false;
   for (auto Element : *Parent) { // TODO: reverse iterate?
     if (Optional<CFGStmt> S = Element.getAs<CFGStmt>()) {
-      if (S->getStmt() == Trace.Call) {
+      if (S->getStmt() == PldCall.Call) {
         CallFound = true;
         continue;
       }
@@ -895,7 +918,7 @@ void DiagnoseTraceCall(Sema &S, const VarDecl *Payload,
 
   for (const FieldDecl *Field : ReadableFields) {
     if (!TraceInfo.ReadsPerField.count(Field)) {
-      S.Diag(Trace.Call->getArg(7)->getExprLoc(),
+      S.Diag(PldCall.Call->getArg(PldArgIdx)->getExprLoc(),
              diag::warn_hlsl_payload_access_read_but_no_read_after_trace)
           << Field->getName();
     }
@@ -928,27 +951,29 @@ void DiagnoseTraceCall(Sema &S, const VarDecl *Payload,
   }
 }
 
-// Emit diagnostics for all TraceRay calls.
-void DiagnoseTraceCalls(Sema &S, CFG &ShaderCFG, DominatorTree &DT,
-                        DxrShaderDiagnoseInfo &Info) {
-  // Collect TraceRay calls in the shader.
+// Emit diagnostics for all calls to TraceRay, HitObject::TraceRay or
+// HitObject::Invoke.
+void DiagnoseBuiltinCallsWithPayload(Sema &S, CFG &ShaderCFG, DominatorTree &DT,
+                                     DxrShaderDiagnoseInfo &Info) {
+  // Collect calls with payload in the shader.
   std::set<const CFGBlock *> Visited;
-  ForwardTraverseCFGAndCollectTraceCalls(ShaderCFG.getEntry(), Info, Visited);
+  ForwardTraverseCFGAndCollectBuiltinCallsWithPayload(ShaderCFG.getEntry(),
+                                                      Info, Visited);
 
   std::set<const CallExpr *> Diagnosed;
 
-  for (const TraceRayCall &TraceCall : Info.TraceCalls) {
-    if (Diagnosed.count(TraceCall.Call))
+  for (const PayloadBuiltinCall &PldCall : Info.PayloadBuiltinCalls) {
+    if (Diagnosed.count(PldCall.Call))
       continue;
-    Diagnosed.insert(TraceCall.Call);
+    Diagnosed.insert(PldCall.Call);
 
-    const VarDecl *Payload = GetPayloadParameterForTraceCall(TraceCall.Call);
-    DiagnoseTraceCall(S, Payload, TraceCall, DT);
+    const VarDecl *Payload = GetPayloadParameterForBuiltinCall(PldCall.Call);
+    DiagnoseBuiltinCallWithPayload(S, Payload, PldCall, DT);
   }
 }
 
 // Emit diagnostics for all access to the payload of a shader,
-// and the input to TraceRay calls.
+// and the input to TraceRay, HitObject::TraceRay or HitObject::Invoke calls.
 std::vector<const FieldDecl *>
 DiagnosePayloadAccess(Sema &S, DxrShaderDiagnoseInfo &Info,
                       const std::set<const FieldDecl *> &FieldsToIgnoreRead,
@@ -1012,7 +1037,7 @@ DiagnosePayloadAccess(Sema &S, DxrShaderDiagnoseInfo &Info,
       DiagnosePayloadReads(S, TheCFG, DT, Info, NonReadableFields);
   }
 
-  DiagnoseTraceCalls(S, TheCFG, DT, Info);
+  DiagnoseBuiltinCallsWithPayload(S, TheCFG, DT, Info);
 
   return WrittenFields;
 }
@@ -1165,9 +1190,13 @@ void DiagnoseCallableEntry(Sema &S, FunctionDecl *FD,
           << /*payload|callable*/ 1 << Param;
     QualType Ty = Param->getType().getNonReferenceType();
 
-    if (!(hlsl::IsHLSLCopyableAnnotatableRecord(Ty)))
+    // Don't diagnose incomplete type here. Function parameters are
+    // checked in Sema::CheckParmsForFunctionDef.
+    if (!S.RequireCompleteType(Param->getLocation(), Ty, 0) &&
+        !(hlsl::IsHLSLCopyableAnnotatableRecord(Ty)))
       S.Diag(Param->getLocation(), diag::err_payload_attrs_must_be_udt)
-          << /*payload|attributes|callable*/ 2 << Param;
+          << /*payload|attributes|callable*/ 2 << /*parameter %2|type*/ 0
+          << Param;
   }
   return;
 }
@@ -1206,9 +1235,15 @@ void DiagnoseMissOrAnyHitEntry(Sema &S, FunctionDecl *FD,
 
     QualType Ty = Param->getType().getNonReferenceType();
 
+    // Don't diagnose here, just continue if this fails. Function parameters are
+    // checked in Sema::CheckParmsForFunctionDef.
+    if (S.RequireCompleteType(Param->getLocation(), Ty, 0))
+      continue;
+
     if (!(hlsl::IsHLSLCopyableAnnotatableRecord(Ty))) {
       S.Diag(Param->getLocation(), diag::err_payload_attrs_must_be_udt)
-          << /*payload|attributes|callable*/ Idx << Param;
+          << /*payload|attributes|callable*/ Idx << /*parameter %2|type*/ 0
+          << Param;
     }
   }
   return;
@@ -1259,9 +1294,15 @@ void DiagnoseClosestHitEntry(Sema &S, FunctionDecl *FD,
 
     QualType Ty = Param->getType().getNonReferenceType();
 
+    // Don't diagnose here, just continue if this fails. Function parameters are
+    // checked in Sema::CheckParmsForFunctionDef.
+    if (S.RequireCompleteType(Param->getLocation(), Ty, 0))
+      continue;
+
     if (!(hlsl::IsHLSLCopyableAnnotatableRecord(Ty))) {
       S.Diag(Param->getLocation(), diag::err_payload_attrs_must_be_udt)
-          << /*payload|attributes|callable*/ Idx << Param;
+          << /*payload|attributes|callable*/ Idx << /*parameter %2|type*/ 0
+          << Param;
     }
   }
   return;
diff --git a/tools/clang/lib/Sema/SemaDecl.cpp b/tools/clang/lib/Sema/SemaDecl.cpp
index e09bf4623c..a772054960 100644
--- a/tools/clang/lib/Sema/SemaDecl.cpp
+++ b/tools/clang/lib/Sema/SemaDecl.cpp
@@ -5331,7 +5331,7 @@ bool Sema::inferObjCARCLifetime(ValueDecl *decl) {
   Qualifiers::ObjCLifetime lifetime = type.getObjCLifetime();
   if (lifetime == Qualifiers::OCL_Autoreleasing) {
     // Various kinds of declaration aren't allowed to be __autoreleasing.
-    unsigned kind = -1U;
+    unsigned kind = ~0U;
     if (VarDecl *var = dyn_cast<VarDecl>(decl)) {
       if (var->hasAttr<BlocksAttr>())
         kind = 0; // __block
@@ -5343,7 +5343,7 @@ bool Sema::inferObjCARCLifetime(ValueDecl *decl) {
       kind = 2; // field
     }
 
-    if (kind != -1U) {
+    if (kind != ~0U) {
       Diag(decl->getLocation(), diag::err_arc_autoreleasing_var)
         << kind;
     }
diff --git a/tools/clang/lib/Sema/SemaExpr.cpp b/tools/clang/lib/Sema/SemaExpr.cpp
index 507b6a7508..cccf711126 100644
--- a/tools/clang/lib/Sema/SemaExpr.cpp
+++ b/tools/clang/lib/Sema/SemaExpr.cpp
@@ -1466,7 +1466,7 @@ Sema::CreateGenericSelectionExpr(SourceLocation KeyLoc,
         ContainsUnexpandedParameterPack);
 
   SmallVector<unsigned, 1> CompatIndices;
-  unsigned DefaultIndex = -1U;
+  unsigned DefaultIndex = std::numeric_limits<unsigned>::max();
   for (unsigned i = 0; i < NumAssocs; ++i) {
     if (!Types[i])
       DefaultIndex = i;
@@ -1498,7 +1498,8 @@ Sema::CreateGenericSelectionExpr(SourceLocation KeyLoc,
   // C11 6.5.1.1p2 "If a generic selection has no default generic association,
   // its controlling expression shall have type compatible with exactly one of
   // the types named in its generic association list."
-  if (DefaultIndex == -1U && CompatIndices.size() == 0) {
+  if (DefaultIndex == std::numeric_limits<unsigned>::max() &&
+      CompatIndices.size() == 0) {
     // We strip parens here because the controlling expression is typically
     // parenthesized in macro definitions.
     ControllingExpr = ControllingExpr->IgnoreParens();
@@ -3504,12 +3505,14 @@ ExprResult Sema::ActOnNumericConstant(const Token &Tok, Scope *UDLScope) {
       Ty = Context.LitIntTy;
       if (Literal.GetIntegerValue(ResultVal)) {
         // If this value didn't fit into 64-bit literal int, report error.
-        Diag(Tok.getLocation(), diag::err_integer_literal_too_large);
+        Diag(Tok.getLocation(), diag::err_integer_literal_too_large)
+            << /* Unsigned */ 1;
       }
     } else {
 
       if (Literal.GetIntegerValue(ResultVal)) {
-        Diag(Tok.getLocation(), diag::err_integer_literal_too_large);
+        Diag(Tok.getLocation(), diag::err_integer_literal_too_large)
+            << /* Unsigned */ 1;
       }
       if (Literal.isLongLong) {
         if (Literal.isUnsigned)
@@ -3798,13 +3801,21 @@ static void warnOnSizeofOnArrayDecay(Sema &S, SourceLocation Loc, QualType T,
 }
 
 // HLSL Change Begins
-bool Sema::CheckHLSLUnaryExprOrTypeTraitOperand(QualType ExprType, SourceLocation Loc,
+bool Sema::CheckHLSLUnaryExprOrTypeTraitOperand(QualType ExprType,
+                                                SourceLocation Loc,
                                                 UnaryExprOrTypeTrait ExprKind) {
   assert(ExprKind == UnaryExprOrTypeTrait::UETT_SizeOf);
 
-  // "sizeof 42" is ill-defined because HLSL has literal int type which can decay to an int of any size.
-  const BuiltinType* BuiltinTy = ExprType->getAs<BuiltinType>();
-  if (BuiltinTy != nullptr && (BuiltinTy->getKind() == BuiltinType::LitInt || BuiltinTy->getKind() == BuiltinType::LitFloat)) {
+  if (RequireCompleteType(Loc, ExprType,
+                          diag::err_sizeof_alignof_incomplete_type, ExprKind,
+                          ExprType))
+    return true;
+
+  // "sizeof 42" is ill-defined because HLSL has literal int type which can
+  // decay to an int of any size.
+  const BuiltinType *BuiltinTy = ExprType->getAs<BuiltinType>();
+  if (BuiltinTy != nullptr && (BuiltinTy->getKind() == BuiltinType::LitInt ||
+                               BuiltinTy->getKind() == BuiltinType::LitFloat)) {
     Diag(Loc, diag::err_hlsl_sizeof_literal) << ExprType;
     return true;
   }
@@ -5338,8 +5349,6 @@ Sema::BuildResolvedCallExpr(Expr *Fn, NamedDecl *NDecl,
   if (FDecl) {
     if (CheckFunctionCall(FDecl, TheCall, Proto))
       return ExprError();
-    if (CheckHLSLFunctionCall(FDecl, TheCall))
-      return ExprError();
     if (BuiltinID)
       return CheckBuiltinFunctionCall(FDecl, BuiltinID, TheCall);
   } else if (NDecl) {
diff --git a/tools/clang/lib/Sema/SemaExprCXX.cpp b/tools/clang/lib/Sema/SemaExprCXX.cpp
index 5113c56205..1e70b95476 100644
--- a/tools/clang/lib/Sema/SemaExprCXX.cpp
+++ b/tools/clang/lib/Sema/SemaExprCXX.cpp
@@ -5,9 +5,6 @@
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
-// Modifications Copyright(C) 2025 Advanced Micro Devices, Inc.
-// All rights reserved.
-//
 //===----------------------------------------------------------------------===//
 ///
 /// \file
diff --git a/tools/clang/lib/Sema/SemaHLSL.cpp b/tools/clang/lib/Sema/SemaHLSL.cpp
index 418425a468..656dfb401f 100644
--- a/tools/clang/lib/Sema/SemaHLSL.cpp
+++ b/tools/clang/lib/Sema/SemaHLSL.cpp
@@ -6,15 +6,13 @@
 // This file is distributed under the University of Illinois Open Source     //
 // License. See LICENSE.TXT for details.                                     //
 //                                                                           //
-// Modifications Copyright(C) 2025 Advanced Micro Devices, Inc.              //
-// All rights reserved.                                                      //
-//                                                                           //
 //  This file implements the semantic support for HLSL.                      //
 //                                                                           //
 ///////////////////////////////////////////////////////////////////////////////
 
 #include "clang/Sema/SemaHLSL.h"
 #include "VkConstantsTables.h"
+#include "dxc/DXIL/DxilConstants.h"
 #include "dxc/DXIL/DxilFunctionProps.h"
 #include "dxc/DXIL/DxilShaderModel.h"
 #include "dxc/DXIL/DxilUtil.h"
@@ -46,6 +44,7 @@
 #include "clang/Sema/TemplateDeduction.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
@@ -1138,6 +1137,14 @@ static const ArBasicKind g_RayDescCT[] = {AR_OBJECT_RAY_DESC, AR_BASIC_UNKNOWN};
 static const ArBasicKind g_RayQueryCT[] = {AR_OBJECT_RAY_QUERY,
                                            AR_BASIC_UNKNOWN};
 
+static const ArBasicKind g_LinAlgCT[] = {
+    AR_BASIC_FLOAT32,       AR_BASIC_FLOAT32_PARTIAL_PRECISION,
+    AR_BASIC_FLOAT16,       AR_BASIC_INT32,
+    AR_BASIC_INT16,         AR_BASIC_UINT32,
+    AR_BASIC_UINT16,        AR_BASIC_INT8_4PACKED,
+    AR_BASIC_UINT8_4PACKED, AR_BASIC_NOCAST,
+    AR_BASIC_UNKNOWN};
+
 static const ArBasicKind g_AccelerationStructCT[] = {
     AR_OBJECT_ACCELERATION_STRUCT, AR_BASIC_UNKNOWN};
 
@@ -1301,6 +1308,7 @@ const ArBasicKind *g_LegalIntrinsicCompTypes[] = {
     g_ThreadNodeOutputRecordsCT, // LICOMPTYPE_THREAD_NODE_OUTPUT_RECORDS
     g_DxHitObjectCT,             // LICOMPTYPE_HIT_OBJECT
     g_RayQueryCT,                // LICOMPTYPE_RAY_QUERY
+    g_LinAlgCT,                  // LICOMPTYPE_LINALG
 #ifdef ENABLE_SPIRV_CODEGEN
     g_VKBufferPointerCT, // LICOMPTYPE_VK_BUFFER_POINTER
 #endif
@@ -4144,6 +4152,7 @@ class HLSLExternalSource : public ExternalSemaSource {
                               SourceLocation(), &context.Idents.get("dx"),
                               /*PrevDecl*/ nullptr);
     m_dxNSDecl->setImplicit();
+    m_dxNSDecl->setHasExternalLexicalStorage(true);
     context.getTranslationUnitDecl()->addDecl(m_dxNSDecl);
 
 #ifdef ENABLE_SPIRV_CODEGEN
@@ -5161,7 +5170,7 @@ class HLSLExternalSource : public ExternalSemaSource {
 
   bool AddOverloadedCallCandidates(UnresolvedLookupExpr *ULE,
                                    ArrayRef<Expr *> Args,
-                                   OverloadCandidateSet &CandidateSet,
+                                   OverloadCandidateSet &CandidateSet, Scope *S,
                                    bool PartialOverloading) override {
     DXASSERT_NOMSG(ULE != nullptr);
 
@@ -5186,6 +5195,8 @@ class HLSLExternalSource : public ExternalSemaSource {
     // Exceptions:
     // - Vulkan-specific intrinsics live in the 'vk::' namespace.
     // - DirectX-specific intrinsics live in the 'dx::' namespace.
+    // - Global namespaces could just mean we have a `using` declaration... so
+    // it can be anywhere!
     if (isQualified && !isGlobalNamespace && !isVkNamespace && !isDxNamespace)
       return false;
 
@@ -5196,81 +5207,106 @@ class HLSLExternalSource : public ExternalSemaSource {
     }
 
     StringRef nameIdentifier = idInfo->getName();
-    const HLSL_INTRINSIC *table = g_Intrinsics;
-    auto tableCount = _countof(g_Intrinsics);
-    if (isDxNamespace) {
-      table = g_DxIntrinsics;
-      tableCount = _countof(g_DxIntrinsics);
+    using IntrinsicArray = llvm::ArrayRef<const HLSL_INTRINSIC>;
+    struct IntrinsicTableEntry {
+      IntrinsicArray Table;
+      NamespaceDecl *NS;
+    };
+
+    llvm::SmallVector<IntrinsicTableEntry, 3> SearchTables;
+
+    bool SearchDX = isDxNamespace;
+    bool SearchVK = isVkNamespace;
+    if (isGlobalNamespace || !isQualified)
+      SearchTables.push_back(
+          IntrinsicTableEntry{IntrinsicArray(g_Intrinsics), m_hlslNSDecl});
+
+    if (S && !isQualified) {
+      SmallVector<const DeclContext *, 4> NSContexts;
+      m_sema->CollectNamespaceContexts(S, NSContexts);
+      for (const auto &UD : NSContexts) {
+        if (static_cast<DeclContext *>(m_dxNSDecl) == UD)
+          SearchDX = true;
+        else if (static_cast<DeclContext *>(m_vkNSDecl) == UD)
+          SearchVK = true;
+      }
     }
+
+    if (SearchDX)
+      SearchTables.push_back(
+          IntrinsicTableEntry{IntrinsicArray(g_DxIntrinsics), m_dxNSDecl});
 #ifdef ENABLE_SPIRV_CODEGEN
-    if (isVkNamespace) {
-      table = g_VkIntrinsics;
-      tableCount = _countof(g_VkIntrinsics);
-    }
-#endif // ENABLE_SPIRV_CODEGEN
+    if (SearchVK)
+      SearchTables.push_back(
+          IntrinsicTableEntry{IntrinsicArray(g_VkIntrinsics), m_vkNSDecl});
+#endif
 
-    IntrinsicDefIter cursor = FindIntrinsicByNameAndArgCount(
-        table, tableCount, StringRef(), nameIdentifier, Args.size());
-    IntrinsicDefIter end = IntrinsicDefIter::CreateEnd(
-        table, tableCount, IntrinsicTableDefIter::CreateEnd(m_intrinsicTables));
-
-    for (; cursor != end; ++cursor) {
-      // If this is the intrinsic we're interested in, build up a representation
-      // of the types we need.
-      const HLSL_INTRINSIC *pIntrinsic = *cursor;
-      LPCSTR tableName = cursor.GetTableName();
-      LPCSTR lowering = cursor.GetLoweringStrategy();
-      DXASSERT(pIntrinsic->uNumArgs <= g_MaxIntrinsicParamCount + 1,
-               "otherwise g_MaxIntrinsicParamCount needs to be updated for "
-               "wider signatures");
-
-      std::vector<QualType> functionArgTypes;
-      size_t badArgIdx;
-      bool argsMatch =
-          MatchArguments(cursor, QualType(), QualType(), QualType(), Args,
-                         &functionArgTypes, badArgIdx);
-      if (!functionArgTypes.size())
-        return false;
+    assert(!SearchTables.empty() && "Must have at least one search table!");
+
+    for (const auto &T : SearchTables) {
+
+      IntrinsicDefIter cursor = FindIntrinsicByNameAndArgCount(
+          T.Table.data(), T.Table.size(), StringRef(), nameIdentifier,
+          Args.size());
+      IntrinsicDefIter end = IntrinsicDefIter::CreateEnd(
+          T.Table.data(), T.Table.size(),
+          IntrinsicTableDefIter::CreateEnd(m_intrinsicTables));
+
+      for (; cursor != end; ++cursor) {
+        // If this is the intrinsic we're interested in, build up a
+        // representation of the types we need.
+        const HLSL_INTRINSIC *pIntrinsic = *cursor;
+        LPCSTR tableName = cursor.GetTableName();
+        LPCSTR lowering = cursor.GetLoweringStrategy();
+        DXASSERT(pIntrinsic->uNumArgs <= g_MaxIntrinsicParamCount + 1,
+                 "otherwise g_MaxIntrinsicParamCount needs to be updated for "
+                 "wider signatures");
+
+        std::vector<QualType> functionArgTypes;
+        size_t badArgIdx;
+        bool argsMatch =
+            MatchArguments(cursor, QualType(), QualType(), QualType(), Args,
+                           &functionArgTypes, badArgIdx);
+        if (!functionArgTypes.size())
+          return false;
 
-      // Get or create the overload we're interested in.
-      FunctionDecl *intrinsicFuncDecl = nullptr;
-      std::pair<UsedIntrinsicStore::iterator, bool> insertResult =
-          m_usedIntrinsics.insert(UsedIntrinsic(pIntrinsic, functionArgTypes));
-      bool insertedNewValue = insertResult.second;
-      if (insertedNewValue) {
-        NamespaceDecl *nsDecl = m_hlslNSDecl;
-        if (isVkNamespace)
-          nsDecl = m_vkNSDecl;
-        else if (isDxNamespace)
-          nsDecl = m_dxNSDecl;
-        DXASSERT(tableName,
-                 "otherwise IDxcIntrinsicTable::GetTableName() failed");
-        intrinsicFuncDecl =
-            AddHLSLIntrinsicFunction(*m_context, nsDecl, tableName, lowering,
-                                     pIntrinsic, &functionArgTypes);
-        insertResult.first->setFunctionDecl(intrinsicFuncDecl);
-      } else {
-        intrinsicFuncDecl = (*insertResult.first).getFunctionDecl();
-      }
+        // Get or create the overload we're interested in.
+        FunctionDecl *intrinsicFuncDecl = nullptr;
+        std::pair<UsedIntrinsicStore::iterator, bool> insertResult =
+            m_usedIntrinsics.insert(
+                UsedIntrinsic(pIntrinsic, functionArgTypes));
+        bool insertedNewValue = insertResult.second;
+        if (insertedNewValue) {
+          DXASSERT(tableName,
+                   "otherwise IDxcIntrinsicTable::GetTableName() failed");
+          intrinsicFuncDecl =
+              AddHLSLIntrinsicFunction(*m_context, T.NS, tableName, lowering,
+                                       pIntrinsic, &functionArgTypes);
+          insertResult.first->setFunctionDecl(intrinsicFuncDecl);
+        } else {
+          intrinsicFuncDecl = (*insertResult.first).getFunctionDecl();
+        }
 
-      OverloadCandidate &candidate = CandidateSet.addCandidate(Args.size());
-      candidate.Function = intrinsicFuncDecl;
-      candidate.FoundDecl.setDecl(intrinsicFuncDecl);
-      candidate.Viable = argsMatch;
-      CandidateSet.isNewCandidate(intrinsicFuncDecl); // used to insert into set
-      if (argsMatch)
-        return true;
-      if (badArgIdx) {
-        candidate.FailureKind = ovl_fail_bad_conversion;
-        QualType ParamType =
-            intrinsicFuncDecl->getParamDecl(badArgIdx - 1)->getType();
-        candidate.Conversions[badArgIdx - 1].setBad(
-            BadConversionSequence::no_conversion, Args[badArgIdx - 1],
-            ParamType);
-      } else {
-        // A less informative error. Needed when the failure relates to the
-        // return type
-        candidate.FailureKind = ovl_fail_bad_final_conversion;
+        OverloadCandidate &candidate = CandidateSet.addCandidate(Args.size());
+        candidate.Function = intrinsicFuncDecl;
+        candidate.FoundDecl.setDecl(intrinsicFuncDecl);
+        candidate.Viable = argsMatch;
+        CandidateSet.isNewCandidate(
+            intrinsicFuncDecl); // used to insert into set
+        if (argsMatch)
+          return true;
+        if (badArgIdx) {
+          candidate.FailureKind = ovl_fail_bad_conversion;
+          QualType ParamType =
+              intrinsicFuncDecl->getParamDecl(badArgIdx - 1)->getType();
+          candidate.Conversions[badArgIdx - 1].setBad(
+              BadConversionSequence::no_conversion, Args[badArgIdx - 1],
+              ParamType);
+        } else {
+          // A less informative error. Needed when the failure relates to the
+          // return type
+          candidate.FailureKind = ovl_fail_bad_final_conversion;
+        }
       }
     }
 
@@ -5394,7 +5430,17 @@ class HLSLExternalSource : public ExternalSemaSource {
         objectKind = ClassifyRecordType(recordType);
         switch (objectKind) {
         case AR_TOBJ_OBJECT:
-          m_sema->Diag(argLoc, diag::err_hlsl_objectintemplateargument) << type;
+#ifdef ENABLE_SPIRV_CODEGEN
+          if (const auto *namespaceDecl = dyn_cast<NamespaceDecl>(
+                  recordType->getDecl()->getDeclContext());
+              namespaceDecl && namespaceDecl->getName().equals("vk") &&
+              (recordType->getDecl()->getName().equals("SpirvType") ||
+               recordType->getDecl()->getName().equals("SpirvOpaqueType"))) {
+            return true;
+          }
+#endif
+          m_sema->Diag(argLoc, diag::err_hlsl_unsupported_object_context)
+              << type << static_cast<unsigned>(TypeDiagContext::TypeParameter);
           return false;
         case AR_TOBJ_COMPOUND: {
           const RecordDecl *recordDecl = recordType->getDecl();
@@ -5533,14 +5579,27 @@ class HLSLExternalSource : public ExternalSemaSource {
         m_sema->RequireCompleteType(argSrcLoc, argType,
                                     diag::err_typecheck_decl_incomplete_type);
 
-        if (ContainsLongVector(argType)) {
-          const unsigned ConstantBuffersOrTextureBuffersIdx = 0;
-          m_sema->Diag(argSrcLoc, diag::err_hlsl_unsupported_long_vector)
-              << ConstantBuffersOrTextureBuffersIdx;
+        TypeDiagContext DiagContext =
+            TypeDiagContext::ConstantBuffersOrTextureBuffers;
+        if (DiagnoseTypeElements(*m_sema, argSrcLoc, argType, DiagContext,
+                                 DiagContext))
           return true;
-        }
       }
       return false;
+    } else if (ResAttr && DXIL::IsStructuredBuffer(ResAttr->getResKind())) {
+      if (TemplateArgList.size() == 1) {
+        const TemplateArgumentLoc &ArgLoc = TemplateArgList[0];
+        const TemplateArgument &Arg = ArgLoc.getArgument();
+        if (Arg.getKind() == TemplateArgument::ArgKind::Type) {
+          QualType ArgType = Arg.getAsType();
+          SourceLocation ArgSrcLoc = ArgLoc.getLocation();
+          if (DiagnoseTypeElements(
+                  *m_sema, ArgSrcLoc, ArgType,
+                  TypeDiagContext::StructuredBuffers /*ObjDiagContext*/,
+                  TypeDiagContext::Valid /*LongVecDiagContext*/))
+            return true;
+        }
+      }
 
     } else if (Template->getTemplatedDecl()->hasAttr<HLSLNodeObjectAttr>()) {
 
@@ -5641,13 +5700,10 @@ class HLSLExternalSource : public ExternalSemaSource {
       CXXRecordDecl *Decl = arg.getAsType()->getAsCXXRecordDecl();
       if (Decl && !Decl->isCompleteDefinition())
         return true;
-      if (ContainsLongVector(arg.getAsType())) {
-        const unsigned TessellationPatchesIDx = 1;
-        m_sema->Diag(argLoc.getLocation(),
-                     diag::err_hlsl_unsupported_long_vector)
-            << TessellationPatchesIDx;
+      const TypeDiagContext DiagContext = TypeDiagContext::TessellationPatches;
+      if (DiagnoseTypeElements(*m_sema, argLoc.getLocation(), arg.getAsType(),
+                               DiagContext, DiagContext))
         return true;
-      }
     } else if (Template->getTemplatedDecl()->hasAttr<HLSLStreamOutputAttr>()) {
       DXASSERT(TemplateArgList.size() > 0,
                "Geometry streams should have at least one template args");
@@ -5660,13 +5716,10 @@ class HLSLExternalSource : public ExternalSemaSource {
       CXXRecordDecl *Decl = arg.getAsType()->getAsCXXRecordDecl();
       if (Decl && !Decl->isCompleteDefinition())
         return true;
-      if (ContainsLongVector(arg.getAsType())) {
-        const unsigned GeometryStreamsIdx = 2;
-        m_sema->Diag(argLoc.getLocation(),
-                     diag::err_hlsl_unsupported_long_vector)
-            << GeometryStreamsIdx;
+      const TypeDiagContext DiagContext = TypeDiagContext::GeometryStreams;
+      if (DiagnoseTypeElements(*m_sema, argLoc.getLocation(), arg.getAsType(),
+                               DiagContext, DiagContext))
         return true;
-      }
     }
 
     bool isMatrix = Template->getCanonicalDecl() ==
@@ -5945,6 +5998,8 @@ class HLSLExternalSource : public ExternalSemaSource {
              "otherwise caller didn't initialize - there should be at least a "
              "void return type");
 
+    const bool IsStatic = IsStaticMember(intrinsic);
+
     // Create the template arguments.
     SmallVector<TemplateArgument, g_MaxIntrinsicParamCount + 1> templateArgs;
     for (size_t i = 0; i < parameterTypeCount; i++) {
@@ -6010,15 +6065,19 @@ class HLSLExternalSource : public ExternalSemaSource {
 
     SmallVector<ParmVarDecl *, g_MaxIntrinsicParamCount> Params;
     for (unsigned int i = 1; i < parameterTypeCount; i++) {
+      // The first parameter in the HLSL intrinsic record is just the intrinsic
+      // name and aliases with the 'this' pointer for non-static members. Skip
+      // this first parameter for static functions.
+      unsigned ParamIdx = IsStatic ? i : i - 1;
       IdentifierInfo *id =
-          &m_context->Idents.get(StringRef(intrinsic->pArgs[i - 1].pName));
+          &m_context->Idents.get(StringRef(intrinsic->pArgs[ParamIdx].pName));
       ParmVarDecl *paramDecl = ParmVarDecl::Create(
           *m_context, nullptr, NoLoc, NoLoc, id, parameterTypes[i], nullptr,
           StorageClass::SC_None, nullptr, paramMods[i - 1]);
       Params.push_back(paramDecl);
     }
 
-    StorageClass SC = IsStaticMember(intrinsic) ? SC_Static : SC_Extern;
+    StorageClass SC = IsStatic ? SC_Static : SC_Extern;
     QualType T = TInfo->getType();
     DeclarationNameInfo NameInfo(FunctionTemplate->getDeclName(), NoLoc);
     CXXMethodDecl *method = CXXMethodDecl::Create(
@@ -6731,8 +6790,8 @@ bool HLSLExternalSource::MatchArguments(
           (iArg != retArgIdx && retTypeIdx == pIntrinsicArg->uComponentTypeId);
       // For literal arg which don't affect return type, find concrete type.
       // For literal arg affect return type,
-      //   TryEvalIntrinsic in CGHLSLMS.cpp will take care of cases
-      //     where all argumentss are literal.
+      //   TryEvalIntrinsic in CGHLSLMSFinishCodeGen.cpp will take care of
+      //     cases where all arguments are literal.
       //   CombineBasicTypes will cover the rest cases.
       if (!affectRetType) {
         TypeInfoEltKind =
@@ -10770,6 +10829,26 @@ HLSLExternalSource::ApplyTypeSpecSignToParsedType(clang::QualType &type,
   }
 }
 
+bool CheckIntersectionAttributeArg(Sema &S, Expr *E) {
+  SourceLocation Loc = E->getExprLoc();
+  QualType Ty = E->getType();
+
+  // Identify problematic fields first (high diagnostic accuracy, may miss some
+  // invalid cases)
+  const TypeDiagContext DiagContext = TypeDiagContext::Attributes;
+  if (DiagnoseTypeElements(S, Loc, Ty, DiagContext, DiagContext))
+    return true;
+
+  // Must be a UDT (low diagnostic accuracy, catches remaining invalid cases)
+  if (Ty.isNull() || !hlsl::IsHLSLCopyableAnnotatableRecord(Ty)) {
+    S.Diag(Loc, diag::err_payload_attrs_must_be_udt)
+        << /*payload|attributes|callable*/ 1 << /*parameter %2|type*/ 1;
+    return true;
+  }
+
+  return false;
+}
+
 Sema::TemplateDeductionResult
 HLSLExternalSource::DeduceTemplateArgumentsForHLSL(
     FunctionTemplateDecl *FunctionTemplate,
@@ -10892,28 +10971,38 @@ HLSLExternalSource::DeduceTemplateArgumentsForHLSL(
       IsBABLoad = intrinsicOp == (UINT)IntrinsicOp::MOP_Load;
       IsBABStore = intrinsicOp == (UINT)IntrinsicOp::MOP_Store;
     }
-    if (ExplicitTemplateArgs && ExplicitTemplateArgs->size() > 0) {
-      bool isLegalTemplate = false;
+    if (ExplicitTemplateArgs && ExplicitTemplateArgs->size() >= 1) {
       SourceLocation Loc = ExplicitTemplateArgs->getLAngleLoc();
-      auto TemplateDiag = diag::err_hlsl_intrinsic_template_arg_unsupported;
-      if (ExplicitTemplateArgs->size() >= 1 && (IsBABLoad || IsBABStore)) {
-        TemplateDiag = diag::err_hlsl_intrinsic_template_arg_requires_2018;
-        Loc = (*ExplicitTemplateArgs)[0].getLocation();
-        if (Is2018) {
-          TemplateDiag = diag::err_hlsl_intrinsic_template_arg_numeric;
-          if (ExplicitTemplateArgs->size() == 1 &&
-              !functionTemplateTypeArg.isNull() &&
-              hlsl::IsHLSLNumericOrAggregateOfNumericType(
-                  functionTemplateTypeArg)) {
-            isLegalTemplate = true;
-          }
-        }
+      if (!IsBABLoad && !IsBABStore) {
+        getSema()->Diag(Loc, diag::err_hlsl_intrinsic_template_arg_unsupported)
+            << intrinsicName;
+        return Sema::TemplateDeductionResult::TDK_Invalid;
       }
-
-      if (!isLegalTemplate) {
-        getSema()->Diag(Loc, TemplateDiag) << intrinsicName;
+      Loc = (*ExplicitTemplateArgs)[0].getLocation();
+      if (!Is2018) {
+        getSema()->Diag(Loc,
+                        diag::err_hlsl_intrinsic_template_arg_requires_2018)
+            << intrinsicName;
         return Sema::TemplateDeductionResult::TDK_Invalid;
       }
+
+      if (IsBABLoad || IsBABStore) {
+        const bool IsNull = functionTemplateTypeArg.isNull();
+        // Incomplete type is diagnosed elsewhere, so just fail if incomplete.
+        if (!IsNull &&
+            getSema()->RequireCompleteType(Loc, functionTemplateTypeArg, 0))
+          return Sema::TemplateDeductionResult::TDK_Invalid;
+        if (IsNull || !hlsl::IsHLSLNumericOrAggregateOfNumericType(
+                          functionTemplateTypeArg)) {
+          getSema()->Diag(Loc, diag::err_hlsl_intrinsic_template_arg_numeric)
+              << intrinsicName;
+          DiagnoseTypeElements(
+              *getSema(), Loc, functionTemplateTypeArg,
+              TypeDiagContext::TypeParameter /*ObjDiagContext*/,
+              TypeDiagContext::Valid /*LongVecDiagContext*/);
+          return Sema::TemplateDeductionResult::TDK_Invalid;
+        }
+      }
     } else if (IsBABStore) {
       // Prior to HLSL 2018, Store operation only stored scalar uint.
       if (!Is2018) {
@@ -11630,6 +11719,537 @@ static bool CheckBarrierCall(Sema &S, FunctionDecl *FD, CallExpr *CE,
   return false;
 }
 
+// MatVec Ops
+static const unsigned kMatVecMulOutputVectorIdx = 0;
+static const unsigned kMatVecMulOutputIsUnsignedIdx = 1;
+static const unsigned kMatVecMulInputVectorIdx = 2;
+static const unsigned kMatVecMulIsInputUnsignedIdx = 3;
+static const unsigned kMatVecMulInputInterpretationIdx = 4;
+// static const unsigned kMatVecMulMatrixBufferIdx = 5;
+// static const unsigned kMatVecMulMatrixOffsetIdx = 6;
+static const unsigned kMatVecMulMatrixInterpretationIdx = 7;
+static const unsigned kMatVecMulMatrixMIdx = 8;
+static const unsigned kMatVecMulMatrixKIdx = 9;
+static const unsigned kMatVecMulMatrixLayoutIdx = 10;
+static const unsigned kMatVecMulMatrixTransposeIdx = 11;
+static const unsigned kMatVecMulMatrixStrideIdx = 12;
+
+// MatVecAdd
+const unsigned kMatVecMulAddBiasInterpretation = 15;
+
+static bool IsValidMatrixLayoutForMulAndMulAddOps(unsigned Layout) {
+  return Layout <=
+         static_cast<unsigned>(DXIL::LinalgMatrixLayout::OuterProductOptimal);
+}
+
+static bool IsOptimalTypeMatrixLayout(unsigned Layout) {
+  return (
+      Layout == (static_cast<unsigned>(DXIL::LinalgMatrixLayout::MulOptimal)) ||
+      (Layout ==
+       (static_cast<unsigned>(DXIL::LinalgMatrixLayout::OuterProductOptimal))));
+}
+
+static bool IsValidTransposeForMatrixLayout(unsigned Layout, bool Transposed) {
+  switch (static_cast<DXIL::LinalgMatrixLayout>(Layout)) {
+  case DXIL::LinalgMatrixLayout::RowMajor:
+  case DXIL::LinalgMatrixLayout::ColumnMajor:
+    return !Transposed;
+
+  default:
+    return true;
+  }
+}
+
+static bool IsPackedType(unsigned type) {
+  return (type == static_cast<unsigned>(DXIL::ComponentType::PackedS8x32) ||
+          type == static_cast<unsigned>(DXIL::ComponentType::PackedU8x32));
+}
+
+static bool IsValidLinalgTypeInterpretation(uint32_t Input, bool InRegister) {
+
+  switch (static_cast<DXIL::ComponentType>(Input)) {
+  case DXIL::ComponentType::I16:
+  case DXIL::ComponentType::U16:
+  case DXIL::ComponentType::I32:
+  case DXIL::ComponentType::U32:
+  case DXIL::ComponentType::F16:
+  case DXIL::ComponentType::F32:
+  case DXIL::ComponentType::U8:
+  case DXIL::ComponentType::I8:
+  case DXIL::ComponentType::F8_E4M3:
+  case DXIL::ComponentType::F8_E5M2:
+    return true;
+  case DXIL::ComponentType::PackedS8x32:
+  case DXIL::ComponentType::PackedU8x32:
+    return InRegister;
+  default:
+    return false;
+  }
+}
+
+static bool IsValidVectorAndMatrixDimensions(Sema &S, CallExpr *CE,
+                                             unsigned InputVectorSize,
+                                             unsigned OutputVectorSize,
+                                             unsigned MatrixK, unsigned MatrixM,
+                                             bool isInputPacked) {
+  // Check if output vector size equals to matrix dimension M
+  if (OutputVectorSize != MatrixM) {
+    Expr *OutputVector = CE->getArg(kMatVecMulOutputVectorIdx);
+    S.Diags.Report(
+        OutputVector->getExprLoc(),
+        diag::
+            err_hlsl_linalg_mul_muladd_output_vector_size_not_equal_to_matrix_M);
+    return false;
+  }
+
+  // Check if input vector size equals to matrix dimension K in the unpacked
+  // case.
+  // Check if input vector size equals the smallest number that can hold
+  // matrix dimension K values
+  const unsigned PackingFactor = isInputPacked ? 4 : 1;
+  unsigned MinInputVectorSize = (MatrixK + PackingFactor - 1) / PackingFactor;
+  if (InputVectorSize != MinInputVectorSize) {
+    Expr *InputVector = CE->getArg(kMatVecMulInputVectorIdx);
+    if (isInputPacked) {
+      S.Diags.Report(
+          InputVector->getExprLoc(),
+          diag::err_hlsl_linalg_mul_muladd_packed_input_vector_size_incorrect);
+      return false;
+    } else {
+      S.Diags.Report(
+          InputVector->getExprLoc(),
+          diag::
+              err_hlsl_linalg_mul_muladd_unpacked_input_vector_size_not_equal_to_matrix_K);
+      return false;
+    }
+  }
+
+  return true;
+}
+
+static void CheckCommonMulAndMulAddParameters(Sema &S, CallExpr *CE,
+                                              const hlsl::ShaderModel *SM) {
+  // Check if IsOutputUnsigned is a const parameter
+  bool IsOutputUnsignedFlagValue = false;
+  Expr *IsOutputUnsignedExpr = CE->getArg(kMatVecMulOutputIsUnsignedIdx);
+  llvm::APSInt IsOutputUnsignedExprVal;
+  if (IsOutputUnsignedExpr->isIntegerConstantExpr(IsOutputUnsignedExprVal,
+                                                  S.Context)) {
+    IsOutputUnsignedFlagValue = IsOutputUnsignedExprVal.getBoolValue();
+  } else {
+    S.Diags.Report(IsOutputUnsignedExpr->getExprLoc(), diag::err_expr_not_ice)
+        << 0;
+    return;
+  }
+
+  Expr *OutputVectorExpr = CE->getArg(kMatVecMulOutputVectorIdx);
+  unsigned OutputVectorSizeValue = 0;
+  if (IsHLSLVecType(OutputVectorExpr->getType())) {
+    OutputVectorSizeValue = GetHLSLVecSize(OutputVectorExpr->getType());
+    QualType OutputVectorType =
+        GetHLSLVecElementType(OutputVectorExpr->getType());
+    const Type *OutputVectorTypePtr = OutputVectorType.getTypePtr();
+
+    // Check if IsOutputUnsigned flag matches output vector type.
+    // Must be true for unsigned int outputs, false for signed int/float
+    // outputs.
+    if (IsOutputUnsignedFlagValue &&
+        !OutputVectorTypePtr->isUnsignedIntegerType()) {
+      DXASSERT_NOMSG(OutputVectorTypePtr->isSignedIntegerType() ||
+                     OutputVectorTypePtr->isFloatingType());
+      S.Diags.Report(IsOutputUnsignedExpr->getExprLoc(),
+                     diag::err_hlsl_linalg_isunsigned_incorrect_for_given_type)
+          << "IsOuputUnsigned" << false
+          << (OutputVectorTypePtr->isSignedIntegerType() ? 1 : 0);
+      return;
+    } else if (!IsOutputUnsignedFlagValue &&
+               OutputVectorTypePtr->isUnsignedIntegerType()) {
+      S.Diags.Report(IsOutputUnsignedExpr->getExprLoc(),
+                     diag::err_hlsl_linalg_isunsigned_incorrect_for_given_type)
+          << "IsOuputUnsigned" << true << 2;
+      return;
+    }
+  }
+
+  // Check if isInputUnsigned parameter is a constant
+  bool IsInputUnsignedFlagValue = false;
+  Expr *IsInputUnsignedExpr = CE->getArg(kMatVecMulIsInputUnsignedIdx);
+  llvm::APSInt IsInputUnsignedExprVal;
+  if (IsInputUnsignedExpr->isIntegerConstantExpr(IsInputUnsignedExprVal,
+                                                 S.Context)) {
+    IsInputUnsignedFlagValue = IsInputUnsignedExprVal.getBoolValue();
+  } else {
+    S.Diags.Report(IsInputUnsignedExpr->getExprLoc(), diag::err_expr_not_ice)
+        << 0;
+    return;
+  }
+
+  // Get InputInterpretation, check if it is constant
+  Expr *InputInterpretationExpr = CE->getArg(kMatVecMulInputInterpretationIdx);
+  llvm::APSInt InputInterpretationExprVal;
+  unsigned InputInterpretationValue = 0;
+  if (InputInterpretationExpr->isIntegerConstantExpr(InputInterpretationExprVal,
+                                                     S.Context)) {
+    InputInterpretationValue = InputInterpretationExprVal.getLimitedValue();
+    const bool InRegisterInterpretation = true;
+    if (!IsValidLinalgTypeInterpretation(InputInterpretationValue,
+                                         InRegisterInterpretation)) {
+      S.Diags.Report(InputInterpretationExpr->getExprLoc(),
+                     diag::err_hlsl_linalg_interpretation_value_incorrect)
+          << std::to_string(InputInterpretationValue)
+          << InRegisterInterpretation;
+      return;
+    }
+  } else {
+    S.Diags.Report(InputInterpretationExpr->getExprLoc(),
+                   diag::err_expr_not_ice)
+        << 0;
+    return;
+  }
+
+  bool IsInputVectorPacked = IsPackedType(InputInterpretationValue);
+
+  // For packed types input vector type must be uint and isUnsigned must be
+  // true. The signedness is determined from the InputInterpretation
+  Expr *InputVectorExpr = CE->getArg(kMatVecMulInputVectorIdx);
+  unsigned InputVectorSizeValue = 0;
+  if (IsHLSLVecType(InputVectorExpr->getType())) {
+    InputVectorSizeValue = GetHLSLVecSize(InputVectorExpr->getType());
+    QualType InputVectorType =
+        GetHLSLVecElementType(InputVectorExpr->getType());
+    unsigned BitWidth = S.Context.getTypeSize(InputVectorType);
+    bool Is32Bit = (BitWidth == 32);
+    const Type *InputVectorTypePtr = InputVectorType.getTypePtr();
+
+    // Check if the isUnsigned flag setting
+    if (IsInputVectorPacked) {
+      // Check that the input vector element type is "32bit"
+      if (!Is32Bit) {
+        S.Diags.Report(
+            InputVectorExpr->getExprLoc(),
+            diag::err_hlsl_linalg_mul_muladd_packed_input_vector_must_be_uint);
+        return;
+      }
+
+      // Check that the input vector element type is an unsigned int
+      if (!InputVectorTypePtr->isUnsignedIntegerType()) {
+        S.Diags.Report(
+            InputVectorExpr->getExprLoc(),
+            diag::err_hlsl_linalg_mul_muladd_packed_input_vector_must_be_uint);
+        return;
+      }
+
+      // Check that isInputUnsigned is always true
+      // Actual signedness is inferred from the InputInterpretation
+      if (!IsInputUnsignedFlagValue) {
+        S.Diags.Report(
+            IsInputUnsignedExpr->getExprLoc(),
+            diag::
+                err_hlsl_linalg_mul_muladd_isUnsigned_for_packed_input_must_be_true);
+        return;
+      }
+    } else {
+      if (IsInputUnsignedFlagValue &&
+          !InputVectorTypePtr->isUnsignedIntegerType()) {
+        DXASSERT_NOMSG(InputVectorTypePtr->isSignedIntegerType() ||
+                       InputVectorTypePtr->isFloatingType());
+        S.Diags.Report(
+            IsInputUnsignedExpr->getExprLoc(),
+            diag::err_hlsl_linalg_isunsigned_incorrect_for_given_type)
+            << "IsInputUnsigned" << false
+            << (InputVectorTypePtr->isSignedIntegerType() ? 1 : 0);
+        return;
+      } else if (!IsInputUnsignedFlagValue &&
+                 InputVectorTypePtr->isUnsignedIntegerType()) {
+        S.Diags.Report(
+            IsInputUnsignedExpr->getExprLoc(),
+            diag::err_hlsl_linalg_isunsigned_incorrect_for_given_type)
+            << "IsInputUnsigned" << true << 2;
+        return;
+      }
+    }
+  }
+
+  // Get Matrix Dimensions M and K, check if they are constants
+  Expr *MatrixKExpr = CE->getArg(kMatVecMulMatrixKIdx);
+  llvm::APSInt MatrixKExprVal;
+  unsigned MatrixKValue = 0;
+  if (MatrixKExpr->isIntegerConstantExpr(MatrixKExprVal, S.Context)) {
+    MatrixKValue = MatrixKExprVal.getLimitedValue();
+  } else {
+    S.Diags.Report(MatrixKExpr->getExprLoc(), diag::err_expr_not_ice) << 0;
+    return;
+  }
+
+  Expr *MatrixMExpr = CE->getArg(kMatVecMulMatrixMIdx);
+  llvm::APSInt MatrixMExprVal;
+  unsigned MatrixMValue = 0;
+  if (MatrixMExpr->isIntegerConstantExpr(MatrixMExprVal, S.Context)) {
+    MatrixMValue = MatrixMExprVal.getLimitedValue();
+  } else {
+    S.Diags.Report(MatrixMExpr->getExprLoc(), diag::err_expr_not_ice) << 0;
+    return;
+  }
+
+  // Check MatrixM and MatrixK values are non-zero
+  if (MatrixMValue == 0) {
+    S.Diags.Report(MatrixMExpr->getExprLoc(),
+                   diag::err_hlsl_linalg_matrix_dim_must_be_greater_than_zero)
+        << std::to_string(DXIL::kSM69MaxVectorLength);
+    return;
+  }
+
+  if (MatrixKValue == 0) {
+    S.Diags.Report(MatrixKExpr->getExprLoc(),
+                   diag::err_hlsl_linalg_matrix_dim_must_be_greater_than_zero)
+        << std::to_string(DXIL::kSM69MaxVectorLength);
+    return;
+  }
+
+  // Check MatrixM and MatrixK values are less than max
+  // Matrix dimension cannot exceed largest vector length in a Mul/MulAdd
+  // operation.
+  if (MatrixMValue > DXIL::kSM69MaxVectorLength) {
+    S.Diags.Report(MatrixMExpr->getExprLoc(),
+                   diag::err_hlsl_linalg_mul_muladd_invalid_dim)
+        << 0 << std::to_string(DXIL::kSM69MaxVectorLength);
+    return;
+  }
+
+  // For packed input vectors 4 values are packed in a uint, so max Matrix K
+  // can be 4096
+  if (IsInputVectorPacked) {
+    const unsigned PackingFactor =
+        4; // Only supported packed formats: DATA_TYPE_(U)SINT8_T4_PACKED
+    if (MatrixKValue > DXIL::kSM69MaxVectorLength * PackingFactor) {
+      S.Diags.Report(MatrixKExpr->getExprLoc(),
+                     diag::err_hlsl_linalg_mul_muladd_invalid_dim)
+          << 2 << std::to_string(DXIL::kSM69MaxVectorLength * PackingFactor);
+      return;
+    }
+  } else {
+    if (MatrixKValue > DXIL::kSM69MaxVectorLength) {
+      S.Diags.Report(MatrixKExpr->getExprLoc(),
+                     diag::err_hlsl_linalg_mul_muladd_invalid_dim)
+          << 1 << std::to_string(DXIL::kSM69MaxVectorLength);
+      return;
+    }
+  }
+
+  if (!IsValidVectorAndMatrixDimensions(S, CE, InputVectorSizeValue,
+                                        OutputVectorSizeValue, MatrixKValue,
+                                        MatrixMValue, IsInputVectorPacked)) {
+    return;
+  }
+
+  // Get MatrixInterpretation, check if it is constant
+  // Make sure it is a valid value
+  Expr *MatrixInterpretationExpr =
+      CE->getArg(kMatVecMulMatrixInterpretationIdx);
+  llvm::APSInt MatrixInterpretationExprVal;
+  unsigned MatrixInterpretationValue = 0;
+  if (MatrixInterpretationExpr->isIntegerConstantExpr(
+          MatrixInterpretationExprVal, S.Context)) {
+    MatrixInterpretationValue = MatrixInterpretationExprVal.getLimitedValue();
+    const bool InRegisterInterpretation = false;
+    if (!IsValidLinalgTypeInterpretation(MatrixInterpretationValue,
+                                         InRegisterInterpretation)) {
+      S.Diags.Report(MatrixInterpretationExpr->getExprLoc(),
+                     diag::err_hlsl_linalg_interpretation_value_incorrect)
+          << std::to_string(MatrixInterpretationValue)
+          << InRegisterInterpretation;
+      return;
+    }
+  } else {
+    S.Diags.Report(MatrixInterpretationExpr->getExprLoc(),
+                   diag::err_expr_not_ice)
+        << 0;
+    return;
+  }
+
+  // Get MatrixLayout, check if it is constant and valid value
+  Expr *MatrixLayoutExpr = CE->getArg(kMatVecMulMatrixLayoutIdx);
+  llvm::APSInt MatrixLayoutExprVal;
+  unsigned MatrixLayoutValue = 0;
+  if (MatrixLayoutExpr->isIntegerConstantExpr(MatrixLayoutExprVal, S.Context)) {
+    MatrixLayoutValue = MatrixLayoutExprVal.getLimitedValue();
+    if (!IsValidMatrixLayoutForMulAndMulAddOps(MatrixLayoutValue)) {
+      S.Diags.Report(MatrixLayoutExpr->getExprLoc(),
+                     diag::err_hlsl_linalg_matrix_layout_invalid)
+          << std::to_string(MatrixLayoutValue)
+          << std::to_string(
+                 static_cast<unsigned>(DXIL::LinalgMatrixLayout::RowMajor))
+          << std::to_string(static_cast<unsigned>(
+                 DXIL::LinalgMatrixLayout::OuterProductOptimal));
+      return;
+    }
+  } else {
+    S.Diags.Report(MatrixLayoutExpr->getExprLoc(), diag::err_expr_not_ice) << 0;
+    return;
+  }
+
+  // Get MatrixTranspose, check if it is constant
+  Expr *MatrixTransposeExpr = CE->getArg(kMatVecMulMatrixTransposeIdx);
+  llvm::APSInt MatrixTransposeExprVal;
+  unsigned MatrixTransposeValue = 0;
+  if (MatrixTransposeExpr->isIntegerConstantExpr(MatrixTransposeExprVal,
+                                                 S.Context)) {
+    MatrixTransposeValue = MatrixTransposeExprVal.getBoolValue();
+    if (!IsValidTransposeForMatrixLayout(MatrixLayoutValue,
+                                         MatrixTransposeValue)) {
+
+      S.Diags.Report(MatrixTransposeExpr->getExprLoc(),
+                     diag::err_hlsl_linalg_matrix_layout_is_not_transposable);
+      return;
+    }
+  } else {
+    S.Diags.Report(MatrixTransposeExpr->getExprLoc(), diag::err_expr_not_ice)
+        << 0;
+    return;
+  }
+
+  // Get MatrixStride, check if it is constant, if yes it should be zero
+  // for optimal layouts
+  Expr *MatrixStrideExpr = CE->getArg(kMatVecMulMatrixStrideIdx);
+  llvm::APSInt MatrixStrideExprVal;
+  unsigned MatrixStrideValue = 0;
+  if (MatrixStrideExpr->isIntegerConstantExpr(MatrixStrideExprVal, S.Context)) {
+    MatrixStrideValue = MatrixStrideExprVal.getLimitedValue();
+    if (IsOptimalTypeMatrixLayout(MatrixLayoutValue) &&
+        MatrixStrideValue != 0) {
+      S.Diags.Report(
+          MatrixStrideExpr->getExprLoc(),
+          diag::
+              err_hlsl_linalg_optimal_matrix_layout_matrix_stride_must_be_zero);
+      return;
+    }
+  }
+}
+
+static void CheckMulCall(Sema &S, FunctionDecl *FD, CallExpr *CE,
+                         const hlsl::ShaderModel *SM) {
+  CheckCommonMulAndMulAddParameters(S, CE, SM);
+}
+
+static void CheckMulAddCall(Sema &S, FunctionDecl *FD, CallExpr *CE,
+                            const hlsl::ShaderModel *SM) {
+  CheckCommonMulAndMulAddParameters(S, CE, SM);
+
+  // Check if BiasInterpretation is constant and a valid value
+  Expr *BiasInterpretationExpr = CE->getArg(kMatVecMulAddBiasInterpretation);
+  llvm::APSInt BiasInterpretationExprVal;
+  unsigned BiasInterpretationValue = 0;
+  if (BiasInterpretationExpr->isIntegerConstantExpr(BiasInterpretationExprVal,
+                                                    S.Context)) {
+    BiasInterpretationValue = BiasInterpretationExprVal.getLimitedValue();
+    const bool InRegisterInterpretation = false;
+    if (!IsValidLinalgTypeInterpretation(BiasInterpretationValue,
+                                         InRegisterInterpretation)) {
+      S.Diags.Report(BiasInterpretationExpr->getExprLoc(),
+                     diag::err_hlsl_linalg_interpretation_value_incorrect)
+          << std::to_string(BiasInterpretationValue)
+          << InRegisterInterpretation;
+      return;
+    }
+  } else {
+    S.Diags.Report(BiasInterpretationExpr->getExprLoc(), diag::err_expr_not_ice)
+        << 0;
+    return;
+  }
+}
+
+// Linalg Outer Product Accumulate
+// OuterProductAccumulate builtin function parameters
+static const unsigned kOuterProdAccInputVector1Idx = 0;
+static const unsigned kOuterProdAccInputVector2Idx = 1;
+// static const unsigned kOuterProdAccMatrixBufferIdx = 2;
+// static const unsigned kOuterProdAccMatrixOffsetIdx = 3;
+static const unsigned kOuterProdAccMatrixInterpretationIdx = 4;
+static const unsigned kOuterProdAccMatrixLayoutIdx = 5;
+static const unsigned kOuterProdAccMatrixStrideIdx = 6;
+
+static void CheckOuterProductAccumulateCall(Sema &S, FunctionDecl *FD,
+                                            CallExpr *CE) {
+  // Check InputVector1 and InputVector2 are the same type
+  const Expr *InputVector1Expr = CE->getArg(kOuterProdAccInputVector1Idx);
+  const Expr *InputVector2Expr = CE->getArg(kOuterProdAccInputVector2Idx);
+  QualType InputVector1Type = InputVector1Expr->getType();
+  QualType InputVector2Type = InputVector2Expr->getType();
+
+  // Get the element types of the vectors
+  const QualType InputVector1ElementType =
+      GetHLSLVecElementType(InputVector1Type);
+  const QualType InputVector2ElementType =
+      GetHLSLVecElementType(InputVector2Type);
+
+  if (!S.Context.hasSameType(InputVector1ElementType,
+                             InputVector2ElementType)) {
+    S.Diags.Report(InputVector2Expr->getExprLoc(),
+                   diag::err_hlsl_linalg_outer_prod_acc_vector_type_mismatch);
+    return;
+  }
+
+  // Check Matrix Interpretation is a constant and a valid value
+  Expr *MatrixInterpretationExpr =
+      CE->getArg(kOuterProdAccMatrixInterpretationIdx);
+  llvm::APSInt MatrixInterpretationExprVal;
+  unsigned MatrixInterpretationValue = 0;
+  if (MatrixInterpretationExpr->isIntegerConstantExpr(
+          MatrixInterpretationExprVal, S.Context)) {
+    MatrixInterpretationValue = MatrixInterpretationExprVal.getLimitedValue();
+    const bool InRegisterInterpretation = false;
+    if (!IsValidLinalgTypeInterpretation(MatrixInterpretationValue,
+                                         InRegisterInterpretation)) {
+      S.Diags.Report(MatrixInterpretationExpr->getExprLoc(),
+                     diag::err_hlsl_linalg_interpretation_value_incorrect)
+          << std::to_string(MatrixInterpretationValue)
+          << InRegisterInterpretation;
+      return;
+    }
+  } else {
+    S.Diags.Report(MatrixInterpretationExpr->getExprLoc(),
+                   diag::err_expr_not_ice)
+        << 0;
+    return;
+  }
+
+  // Check Matrix Layout must be a constant and Training Optimal
+  Expr *MatrixLayoutExpr = CE->getArg(kOuterProdAccMatrixLayoutIdx);
+  llvm::APSInt MatrixLayoutExprVal;
+  unsigned MatrixLayoutValue = 0;
+  if (MatrixLayoutExpr->isIntegerConstantExpr(MatrixLayoutExprVal, S.Context)) {
+    MatrixLayoutValue = MatrixLayoutExprVal.getLimitedValue();
+    if (MatrixLayoutValue !=
+        static_cast<unsigned>(DXIL::LinalgMatrixLayout::OuterProductOptimal)) {
+      S.Diags.Report(
+          MatrixLayoutExpr->getExprLoc(),
+          diag::
+              err_hlsl_linalg_outer_prod_acc_matrix_layout_must_be_outer_prod_acc_optimal)
+          << std::to_string(static_cast<unsigned>(
+                 DXIL::LinalgMatrixLayout::OuterProductOptimal));
+      return;
+    }
+  } else {
+    S.Diags.Report(MatrixLayoutExpr->getExprLoc(), diag::err_expr_not_ice) << 0;
+    return;
+  }
+
+  // Matrix Stride must be zero (Training Optimal matrix layout)
+  Expr *MatrixStrideExpr = CE->getArg(kOuterProdAccMatrixStrideIdx);
+  llvm::APSInt MatrixStrideExprVal;
+  unsigned MatrixStrideValue = 0;
+  if (MatrixStrideExpr->isIntegerConstantExpr(MatrixStrideExprVal, S.Context)) {
+    MatrixStrideValue = MatrixStrideExprVal.getLimitedValue();
+    if (MatrixStrideValue != 0) {
+      S.Diags.Report(
+          MatrixStrideExpr->getExprLoc(),
+          diag::
+              err_hlsl_linalg_optimal_matrix_layout_matrix_stride_must_be_zero);
+      return;
+    }
+  }
+}
+
 #ifdef ENABLE_SPIRV_CODEGEN
 static bool CheckVKBufferPointerCast(Sema &S, FunctionDecl *FD, CallExpr *CE,
                                      bool isStatic) {
@@ -11656,9 +12276,78 @@ static bool CheckVKBufferPointerCast(Sema &S, FunctionDecl *FD, CallExpr *CE,
 }
 #endif
 
+static bool isRelatedDeclMarkedNointerpolation(Expr *E) {
+  if (!E)
+    return false;
+  E = E->IgnoreCasts();
+  if (auto *DRE = dyn_cast<DeclRefExpr>(E))
+    return DRE->getDecl()->hasAttr<HLSLNoInterpolationAttr>();
+
+  if (auto *ME = dyn_cast<MemberExpr>(E))
+    return ME->getMemberDecl()->hasAttr<HLSLNoInterpolationAttr>() ||
+           isRelatedDeclMarkedNointerpolation(ME->getBase());
+
+  if (auto *HVE = dyn_cast<HLSLVectorElementExpr>(E))
+    return isRelatedDeclMarkedNointerpolation(HVE->getBase());
+
+  if (auto *ASE = dyn_cast<ArraySubscriptExpr>(E))
+    return isRelatedDeclMarkedNointerpolation(ASE->getBase());
+
+  return false;
+}
+
+static bool CheckIntrinsicGetAttributeAtVertex(Sema &S, FunctionDecl *FDecl,
+                                               CallExpr *TheCall) {
+  assert(TheCall->getNumArgs() > 0);
+  auto argument = TheCall->getArg(0)->IgnoreCasts();
+
+  if (!isRelatedDeclMarkedNointerpolation(argument)) {
+    S.Diag(argument->getExprLoc(), diag::err_hlsl_parameter_requires_attribute)
+        << 0 << FDecl->getName() << "nointerpolation";
+    return true;
+  }
+
+  return false;
+}
+
+static bool CheckNoInterpolationParams(Sema &S, FunctionDecl *FDecl,
+                                       CallExpr *TheCall) {
+  // See #hlsl-specs/issues/181. Feature is broken. For SPIR-V we want
+  // to limit the scope, and fail gracefully in some cases.
+  if (!S.getLangOpts().SPIRV)
+    return false;
+
+  bool error = false;
+  for (unsigned i = 0; i < FDecl->getNumParams(); i++) {
+    assert(i < TheCall->getNumArgs());
+
+    if (!FDecl->getParamDecl(i)->hasAttr<HLSLNoInterpolationAttr>())
+      continue;
+
+    if (!isRelatedDeclMarkedNointerpolation(TheCall->getArg(i))) {
+      S.Diag(TheCall->getArg(i)->getExprLoc(),
+             diag::err_hlsl_parameter_requires_attribute)
+          << i << FDecl->getName() << "nointerpolation";
+      error = true;
+    }
+  }
+
+  return error;
+}
+
+// Verify that user-defined intrinsic struct args contain no long vectors
+static bool CheckUDTIntrinsicArg(Sema &S, Expr *Arg) {
+  const TypeDiagContext DiagContext =
+      TypeDiagContext::UserDefinedStructParameter;
+  return DiagnoseTypeElements(S, Arg->getExprLoc(), Arg->getType(), DiagContext,
+                              DiagContext);
+}
+
 // Check HLSL call constraints, not fatal to creating the AST.
-void Sema::CheckHLSLFunctionCall(FunctionDecl *FDecl, CallExpr *TheCall,
-                                 const FunctionProtoType *Proto) {
+void Sema::CheckHLSLFunctionCall(FunctionDecl *FDecl, CallExpr *TheCall) {
+  if (CheckNoInterpolationParams(*this, FDecl, TheCall))
+    return;
+
   HLSLIntrinsicAttr *IntrinsicAttr = FDecl->getAttr<HLSLIntrinsicAttr>();
   if (!IntrinsicAttr)
     return;
@@ -11677,6 +12366,37 @@ void Sema::CheckHLSLFunctionCall(FunctionDecl *FDecl, CallExpr *TheCall,
   case hlsl::IntrinsicOp::IOP_Barrier:
     CheckBarrierCall(*this, FDecl, TheCall, SM);
     break;
+  case hlsl::IntrinsicOp::IOP___builtin_MatVecMul:
+    CheckMulCall(*this, FDecl, TheCall, SM);
+    break;
+  case hlsl::IntrinsicOp::IOP___builtin_MatVecMulAdd:
+    CheckMulAddCall(*this, FDecl, TheCall, SM);
+    break;
+  case hlsl::IntrinsicOp::IOP___builtin_OuterProductAccumulate:
+    CheckOuterProductAccumulateCall(*this, FDecl, TheCall);
+    break;
+  case hlsl::IntrinsicOp::IOP_GetAttributeAtVertex:
+    // See #hlsl-specs/issues/181. Feature is broken. For SPIR-V we want
+    // to limit the scope, and fail gracefully in some cases.
+    if (!getLangOpts().SPIRV)
+      return;
+    CheckIntrinsicGetAttributeAtVertex(*this, FDecl, TheCall);
+    break;
+  case hlsl::IntrinsicOp::IOP_DispatchMesh:
+    CheckUDTIntrinsicArg(*this, TheCall->getArg(3)->IgnoreCasts());
+    break;
+  case hlsl::IntrinsicOp::IOP_CallShader:
+    CheckUDTIntrinsicArg(*this, TheCall->getArg(1)->IgnoreCasts());
+    break;
+  case hlsl::IntrinsicOp::IOP_TraceRay:
+    CheckUDTIntrinsicArg(*this, TheCall->getArg(7)->IgnoreCasts());
+    break;
+  case hlsl::IntrinsicOp::IOP_ReportHit:
+    CheckIntersectionAttributeArg(*this, TheCall->getArg(2)->IgnoreCasts());
+    break;
+  case hlsl::IntrinsicOp::MOP_DxHitObject_GetAttributes:
+    CheckIntersectionAttributeArg(*this, TheCall->getArg(0)->IgnoreCasts());
+    break;
 #ifdef ENABLE_SPIRV_CODEGEN
   case hlsl::IntrinsicOp::IOP_Vkreinterpret_pointer_cast:
     CheckVKBufferPointerCast(*this, FDecl, TheCall, false);
@@ -12066,8 +12786,11 @@ void Sema::DiagnoseReachableHLSLCall(CallExpr *CE, const hlsl::ShaderModel *SM,
   case hlsl::IntrinsicOp::MOP_TraceRayInline:
     DiagnoseTraceRayInline(*this, CE);
     break;
+  case hlsl::IntrinsicOp::MOP_DxHitObject_FromRayQuery:
+  case hlsl::IntrinsicOp::MOP_DxHitObject_Invoke:
   case hlsl::IntrinsicOp::MOP_DxHitObject_MakeMiss:
   case hlsl::IntrinsicOp::MOP_DxHitObject_MakeNop:
+  case hlsl::IntrinsicOp::MOP_DxHitObject_TraceRay:
     DiagnoseReachableSERCall(*this, CE, EntrySK, EntryDecl, false);
     break;
   case hlsl::IntrinsicOp::IOP_DxMaybeReorderThread:
@@ -12080,34 +12803,73 @@ void Sema::DiagnoseReachableHLSLCall(CallExpr *CE, const hlsl::ShaderModel *SM,
 
 /////////////////////////////////////////////////////////////////////////////
 
-bool hlsl::DiagnoseNodeStructArgument(Sema *self, TemplateArgumentLoc ArgLoc,
-                                      QualType ArgTy, bool &Empty,
-                                      const FieldDecl *FD) {
-  DXASSERT_NOMSG(!ArgTy.isNull());
+static bool AllowObjectInContext(QualType Ty, TypeDiagContext DiagContext) {
+  // Disallow all object in template type parameters (former
+  // err_hlsl_objectintemplateargument)
+  if (DiagContext == TypeDiagContext::TypeParameter)
+    return false;
+  // Disallow all objects in node records (former
+  // err_hlsl_node_record_object)
+  if (DiagContext == TypeDiagContext::NodeRecords)
+    return false;
+  // TODO: Extend this list for other object types.
+  if (IsHLSLHitObjectType(Ty))
+    return false;
+  return true;
+}
 
-  HLSLExternalSource *source = HLSLExternalSource::FromSema(self);
-  ArTypeObjectKind shapeKind = source->GetTypeObjectKind(ArgTy);
-  switch (shapeKind) {
+// Determine if `Ty` is valid in this `DiagContext` and/or an empty type.  If
+// invalid returns false and Sema `S`, location `Loc`, error index
+// `DiagContext`, and FieldDecl `FD` are used to emit diagnostics. If
+// `CheckLongVec` is set, errors are produced if `Ty` is a long vector. If the
+// type is not empty, `Empty` is set to false. `CheckedDecls` is used to prevent
+// redundant recursive type checks.
+static bool
+DiagnoseElementTypes(Sema &S, SourceLocation Loc, QualType Ty, bool &Empty,
+                     TypeDiagContext ObjDiagContext,
+                     TypeDiagContext LongVecDiagContext,
+                     llvm::SmallPtrSet<const RecordDecl *, 8> &CheckedDecls,
+                     const clang::FieldDecl *FD) {
+  if (Ty.isNull() || Ty->isDependentType())
+    return false;
+
+  const bool CheckLongVec = LongVecDiagContext != TypeDiagContext::Valid;
+  const bool CheckObjects = ObjDiagContext != TypeDiagContext::Valid;
+
+  while (const ArrayType *Arr = Ty->getAsArrayTypeUnsafe())
+    Ty = Arr->getElementType();
+
+  const int ObjDiagContextIdx = static_cast<int>(ObjDiagContext);
+  const int LongVecDiagContextIdx = static_cast<int>(LongVecDiagContext);
+  DXASSERT_NOMSG(
+      LongVecDiagContext == TypeDiagContext::Valid ||
+      (0 <= LongVecDiagContextIdx &&
+       LongVecDiagContextIdx <=
+           static_cast<int>(TypeDiagContext::LongVecDiagMaxSelectIndex)));
+
+  HLSLExternalSource *Source = HLSLExternalSource::FromSema(&S);
+  ArTypeObjectKind ShapeKind = Source->GetTypeObjectKind(Ty);
+  switch (ShapeKind) {
   case AR_TOBJ_VECTOR:
-    if (GetHLSLVecSize(ArgTy) > DXIL::kDefaultMaxVectorLength) {
-      const unsigned NodeRecordsIdx = 3;
-      self->Diag(ArgLoc.getLocation(), diag::err_hlsl_unsupported_long_vector)
-          << NodeRecordsIdx;
+    if (CheckLongVec && GetHLSLVecSize(Ty) > DXIL::kDefaultMaxVectorLength) {
+      S.Diag(Loc, diag::err_hlsl_unsupported_long_vector)
+          << LongVecDiagContextIdx;
       Empty = false;
       return false;
     }
     LLVM_FALLTHROUGH;
-  case AR_TOBJ_ARRAY:
   case AR_TOBJ_BASIC:
   case AR_TOBJ_MATRIX:
     Empty = false;
     return false;
   case AR_TOBJ_OBJECT:
     Empty = false;
-    self->Diag(ArgLoc.getLocation(), diag::err_hlsl_node_record_object)
-        << ArgTy << ArgLoc.getSourceRange();
+    if (!CheckObjects || AllowObjectInContext(Ty, ObjDiagContext))
+      return false;
+    S.Diag(Loc, diag::err_hlsl_unsupported_object_context)
+        << Ty << ObjDiagContextIdx;
     if (FD)
-      self->Diag(FD->getLocation(), diag::note_field_declared_here)
+      S.Diag(FD->getLocation(), diag::note_field_declared_here)
           << FD->getType() << FD->getSourceRange();
     return true;
   case AR_TOBJ_DEPENDENT:
@@ -12116,25 +12878,55 @@ bool hlsl::DiagnoseNodeStructArgument(Sema *self, TemplateArgumentLoc ArgLoc,
     return true;
   case AR_TOBJ_COMPOUND: {
     bool ErrorFound = false;
-    const RecordDecl *RD = ArgTy->getAs<RecordType>()->getDecl();
+    const RecordDecl *RD = Ty->getAs<RecordType>()->getDecl();
+    // Never recurse redundantly into related subtypes that have already been
+    // checked.
+    if (!CheckedDecls.insert(RD).second)
+      return false;
+
     // Check the fields of the RecordDecl
-    for (auto *FD : RD->fields())
+    for (auto *ElemFD : RD->fields()) {
       ErrorFound |=
-          DiagnoseNodeStructArgument(self, ArgLoc, FD->getType(), Empty, FD);
-    if (RD->isCompleteDefinition())
-      if (auto *Child = dyn_cast<CXXRecordDecl>(RD))
-        // Walk up the inheritance chain and check base class fields
-        for (auto &B : Child->bases())
-          ErrorFound |=
-              DiagnoseNodeStructArgument(self, ArgLoc, B.getType(), Empty);
+          DiagnoseElementTypes(S, Loc, ElemFD->getType(), Empty, ObjDiagContext,
+                               LongVecDiagContext, CheckedDecls, ElemFD);
+    }
+    if (!RD->isCompleteDefinition())
+      return ErrorFound;
+
+    if (auto *Child = dyn_cast<CXXRecordDecl>(RD))
+      // Walk up the inheritance chain and check base class fields
+      for (auto &B : Child->bases())
+        ErrorFound |=
+            DiagnoseElementTypes(S, Loc, B.getType(), Empty, ObjDiagContext,
+                                 LongVecDiagContext, CheckedDecls, nullptr);
     return ErrorFound;
   }
   default:
-    DXASSERT(false, "unreachable");
+    // Not a recursive type, no element types to check here
+    Empty = false;
     return false;
   }
 }
 
+bool hlsl::DiagnoseTypeElements(Sema &S, SourceLocation Loc, QualType Ty,
+                                TypeDiagContext ObjDiagContext,
+                                TypeDiagContext LongVecDiagContext,
+                                const clang::FieldDecl *FD) {
+  bool Empty = false;
+  llvm::SmallPtrSet<const RecordDecl *, 8> CheckedDecls;
+  return DiagnoseElementTypes(S, Loc, Ty, Empty, ObjDiagContext,
+                              LongVecDiagContext, CheckedDecls, FD);
+}
+
+bool hlsl::DiagnoseNodeStructArgument(Sema *self, TemplateArgumentLoc ArgLoc,
+                                      QualType ArgTy, bool &Empty,
+                                      const FieldDecl *FD) {
+  llvm::SmallPtrSet<const RecordDecl *, 8> CheckedDecls;
+  return DiagnoseElementTypes(*self, ArgLoc.getLocation(), ArgTy, Empty,
+                              TypeDiagContext::NodeRecords,
+                              TypeDiagContext::NodeRecords, CheckedDecls, FD);
+}
+
 // This function diagnoses whether or not all entry-point attributes
 // should exist on this shader stage
 void DiagnoseEntryAttrAllowedOnStage(clang::Sema *self,
@@ -12562,21 +13354,6 @@ bool hlsl::ShouldSkipNRVO(clang::Sema &sema, clang::QualType returnType,
   return false;
 }
 
-bool hlsl::ContainsLongVector(QualType QT) {
-  if (QT.isNull() || QT->isDependentType())
-    return false;
-
-  while (const ArrayType *Arr = QT->getAsArrayTypeUnsafe())
-    QT = Arr->getElementType();
-
-  if (CXXRecordDecl *Decl = QT->getAsCXXRecordDecl()) {
-    if (!Decl->isCompleteDefinition())
-      return false;
-    return Decl->hasHLSLLongVector();
-  }
-  return false;
-}
-
 bool hlsl::IsConversionToLessOrEqualElements(
     clang::Sema *self, const clang::ExprResult &sourceExpr,
     const clang::QualType &targetType, bool explicitConversion) {
@@ -15247,8 +16024,8 @@ bool Sema::DiagnoseHLSLDecl(Declarator &D, DeclContext *DC, Expr *BitWidth,
     result = false;
   }
 
-  // Disallow long vecs from $Global cbuffers.
-  if (isGlobal && !isStatic && !isGroupShared && !IS_BASIC_OBJECT(basicKind)) {
+  // Disallow intangible HLSL objects in the global scope.
+  if (isGlobal) {
     // Suppress actual emitting of errors for incompletable types here
     // They are redundant to those produced in ActOnUninitializedDecl.
     struct SilentDiagnoser : public TypeDiagnoser {
@@ -15256,12 +16033,22 @@ bool Sema::DiagnoseHLSLDecl(Declarator &D, DeclContext *DC, Expr *BitWidth,
       virtual void diagnose(Sema &S, SourceLocation Loc, QualType T) {}
     } SD;
     RequireCompleteType(D.getLocStart(), qt, SD);
-    if (ContainsLongVector(qt)) {
-      unsigned CbuffersOrTbuffersIdx = 4;
-      Diag(D.getLocStart(), diag::err_hlsl_unsupported_long_vector)
-          << CbuffersOrTbuffersIdx;
+
+    // Disallow objects in the global context
+    TypeDiagContext ObjDiagContext = TypeDiagContext::CBuffersOrTBuffers;
+    if (isGroupShared)
+      ObjDiagContext = TypeDiagContext::GroupShared;
+    else if (isStatic)
+      ObjDiagContext = TypeDiagContext::GlobalVariables;
+
+    TypeDiagContext LongVecDiagContext = TypeDiagContext::Valid;
+
+    // Disallow long vecs from $Global cbuffers.
+    if (!isStatic && !isGroupShared && !IS_BASIC_OBJECT(basicKind))
+      LongVecDiagContext = TypeDiagContext::CBuffersOrTBuffers;
+    if (DiagnoseTypeElements(*this, D.getLocStart(), qt, ObjDiagContext,
+                             LongVecDiagContext))
       result = false;
-    }
   }
 
   // SPIRV change starts
@@ -16144,121 +16931,6 @@ QualType Sema::getHLSLDefaultSpecialization(TemplateDecl *Decl) {
   return QualType();
 }
 
-static bool isRelatedDeclMarkedNointerpolation(Expr *E) {
-  if (!E)
-    return false;
-  E = E->IgnoreCasts();
-  if (auto *DRE = dyn_cast<DeclRefExpr>(E))
-    return DRE->getDecl()->hasAttr<HLSLNoInterpolationAttr>();
-
-  if (auto *ME = dyn_cast<MemberExpr>(E))
-    return ME->getMemberDecl()->hasAttr<HLSLNoInterpolationAttr>() ||
-           isRelatedDeclMarkedNointerpolation(ME->getBase());
-
-  if (auto *HVE = dyn_cast<HLSLVectorElementExpr>(E))
-    return isRelatedDeclMarkedNointerpolation(HVE->getBase());
-
-  if (auto *ASE = dyn_cast<ArraySubscriptExpr>(E))
-    return isRelatedDeclMarkedNointerpolation(ASE->getBase());
-
-  return false;
-}
-
-// Verify that user-defined intrinsic struct args contain no long vectors
-static bool CheckUDTIntrinsicArg(Sema *S, Expr *Arg) {
-  if (ContainsLongVector(Arg->getType())) {
-    const unsigned UserDefinedStructParameterIdx = 5;
-    S->Diag(Arg->getExprLoc(), diag::err_hlsl_unsupported_long_vector)
-        << UserDefinedStructParameterIdx;
-    return true;
-  }
-  return false;
-}
-
-static bool CheckIntrinsicGetAttributeAtVertex(Sema *S, FunctionDecl *FDecl,
-                                               CallExpr *TheCall) {
-  assert(TheCall->getNumArgs() > 0);
-  auto argument = TheCall->getArg(0)->IgnoreCasts();
-
-  if (!isRelatedDeclMarkedNointerpolation(argument)) {
-    S->Diag(argument->getExprLoc(), diag::err_hlsl_parameter_requires_attribute)
-        << 0 << FDecl->getName() << "nointerpolation";
-    return true;
-  }
-
-  return false;
-}
-
-bool Sema::CheckHLSLIntrinsicCall(FunctionDecl *FDecl, CallExpr *TheCall) {
-  auto attr = FDecl->getAttr<HLSLIntrinsicAttr>();
-
-  if (!attr)
-    return false;
-
-  if (!IsBuiltinTable(attr->getGroup()))
-    return false;
-
-  switch (hlsl::IntrinsicOp(attr->getOpcode())) {
-  case hlsl::IntrinsicOp::IOP_GetAttributeAtVertex:
-    // See #hlsl-specs/issues/181. Feature is broken. For SPIR-V we want
-    // to limit the scope, and fail gracefully in some cases.
-    if (!getLangOpts().SPIRV)
-      return false;
-    // This should never happen for SPIR-V. But on the DXIL side, extension can
-    // be added by inserting new intrinsics, meaning opcodes can collide with
-    // existing ones. See the ExtensionTest.EvalAttributeCollision test.
-    assert(FDecl->getName() == "GetAttributeAtVertex");
-    return CheckIntrinsicGetAttributeAtVertex(this, FDecl, TheCall);
-  case hlsl::IntrinsicOp::IOP_DispatchMesh:
-    assert(TheCall->getNumArgs() > 3);
-    assert(FDecl->getName() == "DispatchMesh");
-    return CheckUDTIntrinsicArg(this, TheCall->getArg(3)->IgnoreCasts());
-  case hlsl::IntrinsicOp::IOP_CallShader:
-    assert(TheCall->getNumArgs() > 1);
-    assert(FDecl->getName() == "CallShader");
-    return CheckUDTIntrinsicArg(this, TheCall->getArg(1)->IgnoreCasts());
-  case hlsl::IntrinsicOp::IOP_TraceRay:
-    assert(TheCall->getNumArgs() > 7);
-    assert(FDecl->getName() == "TraceRay");
-    return CheckUDTIntrinsicArg(this, TheCall->getArg(7)->IgnoreCasts());
-  case hlsl::IntrinsicOp::IOP_ReportHit:
-    assert(TheCall->getNumArgs() > 2);
-    assert(FDecl->getName() == "ReportHit");
-    return CheckUDTIntrinsicArg(this, TheCall->getArg(2)->IgnoreCasts());
-  default:
-    break;
-  }
-
-  return false;
-}
-
-bool Sema::CheckHLSLFunctionCall(FunctionDecl *FDecl, CallExpr *TheCall) {
-  if (hlsl::IsIntrinsicOp(FDecl) && CheckHLSLIntrinsicCall(FDecl, TheCall))
-    return true;
-
-  // See #hlsl-specs/issues/181. Feature is broken. For SPIR-V we want
-  // to limit the scope, and fail gracefully in some cases.
-  if (!getLangOpts().SPIRV)
-    return false;
-
-  bool error = false;
-  for (unsigned i = 0; i < FDecl->getNumParams(); i++) {
-    assert(i < TheCall->getNumArgs());
-
-    if (!FDecl->getParamDecl(i)->hasAttr<HLSLNoInterpolationAttr>())
-      continue;
-
-    if (!isRelatedDeclMarkedNointerpolation(TheCall->getArg(i))) {
-      Diag(TheCall->getArg(i)->getExprLoc(),
-           diag::err_hlsl_parameter_requires_attribute)
-          << i << FDecl->getName() << "nointerpolation";
-      error = true;
-    }
-  }
-
-  return error;
-}
-
 namespace hlsl {
 
 static bool nodeInputIsCompatible(DXIL::NodeIOKind IOType,
@@ -16482,6 +17154,10 @@ void DiagnoseNodeEntry(Sema &S, FunctionDecl *FD, llvm::StringRef StageName,
   DXIL::ShaderKind shaderKind = ShaderModel::KindFromFullName(StageName);
   if (shaderKind == DXIL::ShaderKind::Node) {
     NodeLoc = pAttr->getLocation();
+    // SPIR-V node shader support is experimental
+    if (S.getLangOpts().SPIRV) {
+      S.Diag(NodeLoc, diag::warn_spirv_node_shaders_experimental);
+    }
   }
   if (NodeLoc.isInvalid()) {
     return;
@@ -16909,18 +17585,15 @@ void DiagnoseEntry(Sema &S, FunctionDecl *FD) {
   // Would be nice to check for resources here as they crash the compiler now.
   // See issue #7186.
   for (const auto *param : FD->params()) {
-    if (ContainsLongVector(param->getType())) {
-      const unsigned EntryFunctionParametersIdx = 6;
-      S.Diag(param->getLocation(), diag::err_hlsl_unsupported_long_vector)
-          << EntryFunctionParametersIdx;
-    }
+    const TypeDiagContext DiagContext =
+        TypeDiagContext::EntryFunctionParameters;
+    hlsl::DiagnoseTypeElements(S, param->getLocation(), param->getType(),
+                               DiagContext, DiagContext);
   }
 
-  if (ContainsLongVector(FD->getReturnType())) {
-    const unsigned EntryFunctionReturnIdx = 7;
-    S.Diag(FD->getLocation(), diag::err_hlsl_unsupported_long_vector)
-        << EntryFunctionReturnIdx;
-  }
+  const TypeDiagContext DiagContext = TypeDiagContext::EntryFunctionReturnType;
+  DiagnoseTypeElements(S, FD->getLocation(), FD->getReturnType(), DiagContext,
+                       DiagContext);
 
   DXIL::ShaderKind Stage =
       ShaderModel::KindFromFullName(shaderAttr->getStage());
diff --git a/tools/clang/lib/Sema/SemaHLSLDiagnoseTU.cpp b/tools/clang/lib/Sema/SemaHLSLDiagnoseTU.cpp
index abca7cbf86..a3ca955802 100644
--- a/tools/clang/lib/Sema/SemaHLSLDiagnoseTU.cpp
+++ b/tools/clang/lib/Sema/SemaHLSLDiagnoseTU.cpp
@@ -709,20 +709,18 @@ void hlsl::DiagnoseTranslationUnit(clang::Sema *self) {
               << hullPatchCount.value();
         }
       }
-      for (const auto *param : pPatchFnDecl->params())
-        if (ContainsLongVector(param->getType())) {
-          const unsigned PatchConstantFunctionParametersIdx = 8;
-          self->Diag(param->getLocation(),
-                     diag::err_hlsl_unsupported_long_vector)
-              << PatchConstantFunctionParametersIdx;
-        }
-
-      if (ContainsLongVector(pPatchFnDecl->getReturnType())) {
-        const unsigned PatchConstantFunctionReturnIdx = 9;
-        self->Diag(pPatchFnDecl->getLocation(),
-                   diag::err_hlsl_unsupported_long_vector)
-            << PatchConstantFunctionReturnIdx;
+      for (const auto *param : pPatchFnDecl->params()) {
+        const TypeDiagContext ParamDiagContext =
+            TypeDiagContext::PatchConstantFunctionParameters;
+        DiagnoseTypeElements(*self, param->getLocation(), param->getType(),
+                             ParamDiagContext, ParamDiagContext);
       }
+
+      const TypeDiagContext ReturnDiagContext =
+          TypeDiagContext::PatchConstantFunctionReturnType;
+      DiagnoseTypeElements(*self, pPatchFnDecl->getLocation(),
+                           pPatchFnDecl->getReturnType(), ReturnDiagContext,
+                           ReturnDiagContext);
     }
     DXIL::ShaderKind EntrySK = shaderModel->GetKind();
     DXIL::NodeLaunchType NodeLaunchTy = DXIL::NodeLaunchType::Invalid;
diff --git a/tools/clang/lib/Sema/SemaLookup.cpp b/tools/clang/lib/Sema/SemaLookup.cpp
index 98832a8f57..eec8a7fa64 100644
--- a/tools/clang/lib/Sema/SemaLookup.cpp
+++ b/tools/clang/lib/Sema/SemaLookup.cpp
@@ -55,6 +55,7 @@
 using namespace clang;
 using namespace sema;
 
+// HLSL Note: This set of utilities copied to SemaHLSL.cpp.
 namespace {
   class UnqualUsingEntry {
     const DeclContext *Nominated;
@@ -4809,9 +4810,12 @@ void Sema::diagnoseTypo(const TypoCorrection &Correction,
 
   NamedDecl *ChosenDecl =
       Correction.isKeyword() ? nullptr : Correction.getCorrectionDecl();
-  if (PrevNote.getDiagID() && ChosenDecl)
+  // HLSL Change begin: don't put notes on invalid source locations.
+  if (PrevNote.getDiagID() && ChosenDecl &&
+      !ChosenDecl->getLocation().isInvalid())
     Diag(ChosenDecl->getLocation(), PrevNote)
       << CorrectedQuotedStr << (ErrorRecovery ? FixItHint() : FixTypo);
+  // HLSL Change end
 }
 
 TypoExpr *Sema::createDelayedTypo(std::unique_ptr<TypoCorrectionConsumer> TCC,
@@ -4836,3 +4840,33 @@ const Sema::TypoExprState &Sema::getTypoExprState(TypoExpr *TE) const {
 void Sema::clearDelayedTypo(TypoExpr *TE) {
   DelayedTypos.erase(TE);
 }
+
+// HLSL Change Begin
+void Sema::CollectNamespaceContexts(Scope *S,
+                                    SmallVectorImpl<const DeclContext *> &NSs) {
+  UnqualUsingDirectiveSet UDirs;
+
+  // Add using directives from this context up to the top level. This
+  // handles cases where the current declaration is in a context that has
+  // a using directive but might be in a scope chain that doesn't reach
+  // the using directive (i.e. a using inside a namespace or class
+  // declaration but the function definition is outside).
+  DeclContext *Ctx = S->getEntity();
+  for (DeclContext *UCtx = Ctx; UCtx; UCtx = UCtx->getParent()) {
+    if (UCtx->isTransparentContext())
+      continue;
+
+    UDirs.visit(UCtx, UCtx);
+  }
+  // Find the first namespace or translation-unit scope.
+  Scope *Innermost = S;
+  while (Innermost && !isNamespaceOrTranslationUnitScope(Innermost))
+    Innermost = Innermost->getParent();
+
+  UDirs.visitScopeChain(S, Innermost);
+  UDirs.done();
+
+  for (auto &UD : UDirs)
+    NSs.push_back(UD.getNominatedNamespace());
+}
+// HLSL Change End
diff --git a/tools/clang/lib/Sema/SemaOverload.cpp b/tools/clang/lib/Sema/SemaOverload.cpp
index 636eaf0213..274b66646b 100644
--- a/tools/clang/lib/Sema/SemaOverload.cpp
+++ b/tools/clang/lib/Sema/SemaOverload.cpp
@@ -146,8 +146,8 @@ ImplicitConversionRank clang::GetConversionRank(ImplicitConversionKind Kind) {
   };
   static_assert(_countof(Rank) == ICK_Num_Conversion_Kinds,
       "Otherwise, GetConversionRank is out of sync with ImplicitConversionKind"); // HLSL Change
-  assert((int)Kind < (int)ICK_Num_Conversion_Kinds); // HLSL Change
-  return Rank[(int)Kind];
+  assert(Kind < _countof(Rank)); // HLSL Change
+  return Rank[Kind];             // HLSL Change
 }
 
 /// GetImplicitConversionName - Return the name of this kind of
@@ -10627,6 +10627,7 @@ static void AddOverloadedCallCandidate(Sema &S,
 void Sema::AddOverloadedCallCandidates(UnresolvedLookupExpr *ULE,
                                        ArrayRef<Expr *> Args,
                                        OverloadCandidateSet &CandidateSet,
+                                       Scope *S, // HLSL Change
                                        bool PartialOverloading) {
 
 #ifndef NDEBUG
@@ -10659,8 +10660,8 @@ void Sema::AddOverloadedCallCandidates(UnresolvedLookupExpr *ULE,
 #endif
 
   // HLSL Change - allow ExternalSource the ability to add the overloads for a call.
-  if (ExternalSource &&
-    ExternalSource->AddOverloadedCallCandidates(ULE, Args, CandidateSet, PartialOverloading)) {
+  if (ExternalSource && ExternalSource->AddOverloadedCallCandidates(
+                            ULE, Args, CandidateSet, S, PartialOverloading)) {
     return;
   }
 
@@ -10970,7 +10971,7 @@ bool Sema::buildOverloadedCallSet(Scope *S, Expr *Fn,
 
   // Add the functions denoted by the callee to the set of candidate
   // functions, including those from argument-dependent lookup.
-  AddOverloadedCallCandidates(ULE, Args, *CandidateSet);
+  AddOverloadedCallCandidates(ULE, Args, *CandidateSet, S); // HLSL Change
 
   if (getLangOpts().MSVCCompat &&
       CurContext->isDependentContext() && !isSFINAEContext() &&
diff --git a/tools/clang/lib/Sema/SemaTemplateInstantiate.cpp b/tools/clang/lib/Sema/SemaTemplateInstantiate.cpp
index 1eacedbb0b..a6ae05faa5 100644
--- a/tools/clang/lib/Sema/SemaTemplateInstantiate.cpp
+++ b/tools/clang/lib/Sema/SemaTemplateInstantiate.cpp
@@ -2139,18 +2139,6 @@ Sema::InstantiateClass(SourceLocation PointOfInstantiation,
               SourceLocation(), SourceLocation(), nullptr);
   CheckCompletedCXXClass(Instantiation);
 
-  // HLSL Change Begin - set longvec bit for vectors of over 4 elements
-  ClassTemplateSpecializationDecl *Spec =
-      dyn_cast<ClassTemplateSpecializationDecl>(Instantiation);
-  if (Spec && Spec->hasAttr<HLSLVectorAttr>()) {
-    const TemplateArgumentList &argList = Spec->getTemplateArgs();
-    const TemplateArgument &arg1 = argList[1];
-    llvm::APSInt vecSize = arg1.getAsIntegral();
-    if (vecSize.getLimitedValue() > hlsl::DXIL::kDefaultMaxVectorLength)
-      Instantiation->setHasHLSLLongVector();
-  }
-  // HLSL Change End - set longvec bit for vectors of over 4 elements
-
   // Default arguments are parsed, if not instantiated. We can go instantiate
   // default arg exprs for default constructors if necessary now.
   ActOnFinishCXXMemberDefaultArgs(Instantiation);
diff --git a/tools/clang/lib/Sema/SemaType.cpp b/tools/clang/lib/Sema/SemaType.cpp
index ff3b0dbac7..f08ae486b5 100644
--- a/tools/clang/lib/Sema/SemaType.cpp
+++ b/tools/clang/lib/Sema/SemaType.cpp
@@ -462,7 +462,7 @@ distributeObjCPointerTypeAttrFromDeclarator(TypeProcessingState &state,
 
   // objc_gc goes on the innermost pointer to something that's not a
   // pointer.
-  unsigned innermost = -1U;
+  unsigned innermost = std::numeric_limits<unsigned>::max();
   bool considerDeclSpec = true;
   for (unsigned i = 0, e = declarator.getNumTypeObjects(); i != e; ++i) {
     DeclaratorChunk &chunk = declarator.getTypeObject(i);
@@ -501,7 +501,7 @@ distributeObjCPointerTypeAttrFromDeclarator(TypeProcessingState &state,
 
   // Otherwise, if we found an appropriate chunk, splice the attribute
   // into it.
-  if (innermost != -1U) {
+  if (innermost != std::numeric_limits<unsigned>::max()) {
     moveAttrFromListToList(attr, declarator.getAttrListRef(),
                        declarator.getTypeObject(innermost).getAttrListRef());
     return;
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/buffer-agg-load-stores.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/buffer-agg-load-stores.hlsl
index 9f7a487a05..572734d679 100644
--- a/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/buffer-agg-load-stores.hlsl
+++ b/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/buffer-agg-load-stores.hlsl
@@ -1,35 +1,35 @@
-// RUN: %dxc -T vs_6_6              -DETY=float    -DCOLS=4 %s | FileCheck %s
-// RUN: %dxc -T vs_6_6              -DETY=bool     -DCOLS=4 %s | FileCheck %s
-// RUN: %dxc -T vs_6_6              -DETY=uint64_t -DCOLS=2 %s | FileCheck %s
-// RUN: %dxc -T vs_6_6              -DETY=double   -DCOLS=2 %s | FileCheck %s
+// RUN: %dxc -T vs_6_6              -DETY=float     -DCOLS=3 %s | FileCheck %s --check-prefixes=CHECK,MULTI
+// RUN: %dxc -T vs_6_6              -DETY=bool      -DCOLS=3 %s | FileCheck %s --check-prefixes=CHECK,MULTI
+// RUN: %dxc -T vs_6_6              -DETY=uint64_t  -DCOLS=3 %s | FileCheck %s --check-prefixes=CHECK,MULTI
+// RUN: %dxc -T vs_6_6              -DETY=double    -DCOLS=3 %s | FileCheck %s --check-prefixes=CHECK,MULTI
 
-// RUN: %dxc -T vs_6_6              -DETY=float1    -DCOLS=4 %s | FileCheck %s
-// RUN: %dxc -T vs_6_6              -DETY=bool1     -DCOLS=4 %s | FileCheck %s
-// RUN: %dxc -T vs_6_6              -DETY=uint64_t1 -DCOLS=2 %s | FileCheck %s
-// RUN: %dxc -T vs_6_6              -DETY=double1   -DCOLS=2 %s | FileCheck %s
+// RUN: %dxc -T vs_6_6              -DETY=float1    -DCOLS=3 %s | FileCheck %s --check-prefixes=CHECK,MULTI
+// RUN: %dxc -T vs_6_6              -DETY=bool1     -DCOLS=3 %s | FileCheck %s --check-prefixes=CHECK,MULTI
+// RUN: %dxc -T vs_6_6              -DETY=uint64_t1 -DCOLS=3 %s | FileCheck %s --check-prefixes=CHECK,MULTI
+// RUN: %dxc -T vs_6_6              -DETY=double1   -DCOLS=3 %s | FileCheck %s --check-prefixes=CHECK,MULTI
 
-// RUN: %dxc -T vs_6_6              -DETY=float4    -DCOLS=4 %s | FileCheck %s
-// RUN: %dxc -T vs_6_6              -DETY=bool4     -DCOLS=4 %s | FileCheck %s
-// RUN: %dxc -T vs_6_6              -DETY=uint64_t4 -DCOLS=2 %s | FileCheck %s
-// RUN: %dxc -T vs_6_6              -DETY=double4   -DCOLS=2 %s | FileCheck %s
+// RUN: %dxc -T vs_6_6              -DETY=float4    -DCOLS=3 %s | FileCheck %s --check-prefixes=CHECK,MULTI
+// RUN: %dxc -T vs_6_6              -DETY=bool4     -DCOLS=3 %s | FileCheck %s --check-prefixes=CHECK,MULTI
+// RUN: %dxc -T vs_6_6              -DETY=uint64_t4 -DCOLS=3 %s | FileCheck %s --check-prefixes=CHECK,MULTI
+// RUN: %dxc -T vs_6_6              -DETY=double4   -DCOLS=3 %s | FileCheck %s --check-prefixes=CHECK,MULTI
 
 // RUN: %dxc -T vs_6_6 -DATY=matrix -DETY=float    -DCOLS=2 -DROWS=2 %s | FileCheck %s
 // RUN: %dxc -T vs_6_6 -DATY=matrix -DETY=bool     -DCOLS=2 -DROWS=2 %s | FileCheck %s
 // RUN: %dxc -T vs_6_6 -DATY=matrix -DETY=uint64_t -DCOLS=2 -DROWS=2 %s | FileCheck %s
 // RUN: %dxc -T vs_6_6 -DATY=matrix -DETY=double   -DCOLS=2 -DROWS=2 %s | FileCheck %s
 
-// RUN: %dxc -T vs_6_6 -DATY=matrix -DETY=float    -DCOLS=3 -DROWS=3 %s | FileCheck %s --check-prefixes=CHECK,MAT
-// RUN: %dxc -T vs_6_6 -DATY=matrix -DETY=bool     -DCOLS=3 -DROWS=3 %s | FileCheck %s --check-prefixes=CHECK,MAT
-// RUN: %dxc -T vs_6_6 -DATY=matrix -DETY=uint64_t -DCOLS=3 -DROWS=3 %s | FileCheck %s --check-prefixes=CHECK,MAT
-// RUN: %dxc -T vs_6_6 -DATY=matrix -DETY=double   -DCOLS=3 -DROWS=3 %s | FileCheck %s --check-prefixes=CHECK,MAT
+// RUN: %dxc -T vs_6_6 -DATY=matrix -DETY=float    -DCOLS=3 -DROWS=3 %s | FileCheck %s --check-prefixes=CHECK,MULTI
+// RUN: %dxc -T vs_6_6 -DATY=matrix -DETY=bool     -DCOLS=3 -DROWS=3 %s | FileCheck %s --check-prefixes=CHECK,MULTI
+// RUN: %dxc -T vs_6_6 -DATY=matrix -DETY=uint64_t -DCOLS=3 -DROWS=3 %s | FileCheck %s --check-prefixes=CHECK,MULTI
+// RUN: %dxc -T vs_6_6 -DATY=matrix -DETY=double   -DCOLS=3 -DROWS=3 %s | FileCheck %s --check-prefixes=CHECK,MULTI
 
 // RUN: %dxc -T vs_6_6 -DATY=Matrix -DETY=float    -DCOLS=2 -DROWS=2 %s | FileCheck %s
 // RUN: %dxc -T vs_6_6 -DATY=Matrix -DETY=uint64_t -DCOLS=2 -DROWS=2 %s | FileCheck %s
 // RUN: %dxc -T vs_6_6 -DATY=Matrix -DETY=double   -DCOLS=2 -DROWS=2 %s | FileCheck %s
-// RUN: %dxc -T vs_6_6 -DATY=Matrix -DETY=float    -DCOLS=3 -DROWS=3 %s | FileCheck %s --check-prefixes=CHECK,MAT
-// RUN: %dxc -T vs_6_6 -DATY=Matrix -DETY=bool     -DCOLS=3 -DROWS=3 %s | FileCheck %s --check-prefixes=CHECK,MAT
-// RUN: %dxc -T vs_6_6 -DATY=Matrix -DETY=uint64_t -DCOLS=3 -DROWS=3 %s | FileCheck %s --check-prefixes=CHECK,MAT
-// RUN: %dxc -T vs_6_6 -DATY=Matrix -DETY=double   -DCOLS=3 -DROWS=3 %s | FileCheck %s --check-prefixes=CHECK,MAT
+// RUN: %dxc -T vs_6_6 -DATY=Matrix -DETY=float    -DCOLS=3 -DROWS=3 %s | FileCheck %s --check-prefixes=CHECK,MULTI
+// RUN: %dxc -T vs_6_6 -DATY=Matrix -DETY=bool     -DCOLS=3 -DROWS=3 %s | FileCheck %s --check-prefixes=CHECK,MULTI
+// RUN: %dxc -T vs_6_6 -DATY=Matrix -DETY=uint64_t -DCOLS=3 -DROWS=3 %s | FileCheck %s --check-prefixes=CHECK,MULTI
+// RUN: %dxc -T vs_6_6 -DATY=Matrix -DETY=double   -DCOLS=3 -DROWS=3 %s | FileCheck %s --check-prefixes=CHECK,MULTI
 
 // RUN: %dxc -T vs_6_6 -DATY=Vector -DETY=float    -DCOLS=4 %s | FileCheck %s
 // RUN: %dxc -T vs_6_6 -DATY=Vector -DETY=bool     -DCOLS=4 %s | FileCheck %s
@@ -105,27 +105,27 @@ RWStructuredBuffer< TYPE SS > RwStBuf : register(u2);
 ConsumeStructuredBuffer< TYPE SS > CnStBuf : register(u4);
 AppendStructuredBuffer< TYPE SS > ApStBuf  : register(u5);
 
-TYPE Add(TYPE f1[COLS], TYPE f2[COLS])[COLS] {
+TYPE Add(TYPE f1[COLS], TYPE f2[COLS], TYPE f3[COLS], TYPE f4[COLS])[COLS] {
   TYPE ret[COLS];
   for (int i = 0; i < COLS; i++)
-    ret[i] = f1[i] + f2[i];
+    ret[i] = f1[i] + f2[i] + f3[i] + f4[i];
   return ret;
 }
 
 template<typename T>
-T Add(T v1, T v2) { return v1 + v2; }
+T Add(T v1, T v2, T v3, T v4) { return v1 + v2 + v3 + v4; }
 
-TYPE Add(TYPE f1[COLS], TYPE f2[COLS], TYPE f3[COLS], TYPE f4[COLS])[COLS] {
+TYPE Add(TYPE f1[COLS], TYPE f2[COLS], TYPE f3[COLS], TYPE f4[COLS], TYPE f5[COLS], TYPE f6[COLS])[COLS] {
   TYPE ret[COLS];
   for (int i = 0; i < COLS; i++)
-    ret[i] = f1[i] + f2[i] + f3[i] + f4[i];
+    ret[i] = f1[i] + f2[i] + f3[i] + f4[i] + f5[i] + f6[i];
   return ret;
 }
 
 template<typename T>
-T Add(T v1, T v2, T v3, T v4) { return v1 + v2 + v3 + v4; }
+T Add(T v1, T v2, T v3, T v4, T v5, T v6) { return v1 + v2 + v3 + v4 + v5 + v6; }
 
-void main(uint ix[2] : IX) {
+void main(uint ix[3] : IX) {
   // ByteAddressBuffer Tests
 
   // CHECK-DAG: [[HDLROBY:%.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217, %dx.types.ResBind { i32 1, i32 1, i32 0, i8 0 }, i32 1, i1 false)
@@ -150,26 +150,55 @@ void main(uint ix[2] : IX) {
   // CHECK: [[ANHDLRWBY:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[HDLRWBY]]
   // OFF: [[RIX0:%.*]] = add i32 [[IX0]], [[BOFF:[0-9]+]]
   // CHECK: call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLRWBY]], i32 [[RIX0]]
-  // MAT: [[IX0p4:%.*]] = add i32 [[RIX0]], [[p4:[0-9]+]]
-  // MAT: call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLRWBY]], i32 [[IX0p4]]
-  // MAT: [[IX0p8:%.*]] = add i32 [[RIX0]], [[p8:[0-9]+]]
-  // MAT: call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLRWBY]], i32 [[IX0p8]]
+  // MULTI: [[IX0p4:%.*]] = add i32 [[RIX0]], [[p4:[0-9]+]]
+  // MULTI: call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLRWBY]], i32 [[IX0p4]]
+  // MULTI: [[IX0p8:%.*]] = add i32 [[RIX0]], [[p8:[0-9]+]]
+  // MULTI: call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLRWBY]], i32 [[IX0p8]]
   // I1: icmp ne i32
   // I1: icmp ne i32
   // I1: icmp ne i32
   // I1: icmp ne i32
   TYPE babElt1 SS = RwByBuf.Load< TYPE SS >(ix[0]);
 
+  // CHECK-DAG: [[IX1:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 {{[0-9]*}}, i32 1
+  // CHECK-DAG: [[RIX1:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 {{[0-9]*}}, i32 1
+  // OFF: [[RIX1:%.*]] = add i32 [[IX1]], [[BOFF]]
+  // CHECK: [[RESRET:%.*]] = call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLRWBY]], i32 [[RIX1]]
+  // MULTI: [[IX1p4:%.*]] = add i32 [[RIX1]], [[p4]]
+  // MULTI: call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLRWBY]], i32 [[IX1p4]]
+  // MULTI: [[IX1p8:%.*]] = add i32 [[RIX1]], [[p8]]
+  // MULTI: [[RESRET:%.*]] = call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLRWBY]], i32 [[IX1p8]]
+  // CHECK: [[STATUS:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[RESRET]], 4
+  // CHECK: [[CHK1:%.*]] = call i1 @dx.op.checkAccessFullyMapped.i32(i32 71, i32 [[STATUS]])
+  // I1: icmp ne i32
+  // I1: icmp ne i32
+  // I1: icmp ne i32
+  // I1: icmp ne i32
+  uint status1;
+  TYPE babElt3 SS = RwByBuf.Load< TYPE SS >(ix[1], status1);
+
   // CHECK: [[ANHDLROBY:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[HDLROBY]]
   // CHECK: call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLROBY]], i32 [[RIX0]]
-  // MAT: call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLROBY]], i32 [[IX0p4]]
-  // MAT: call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLROBY]], i32 [[IX0p8]]
+  // MULTI: call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLROBY]], i32 [[IX0p4]]
+  // MULTI: call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLROBY]], i32 [[IX0p8]]
   // I1: icmp ne i32 %{{.*}}, 0
   // I1: icmp ne i32 %{{.*}}, 0
   // I1: icmp ne i32 %{{.*}}, 0
   // I1: icmp ne i32 %{{.*}}, 0
   TYPE babElt2 SS = RoByBuf.Load< TYPE SS >(ix[0]);
 
+  // CHECK: [[RESRET:%.*]] = call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLROBY]], i32 [[RIX1]]
+  // MULTI: call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLROBY]], i32 [[IX1p4]]
+  // MULTI: [[RESRET:%.*]] = call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLROBY]], i32 [[IX1p8]]
+  // CHECK: [[STATUS:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[RESRET]], 4
+  // CHECK: [[CHK2:%.*]] = call i1 @dx.op.checkAccessFullyMapped.i32(i32 71, i32 [[STATUS]])
+  // I1: icmp ne i32 %{{.*}}, 0
+  // I1: icmp ne i32 %{{.*}}, 0
+  // I1: icmp ne i32 %{{.*}}, 0
+  // I1: icmp ne i32 %{{.*}}, 0
+  uint status2;
+  TYPE babElt4 SS = RoByBuf.Load< TYPE SS >(ix[1], status2);
+
   // I1: zext i1 %{{.*}} to i32
   // I1: zext i1 %{{.*}} to i32
   // I1: zext i1 %{{.*}} to i32
@@ -177,48 +206,76 @@ void main(uint ix[2] : IX) {
   // OFF: call void @dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle [[ANHDLRWBY]], i32 {{%.*}}, i32 undef, float 0.0
   // OFF: call void @dx.op.rawBufferStore.f64(i32 140, %dx.types.Handle [[ANHDLRWBY]], i32 {{%.*}}, i32 undef, double 0.0
   // CHECK: call void @dx.op.rawBufferStore.[[TY]](i32 140, %dx.types.Handle [[ANHDLRWBY]], i32 [[RIX0]]
-  // MAT: call void @dx.op.rawBufferStore.[[TY]](i32 140, %dx.types.Handle [[ANHDLRWBY]], i32 [[IX0p4]]
-  // MAT: call void @dx.op.rawBufferStore.[[TY]](i32 140, %dx.types.Handle [[ANHDLRWBY]], i32 [[IX0p8]]
-  RwByBuf.Store< TYPE SS >(ix[0], Add(babElt1, babElt2));
+  // MULTI: call void @dx.op.rawBufferStore.[[TY]](i32 140, %dx.types.Handle [[ANHDLRWBY]], i32 [[IX0p4]]
+  // MULTI: call void @dx.op.rawBufferStore.[[TY]](i32 140, %dx.types.Handle [[ANHDLRWBY]], i32 [[IX0p8]]
+  // CHECK: and i1 [[CHK1]], [[CHK2]]
+  // CHECK: [[ANHDLRWBY:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[HDLRWBY]]
+  // CHECK: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle [[ANHDLRWBY]], i32 100
+  RwByBuf.Store< TYPE SS >(ix[0], Add(babElt1, babElt2, babElt3, babElt4));
+  RwByBuf.Store< uint > (100, status1 && status2);
 
   // StructuredBuffer Tests
   // CHECK: [[ANHDLRWST:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[HDLRWST]]
   // CHECK: call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLRWST]], i32 [[IX0]], i32 [[BOFF]]
-  // MAT:  call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLRWST]], i32 [[IX0]], i32 [[p4]]
-  // MAT:  call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLRWST]], i32 [[IX0]], i32 [[p8]]
+  // MULTI:  call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLRWST]], i32 [[IX0]], i32 [[p4]]
+  // MULTI:  call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLRWST]], i32 [[IX0]], i32 [[p8]]
   // I1: icmp ne i32 %{{.*}}, 0
   // I1: icmp ne i32 %{{.*}}, 0
   // I1: icmp ne i32 %{{.*}}, 0
   // I1: icmp ne i32 %{{.*}}, 0
   TYPE stbElt1 SS = RwStBuf.Load(ix[0]);
-  // CHECK: [[IX1:%.*]] = call i32 @dx.op.loadInput.i32(i32 4,
+
   // CHECK: call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLRWST]], i32 [[IX1]], i32 [[BOFF]]
-  // MAT:  call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLRWST]], i32 [[IX1]], i32 [[p4]]
-  // MAT:  call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLRWST]], i32 [[IX1]], i32 [[p8]]
+  // MULTI:  call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLRWST]], i32 [[IX1]], i32 [[p4]]
+  // MULTI:  call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLRWST]], i32 [[IX1]], i32 [[p8]]
   // I1: icmp ne i32 %{{.*}}, 0
   // I1: icmp ne i32 %{{.*}}, 0
   // I1: icmp ne i32 %{{.*}}, 0
   // I1: icmp ne i32 %{{.*}}, 0
   TYPE stbElt2 SS = RwStBuf[ix[1]];
 
+  // CHECK: [[IX2:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 {{[0-9]*}}, i32 2
+  // CHECK: [[RESRET:%.*]] = call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLRWST]], i32 [[IX2]], i32 [[BOFF]]
+  // MULTI: call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLRWST]], i32 [[IX2]], i32 [[p4]]
+  // MULTI: [[RESRET:%.*]] = call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLRWST]], i32 [[IX2]], i32 [[p8]]
+  // CHECK: [[STATUS:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[RESRET]], 4
+  // CHECK: [[CHK1:%.*]] = call i1 @dx.op.checkAccessFullyMapped.i32(i32 71, i32 [[STATUS]])
+  // I1: icmp ne i32 %{{.*}}, 0
+  // I1: icmp ne i32 %{{.*}}, 0
+  // I1: icmp ne i32 %{{.*}}, 0
+  // I1: icmp ne i32 %{{.*}}, 0
+  TYPE stbElt5 SS = RwStBuf.Load(ix[2], status1);
+
   // CHECK: [[ANHDLROST:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[HDLROST]]
   // CHECK: call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLROST]], i32 [[IX0]], i32 [[BOFF]]
-  // MAT: call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLROST]], i32 [[IX0]], i32 [[p4]]
-  // MAT: call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLROST]], i32 [[IX0]], i32 [[p8]]
+  // MULTI: call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLROST]], i32 [[IX0]], i32 [[p4]]
+  // MULTI: call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLROST]], i32 [[IX0]], i32 [[p8]]
   // I1: icmp ne i32 %{{.*}}, 0
   // I1: icmp ne i32 %{{.*}}, 0
   // I1: icmp ne i32 %{{.*}}, 0
   // I1: icmp ne i32 %{{.*}}, 0
   TYPE stbElt3 SS = RoStBuf.Load(ix[0]);
+
   // CHECK: call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLROST]], i32 [[IX1]], i32 [[BOFF]]
-  // MAT: call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLROST]], i32 [[IX1]], i32 [[p4]]
-  // MAT: call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLROST]], i32 [[IX1]], i32 [[p8]]
+  // MULTI: call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLROST]], i32 [[IX1]], i32 [[p4]]
+  // MULTI: call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLROST]], i32 [[IX1]], i32 [[p8]]
   // I1: icmp ne i32 %{{.*}}, 0
   // I1: icmp ne i32 %{{.*}}, 0
   // I1: icmp ne i32 %{{.*}}, 0
   // I1: icmp ne i32 %{{.*}}, 0
   TYPE stbElt4 SS = RoStBuf[ix[1]];
 
+  // CHECK: [[RESRET:%.*]] = call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLROST]], i32 [[IX2]], i32 [[BOFF]]
+  // MULTI: call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLROST]], i32 [[IX2]], i32 [[p4]]
+  // MULTI: [[RESRET:%.*]] = call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLROST]], i32 [[IX2]], i32 [[p8]]
+  // CHECK: [[STATUS:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[RESRET]], 4
+  // CHECK: [[CHK2:%.*]] = call i1 @dx.op.checkAccessFullyMapped.i32(i32 71, i32 [[STATUS]])
+  // I1: icmp ne i32 %{{.*}}, 0
+  // I1: icmp ne i32 %{{.*}}, 0
+  // I1: icmp ne i32 %{{.*}}, 0
+  // I1: icmp ne i32 %{{.*}}, 0
+  TYPE stbElt6 SS = RoStBuf.Load(ix[2], status2);
+
   // I1: zext i1 %{{.*}} to i32
   // I1: zext i1 %{{.*}} to i32
   // I1: zext i1 %{{.*}} to i32
@@ -226,9 +283,13 @@ void main(uint ix[2] : IX) {
   // OFF: call void @dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle [[ANHDLRWST]], i32 [[IX0]], i32 0, float 0.0
   // OFF: call void @dx.op.rawBufferStore.f64(i32 140, %dx.types.Handle [[ANHDLRWST]], i32 [[IX0]], i32 16, double 0.0
   // CHECK: call void @dx.op.rawBufferStore.[[TY]](i32 140, %dx.types.Handle [[ANHDLRWST]], i32 [[IX0]], i32 [[BOFF]]
-  // MAT: call void @dx.op.rawBufferStore.[[TY]](i32 140, %dx.types.Handle [[ANHDLRWST]], i32 [[IX0]], i32 [[p4]]
-  // MAT: call void @dx.op.rawBufferStore.[[TY]](i32 140, %dx.types.Handle [[ANHDLRWST]], i32 [[IX0]], i32 [[p8]]
-  RwStBuf[ix[0]] = Add(stbElt1, stbElt2, stbElt3, stbElt4);
+  // MULTI: call void @dx.op.rawBufferStore.[[TY]](i32 140, %dx.types.Handle [[ANHDLRWST]], i32 [[IX0]], i32 [[p4]]
+  // MULTI: call void @dx.op.rawBufferStore.[[TY]](i32 140, %dx.types.Handle [[ANHDLRWST]], i32 [[IX0]], i32 [[p8]]
+  // CHECK: and i1 [[CHK1]], [[CHK2]]
+  // CHECK: [[ANHDLRWBY:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[HDLRWBY]]
+  // CHECK: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle [[ANHDLRWBY]], i32 200
+  RwStBuf[ix[0]] = Add(stbElt1, stbElt2, stbElt3, stbElt4, stbElt5, stbElt6);
+  RwByBuf.Store< uint > (200, status1 && status2);
 
   // {Append/Consume}StructuredBuffer Tests
   // CHECK: [[ANHDLCON:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[HDLCON]]
@@ -236,8 +297,8 @@ void main(uint ix[2] : IX) {
   // OFF: call %dx.types.ResRet.f32 @dx.op.rawBufferLoad.f32(i32 139, %dx.types.Handle [[ANHDLCON]], i32 [[CONIX]], i32 
   // OFF: call %dx.types.ResRet.f64 @dx.op.rawBufferLoad.f64(i32 139, %dx.types.Handle [[ANHDLCON]], i32 [[CONIX]], i32 16
   // CHECK: call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLCON]], i32 [[CONIX]], i32 [[BOFF]]
-  // MAT: call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLCON]], i32 [[CONIX]], i32 [[p4]]
-  // MAT: call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLCON]], i32 [[CONIX]], i32 [[p8]]
+  // MULTI: call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLCON]], i32 [[CONIX]], i32 [[p4]]
+  // MULTI: call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLCON]], i32 [[CONIX]], i32 [[p8]]
   // I1: icmp ne i32 %{{.*}}, 0
   // I1: icmp ne i32 %{{.*}}, 0
   // I1: icmp ne i32 %{{.*}}, 0
@@ -253,7 +314,7 @@ void main(uint ix[2] : IX) {
   // OFF: call void @dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle [[ANHDLAPP]], i32 [[APPIX]], i32 0
   // OFF: call void @dx.op.rawBufferStore.f64(i32 140, %dx.types.Handle [[ANHDLAPP]], i32 [[APPIX]], i32 16
   // CHECK: call void @dx.op.rawBufferStore.[[TY]](i32 140, %dx.types.Handle [[ANHDLAPP]], i32 [[APPIX]], i32 [[BOFF]]
-  // MAT: call void @dx.op.rawBufferStore.[[TY]](i32 140, %dx.types.Handle [[ANHDLAPP]], i32 [[APPIX]], i32 [[p4]]
-  // MAT: call void @dx.op.rawBufferStore.[[TY]](i32 140, %dx.types.Handle [[ANHDLAPP]], i32 [[APPIX]], i32 [[p8]]
+  // MULTI: call void @dx.op.rawBufferStore.[[TY]](i32 140, %dx.types.Handle [[ANHDLAPP]], i32 [[APPIX]], i32 [[p4]]
+  // MULTI: call void @dx.op.rawBufferStore.[[TY]](i32 140, %dx.types.Handle [[ANHDLAPP]], i32 [[APPIX]], i32 [[p8]]
   ApStBuf.Append(cnElt);
 }
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/buffer-load-stores-sm69.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/buffer-load-stores-sm69.hlsl
index 5305ee495b..f71b29e83e 100644
--- a/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/buffer-load-stores-sm69.hlsl
+++ b/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/buffer-load-stores-sm69.hlsl
@@ -26,7 +26,7 @@ AppendStructuredBuffer<vector<TYPE, NUM> > ApStBuf  : register(u5);
 
 // CHECK-LABEL: define void @main
 [shader("vertex")]
-void main(uint ix[2] : IX) {
+void main(uint ix[3] : IX) {
   // ByteAddressBuffer Tests
 
   // CHECK-DAG: [[HDLROBY:%.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217, %dx.types.ResBind { i32 1, i32 1, i32 0, i8 0 }, i32 1, i1 false)
@@ -45,36 +45,73 @@ void main(uint ix[2] : IX) {
   // I1: icmp ne <[[NUM]] x i32> %{{.*}}, zeroinitializer
   vector<TYPE, NUM>  babElt1 = RwByBuf.Load< vector<TYPE, NUM>  >(ix[0]);
 
+  // CHECK: [[IX1:%.*]] = call i32 @dx.op.loadInput.i32(i32 4,
+  // CHECK: [[RESRET:%.*]] = call %dx.types.ResRet.[[VTY]] @dx.op.rawBufferVectorLoad.[[VTY]](i32 303, %dx.types.Handle [[ANHDLRWBY]], i32 [[IX1]]
+  // CHECK: [[STATUS:%.*]] = extractvalue %dx.types.ResRet.[[VTY]] [[RESRET]], 1
+  // CHECK: [[CHK1:%.*]] = call i1 @dx.op.checkAccessFullyMapped.i32(i32 71, i32 [[STATUS]])
+  // I1: icmp ne <[[NUM]] x i32> %{{.*}}, zeroinitializer
+  uint status1;
+  vector<TYPE, NUM>  babElt3 = RwByBuf.Load< vector<TYPE, NUM>  >(ix[1], status1);
+
   // CHECK: [[ANHDLROBY:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[HDLROBY]]
   // CHECK: call %dx.types.ResRet.[[VTY]] @dx.op.rawBufferVectorLoad.[[VTY]](i32 303, %dx.types.Handle [[ANHDLROBY]], i32 [[IX0]]
   // I1: icmp ne <[[NUM]] x i32> %{{.*}}, zeroinitializer
   vector<TYPE, NUM>  babElt2 = RoByBuf.Load< vector<TYPE, NUM>  >(ix[0]);
 
+  // CHECK: [[RESRET:%.*]] = call %dx.types.ResRet.[[VTY]] @dx.op.rawBufferVectorLoad.[[VTY]](i32 303, %dx.types.Handle [[ANHDLROBY]], i32 [[IX1]]
+  // CHECK: [[STATUS:%.*]] = extractvalue %dx.types.ResRet.[[VTY]] [[RESRET]], 1
+  // CHECK: [[CHK2:%.*]] = call i1 @dx.op.checkAccessFullyMapped.i32(i32 71, i32 [[STATUS]])
+  // I1: icmp ne <[[NUM]] x i32> %{{.*}}, zeroinitializer
+  uint status2;
+  vector<TYPE, NUM>  babElt4 = RoByBuf.Load< vector<TYPE, NUM>  >(ix[1], status2);
+
   // I1: zext <[[NUM]] x i1> %{{.*}} to <[[NUM]] x i32>
   // CHECK: all void @dx.op.rawBufferVectorStore.[[VTY]](i32 304, %dx.types.Handle [[ANHDLRWBY]], i32 [[IX0]]
-  RwByBuf.Store< vector<TYPE, NUM>  >(ix[0], babElt1 + babElt2);
+  // CHECK: and i1 [[CHK1]], [[CHK2]]
+  // CHECK: [[ANHDLRWBY:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[HDLRWBY]]
+  // CHECK: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle [[ANHDLRWBY]], i32 100
+  RwByBuf.Store< vector<TYPE, NUM>  >(ix[0], babElt1 + babElt2 + babElt3 + babElt4);
+  RwByBuf.Store< uint > (100, status1 && status2);
 
   // StructuredBuffer Tests
   // CHECK: [[ANHDLRWST:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[HDLRWST]]
   // CHECK: call %dx.types.ResRet.[[VTY]] @dx.op.rawBufferVectorLoad.[[VTY]](i32 303, %dx.types.Handle [[ANHDLRWST]], i32 [[IX0]]
   // I1: icmp ne <[[NUM]] x i32> %{{.*}}, zeroinitializer
   vector<TYPE, NUM>  stbElt1 = RwStBuf.Load(ix[0]);
-  // CHECK: [[IX1:%.*]] = call i32 @dx.op.loadInput.i32(i32 4,
+
   // CHECK: call %dx.types.ResRet.[[VTY]] @dx.op.rawBufferVectorLoad.[[VTY]](i32 303, %dx.types.Handle [[ANHDLRWST]], i32 [[IX1]]
   // I1: icmp ne <[[NUM]] x i32> %{{.*}}, zeroinitializer
   vector<TYPE, NUM>  stbElt2 = RwStBuf[ix[1]];
 
+  // CHECK: [[IX2:%.*]] = call i32 @dx.op.loadInput.i32(i32 4,
+  // CHECK: [[RESRET:%.*]] = call %dx.types.ResRet.[[VTY]] @dx.op.rawBufferVectorLoad.[[VTY]](i32 303, %dx.types.Handle [[ANHDLRWST]], i32 [[IX2]]
+  // CHECK: [[STATUS:%.*]] = extractvalue %dx.types.ResRet.[[VTY]] [[RESRET]], 1
+  // CHECK: [[CHK1:%.*]] = call i1 @dx.op.checkAccessFullyMapped.i32(i32 71, i32 [[STATUS]])
+  // I1: icmp ne <[[NUM]] x i32> %{{.*}}, zeroinitializer
+  vector<TYPE, NUM>  stbElt5 = RwStBuf.Load(ix[2], status1);
+
   // CHECK: [[ANHDLROST:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[HDLROST]]
   // CHECK: call %dx.types.ResRet.[[VTY]] @dx.op.rawBufferVectorLoad.[[VTY]](i32 303, %dx.types.Handle [[ANHDLROST]], i32 [[IX0]]
   // I1: icmp ne <[[NUM]] x i32> %{{.*}}, zeroinitializer
   vector<TYPE, NUM>  stbElt3 = RoStBuf.Load(ix[0]);
+
   // CHECK: call %dx.types.ResRet.[[VTY]] @dx.op.rawBufferVectorLoad.[[VTY]](i32 303, %dx.types.Handle [[ANHDLROST]], i32 [[IX1]]
   // I1: icmp ne <[[NUM]] x i32> %{{.*}}, zeroinitializer
   vector<TYPE, NUM>  stbElt4 = RoStBuf[ix[1]];
 
+  // CHECK: [[RESRET:%.*]] = call %dx.types.ResRet.[[VTY]] @dx.op.rawBufferVectorLoad.[[VTY]](i32 303, %dx.types.Handle [[ANHDLROST]], i32 [[IX2]]
+  // CHECK: [[STATUS:%.*]] = extractvalue %dx.types.ResRet.[[VTY]] [[RESRET]], 1
+  // CHECK: [[CHK2:%.*]] = call i1 @dx.op.checkAccessFullyMapped.i32(i32 71, i32 [[STATUS]])
+  // I1: icmp ne <[[NUM]] x i32> %{{.*}}, zeroinitializer
+  vector<TYPE, NUM>  stbElt6 = RoStBuf.Load(ix[2], status2);
+
   // I1: zext <[[NUM]] x i1> %{{.*}} to <[[NUM]] x i32>
   // CHECK: all void @dx.op.rawBufferVectorStore.[[VTY]](i32 304, %dx.types.Handle [[ANHDLRWST]], i32 [[IX0]]
-  RwStBuf[ix[0]] = stbElt1 + stbElt2 + stbElt3 + stbElt4;
+  // CHECK: and i1 [[CHK1]], [[CHK2]]
+  // CHECK: [[ANHDLRWBY:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[HDLRWBY]]
+  // CHECK: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle [[ANHDLRWBY]], i32 200
+  RwStBuf[ix[0]] = stbElt1 + stbElt2 + stbElt3 + stbElt4 + stbElt5 + stbElt6;
+  RwByBuf.Store< uint > (200, status1 && status2);
 
   // {Append/Consume}StructuredBuffer Tests
   // CHECK: [[ANHDLCON:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[HDLCON]]
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/buffer-load-stores.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/buffer-load-stores.hlsl
index 8dcf5ead1c..896f442c2c 100644
--- a/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/buffer-load-stores.hlsl
+++ b/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/buffer-load-stores.hlsl
@@ -87,12 +87,36 @@ void main(uint ix0 : IX0, uint ix1 : IX1, uint2 ix2 : IX2, uint3 ix3 : IX3) {
   // I1: icmp ne i32 %{{.*}}, 0
   TYPE babElt2 = RoByBuf.Load< TYPE >(ix0);
 
+  // CHECK: [[RESRET:%.*]] = call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLRWBY]], i32 [[IX1]]
+  // CHECK: [[STATUS:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[RESRET]], 4
+  // CHECK: [[CHK1:%.*]] = call i1 @dx.op.checkAccessFullyMapped.i32(i32 71, i32 [[STATUS]])
+  // I1: icmp ne i32 %{{.*}}, 0
+  // I1: icmp ne i32 %{{.*}}, 0
+  // I1: icmp ne i32 %{{.*}}, 0
+  // I1: icmp ne i32 %{{.*}}, 0
+  uint status1 = 0;
+  TYPE babElt3 = RwByBuf.Load< TYPE >(ix1, status1);
+
+  // CHECK: [[RESRET:%.*]] = call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLROBY]], i32 [[IX1]]
+  // CHECK: [[STATUS:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[RESRET]], 4
+  // CHECK: [[CHK2:%.*]] = call i1 @dx.op.checkAccessFullyMapped.i32(i32 71, i32 [[STATUS]])
+  // I1: icmp ne i32 %{{.*}}, 0
+  // I1: icmp ne i32 %{{.*}}, 0
+  // I1: icmp ne i32 %{{.*}}, 0
+  // I1: icmp ne i32 %{{.*}}, 0
+  uint status2 = 0;
+  TYPE babElt4 = RoByBuf.Load< TYPE >(ix1, status2);
+
   // I1: zext i1 %{{.*}} to i32
   // I1: zext i1 %{{.*}} to i32
   // I1: zext i1 %{{.*}} to i32
   // I1: zext i1 %{{.*}} to i32
   // CHECK: all void @dx.op.rawBufferStore.[[TY]](i32 140, %dx.types.Handle [[ANHDLRWBY]], i32 [[IX0]]
-  RwByBuf.Store< TYPE >(ix0, babElt1 + babElt2);
+  // CHECK: and i1 [[CHK1]], [[CHK2]]
+  // CHECK: [[ANHDLRWBY:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[HDLRWBY]]
+  // CHECK: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle [[ANHDLRWBY]], i32 100
+  RwByBuf.Store< TYPE >(ix0, babElt1 + babElt2 + babElt3 + babElt4);
+  RwByBuf.Store< uint > (100, status1 && status2);
 
   // StructuredBuffer Tests
   // CHECK: [[ANHDLRWST:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[HDLRWST]]
@@ -102,6 +126,7 @@ void main(uint ix0 : IX0, uint ix1 : IX1, uint2 ix2 : IX2, uint3 ix3 : IX3) {
   // I1: icmp ne i32 %{{.*}}, 0
   // I1: icmp ne i32 %{{.*}}, 0
   TYPE stbElt1 = RwStBuf.Load(ix0);
+
   // CHECK: call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLRWST]], i32 [[IX1]]
   // I1: icmp ne i32 %{{.*}}, 0
   // I1: icmp ne i32 %{{.*}}, 0
@@ -116,6 +141,7 @@ void main(uint ix0 : IX0, uint ix1 : IX1, uint2 ix2 : IX2, uint3 ix3 : IX3) {
   // I1: icmp ne i32 %{{.*}}, 0
   // I1: icmp ne i32 %{{.*}}, 0
   TYPE stbElt3 = RoStBuf.Load(ix0);
+
   // CHECK: call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLROST]], i32 [[IX1]]
   // I1: icmp ne i32 %{{.*}}, 0
   // I1: icmp ne i32 %{{.*}}, 0
@@ -123,12 +149,34 @@ void main(uint ix0 : IX0, uint ix1 : IX1, uint2 ix2 : IX2, uint3 ix3 : IX3) {
   // I1: icmp ne i32 %{{.*}}, 0
   TYPE stbElt4 = RoStBuf[ix1];
 
+  // CHECK: [[RESRET:%.*]] = call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLRWST]], i32 [[IX20]]
+  // CHECK: [[STATUS:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[RESRET]], 4
+  // CHECK: [[CHK1:%.*]] = call i1 @dx.op.checkAccessFullyMapped.i32(i32 71, i32 [[STATUS]])
+  // I1: icmp ne i32 %{{.*}}, 0
+  // I1: icmp ne i32 %{{.*}}, 0
+  // I1: icmp ne i32 %{{.*}}, 0
+  // I1: icmp ne i32 %{{.*}}, 0
+  TYPE stbElt5 = RwStBuf.Load(ix2[0], status1);
+
+  // CHECK: [[RESRET:%.*]] = call %dx.types.ResRet.[[TY]] @dx.op.rawBufferLoad.[[TY]](i32 139, %dx.types.Handle [[ANHDLROST]], i32 [[IX20]]
+  // CHECK: [[STATUS:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[RESRET]], 4
+  // CHECK: [[CHK2:%.*]] = call i1 @dx.op.checkAccessFullyMapped.i32(i32 71, i32 [[STATUS]])
+  // I1: icmp ne i32 %{{.*}}, 0
+  // I1: icmp ne i32 %{{.*}}, 0
+  // I1: icmp ne i32 %{{.*}}, 0
+  // I1: icmp ne i32 %{{.*}}, 0
+  TYPE stbElt6 = RoStBuf.Load(ix2[0], status2);
+
   // I1: zext i1 %{{.*}} to i32
   // I1: zext i1 %{{.*}} to i32
   // I1: zext i1 %{{.*}} to i32
   // I1: zext i1 %{{.*}} to i32
   // CHECK: all void @dx.op.rawBufferStore.[[TY]](i32 140, %dx.types.Handle [[ANHDLRWST]], i32 [[IX0]]
-  RwStBuf[ix0] = stbElt1 + stbElt2 + stbElt3 + stbElt4;
+  // CHECK: and i1 [[CHK1]], [[CHK2]]
+  // CHECK: [[ANHDLRWBY:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[HDLRWBY]]
+  // CHECK: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle [[ANHDLRWBY]], i32 200
+  RwStBuf[ix0] = stbElt1 + stbElt2 + stbElt3 + stbElt4 + stbElt5 + stbElt6;
+  RwByBuf.Store< uint > (200, status1 && status2);
 
   // {Append/Consume}StructuredBuffer Tests
   // CHECK: [[ANHDLCON:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[HDLCON]]
@@ -167,6 +215,7 @@ void main(uint ix0 : IX0, uint ix1 : IX1, uint2 ix2 : IX2, uint3 ix3 : IX3) {
   // I1: icmp ne i32 %{{.*}}, 0
   // I1: icmp ne i32 %{{.*}}, 0
   TYPE typElt1 = RwTyBuf.Load(ix0);
+
   // CHECK: call %dx.types.ResRet.[[TY32]] @dx.op.bufferLoad.[[TY32]](i32 68, %dx.types.Handle [[ANHDLRWTY]], i32 [[IX1]]
   // F64: call double @dx.op.makeDouble.f64(i32 101
   // F64: call double @dx.op.makeDouble.f64(i32 101
@@ -183,6 +232,7 @@ void main(uint ix0 : IX0, uint ix1 : IX1, uint2 ix2 : IX2, uint3 ix3 : IX3) {
   // I1: icmp ne i32 %{{.*}}, 0
   // I1: icmp ne i32 %{{.*}}, 0
   TYPE typElt2 = RwTyBuf[ix1];
+
   // CHECK: [[ANHDLROTY:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[HDLROTY]]
   // CHECK: call %dx.types.ResRet.[[TY32]] @dx.op.bufferLoad.[[TY32]](i32 68, %dx.types.Handle [[ANHDLROTY]], i32 [[IX0]]
   // F64: call double @dx.op.makeDouble.f64(i32 101
@@ -200,6 +250,7 @@ void main(uint ix0 : IX0, uint ix1 : IX1, uint2 ix2 : IX2, uint3 ix3 : IX3) {
   // I1: icmp ne i32 %{{.*}}, 0
   // I1: icmp ne i32 %{{.*}}, 0
   TYPE typElt3 = RoTyBuf.Load(ix0);
+
   // CHECK: call %dx.types.ResRet.[[TY32]] @dx.op.bufferLoad.[[TY32]](i32 68, %dx.types.Handle [[ANHDLROTY]], i32 [[IX1]]
   // F64: call double @dx.op.makeDouble.f64(i32 101
   // F64: call double @dx.op.makeDouble.f64(i32 101
@@ -217,6 +268,44 @@ void main(uint ix0 : IX0, uint ix1 : IX1, uint2 ix2 : IX2, uint3 ix3 : IX3) {
   // I1: icmp ne i32 %{{.*}}, 0
   TYPE typElt4 = RoTyBuf[ix1];
 
+  // CHECK: [[RESRET:%.*]] = call %dx.types.ResRet.[[TY32]] @dx.op.bufferLoad.[[TY32]](i32 68, %dx.types.Handle [[ANHDLRWTY]], i32 [[IX20]]
+  // CHECK: [[STATUS:%.*]] = extractvalue %dx.types.ResRet.[[TY32]] [[RESRET]], 4
+  // CHECK: [[CHK1:%.*]] = call i1 @dx.op.checkAccessFullyMapped.i32(i32 71, i32 [[STATUS]])
+  // F64: call double @dx.op.makeDouble.f64(i32 101
+  // F64: call double @dx.op.makeDouble.f64(i32 101
+  // I64: zext i32 %{{.*}} to i64
+  // I64: zext i32 %{{.*}} to i64
+  // I64: shl nuw i64
+  // I64: or i64
+  // I64: zext i32 %{{.*}} to i64
+  // I64: zext i32 %{{.*}} to i64
+  // I64: shl nuw i64
+  // I64: or i64
+  // I1: icmp ne i32 %{{.*}}, 0
+  // I1: icmp ne i32 %{{.*}}, 0
+  // I1: icmp ne i32 %{{.*}}, 0
+  // I1: icmp ne i32 %{{.*}}, 0
+  TYPE typElt5 = RwTyBuf.Load(ix2[0], status1);
+
+  // CHECK: [[RESRET:%.*]] = call %dx.types.ResRet.[[TY32]] @dx.op.bufferLoad.[[TY32]](i32 68, %dx.types.Handle [[ANHDLROTY]], i32 [[IX20]]
+  // CHECK: [[STATUS:%.*]] = extractvalue %dx.types.ResRet.[[TY32]] [[RESRET]], 4
+  // CHECK: [[CHK2:%.*]] = call i1 @dx.op.checkAccessFullyMapped.i32(i32 71, i32 [[STATUS]])
+  // F64: call double @dx.op.makeDouble.f64(i32 101
+  // F64: call double @dx.op.makeDouble.f64(i32 101
+  // I64: zext i32 %{{.*}} to i64
+  // I64: zext i32 %{{.*}} to i64
+  // I64: shl nuw i64
+  // I64: or i64
+  // I64: zext i32 %{{.*}} to i64
+  // I64: zext i32 %{{.*}} to i64
+  // I64: shl nuw i64
+  // I64: or i64
+  // I1: icmp ne i32 %{{.*}}, 0
+  // I1: icmp ne i32 %{{.*}}, 0
+  // I1: icmp ne i32 %{{.*}}, 0
+  // I1: icmp ne i32 %{{.*}}, 0
+  TYPE typElt6 = RoTyBuf.Load(ix2[0], status2);
+
   // F64: call %dx.types.splitdouble @dx.op.splitDouble.f64(i32 102
   // F64: call %dx.types.splitdouble @dx.op.splitDouble.f64(i32 102
   // I64: trunc i64 %{{.*}} to i32
@@ -229,8 +318,12 @@ void main(uint ix0 : IX0, uint ix1 : IX1, uint2 ix2 : IX2, uint3 ix3 : IX3) {
   // I1: zext i1 %{{.*}} to i32
   // I1: zext i1 %{{.*}} to i32
   // I1: zext i1 %{{.*}} to i32
-  // CHECK: all void @dx.op.bufferStore.[[TY32]](i32 69, %dx.types.Handle [[ANHDLRWTY]], i32 [[IX0]]
-  RwTyBuf[ix0] = typElt1 + typElt2 + typElt3 + typElt4;
+  // CHECK: call void @dx.op.bufferStore.[[TY32]](i32 69, %dx.types.Handle [[ANHDLRWTY]], i32 [[IX0]]
+  // CHECK: and i1 [[CHK1]], [[CHK2]]
+  // CHECK: [[ANHDLRWBY:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[HDLRWBY]]
+  // CHECK: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle [[ANHDLRWBY]], i32 300
+  RwTyBuf[ix0] = typElt1 + typElt2 + typElt3 + typElt4 + typElt5 + typElt6;
+  RwByBuf.Store< uint > (300, status1 && status2);
 
   // Texture Tests
   // CHECK: [[ANHDLROTX1:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[HDLROTX1]]
@@ -250,6 +343,7 @@ void main(uint ix0 : IX0, uint ix1 : IX1, uint2 ix2 : IX2, uint3 ix3 : IX3) {
   // I1: icmp ne i32 %{{.*}}, 0
   // I1: icmp ne i32 %{{.*}}, 0
   TYPE texElt1 = RoTex1d[ix0];
+
   // CHECK: [[ANHDLRWTX1:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[HDLRWTX1]]
   // CHECK: call %dx.types.ResRet.[[TY32]] @dx.op.textureLoad.[[TY32]](i32 66, %dx.types.Handle [[ANHDLRWTX1]], i32 undef, i32 [[IX0]], i32 undef, i32 undef
   // F64: call double @dx.op.makeDouble.f64(i32 101
@@ -285,6 +379,7 @@ void main(uint ix0 : IX0, uint ix1 : IX1, uint2 ix2 : IX2, uint3 ix3 : IX3) {
   // I1: icmp ne i32 %{{.*}}, 0
   // I1: icmp ne i32 %{{.*}}, 0
   TYPE texElt3 = RoTex2d[ix2];
+
   // CHECK: [[ANHDLRWTX2:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[HDLRWTX2]]
   // CHECK: call %dx.types.ResRet.[[TY32]] @dx.op.textureLoad.[[TY32]](i32 66, %dx.types.Handle [[ANHDLRWTX2]], i32 undef, i32 [[IX20]], i32 [[IX21]], i32 undef
   // F64: call double @dx.op.makeDouble.f64(i32 101
@@ -320,6 +415,7 @@ void main(uint ix0 : IX0, uint ix1 : IX1, uint2 ix2 : IX2, uint3 ix3 : IX3) {
   // I1: icmp ne i32 %{{.*}}, 0
   // I1: icmp ne i32 %{{.*}}, 0
   TYPE texElt5 = RoTex3d[ix3];
+
   // CHECK: [[ANHDLRWTX3:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[HDLRWTX3]]
   // CHECK: call %dx.types.ResRet.[[TY32]] @dx.op.textureLoad.[[TY32]](i32 66, %dx.types.Handle [[ANHDLRWTX3]], i32 undef, i32 [[IX30]], i32 [[IX31]], i32 [[IX32]]
   // F64: call double @dx.op.makeDouble.f64(i32 101
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/check-shader-stages.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/check-shader-stages.hlsl
new file mode 100644
index 0000000000..75e7c8a5cd
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/check-shader-stages.hlsl
@@ -0,0 +1,135 @@
+// RUN: %dxc -T lib_6_9 %s | FileCheck %s
+ 
+ByteAddressBuffer matrix_buffer;
+ByteAddressBuffer bias_buffer;
+RWByteAddressBuffer rw_matrix_buffer;
+ByteAddressBuffer input_vector_buffer;
+RWByteAddressBuffer output_vector_buffer;
+
+void UseCoopVec() {
+    vector<float, 4> output_vector;
+    static const uint is_output_unsigned = 0;
+
+    vector<float, 4> input_vector = input_vector_buffer.Load<vector<float, 4> >(0);
+    const uint is_input_unsigned = 0;
+    const uint input_interpretation = 9; /*F32*/
+
+    const uint matrix_offset = 0;
+    const uint matrix_interpretation = 9; /*F32*/
+    const uint matrix_dimM = 4;
+    const uint matrix_dimK = 4;
+    const uint matrix_layout = 0; /*RowMajor*/
+    const bool matrix_is_transposed = false;
+    const uint matrix_stride = 64;
+
+    __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+      is_input_unsigned, input_interpretation, matrix_buffer, matrix_offset,
+      matrix_interpretation, matrix_dimM, matrix_dimK, matrix_layout,
+      matrix_is_transposed, matrix_stride);
+    output_vector_buffer.Store(0, output_vector);
+
+    const uint bias_offset = 0;
+    const uint bias_interpretation = 9; /*F32*/
+
+    __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+      is_input_unsigned, input_interpretation, matrix_buffer, matrix_offset,
+      matrix_interpretation, matrix_dimM, matrix_dimK, matrix_layout,
+      matrix_is_transposed, matrix_stride, bias_buffer, bias_offset,
+      bias_interpretation);
+    output_vector_buffer.Store(1024, output_vector);
+
+    vector<uint, 8> input_vector1;
+    vector<uint, 8> input_vector2;
+    const uint opa_matrix_offset = 0;
+    const uint opa_matrix_interpretation = 5; /*U32*/
+    const uint opa_matrix_layout = 3; /*OuterProductOptimal*/
+    const uint opa_matrix_stride = 0;
+
+    __builtin_OuterProductAccumulate(input_vector1, input_vector2,
+      rw_matrix_buffer, opa_matrix_offset, opa_matrix_interpretation,
+      opa_matrix_layout, opa_matrix_stride);
+
+    const uint va_matrix_offset = 0;
+
+     __builtin_VectorAccumulate(input_vector1, rw_matrix_buffer,
+       va_matrix_offset);
+}
+
+// CHECK: define void @ps_main()
+// CHECK: call <4 x float> @dx.op.matVecMul
+// CHECK: call <4 x float> @dx.op.matVecMulAdd
+// CHECK: call void @dx.op.outerProductAccumulate
+// CHECK: call void @dx.op.vectorAccumulate
+
+[Shader("pixel")]
+void ps_main()
+{
+    UseCoopVec();
+}
+
+// CHECK: define void @cs_main()
+// CHECK: call <4 x float> @dx.op.matVecMul
+// CHECK: call <4 x float> @dx.op.matVecMulAdd
+// CHECK: call void @dx.op.outerProductAccumulate
+// CHECK: call void @dx.op.vectorAccumulate
+
+[Shader("compute")]
+[NumThreads(1,1,1)]
+void cs_main()
+{
+    UseCoopVec();
+}
+
+// CHECK: define void @vs_main()
+// CHECK: call <4 x float> @dx.op.matVecMul
+// CHECK: call <4 x float> @dx.op.matVecMulAdd
+// CHECK: call void @dx.op.outerProductAccumulate
+// CHECK: call void @dx.op.vectorAccumulate
+
+[Shader("vertex")]
+void vs_main()
+{
+    UseCoopVec();
+}
+
+struct MyRecord{
+    uint a;
+};
+
+// CHECK: define void @ns_main()
+// CHECK: call <4 x float> @dx.op.matVecMul
+// CHECK: call <4 x float> @dx.op.matVecMulAdd
+// CHECK: call void @dx.op.outerProductAccumulate
+// CHECK: call void @dx.op.vectorAccumulate
+
+[Shader("node")]
+[NodeLaunch("thread")]
+void ns_main(ThreadNodeInputRecord<MyRecord> input)
+{
+    UseCoopVec();
+}
+
+// Vertex shader output structure
+struct VS_OUT {
+    float3 Color : COLOR0;
+};
+
+// Geometry shader output structure
+struct GS_OUT {
+    float3 Color : COLOR0;
+    float2 TexCoord : TEXCOORD0;
+};
+
+// CHECK: define void @gs_main()
+// CHECK:  call <4 x float> @dx.op.matVecMul
+// CHECK: call <4 x float> @dx.op.matVecMulAdd
+// CHECK: call void @dx.op.outerProductAccumulate
+// CHECK: call void @dx.op.vectorAccumulate
+
+[shader("geometry")]
+[maxvertexcount(3)]
+void gs_main(point VS_OUT input[1], 
+    inout TriangleStream<GS_OUT> OutputStream)
+{
+    UseCoopVec();
+}
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/linalg-builtins.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/linalg-builtins.hlsl
new file mode 100644
index 0000000000..f1badb9101
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/linalg-builtins.hlsl
@@ -0,0 +1,79 @@
+// RUN: %dxc -fcgl -T cs_6_9 -E cs_main %s | FileCheck %s
+
+ByteAddressBuffer input_vector_buffer;
+ByteAddressBuffer opa_input_buffer;
+ByteAddressBuffer matrix_buffer;
+ByteAddressBuffer bias_buffer;
+RWByteAddressBuffer rw_matrix_buffer;
+RWByteAddressBuffer output_vector_buffer;
+
+[Shader("compute")]
+[NumThreads(1,1,1)]
+void cs_main()
+{    
+    vector<float, 4> output_vector;
+    static const uint is_output_unsigned = 0;
+    
+    vector<float, 4> input_vector = input_vector_buffer.Load<vector<float, 4> >(0);
+    const uint is_input_unsigned = 0;
+    const uint input_interpretation = 9; /*F32*/
+    
+    const uint matrix_offset = 0;
+    const uint matrix_interpretation = 9; /*F32*/
+    const uint matrix_dimM = 4;
+    const uint matrix_dimK = 4;
+    const uint matrix_layout = 0; /*RowMajor*/
+    const bool matrix_is_transposed = false; 
+    const uint matrix_stride = 64;
+
+    // CHECK: %[[MLD0:[^ ]+]] = load %struct.ByteAddressBuffer, %struct.ByteAddressBuffer* @"\01?matrix_buffer@@3UByteAddressBuffer@@A"
+    // CHECK: %[[MCH0:[^ ]+]] = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.ByteAddressBuffer)"(i32 0, %struct.ByteAddressBuffer %[[MLD0]])
+    // CHECK: %[[MAH0:[^ ]+]] = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.ByteAddressBuffer)"(i32 14, %dx.types.Handle %[[MCH0]], %dx.types.ResourceProperties { i32 11, i32 0 }, %struct.ByteAddressBuffer undef)
+    // CHECK: call void @"dx.hl.op..void (i32, <4 x float>*, i1, <4 x float>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32)"(i32 390, <4 x float>* %{{[^ ]+}}, i1 false, <4 x float> %{{[^ ]+}}, i1 false, i32 9, %dx.types.Handle %[[MAH0]], i32 0, i32 9, i32 4, i32 4, i32 0, i1 false, i32 64)
+    __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+      is_input_unsigned, input_interpretation, matrix_buffer, matrix_offset,
+      matrix_interpretation, matrix_dimM, matrix_dimK, matrix_layout,
+      matrix_is_transposed, matrix_stride);
+    output_vector_buffer.Store(0, output_vector);
+
+    const uint bias_offset = 0;
+    const uint bias_interpretation = 9; /*F32*/
+
+    // CHECK: %[[MLD1:[^ ]+]] = load %struct.ByteAddressBuffer, %struct.ByteAddressBuffer* @"\01?matrix_buffer@@3UByteAddressBuffer@@A"
+    // CHECK: %[[MCH1:[^ ]+]] = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.ByteAddressBuffer)"(i32 0, %struct.ByteAddressBuffer %[[MLD1]])
+    // CHECK: %[[MAH1:[^ ]+]] = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.ByteAddressBuffer)"(i32 14, %dx.types.Handle %[[MCH1]], %dx.types.ResourceProperties { i32 11, i32 0 }, %struct.ByteAddressBuffer undef)
+    // CHECK-NEXT: %[[BLD1:[^ ]+]] = load %struct.ByteAddressBuffer, %struct.ByteAddressBuffer* @"\01?bias_buffer@@3UByteAddressBuffer@@A"
+    // CHECK-NEXT: %[[BCH1:[^ ]+]] = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.ByteAddressBuffer)"(i32 0, %struct.ByteAddressBuffer %[[BLD1]])
+    // CHECK-NEXT: %[[BAH1:[^ ]+]] = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.ByteAddressBuffer)"(i32 14, %dx.types.Handle %[[BCH1]], %dx.types.ResourceProperties { i32 11, i32 0 }, %struct.ByteAddressBuffer undef)
+    // CHECK-NEXT: call void @"dx.hl.op..void (i32, <4 x float>*, i1, <4 x float>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32, %dx.types.Handle, i32, i32)"(i32 391, <4 x float>* %{{[^ ]+}}, i1 false, <4 x float> %{{[^ ]+}}, i1 false, i32 9, %dx.types.Handle %[[MAH1]], i32 0, i32 9, i32 4, i32 4, i32 0, i1 false, i32 64, %dx.types.Handle %[[BAH1]], i32 0, i32 9)
+    __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+      is_input_unsigned, input_interpretation, matrix_buffer, matrix_offset,
+      matrix_interpretation, matrix_dimM, matrix_dimK, matrix_layout,
+      matrix_is_transposed, matrix_stride, bias_buffer, bias_offset,
+      bias_interpretation);
+    output_vector_buffer.Store(1024, output_vector);
+
+    vector<uint, 8> input_vector1 = opa_input_buffer.Load<vector<uint, 8> >(0);
+    vector<uint, 8> input_vector2 = opa_input_buffer.Load<vector<uint, 8> >(128);
+    const uint opa_matrix_offset = 0;
+    const uint opa_matrix_interpretation = 5; /*U32*/
+    const uint opa_matrix_layout = 3; /*OuterProductOptimal*/
+    const uint opa_matrix_stride = 0;
+
+    // CHECK: %[[MLD2:[^ ]+]] = load %struct.RWByteAddressBuffer, %struct.RWByteAddressBuffer* @"\01?rw_matrix_buffer@@3URWByteAddressBuffer@@A"
+    // CHECK: %[[MCH2:[^ ]+]] = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RWByteAddressBuffer)"(i32 0, %struct.RWByteAddressBuffer %[[MLD2]])
+    // CHECK: %[[MAH2:[^ ]+]] = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RWByteAddressBuffer)"(i32 14, %dx.types.Handle %[[MCH2]], %dx.types.ResourceProperties { i32 4107, i32 0 }, %struct.RWByteAddressBuffer undef)
+    // CHECK: call void @"dx.hl.op..void (i32, <8 x i32>, <8 x i32>, %dx.types.Handle, i32, i32, i32, i32)"(i32 392, <8 x i32> %{{[^ ]+}}, <8 x i32> %{{[^ ]+}}, %dx.types.Handle %[[MAH2]], i32 0, i32 5, i32 3, i32 0)
+    __builtin_OuterProductAccumulate(input_vector1, input_vector2,
+      rw_matrix_buffer, opa_matrix_offset, opa_matrix_interpretation,
+      opa_matrix_layout, opa_matrix_stride);
+
+    const uint va_matrix_offset = 0;
+
+    // CHECK: %[[MLD3:[^ ]+]] = load %struct.RWByteAddressBuffer, %struct.RWByteAddressBuffer* @"\01?rw_matrix_buffer@@3URWByteAddressBuffer@@A"
+    // CHECK: %[[MCH3:[^ ]+]] = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RWByteAddressBuffer)"(i32 0, %struct.RWByteAddressBuffer %[[MLD3]])
+    // CHECK: %[[MAH3:[^ ]+]] = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RWByteAddressBuffer)"(i32 14, %dx.types.Handle %[[MCH3]], %dx.types.ResourceProperties { i32 4107, i32 0 }, %struct.RWByteAddressBuffer undef)
+    // CHECK: call void @"dx.hl.op..void (i32, <8 x i32>, %dx.types.Handle, i32)"(i32 393, <8 x i32> %{{[^ ]+}}, %dx.types.Handle %[[MAH3]], i32 0)
+    __builtin_VectorAccumulate(input_vector1, rw_matrix_buffer,
+      va_matrix_offset); 
+}
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/lit.local.cfg b/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/lit.local.cfg
new file mode 100644
index 0000000000..c2417a9e43
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/lit.local.cfg
@@ -0,0 +1 @@
+config.unsupported = 'dxil-1-9' not in config.available_features
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/mat-vec-mul-add_multioverload.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/mat-vec-mul-add_multioverload.hlsl
new file mode 100644
index 0000000000..de811982d6
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/mat-vec-mul-add_multioverload.hlsl
@@ -0,0 +1,122 @@
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=float16_t -DIU=0 -DITY=float16_t -DINUM=8 -DII=F16 -DMI=F16 -DML=RowMajor -DMT=0 -DBI=F16 -DMST=64 | FileCheck %s --check-prefixes COMMON,DXIL-0
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=float16_t -DIU=0 -DITY=float16_t -DINUM=8 -DII=F8_E4M3 -DMI=F8_E4M3 -DML=MulOptimal -DMT=0 -DBI=F16 -DMST=0 | FileCheck %s --check-prefixes COMMON,DXIL-1
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=float16_t -DIU=0 -DITY=float16_t -DINUM=8 -DII=F8_E5M2 -DMI=F8_E5M2 -DML=MulOptimal -DMT=1 -DBI=F16 -DMST=0 | FileCheck %s --check-prefixes COMMON,DXIL-2
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=1 -DITY=uint -DINUM=2 -DII=PackedS8x32 -DMI=I8 -DML=OuterProductOptimal -DMT=1 -DBI=I32 -DMST=0 | FileCheck %s --check-prefixes COMMON,DXIL-3
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=0 -DITY=float -DINUM=8 -DII=I8 -DMI=I8 -DML=RowMajor -DMT=0 -DBI=I32 -DMST=64 | FileCheck %s --check-prefixes COMMON,DXIL-4
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=1 -DOTY=uint -DIU=0 -DITY=float -DINUM=8 -DII=I8 -DMI=F16 -DML=RowMajor -DMT=0 -DBI=I8 -DMST=64 | FileCheck %s --check-prefixes COMMON,DXIL-5
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=1 -DITY=uint -DINUM=8 -DII=U8 -DMI=I8 -DML=ColumnMajor -DMT=0 -DBI=I8 -DMST=64 | FileCheck %s --check-prefixes COMMON,DXIL-6
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=0 -DITY=int -DINUM=8 -DII=U8 -DMI=U8 -DML=MulOptimal -DMT=1 -DBI=I8 -DMST=0 | FileCheck %s --check-prefixes COMMON,DXIL-7
+
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=float16_t -DIU=0 -DITY=float16_t -DINUM=8 -DII=F16 -DMI=F16 -DML=RowMajor -DMT=0 -DBI=F16 -DMST=64 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-0
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=float16_t -DIU=0 -DITY=float16_t -DINUM=8 -DII=F8_E4M3 -DMI=F8_E4M3 -DML=MulOptimal -DMT=0 -DBI=F16 -DMST=0 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-1
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=float16_t -DIU=0 -DITY=float16_t -DINUM=8 -DII=F8_E5M2 -DMI=F8_E5M2 -DML=MulOptimal -DMT=1 -DBI=F16 -DMST=0 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-2
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=1 -DITY=uint -DINUM=2 -DII=PackedS8x32 -DMI=I8 -DML=OuterProductOptimal -DMT=1 -DBI=I32 -DMST=0 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-3
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=0 -DITY=float -DINUM=8 -DII=I8 -DMI=I8 -DML=RowMajor -DMT=0 -DBI=I32 -DMST=64 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-4
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=1 -DOTY=uint -DIU=0 -DITY=float -DINUM=8 -DII=I8 -DMI=F16 -DML=RowMajor -DMT=0 -DBI=I8 -DMST=64 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-5
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=1 -DITY=uint -DINUM=8 -DII=U8 -DMI=I8 -DML=ColumnMajor -DMT=0 -DBI=I8 -DMST=64 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-6
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=0 -DITY=int -DINUM=8 -DII=U8 -DMI=U8 -DML=MulOptimal -DMT=1 -DBI=I8 -DMST=0 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-7
+
+
+// COMMON: define void @main()
+
+// Test minimum support set of combinations for matVecMul
+// HLOP-0: call void @"dx.hl.op..void (i32, <8 x half>*, i1, <8 x half>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32, %dx.types.Handle, i32, i32)"(i32 391, <8 x half>* %output_vector, i1 false, <8 x half> %{{[^ ]+}}, i1 false, i32 8, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 8, i32 8, i32 8, i32 0, i1 false, i32 64, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 8)
+
+// DXIL-0: call <8 x half> @dx.op.matVecMulAdd.v8f16.v8f16(i32 306, <8 x half> {{[^ ]+}}, i1 false, i32 8, %dx.types.Handle {{[^ ]+}}, i32 0, i32 8, i32 8, i32 8, i32 0, i1 false, i32 64, %dx.types.Handle {{[^ ]+}}, i32 0, i32 8, i1 false)  ; MatVecMulAdd(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,biasBuffer,biasOffset,biasIntepretation,isOutputUnsigned)
+
+// HLOP-1: call void @"dx.hl.op..void (i32, <8 x half>*, i1, <8 x half>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32, %dx.types.Handle, i32, i32)"(i32 391, <8 x half>* %output_vector, i1 false, <8 x half> %{{[^ ]+}}, i1 false, i32 21, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 21, i32 8, i32 8, i32 2, i1 false, i32 0, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 8)
+
+// DXIL-1: call <8 x half> @dx.op.matVecMulAdd.v8f16.v8f16(i32 306, <8 x half> {{[^ ]+}}, i1 false, i32 21, %dx.types.Handle {{[^ ]+}}, i32 0, i32 21, i32 8, i32 8, i32 2, i1 false, i32 0, %dx.types.Handle {{[^ ]+}}, i32 0, i32 8, i1 false)  ; MatVecMulAdd(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,biasBuffer,biasOffset,biasIntepretation,isOutputUnsigned)
+
+// HLOP-2: call void @"dx.hl.op..void (i32, <8 x half>*, i1, <8 x half>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32, %dx.types.Handle, i32, i32)"(i32 391, <8 x half>* %output_vector, i1 false, <8 x half> %{{[^ ]+}}, i1 false, i32 22, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 22, i32 8, i32 8, i32 2, i1 true, i32 0, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 8)
+
+// DXIL-2: call <8 x half> @dx.op.matVecMulAdd.v8f16.v8f16(i32 306, <8 x half> {{[^ ]+}}, i1 false, i32 22, %dx.types.Handle {{[^ ]+}}, i32 0, i32 22, i32 8, i32 8, i32 2, i1 true, i32 0, %dx.types.Handle {{[^ ]+}}, i32 0, i32 8, i1 false)  ; MatVecMulAdd(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,biasBuffer,biasOffset,biasIntepretation,isOutputUnsigned)
+
+// HLOP-3: call void @"dx.hl.op..void (i32, <8 x i32>*, i1, <2 x i32>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32, %dx.types.Handle, i32, i32)"(i32 391, <8 x i32>* %output_vector, i1 false, <2 x i32> %{{[^ ]+}}, i1 true, i32 17, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 20, i32 8, i32 8, i32 3, i1 true, i32 0, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 4)
+
+// DXIL-3: call <8 x i32> @dx.op.matVecMulAdd.v8i32.v2i32(i32 306, <2 x i32> {{[^ ]+}}, i1 true, i32 17, %dx.types.Handle {{[^ ]+}}, i32 0, i32 20, i32 8, i32 8, i32 3, i1 true, i32 0, %dx.types.Handle {{[^ ]+}}, i32 0, i32 4, i1 false)  ; MatVecMulAdd(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,biasBuffer,biasOffset,biasIntepretation,isOutputUnsigned)
+
+// HLOP-4: call void @"dx.hl.op..void (i32, <8 x i32>*, i1, <8 x float>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32, %dx.types.Handle, i32, i32)"(i32 391, <8 x i32>* %output_vector, i1 false, <8 x float> %{{[^ ]+}}, i1 false, i32 20, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 20, i32 8, i32 8, i32 0, i1 false, i32 64, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 4)
+
+// DXIL-4: call <8 x i32> @dx.op.matVecMulAdd.v8i32.v8f32(i32 306, <8 x float> {{[^ ]+}}, i1 false, i32 20, %dx.types.Handle {{[^ ]+}}, i32 0, i32 20, i32 8, i32 8, i32 0, i1 false, i32 64, %dx.types.Handle {{[^ ]+}}, i32 0, i32 4, i1 false)  ; MatVecMulAdd(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,biasBuffer,biasOffset,biasIntepretation,isOutputUnsigned)
+
+// Test unsigned variations
+// HLOP-5: call void @"dx.hl.op..void (i32, <8 x i32>*, i1, <8 x float>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32, %dx.types.Handle, i32, i32)"(i32 391, <8 x i32>* %output_vector, i1 true, <8 x float> %{{[^ ]+}}, i1 false, i32 20, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 8, i32 8, i32 8, i32 0, i1 false, i32 64, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 20)
+
+// DXIL-5: call <8 x i32> @dx.op.matVecMulAdd.v8i32.v8f32(i32 306, <8 x float> {{[^ ]+}}, i1 false, i32 20, %dx.types.Handle {{[^ ]+}}, i32 0, i32 8, i32 8, i32 8, i32 0, i1 false, i32 64, %dx.types.Handle {{[^ ]+}}, i32 0, i32 20, i1 true)  ; MatVecMulAdd(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,biasBuffer,biasOffset,biasIntepretation,isOutputUnsigned)
+
+// HLOP-6: call void @"dx.hl.op..void (i32, <8 x i32>*, i1, <8 x i32>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32, %dx.types.Handle, i32, i32)"(i32 391, <8 x i32>* %output_vector, i1 false, <8 x i32> %{{[^ ]+}}, i1 true, i32 19, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 20, i32 8, i32 8, i32 1, i1 false, i32 64, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 20)
+
+// DXIL-6: call <8 x i32> @dx.op.matVecMulAdd.v8i32.v8i32(i32 306, <8 x i32> {{[^ ]+}}, i1 true, i32 19, %dx.types.Handle {{[^ ]+}}, i32 0, i32 20, i32 8, i32 8, i32 1, i1 false, i32 64, %dx.types.Handle {{[^ ]+}}, i32 0, i32 20, i1 false)  ; MatVecMulAdd(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,biasBuffer,biasOffset,biasIntepretation,isOutputUnsigned)
+
+// HLOP-7: call void @"dx.hl.op..void (i32, <8 x i32>*, i1, <8 x i32>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32, %dx.types.Handle, i32, i32)"(i32 391, <8 x i32>* %output_vector, i1 false, <8 x i32> %{{[^ ]+}}, i1 false, i32 19, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 19, i32 8, i32 8, i32 2, i1 true, i32 0, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 20)
+
+// DXIL-7: call <8 x i32> @dx.op.matVecMulAdd.v8i32.v8i32(i32 306, <8 x i32> {{[^ ]+}}, i1 false, i32 19, %dx.types.Handle {{[^ ]+}}, i32 0, i32 19, i32 8, i32 8, i32 2, i1 true, i32 0, %dx.types.Handle {{[^ ]+}}, i32 0, i32 20, i1 false)  ; MatVecMulAdd(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,biasBuffer,biasOffset,biasIntepretation,isOutputUnsigned)
+
+
+ByteAddressBuffer input_vector_buffer; 
+ByteAddressBuffer matrix_buffer;
+ByteAddressBuffer bias_buffer;
+RWByteAddressBuffer rw_matrix_buffer;
+RWByteAddressBuffer output_vector_buffer;
+
+enum CompType {
+  Invalid = 0,
+  I1 = 1,
+  I16 = 2,
+  U16 = 3,
+  I32 = 4,
+  U32 = 5,
+  I64 = 6,
+  U64 = 7,
+  F16 = 8,
+  F32 = 9,
+  F64 = 10,
+  SNormF16 = 11,
+  UNormF16 = 12,
+  SNormF32 = 13,
+  UNormF32 = 14,
+  SNormF64 = 15,
+  UNormF64 = 16,
+  PackedS8x32 = 17,
+  PackedU8x32 = 18,
+
+  // BEGIN NEW FOR SM 6.9
+  U8 = 19,
+  I8 = 20,
+  F8_E4M3 = 21,
+  F8_E5M2 = 22,
+};
+
+enum MatLayout {
+  RowMajor = 0,
+  ColumnMajor = 1,
+  MulOptimal = 2,
+  OuterProductOptimal = 3,
+};
+
+[NumThreads(1,1,1)]
+void main()
+{    
+    vector<OTY, 8> output_vector;
+    static const uint is_output_unsigned = OU;
+    
+    vector<ITY, INUM> input_vector = input_vector_buffer.Load<vector<ITY, INUM> >(0);
+    const uint is_input_unsigned = IU;
+    const uint input_interpretation = II;
+    
+    const uint matrix_offset = 0;
+    const uint matrix_interpretation = MI;
+    const uint matrix_dimM = 8;
+    const uint matrix_dimK = 8;
+    const uint matrix_layout = ML;
+    const bool matrix_is_transposed = (bool) MT; 
+    const uint matrix_stride = MST;
+
+    const uint bias_offset = 0;
+    const uint bias_interpretation = BI;
+
+    __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector, is_input_unsigned, input_interpretation, matrix_buffer, matrix_offset, matrix_interpretation, 
+        matrix_dimM, matrix_dimK, matrix_layout, matrix_is_transposed, matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+    output_vector_buffer.Store(0, output_vector);
+}
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/mat-vec-mul_multioverload.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/mat-vec-mul_multioverload.hlsl
new file mode 100644
index 0000000000..8b14fb4cf1
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/mat-vec-mul_multioverload.hlsl
@@ -0,0 +1,118 @@
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=float16_t -DIU=0 -DITY=float16_t -DINUM=8 -DII=F16 -DMI=F16 -DML=RowMajor -DMT=0 -DMST=64 | FileCheck %s --check-prefixes COMMON,DXIL-0
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=float16_t -DIU=0 -DITY=float16_t -DINUM=8 -DII=F8_E4M3 -DMI=F8_E4M3 -DML=MulOptimal -DMT=0 -DMST=0 | FileCheck %s --check-prefixes COMMON,DXIL-1
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=float16_t -DIU=0 -DITY=float16_t -DINUM=8 -DII=F8_E5M2 -DMI=F8_E5M2 -DML=MulOptimal -DMT=1 -DMST=0| FileCheck %s --check-prefixes COMMON,DXIL-2
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=1 -DITY=uint -DII=PackedS8x32 -DINUM=2 -DMI=I8 -DML=OuterProductOptimal -DMT=1 -DMST=0 | FileCheck %s --check-prefixes COMMON,DXIL-3
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=0 -DITY=float -DINUM=8 -DII=I8 -DMI=I8 -DML=RowMajor -DMT=0 -DMST=64 | FileCheck %s --check-prefixes COMMON,DXIL-4
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=1 -DOTY=uint -DIU=0 -DITY=float -DINUM=8 -DII=I8 -DMI=F16 -DINUM=8 -DML=RowMajor -DMT=0 -DMST=64 | FileCheck %s --check-prefixes COMMON,DXIL-5
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=1 -DITY=uint -DINUM=8 -DII=U8 -DMI=I8 -DINUM=8 -DML=ColumnMajor -DMT=0 -DMST=64 | FileCheck %s --check-prefixes COMMON,DXIL-6
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=0 -DITY=int -DINUM=8 -DII=U8 -DMI=U8 -DINUM=8 -DML=MulOptimal -DMT=1 -DMST=0 | FileCheck %s --check-prefixes COMMON,DXIL-7
+
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=float16_t -DIU=0 -DITY=float16_t -DINUM=8 -DII=F16 -DMI=F16 -DML=RowMajor -DMT=0 -DMST=64 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-0
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=float16_t -DIU=0 -DITY=float16_t -DINUM=8 -DII=F8_E4M3 -DMI=F8_E4M3 -DML=MulOptimal -DMT=0 -DMST=0 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-1
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=float16_t -DIU=0 -DITY=float16_t -DINUM=8 -DII=F8_E5M2 -DMI=F8_E5M2 -DML=MulOptimal -DMT=1 -DMST=0 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-2
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=1 -DITY=uint -DINUM=2 -DII=PackedS8x32 -DMI=I8 -DML=OuterProductOptimal -DMT=1 -DMST=0 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-3
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=0 -DITY=float -DINUM=8 -DII=I8 -DMI=I8 -DML=RowMajor -DMT=0 -DMST=64 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-4
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=1 -DOTY=uint -DIU=0 -DITY=float -DINUM=8 -DII=I8 -DMI=F16 -DML=RowMajor -DMT=0 -DMST=64 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-5
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=1 -DITY=uint -DINUM=8 -DII=U8 -DMI=I8 -DML=ColumnMajor -DMT=0 -DMST=64 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-6
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=0 -DITY=int -DINUM=8 -DII=U8 -DMI=U8 -DML=MulOptimal -DMT=1 -DMST=0 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-7
+
+// COMMON: define void @main()
+
+// Test minimum support set of combinations for matVecMul
+// HLOP-0: call void @"dx.hl.op..void (i32, <8 x half>*, i1, <8 x half>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32)"(i32 390, <8 x half>* %output_vector, i1 false, <8 x half> %{{[^ ]+}}, i1 false, i32 8, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 8, i32 8, i32 8, i32 0, i1 false, i32 64)
+
+// DXIL-0: call <8 x half> @dx.op.matVecMul.v8f16.v8f16(i32 305, <8 x half> {{[^ ]+}}, i1 false, i32 8, %dx.types.Handle {{[^ ]+}}, i32 0, i32 8, i32 8, i32 8, i32 0, i1 false, i32 64, i1 false)  ; MatVecMul(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,isOutputUnsigned)
+
+// HLOP-1: call void @"dx.hl.op..void (i32, <8 x half>*, i1, <8 x half>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32)"(i32 390, <8 x half>* %output_vector, i1 false, <8 x half> %{{[^ ]+}}, i1 false, i32 21, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 21, i32 8, i32 8, i32 2, i1 false, i32 0)
+
+// DXIL-1: call <8 x half> @dx.op.matVecMul.v8f16.v8f16(i32 305, <8 x half> {{[^ ]+}}, i1 false, i32 21, %dx.types.Handle {{[^ ]+}}, i32 0, i32 21, i32 8, i32 8, i32 2, i1 false, i32 0, i1 false)  ; MatVecMul(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,isOutputUnsigned)
+
+// HLOP-2: call void @"dx.hl.op..void (i32, <8 x half>*, i1, <8 x half>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32)"(i32 390, <8 x half>* %output_vector, i1 false, <8 x half> %{{[^ ]+}}, i1 false, i32 22, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 22, i32 8, i32 8, i32 2, i1 true, i32 0)
+
+// DXIL-2: call <8 x half> @dx.op.matVecMul.v8f16.v8f16(i32 305, <8 x half> {{[^ ]+}}, i1 false, i32 22, %dx.types.Handle {{[^ ]+}}, i32 0, i32 22, i32 8, i32 8, i32 2, i1 true, i32 0, i1 false)  ; MatVecMul(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,isOutputUnsigned)
+
+// HLOP-3: call void @"dx.hl.op..void (i32, <8 x i32>*, i1, <2 x i32>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32)"(i32 390, <8 x i32>* %output_vector, i1 false, <2 x i32> %{{[^ ]+}}, i1 true, i32 17, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 20, i32 8, i32 8, i32 3, i1 true, i32 0)
+
+// DXIL-3: call <8 x i32>  @dx.op.matVecMul.v8i32.v2i32(i32 305, <2 x i32> {{[^ ]+}}, i1 true, i32 17, %dx.types.Handle {{[^ ]+}}, i32 0, i32 20, i32 8, i32 8, i32 3, i1 true, i32 0, i1 false)  ; MatVecMul(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,isOutputUnsigned)
+
+// HLOP-4: call void @"dx.hl.op..void (i32, <8 x i32>*, i1, <8 x float>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32)"(i32 390, <8 x i32>* %output_vector, i1 false, <8 x float> %{{[^ ]+}}, i1 false, i32 20, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 20, i32 8, i32 8, i32 0, i1 false, i32 64)
+
+// DXIL-4: call <8 x i32> @dx.op.matVecMul.v8i32.v8f32(i32 305, <8 x float> {{[^ ]+}}, i1 false, i32 20, %dx.types.Handle {{[^ ]+}}, i32 0, i32 20, i32 8, i32 8, i32 0, i1 false, i32 64, i1 false)  ; MatVecMul(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,isOutputUnsigned)
+
+// Test unsigned variations
+// HLOP-5: call void @"dx.hl.op..void (i32, <8 x i32>*, i1, <8 x float>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32)"(i32 390, <8 x i32>* %output_vector, i1 true, <8 x float> %{{[^ ]+}}, i1 false, i32 20, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 8, i32 8, i32 8, i32 0, i1 false, i32 64)
+
+// DXIL-5: call <8 x i32> @dx.op.matVecMul.v8i32.v8f32(i32 305, <8 x float> {{[^ ]+}}, i1 false, i32 20, %dx.types.Handle {{[^ ]+}}, i32 0, i32 8, i32 8, i32 8, i32 0, i1 false, i32 64, i1 true)  ; MatVecMul(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,isOutputUnsigned)
+
+// HLOP-6: call void @"dx.hl.op..void (i32, <8 x i32>*, i1, <8 x i32>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32)"(i32 390, <8 x i32>* %output_vector, i1 false, <8 x i32> %{{[^ ]+}}, i1 true, i32 19, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 20, i32 8, i32 8, i32 1, i1 false, i32 64)
+
+// DXIL-6: call <8 x i32> @dx.op.matVecMul.v8i32.v8i32(i32 305, <8 x i32> {{[^ ]+}}, i1 true, i32 19, %dx.types.Handle {{[^ ]+}}, i32 0, i32 20, i32 8, i32 8, i32 1, i1 false, i32 64, i1 false)  ; MatVecMul(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,isOutputUnsigned)
+
+// HLOP-7: call void @"dx.hl.op..void (i32, <8 x i32>*, i1, <8 x i32>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32)"(i32 390, <8 x i32>* %output_vector, i1 false, <8 x i32> %{{[^ ]+}}, i1 false, i32 19, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 19, i32 8, i32 8, i32 2, i1 true, i32 0)
+
+// DXIL-7: call <8 x i32> @dx.op.matVecMul.v8i32.v8i32(i32 305, <8 x i32> {{[^ ]+}}, i1 false, i32 19, %dx.types.Handle {{[^ ]+}}, i32 0, i32 19, i32 8, i32 8, i32 2, i1 true, i32 0, i1 false)  ; MatVecMul(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,isOutputUnsigned)
+
+
+ByteAddressBuffer input_vector_buffer; 
+ByteAddressBuffer matrix_buffer;
+ByteAddressBuffer bias_buffer;
+RWByteAddressBuffer rw_matrix_buffer;
+RWByteAddressBuffer output_vector_buffer;
+
+enum CompType {
+  Invalid = 0,
+  I1 = 1,
+  I16 = 2,
+  U16 = 3,
+  I32 = 4,
+  U32 = 5,
+  I64 = 6,
+  U64 = 7,
+  F16 = 8,
+  F32 = 9,
+  F64 = 10,
+  SNormF16 = 11,
+  UNormF16 = 12,
+  SNormF32 = 13,
+  UNormF32 = 14,
+  SNormF64 = 15,
+  UNormF64 = 16,
+  PackedS8x32 = 17,
+  PackedU8x32 = 18,
+
+  // BEGIN NEW FOR SM 6.9
+  U8 = 19,
+  I8 = 20,
+  F8_E4M3 = 21,
+  F8_E5M2 = 22,
+};
+
+enum MatLayout {
+  RowMajor = 0,
+  ColumnMajor = 1,
+  MulOptimal = 2,
+  OuterProductOptimal = 3,
+};
+
+[NumThreads(1,1,1)]
+void main()
+{    
+    vector<OTY, 8> output_vector;
+    static const uint is_output_unsigned = OU;
+    
+    vector<ITY, INUM> input_vector = input_vector_buffer.Load<vector<ITY, INUM> >(0);
+    const uint is_input_unsigned = IU;
+    const uint input_interpretation = II;
+    
+    const uint matrix_offset = 0;
+    const uint matrix_interpretation = MI;
+    const uint matrix_dimM = 8;
+    const uint matrix_dimK = 8;
+    const uint matrix_layout = ML;
+    const bool matrix_is_transposed = (bool) MT; 
+    const uint matrix_stride = MST;
+
+    __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector, is_input_unsigned, input_interpretation, matrix_buffer, matrix_offset, matrix_interpretation, 
+        matrix_dimM, matrix_dimK, matrix_layout, matrix_is_transposed, matrix_stride);
+    output_vector_buffer.Store(0, output_vector);
+}
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/outer-product-accumulate-multioverload.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/outer-product-accumulate-multioverload.hlsl
new file mode 100644
index 0000000000..c53b7d8f21
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/outer-product-accumulate-multioverload.hlsl
@@ -0,0 +1,75 @@
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DITY=float16_t -DMI=F16 -DML=OuterProductOptimal | FileCheck %s --check-prefixes COMMON,DXIL-0
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DITY=float16_t -DMI=F8_E4M3 -DML=OuterProductOptimal | FileCheck %s --check-prefixes COMMON,DXIL-1
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DITY=uint -DMI=U8 -DML=OuterProductOptimal | FileCheck %s --check-prefixes COMMON,DXIL-2
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DITY=float16_t -DMI=F16 -DML=OuterProductOptimal -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-0
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DITY=float16_t -DMI=F8_E4M3 -DML=OuterProductOptimal -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-1
+// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DITY=uint -DMI=U8 -DML=OuterProductOptimal -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-2
+
+ByteAddressBuffer input_vector_buffer;
+ByteAddressBuffer input_vector_buffer2;
+RWByteAddressBuffer matrix_buffer;
+
+// COMMON: define void @main()
+
+// DXIL-0: call void @dx.op.outerProductAccumulate.v8f16.v8f16(i32 307, <8 x half> %{{[^ ]+}}, <8 x half> %{{[^ ]+}}, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 8, i32 3, i32 0)  ; OuterProductAccumulate(inputVector1,inputVector2,matrixBuffer,matrixOffset,matrixIntepretation,matrixLayout,matrixStride)
+
+// HLOP-0: call void @"dx.hl.op..void (i32, <8 x half>, <8 x half>, %dx.types.Handle, i32, i32, i32, i32)"(i32 392, <8 x half> %{{[^ ]+}}, <8 x half> %{{[^ ]+}}, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 8, i32 3, i32 0)
+
+// DXIL-1: call void @dx.op.outerProductAccumulate.v8f16.v8f16(i32 307, <8 x half> %{{[^ ]+}}, <8 x half> %{{[^ ]+}}, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 21, i32 3, i32 0)  ; OuterProductAccumulate(inputVector1,inputVector2,matrixBuffer,matrixOffset,matrixIntepretation,matrixLayout,matrixStride)
+
+// HLOP-1: call void @"dx.hl.op..void (i32, <8 x half>, <8 x half>, %dx.types.Handle, i32, i32, i32, i32)"(i32 392, <8 x half> %{{[^ ]+}}, <8 x half> %{{[^ ]+}}, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 21, i32 3, i32 0)
+
+// DXIL-2: call void @dx.op.outerProductAccumulate.v8i32.v8i32(i32 307, <8 x i32> %{{[^ ]+}}, <8 x i32> %{{[^ ]+}}, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 19, i32 3, i32 0)  ; OuterProductAccumulate(inputVector1,inputVector2,matrixBuffer,matrixOffset,matrixIntepretation,matrixLayout,matrixStride)
+
+// HLOP-2: call void @"dx.hl.op..void (i32, <8 x i32>, <8 x i32>, %dx.types.Handle, i32, i32, i32, i32)"(i32 392, <8 x i32> %{{[^ ]+}}, <8 x i32> %{{[^ ]+}}, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 19, i32 3, i32 0)
+
+enum CompType {
+  Invalid = 0,
+  I1 = 1,
+  I16 = 2,
+  U16 = 3,
+  I32 = 4,
+  U32 = 5,
+  I64 = 6,
+  U64 = 7,
+  F16 = 8,
+  F32 = 9,
+  F64 = 10,
+  SNormF16 = 11,
+  UNormF16 = 12,
+  SNormF32 = 13,
+  UNormF32 = 14,
+  SNormF64 = 15,
+  UNormF64 = 16,
+  PackedS8x32 = 17,
+  PackedU8x32 = 18,
+
+  // BEGIN NEW FOR SM 6.9
+  U8 = 19,
+  I8 = 20,
+  F8_E4M3 = 21,
+  F8_E5M2 = 22,
+};
+
+enum MatLayout {
+  RowMajor = 0,
+  ColumnMajor = 1,
+  MulOptimal = 2,
+  OuterProductOptimal = 3,
+};
+
+
+[Numthreads(1,1,1)]
+void main()
+{
+    vector<ITY, 8> input_vector1 = input_vector_buffer.Load<vector<ITY, 8> >(0);
+    vector<ITY, 8> input_vector2 = input_vector_buffer2.Load<vector<ITY, 8> >(0);
+
+    const uint matrix_interpretation = MI;
+    const uint matrix_layout = ML;
+    const uint matrix_offset = 0;
+    const uint matrix_stride = 0;
+
+    __builtin_OuterProductAccumulate(input_vector1, input_vector2, matrix_buffer, matrix_offset, matrix_interpretation, matrix_layout, matrix_stride);
+
+}
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/vector-accumulate.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/vector-accumulate.hlsl
new file mode 100644
index 0000000000..dc1bb6c563
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/vector-accumulate.hlsl
@@ -0,0 +1,16 @@
+// RUN: %dxc -T cs_6_9 %s | FileCheck %s
+
+RWByteAddressBuffer matrix_buffer;
+
+// Test use of __builtin_VectorAccumulate in compute shader
+// CHECK: define void @main()
+// CHECK: call void @dx.op.vectorAccumulate.v2i32(i32 {{[0-9]+}}, <2 x i32> <i32 5, i32 5>, %dx.types.Handle {{%[0-9]+}}, i32 0)
+
+[NumThreads(1,1,1)]
+void main()
+{
+    vector<uint, 2> input_vector1 = 5;
+    const uint matrix_offset = 0;
+
+     __builtin_VectorAccumulate(input_vector1, matrix_buffer, matrix_offset);
+}
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/maybereorder_od.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/maybereorder_od.hlsl
new file mode 100644
index 0000000000..42dff9c52c
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/maybereorder_od.hlsl
@@ -0,0 +1,19 @@
+// RUN: %dxc -T lib_6_9 -E main %s -Od | FileCheck %s --check-prefix DXIL
+
+// DXIL: %[[HOA:[^ ]+]] = alloca %dx.types.HitObject, align 4
+// DXIL-NEXT: %[[NOP:[^ ]+]] = call %dx.types.HitObject @dx.op.hitObject_MakeNop(i32 266)  ; HitObject_MakeNop()
+// DXIL-NEXT: store %dx.types.HitObject %[[NOP]], %dx.types.HitObject* %[[HOA]]
+// DXIL-NEXT: %[[LD0:[^ ]+]] = load %dx.types.HitObject, %dx.types.HitObject* %[[HOA]]
+// DXIL-NEXT: call void @dx.op.maybeReorderThread(i32 268, %dx.types.HitObject %[[LD0]], i32 undef, i32 0)  ; MaybeReorderThread(hitObject,coherenceHint,numCoherenceHintBitsFromLSB)
+// DXIL-NEXT: %[[LD1:[^ ]+]] = load %dx.types.HitObject, %dx.types.HitObject* %[[HOA]]
+// DXIL-NEXT: call void @dx.op.maybeReorderThread(i32 268, %dx.types.HitObject %[[LD1]], i32 241, i32 3)  ; MaybeReorderThread(hitObject,coherenceHint,numCoherenceHintBitsFromLSB)
+// DXIL-NEXT: %[[NOP2:[^ ]+]] = call %dx.types.HitObject @dx.op.hitObject_MakeNop(i32 266)  ; HitObject_MakeNop()
+// DXIL-NEXT: call void @dx.op.maybeReorderThread(i32 268, %dx.types.HitObject %[[NOP2]], i32 242, i32 7)  ; MaybeReorderThread(hitObject,coherenceHint,numCoherenceHintBitsFromLSB)
+
+[shader("raygeneration")]
+void main() {
+  dx::HitObject hit;
+  dx::MaybeReorderThread(hit);
+  dx::MaybeReorderThread(hit, 0xf1, 3);
+  dx::MaybeReorderThread(0xf2, 7);
+}
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/linalg/mat-vec-mul.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/linalg/mat-vec-mul.hlsl
new file mode 100644
index 0000000000..26bcc75da2
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/hlsl/linalg/mat-vec-mul.hlsl
@@ -0,0 +1,92 @@
+// RUN: %dxc -I %hlsl_headers -T lib_6_9 -enable-16bit-types %s | FileCheck %s
+
+#include <dx/linalg.h>
+
+ByteAddressBuffer Buf;
+
+export float4 Test1(vector<float, 4> Input) {
+  using namespace dx::linalg;
+
+  MatrixRef<DATA_TYPE_FLOAT16, 4, 4, MATRIX_LAYOUT_MUL_OPTIMAL, true> Matrix = {
+      Buf, 0, 0};
+
+  // CHECK: %{{.+}} = call <4 x float> @dx.op.matVecMul.v4f32.v4f32(i32 305, <4 x float> %{{.+}}, i1 false, i32 8, %dx.types.Handle %{{.+}}, i32 0, i32 8, i32 4, i32 4, i32 2, i1 true, i32 0, i1 false)
+  return Mul<float>(    
+      Matrix, MakeInterpretedVector<DATA_TYPE_FLOAT16>(Input));
+}
+
+export vector<float, 8> Test2(vector<uint, 6> Input) {
+  using namespace dx::linalg;
+
+  MatrixRef<DATA_TYPE_UINT8, 8, 6 * 4, MATRIX_LAYOUT_MUL_OPTIMAL> Matrix = {
+      Buf, 0, 0};
+
+  // note the stride argument is dropped.
+  // CHECK: %{{.+}} = call <8 x float> @dx.op.matVecMul.v8f32.v6i32(i32 305, <6 x i32> %{{.+}}, i1 true, i32 18, %dx.types.Handle %{{.+}}, i32 0, i32 19, i32 8, i32 24, i32 2, i1 false, i32 0, i1 false)
+  return Mul<float>(Matrix,
+                    MakeInterpretedVector<DATA_TYPE_UINT8_T4_PACKED>(Input));
+}
+
+// test that "stride" isn't ignored in non-optimal layouts
+export vector<float, 8> Test3(vector<uint, 6> Input) {
+  using namespace dx::linalg;
+
+  MatrixRef<DATA_TYPE_UINT8, 8, 6 * 4, MATRIX_LAYOUT_ROW_MAJOR> Matrix = {
+      Buf, 0, 6 * 4 * 8};
+
+  // CHECK: %{{.+}} = call <8 x float> @dx.op.matVecMul.v8f32.v6i32(i32 305, <6 x i32> %{{.+}}, i1 true, i32 18, %dx.types.Handle %{{.+}}, i32 0, i32 19, i32 8, i32 24, i32 0, i1 false, i32 192, i1 false)
+  return Mul<float>(Matrix,
+                    MakeInterpretedVector<DATA_TYPE_UINT8_T4_PACKED>(Input));
+}
+
+// test that isUnsigned is set correctly for uint16_t
+export vector<uint16_t, 8> Test4(vector<uint, 6> Input) {
+  using namespace dx::linalg;
+
+  MatrixRef<DATA_TYPE_UINT8, 8, 6 * 4, MATRIX_LAYOUT_ROW_MAJOR> Matrix = {
+      Buf, 0, 6 * 4 * 8};
+
+  // CHECK: %{{.+}} = call <8 x i16> @dx.op.matVecMul.v8i16.v6i32(i32 305, <6 x i32> %{{.+}}, i1 true, i32 18, %dx.types.Handle %{{.+}}, i32 0, i32 19, i32 8, i32 24, i32 0, i1 false, i32 192, i1 true)
+  return Mul<uint16_t>(Matrix,
+                    MakeInterpretedVector<DATA_TYPE_UINT8_T4_PACKED>(Input));  
+
+}
+
+// test that isUnsigned is set correctly for uint32_t
+export vector<uint, 8> Test5(vector<uint, 6> Input) {
+  using namespace dx::linalg;
+
+  MatrixRef<DATA_TYPE_UINT8, 8, 6 * 4, MATRIX_LAYOUT_ROW_MAJOR> Matrix = {
+      Buf, 0, 6 * 4 * 8};
+
+  // CHECK: %{{.+}} = call <8 x i32> @dx.op.matVecMul.v8i32.v6i32(i32 305, <6 x i32> %{{.+}}, i1 true, i32 18, %dx.types.Handle %{{.+}}, i32 0, i32 19, i32 8, i32 24, i32 0, i1 false, i32 192, i1 true)
+  return Mul<uint>(Matrix,
+                    MakeInterpretedVector<DATA_TYPE_UINT8_T4_PACKED>(Input));  
+
+}
+
+// test that isUnsigned is set correctly for uint8_t4_packed
+export vector<uint, 8> Test5(vector<uint8_t4_packed, 6> Input) {
+  using namespace dx::linalg;
+
+  MatrixRef<DATA_TYPE_UINT8, 8, 6 * 4, MATRIX_LAYOUT_ROW_MAJOR> Matrix = {
+      Buf, 0, 6 * 4 * 8};
+
+  // CHECK: %{{.+}} = call <8 x i32> @dx.op.matVecMul.v8i32.v6i32(i32 305, <6 x i32> %{{.+}}, i1 true, i32 18, %dx.types.Handle %{{.+}}, i32 0, i32 19, i32 8, i32 24, i32 0, i1 false, i32 192, i1 true)
+  return Mul<uint>(Matrix,
+                    MakeInterpretedVector<DATA_TYPE_UINT8_T4_PACKED>(Input));  
+
+}
+
+// test that isUnsigned is set correctly for int8_t4_packed
+export vector<uint, 8> Test5(vector<int8_t4_packed, 6> Input) {
+  using namespace dx::linalg;
+
+  MatrixRef<DATA_TYPE_UINT8, 8, 6 * 4, MATRIX_LAYOUT_ROW_MAJOR> Matrix = {
+      Buf, 0, 6 * 4 * 8};
+
+  // CHECK: %{{.+}} = call <8 x i32> @dx.op.matVecMul.v8i32.v6i32(i32 305, <6 x i32> %{{.+}}, i1 true, i32 17, %dx.types.Handle %{{.+}}, i32 0, i32 19, i32 8, i32 24, i32 0, i1 false, i32 192, i1 true)
+  return Mul<uint>(Matrix,
+                    MakeInterpretedVector<DATA_TYPE_SINT8_T4_PACKED>(Input));  
+
+}
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/linalg/mat-vec-muladd.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/linalg/mat-vec-muladd.hlsl
new file mode 100644
index 0000000000..c19e601904
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/hlsl/linalg/mat-vec-muladd.hlsl
@@ -0,0 +1,90 @@
+// RUN: %dxc -I %hlsl_headers -T lib_6_9 %s | FileCheck %s
+
+#include <dx/linalg.h>
+
+ByteAddressBuffer Buf;
+
+export float4 Test1(float4 input) {
+  using namespace dx::linalg;
+
+  MatrixRef<DATA_TYPE_FLOAT16, 4, 4, MATRIX_LAYOUT_MUL_OPTIMAL> matrix = {Buf,
+                                                                          0, 0};
+  VectorRef<DATA_TYPE_FLOAT16> biasVector = {Buf, 256};
+
+  InterpretedVector<float, 4, DATA_TYPE_FLOAT16> theVector = {input};
+
+  // CHECK: %{{.+}} = call <4 x float> @dx.op.matVecMulAdd.v4f32.v4f32(i32 306, <4 x float> %{{.+}}, i1 false, i32 8, %dx.types.Handle [[RES:%.+]], i32 0, i32 8, i32 4, i32 4, i32 2, i1 false, i32 0, %dx.types.Handle [[RES]], i32 256, i32 8, i1 false)
+  return MulAdd<float>(
+      matrix, theVector,
+      biasVector);
+}
+
+export float4 Test2(float4 input) {
+  using namespace dx::linalg;
+
+  MatrixRef<DATA_TYPE_FLOAT16, 4, 4, MATRIX_LAYOUT_MUL_OPTIMAL, true> matrix = {
+      Buf, 0, 0};
+  VectorRef<DATA_TYPE_FLOAT16> biasVector = {Buf, 256};
+
+  InterpretedVector<float, 4, DATA_TYPE_FLOAT16> theVector = {input};
+
+  // CHECK: %{{.+}} = call <4 x float> @dx.op.matVecMulAdd.v4f32.v4f32(i32 306, <4 x float> %{{.+}}, i1 false, i32 8, %dx.types.Handle [[RES:%.+]], i32 0, i32 8, i32 4, i32 4, i32 2, i1 true, i32 0, %dx.types.Handle [[RES]], i32 256, i32 8, i1 false)
+  return MulAdd<float>(
+      matrix, theVector,
+      biasVector);
+}
+
+export float4 Test3(float4 input) {
+  using namespace dx::linalg;
+
+  MatrixRef<DATA_TYPE_FLOAT16, 4, 4, MATRIX_LAYOUT_MUL_OPTIMAL, true> matrix = {
+      Buf, 0, 0};
+  VectorRef<DATA_TYPE_FLOAT16> biasVector = {Buf, 256};
+
+  // CHECK: %{{.+}} = call <4 x float> @dx.op.matVecMulAdd.v4f32.v4f32(i32 306, <4 x float> %{{.+}}, i1 false, i32 8, %dx.types.Handle [[RES:%.+]], i32 0, i32 8, i32 4, i32 4, i32 2, i1 true, i32 0, %dx.types.Handle [[RES]], i32 256, i32 8, i1 false)
+  return MulAdd<float>(
+      matrix, MakeInterpretedVector<DATA_TYPE_FLOAT16>(input),
+      biasVector);
+}
+
+namespace ProposalExample {
+
+ByteAddressBuffer model;
+
+vector<float, 3> ApplyNeuralMaterial(vector<half, 8> inputVector) {
+  using namespace dx::linalg;
+
+  MatrixRef<DATA_TYPE_FLOAT8_E4M3, 32, 8, MATRIX_LAYOUT_MUL_OPTIMAL> matrix0 = {
+      model, 0, 0};
+
+  VectorRef<DATA_TYPE_FLOAT16> biasVector0 = {model, 1024};
+
+  MatrixRef<DATA_TYPE_FLOAT8_E4M3, 32, 32, MATRIX_LAYOUT_MUL_OPTIMAL> matrix1 =
+      {model, 2048, 0};
+
+  VectorRef<DATA_TYPE_FLOAT16> biasVector1 = {model, 3072};
+
+  MatrixRef<DATA_TYPE_FLOAT8_E4M3, 3, 32, MATRIX_LAYOUT_MUL_OPTIMAL> matrix2 = {
+      model, 4096, 0};
+
+  VectorRef<DATA_TYPE_FLOAT16> biasVector2 = {model, 5120};
+
+  vector<half, 32> layer0 = MulAdd<half>(
+      matrix0, MakeInterpretedVector<DATA_TYPE_FLOAT8_E4M3>(inputVector),
+      biasVector0);
+  layer0 = max(layer0, 0);
+
+  vector<half, 32> layer1 = MulAdd<half>(
+      matrix1, MakeInterpretedVector<DATA_TYPE_FLOAT8_E4M3>(layer0),
+      biasVector1);
+  layer1 = max(layer1, 0);
+
+  vector<float, 3> output = MulAdd<float>(
+      matrix2, MakeInterpretedVector<DATA_TYPE_FLOAT8_E4M3>(layer1),
+      biasVector2);
+  output = exp(output);
+
+  return output;
+}
+
+} // namespace ProposalExample
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/linalg/outer-product-accumulate-matrix-layout.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/linalg/outer-product-accumulate-matrix-layout.hlsl
new file mode 100644
index 0000000000..e930557cf9
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/hlsl/linalg/outer-product-accumulate-matrix-layout.hlsl
@@ -0,0 +1,28 @@
+// RUN: %dxc -I %hlsl_headers -T cs_6_9 %s -enable-16bit-types -DML=MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL -DSTRIDE=0 2>&1 | FileCheck %s
+
+//Source file for the IR in \tools\clang\test\LitDXILValidation\outer-product-accumulate-matrix-layout-failing.ll
+//Source file for the IR in \tools\clang\test\LitDXILValidation\outer-product-accumulate-matrix-layout-passing.ll
+
+ByteAddressBuffer input_vector_buffer;
+ByteAddressBuffer input_vector_buffer2;
+RWByteAddressBuffer matrix_buffer;
+
+#include <dx/linalg.h>
+
+// CHECK: call void @dx.op.outerProductAccumulate.v8f16.v8f16(i32 307, <8 x half> %{{[^ ]+}}, <8 x half> %{{[^ ]+}}, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 8, i32 3, i32 0)
+using namespace dx::linalg;
+
+[Numthreads(1,1,1)]
+[shader("compute")]
+void main()
+{
+  vector<half, 8> input_vector1 = input_vector_buffer.Load<vector<half, 8> >(0);
+  vector<half, 8> input_vector2 = input_vector_buffer2.Load<vector<half, 8> >(0);
+
+  const uint matrix_interpretation = DATA_TYPE_FLOAT16;
+  const uint matrix_layout = ML;
+  const uint matrix_offset = 0;
+  const uint matrix_stride = STRIDE;
+
+  __builtin_OuterProductAccumulate(input_vector1, input_vector2, matrix_buffer, matrix_offset, matrix_interpretation, matrix_layout, matrix_stride);
+}
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/linalg/outerproductaccumulate.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/linalg/outerproductaccumulate.hlsl
new file mode 100644
index 0000000000..eda15c66f6
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/hlsl/linalg/outerproductaccumulate.hlsl
@@ -0,0 +1,16 @@
+// RUN: %dxc -I %hlsl_headers -T lib_6_9 -enable-16bit-types %s | FileCheck %s
+
+#include <dx/linalg.h>
+
+RWByteAddressBuffer RWBuf;
+
+export void Test4(vector<half, 128> Input1, vector<half, 64> Input2) {
+  using namespace dx::linalg;
+
+  RWMatrixRef<DATA_TYPE_FLOAT16, 128, 64, MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL>
+      matrix = {RWBuf, 0, 0};
+
+  // CHECK: call void @dx.op.outerProductAccumulate.v128f16.v64f16(i32 307, <128 x half> %{{.+}}, <64 x half> %{{.+}}, %dx.types.Handle %{{.+}}, i32 0, i32 8, i32 3, i32 0)
+
+  OuterProductAccumulate(Input1, Input2, matrix);  
+}
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/linalg/vectoraccumulate.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/linalg/vectoraccumulate.hlsl
new file mode 100644
index 0000000000..9157156f10
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/hlsl/linalg/vectoraccumulate.hlsl
@@ -0,0 +1,14 @@
+// RUN: %dxc -I %hlsl_headers -T lib_6_9 %s | FileCheck %s
+
+#include <dx/linalg.h>
+
+RWByteAddressBuffer RWBuf;
+
+export void Test5(vector<half, 128> Input) {
+  using namespace dx::linalg;
+
+  RWBuf.Store<vector<half, 128> >(0, Input);
+
+  // CHECK: call void @dx.op.vectorAccumulate.v128f32(i32 308, <128 x float> %{{.*}}, %dx.types.Handle %{{.*}}, i32 0)
+  VectorAccumulate(Input, RWBuf, 0);
+}
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/objects/HitObject/hitobject_accessors.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/objects/HitObject/hitobject_accessors.hlsl
new file mode 100644
index 0000000000..daeabf9710
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/hlsl/objects/HitObject/hitobject_accessors.hlsl
@@ -0,0 +1,113 @@
+// REQUIRES: dxil-1-9
+// RUN: %dxc -T lib_6_9 -E main %s | FileCheck %s --check-prefix DXIL
+
+// DXIL: %dx.types.HitObject = type { i8* }
+
+// DXIL:   %[[NOP:[^ ]+]] = call %dx.types.HitObject @dx.op.hitObject_MakeNop(i32 266)  ; HitObject_MakeNop()
+// DXIL:   %[[HIT:[^ ]+]] = call %dx.types.HitObject @dx.op.hitObject_SetShaderTableIndex(i32 287, %dx.types.HitObject %[[NOP]], i32 1)  ; HitObject_SetShaderTableIndex(hitObject,shaderTableIndex)
+// DXIL-DAG:   %{{[^ ]+}} = call i1 @dx.op.hitObject_StateScalar.i1(i32 270, %dx.types.HitObject %[[HIT]])  ; HitObject_IsHit(hitObject)
+// DXIL-DAG:   %{{[^ ]+}} = call i1 @dx.op.hitObject_StateScalar.i1(i32 269, %dx.types.HitObject %[[HIT]])  ; HitObject_IsMiss(hitObject)
+// DXIL-DAG:   %{{[^ ]+}} = call i1 @dx.op.hitObject_StateScalar.i1(i32 271, %dx.types.HitObject %[[HIT]])  ; HitObject_IsNop(hitObject)
+// DXIL-DAG:   %{{[^ ]+}} = call i32 @dx.op.hitObject_StateScalar.i32(i32 281, %dx.types.HitObject %[[HIT]])  ; HitObject_GeometryIndex(hitObject)
+// DXIL-DAG:   %{{[^ ]+}} = call i32 @dx.op.hitObject_StateScalar.i32(i32 285, %dx.types.HitObject %[[HIT]])  ; HitObject_HitKind(hitObject)
+// DXIL-DAG:   %{{[^ ]+}} = call i32 @dx.op.hitObject_StateScalar.i32(i32 282, %dx.types.HitObject %[[HIT]])  ; HitObject_InstanceIndex(hitObject)
+// DXIL-DAG:   %{{[^ ]+}} = call i32 @dx.op.hitObject_StateScalar.i32(i32 283, %dx.types.HitObject %[[HIT]])  ; HitObject_InstanceID(hitObject)
+// DXIL-DAG:   %{{[^ ]+}} = call i32 @dx.op.hitObject_StateScalar.i32(i32 284, %dx.types.HitObject %[[HIT]])  ; HitObject_PrimitiveIndex(hitObject)
+// DXIL-DAG:   %{{[^ ]+}} = call i32 @dx.op.hitObject_StateScalar.i32(i32 286, %dx.types.HitObject %[[HIT]])  ; HitObject_ShaderTableIndex(hitObject)
+// DXIL-DAG:   %{{[^ ]+}} = call i32 @dx.op.hitObject_LoadLocalRootTableConstant(i32 288, %dx.types.HitObject %[[HIT]], i32 40)  ; HitObject_LoadLocalRootTableConstant(hitObject,offset)
+// DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateVector.f32(i32 277, %dx.types.HitObject %[[HIT]], i32 0)  ; HitObject_ObjectRayOrigin(hitObject,component)
+// DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateVector.f32(i32 277, %dx.types.HitObject %[[HIT]], i32 1)  ; HitObject_ObjectRayOrigin(hitObject,component)
+// DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateVector.f32(i32 277, %dx.types.HitObject %[[HIT]], i32 2)  ; HitObject_ObjectRayOrigin(hitObject,component)
+// DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateVector.f32(i32 278, %dx.types.HitObject %[[HIT]], i32 0)  ; HitObject_ObjectRayDirection(hitObject,component)
+// DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateVector.f32(i32 278, %dx.types.HitObject %[[HIT]], i32 1)  ; HitObject_ObjectRayDirection(hitObject,component)
+// DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateVector.f32(i32 278, %dx.types.HitObject %[[HIT]], i32 2)  ; HitObject_ObjectRayDirection(hitObject,component)
+// DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateVector.f32(i32 275, %dx.types.HitObject %[[HIT]], i32 0)  ; HitObject_WorldRayOrigin(hitObject,component)
+// DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateVector.f32(i32 275, %dx.types.HitObject %[[HIT]], i32 1)  ; HitObject_WorldRayOrigin(hitObject,component)
+// DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateVector.f32(i32 275, %dx.types.HitObject %[[HIT]], i32 2)  ; HitObject_WorldRayOrigin(hitObject,component)
+// DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateVector.f32(i32 276, %dx.types.HitObject %[[HIT]], i32 0)  ; HitObject_WorldRayDirection(hitObject,component)
+// DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateVector.f32(i32 276, %dx.types.HitObject %[[HIT]], i32 1)  ; HitObject_WorldRayDirection(hitObject,component)
+// DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateVector.f32(i32 276, %dx.types.HitObject %[[HIT]], i32 2)  ; HitObject_WorldRayDirection(hitObject,component)
+// DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %[[HIT]], i32 0, i32 0)  ; HitObject_ObjectToWorld3x4(hitObject,row,col)
+// DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %[[HIT]], i32 0, i32 1)  ; HitObject_ObjectToWorld3x4(hitObject,row,col)
+// DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %[[HIT]], i32 0, i32 2)  ; HitObject_ObjectToWorld3x4(hitObject,row,col)
+// DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %[[HIT]], i32 0, i32 3)  ; HitObject_ObjectToWorld3x4(hitObject,row,col)
+// DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %[[HIT]], i32 1, i32 0)  ; HitObject_ObjectToWorld3x4(hitObject,row,col)
+// DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %[[HIT]], i32 1, i32 1)  ; HitObject_ObjectToWorld3x4(hitObject,row,col)
+// DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %[[HIT]], i32 1, i32 2)  ; HitObject_ObjectToWorld3x4(hitObject,row,col)
+// DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %[[HIT]], i32 1, i32 3)  ; HitObject_ObjectToWorld3x4(hitObject,row,col)
+// DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %[[HIT]], i32 2, i32 0)  ; HitObject_ObjectToWorld3x4(hitObject,row,col)
+// DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %[[HIT]], i32 2, i32 1)  ; HitObject_ObjectToWorld3x4(hitObject,row,col)
+// DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %[[HIT]], i32 2, i32 2)  ; HitObject_ObjectToWorld3x4(hitObject,row,col)
+// DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %[[HIT]], i32 2, i32 3)  ; HitObject_ObjectToWorld3x4(hitObject,row,col)
+// DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %[[HIT]], i32 0, i32 0)  ; HitObject_WorldToObject3x4(hitObject,row,col)
+// DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %[[HIT]], i32 0, i32 1)  ; HitObject_WorldToObject3x4(hitObject,row,col)
+// DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %[[HIT]], i32 0, i32 2)  ; HitObject_WorldToObject3x4(hitObject,row,col)
+// DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %[[HIT]], i32 0, i32 3)  ; HitObject_WorldToObject3x4(hitObject,row,col)
+// DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %[[HIT]], i32 1, i32 0)  ; HitObject_WorldToObject3x4(hitObject,row,col)
+// DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %[[HIT]], i32 1, i32 1)  ; HitObject_WorldToObject3x4(hitObject,row,col)
+// DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %[[HIT]], i32 1, i32 2)  ; HitObject_WorldToObject3x4(hitObject,row,col)
+// DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %[[HIT]], i32 1, i32 3)  ; HitObject_WorldToObject3x4(hitObject,row,col)
+// DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %[[HIT]], i32 2, i32 0)  ; HitObject_WorldToObject3x4(hitObject,row,col)
+// DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %[[HIT]], i32 2, i32 1)  ; HitObject_WorldToObject3x4(hitObject,row,col)
+// DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %[[HIT]], i32 2, i32 2)  ; HitObject_WorldToObject3x4(hitObject,row,col)
+// DXIL-DAG:   %{{[^ ]+}} = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %[[HIT]], i32 2, i32 3)  ; HitObject_WorldToObject3x4(hitObject,row,col)
+// DXIL:   ret void
+
+RWByteAddressBuffer outbuf;
+
+template <int M, int N>
+float hashM(in matrix<float, M, N> mat) {
+  float h = 0.f;
+  for (int i = 0; i < M; ++i)
+    for (int j = 0; j < N; ++j)
+      h += mat[i][j];
+  return h;
+}
+
+[shader("raygeneration")]
+void main() {
+  dx::HitObject hit;
+  int isum = 0;
+  float fsum = 0.0f;
+  vector<float, 3> vsum = 0;
+
+  ///// Setters
+  hit.SetShaderTableIndex(1);
+
+  ///// Getters
+
+  // i1 accessors
+  isum += hit.IsHit();
+  isum += hit.IsMiss();
+  isum += hit.IsNop();
+
+  // i32 accessors
+  isum += hit.GetGeometryIndex();
+  isum += hit.GetHitKind();
+  isum += hit.GetInstanceIndex();
+  isum += hit.GetInstanceID();
+  isum += hit.GetPrimitiveIndex();
+  isum += hit.GetShaderTableIndex();
+  isum += hit.LoadLocalRootTableConstant(40);
+
+  // float3 accessors
+  vsum += hit.GetWorldRayOrigin();
+  vsum += hit.GetWorldRayDirection();
+  vsum += hit.GetObjectRayOrigin();
+  vsum += hit.GetObjectRayDirection();
+  fsum += vsum[0] + vsum[1] + vsum[2];
+
+  // matrix accessors
+  fsum += hashM<3, 4>(hit.GetObjectToWorld3x4());
+  fsum += hashM<4, 3>(hit.GetObjectToWorld4x3());
+  fsum += hashM<3, 4>(hit.GetWorldToObject3x4());
+  fsum += hashM<4, 3>(hit.GetWorldToObject4x3());
+
+  // f32 accessors
+  isum += hit.GetRayFlags();
+  fsum += hit.GetRayTMin();
+  fsum += hit.GetRayTCurrent();
+
+  outbuf.Store(0, fsum);
+  outbuf.Store(4, isum);
+}
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/objects/HitObject/hitobject_attributes.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/objects/HitObject/hitobject_attributes.hlsl
new file mode 100644
index 0000000000..55ef023a2f
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/hlsl/objects/HitObject/hitobject_attributes.hlsl
@@ -0,0 +1,27 @@
+// RUN: %dxc -T lib_6_9 -E main %s | FileCheck %s --check-prefix DXIL
+
+// DXIL: %[[APTR:[^ ]+]] = alloca %struct.CustomAttrs, align 4
+// DXIL: %[[NOP:[^ ]+]] = call %dx.types.HitObject @dx.op.hitObject_MakeNop(i32 266)  ; HitObject_MakeNop()
+// DXIL: call void @dx.op.hitObject_Attributes.struct.CustomAttrs(i32 289, %dx.types.HitObject %[[NOP]], %struct.CustomAttrs* nonnull %[[APTR]])  ; HitObject_Attributes(hitObject,attributes)
+// DXIL: %[[VPTR:[^ ]+]] = getelementptr inbounds %struct.CustomAttrs, %struct.CustomAttrs* %[[APTR]], i32 0, i32 0
+// DXIL: %{{[^ ]+}} = load <4 x float>, <4 x float>* %[[VPTR]], align 4
+// DXIL: %[[IPTR:[^ ]+]] = getelementptr inbounds %struct.CustomAttrs, %struct.CustomAttrs* %[[APTR]], i32 0, i32 1
+// DXIL: %{{[^ ]+}} = load i32, i32* %[[IPTR]], align 4
+// DXIL: ret void
+
+RWByteAddressBuffer outbuf;
+
+struct
+CustomAttrs {
+  float4 v;
+  int y;
+};
+
+[shader("raygeneration")]
+void main() {
+  dx::HitObject hit;
+  CustomAttrs attrs;
+  hit.GetAttributes(attrs);
+  float sum = attrs.v.x + attrs.v.y + attrs.v.z + attrs.v.w + attrs.y;
+  outbuf.Store(0, sum);
+}
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/objects/HitObject/hitobject_attributes_builtin.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/objects/HitObject/hitobject_attributes_builtin.hlsl
new file mode 100644
index 0000000000..59140ab37e
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/hlsl/objects/HitObject/hitobject_attributes_builtin.hlsl
@@ -0,0 +1,43 @@
+// RUN: %dxc /Tlib_6_9 %s | FileCheck %s
+// RUN: %dxc /Tlib_6_9 -fcgl %s | FileCheck %s -check-prefix=FCGL
+
+// Make sure that we can use the BuiltInTriangleIntersectionAttributes struct
+// as a template argument to GetAttributes.
+
+// For -fcgl, just check the form of the HL call.
+// FCGL: call void @"dx.hl.op..void (i32, %dx.types.HitObject*, %struct.BuiltInTriangleIntersectionAttributes*)"(i32 364, %dx.types.HitObject* %{{[^ ]+}}, %struct.BuiltInTriangleIntersectionAttributes* %{{[^ ]+}})
+
+// CHECK: %[[ATTR:[^ ]+]] = alloca %struct.BuiltInTriangleIntersectionAttributes
+// CHECK: call void @dx.op.hitObject_Attributes.struct.BuiltInTriangleIntersectionAttributes(i32 289, %dx.types.HitObject %{{[^ ]+}}, %struct.BuiltInTriangleIntersectionAttributes* nonnull %[[ATTR]])
+
+RaytracingAccelerationStructure Scene : register(t0, space0);
+RWTexture2D<float4> RenderTarget : register(u0);
+
+struct [raypayload] RayPayload
+{
+    float4 color : write(caller, closesthit, miss) : read(caller);
+};
+
+typedef BuiltInTriangleIntersectionAttributes MyAttribs;
+
+[shader("raygeneration")]
+void MyRaygenShader()
+{
+    RayDesc ray;
+    ray.Origin = float3(0,0,0);
+    ray.Direction = float3(0, 0, 1);
+    ray.TMin = 0.001;
+    ray.TMax = 10000.0;
+
+    RayPayload payload = { float4(0, 0, 0, 0) };
+    float4 color = float4(1,1,1,1);
+
+    dx::HitObject hit = dx::HitObject::TraceRay(Scene, RAY_FLAG_NONE, ~0, 0, 1, 0, ray, payload);
+
+    MyAttribs attr;
+    hit.GetAttributes(attr);
+    payload.color += float4(attr,0,1);
+
+    // Write the raytraced color to the output texture.
+    RenderTarget[DispatchRaysIndex().xy] = payload.color;
+}
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/objects/HitObject/hitobject_fromrayquery.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/objects/HitObject/hitobject_fromrayquery.hlsl
new file mode 100644
index 0000000000..33ea2719be
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/hlsl/objects/HitObject/hitobject_fromrayquery.hlsl
@@ -0,0 +1,37 @@
+// RUN: %dxc -T lib_6_9 -E main %s | FileCheck %s --check-prefix DXIL
+
+// DXIL: %{{[^ ]+}} = call %dx.types.HitObject @dx.op.hitObject_FromRayQuery(i32 263, i32 %[[RQ:[^ ]+]])  ; HitObject_FromRayQuery(rayQueryHandle)
+// DXIL: %{{[^ ]+}} = call %dx.types.HitObject @dx.op.hitObject_FromRayQueryWithAttrs.struct.CustomAttrs(i32 264, i32 %[[RQ]], i32 16, %struct.CustomAttrs* nonnull %{{[^ ]+}})  ; HitObject_FromRayQueryWithAttrs(rayQueryHandle,HitKind,CommittedAttribs)
+
+RaytracingAccelerationStructure RTAS;
+RWStructuredBuffer<float> UAV : register(u0);
+
+RayDesc MakeRayDesc() {
+  RayDesc desc;
+  desc.Origin = float3(0, 0, 0);
+  desc.Direction = float3(1, 0, 0);
+  desc.TMin = 0.0f;
+  desc.TMax = 9999.0;
+  return desc;
+}
+
+struct CustomAttrs {
+  float x;
+  float y;
+};
+
+void Use(in dx::HitObject hit) {
+  dx::MaybeReorderThread(hit);
+}
+
+[shader("raygeneration")]
+void main() {
+  RayQuery<RAY_FLAG_FORCE_OPAQUE | RAY_FLAG_ACCEPT_FIRST_HIT_AND_END_SEARCH> q;
+  RayDesc ray = MakeRayDesc();
+  q.TraceRayInline(RTAS, RAY_FLAG_NONE, 0xFF, ray);
+
+  Use(dx::HitObject::FromRayQuery(q));
+
+  CustomAttrs attrs = {1.f, 2.f};
+  Use(dx::HitObject::FromRayQuery(q, 16, attrs));
+}
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/objects/HitObject/hitobject_make.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/objects/HitObject/hitobject_make.hlsl
index 1e947b2296..cc9515d7c1 100644
--- a/tools/clang/test/CodeGenDXIL/hlsl/objects/HitObject/hitobject_make.hlsl
+++ b/tools/clang/test/CodeGenDXIL/hlsl/objects/HitObject/hitobject_make.hlsl
@@ -25,9 +25,9 @@
 // AST-NEXT: | | |   |-TemplateArgument type 'unsigned int'
 // AST-NEXT: | | |   |-TemplateArgument type 'unsigned int'
 // AST-NEXT: | | |   |-TemplateArgument type 'RayDesc'
-// AST-NEXT: | | |   |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> MakeMiss 'unsigned int'
 // AST-NEXT: | | |   |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> RayFlags 'unsigned int'
-// AST-NEXT: | | |   |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> MissShaderIndex 'RayDesc'
+// AST-NEXT: | | |   |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> MissShaderIndex 'unsigned int'
+// AST-NEXT: | | |   |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> Ray 'RayDesc'
 // AST-NEXT: | | |   |-HLSLIntrinsicAttr {{[^ ]+}} <<invalid sloc>> Implicit "op" "" 387
 // AST-NEXT: | | |   `-AvailabilityAttr {{[^ ]+}} <<invalid sloc>> Implicit  6.9 0 0 ""
 
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/objects/HitObject/hitobject_traceinvoke.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/objects/HitObject/hitobject_traceinvoke.hlsl
new file mode 100644
index 0000000000..4ea00475f1
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/hlsl/objects/HitObject/hitobject_traceinvoke.hlsl
@@ -0,0 +1,102 @@
+// RUN: %dxc -T lib_6_9 -E main %s -ast-dump-implicit | FileCheck %s --check-prefix AST
+// RUN: %dxc -T lib_6_9 -E main %s -fcgl | FileCheck %s --check-prefix FCGL
+// RUN: %dxc -T lib_6_9 -E main %s | FileCheck %s --check-prefix DXIL
+
+// AST: | | |-FunctionTemplateDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> Invoke
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TResult
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class Tho
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TPayload
+// AST-NEXT: | | | |-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> implicit Invoke 'TResult (Tho, TPayload &) const' static
+// AST-NEXT: | | | | |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> ho 'Tho'
+// AST-NEXT: | | | | `-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> Payload 'TPayload &'
+// AST-NEXT: | | | `-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> used Invoke 'void (dx::HitObject, Payload &)' static
+// AST-NEXT: | | |   |-TemplateArgument type 'void'
+// AST-NEXT: | | |   |-TemplateArgument type 'dx::HitObject':'dx::HitObject'
+// AST-NEXT: | | |   |-TemplateArgument type 'Payload'
+// AST-NEXT: | | |   |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> ho 'dx::HitObject':'dx::HitObject'
+// AST-NEXT: | | |   |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> Payload 'Payload &&__restrict'
+// AST-NEXT: | | |   |-HLSLIntrinsicAttr {{[^ ]+}} <<invalid sloc>> Implicit "op" "" 382
+// AST-NEXT: | | |   `-AvailabilityAttr {{[^ ]+}} <<invalid sloc>> Implicit  6.9 0 0 ""
+
+// AST: | | |-FunctionTemplateDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> TraceRay
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TResult
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TAccelerationStructure
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TRayFlags
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TInstanceInclusionMask
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TRayContributionToHitGroupIndex
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TMultiplierForGeometryContributionToHitGroupIndex
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TMissShaderIndex
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TRay
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TPayload
+// AST-NEXT: | | | |-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> implicit TraceRay 'TResult (TAccelerationStructure, TRayFlags, TInstanceInclusionMask, TRayContributionToHitGroupIndex, TMultiplierForGeometryContributionToHitGroupIndex, TMissShaderIndex, TRay, TPayload &) const' static
+// AST-NEXT: | | | | |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> AccelerationStructure 'TAccelerationStructure'
+// AST-NEXT: | | | | |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> RayFlags 'TRayFlags'
+// AST-NEXT: | | | | |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> InstanceInclusionMask 'TInstanceInclusionMask'
+// AST-NEXT: | | | | |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> RayContributionToHitGroupIndex 'TRayContributionToHitGroupIndex'
+// AST-NEXT: | | | | |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> MultiplierForGeometryContributionToHitGroupIndex 'TMultiplierForGeometryContributionToHitGroupIndex'
+// AST-NEXT: | | | | |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> MissShaderIndex 'TMissShaderIndex'
+// AST-NEXT: | | | | |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> Ray 'TRay'
+// AST-NEXT: | | | | `-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> Payload 'TPayload &'
+// AST-NEXT: | | | `-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> used TraceRay 'dx::HitObject (RaytracingAccelerationStructure, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, RayDesc, Payload &)' static
+// AST-NEXT: | | |   |-TemplateArgument type 'dx::HitObject'
+// AST-NEXT: | | |   |-TemplateArgument type 'RaytracingAccelerationStructure'
+// AST-NEXT: | | |   |-TemplateArgument type 'unsigned int'
+// AST-NEXT: | | |   |-TemplateArgument type 'unsigned int'
+// AST-NEXT: | | |   |-TemplateArgument type 'unsigned int'
+// AST-NEXT: | | |   |-TemplateArgument type 'unsigned int'
+// AST-NEXT: | | |   |-TemplateArgument type 'unsigned int'
+// AST-NEXT: | | |   |-TemplateArgument type 'RayDesc'
+// AST-NEXT: | | |   |-TemplateArgument type 'Payload'
+// AST-NEXT: | | |   |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> AccelerationStructure 'RaytracingAccelerationStructure'
+// AST-NEXT: | | |   |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> RayFlags 'unsigned int'
+// AST-NEXT: | | |   |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> InstanceInclusionMask 'unsigned int'
+// AST-NEXT: | | |   |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> RayContributionToHitGroupIndex 'unsigned int'
+// AST-NEXT: | | |   |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> MultiplierForGeometryContributionToHitGroupIndex 'unsigned int'
+// AST-NEXT: | | |   |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> MissShaderIndex 'unsigned int'
+// AST-NEXT: | | |   |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> Ray 'RayDesc'
+// AST-NEXT: | | |   |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> Payload 'Payload &&__restrict'
+// AST-NEXT: | | |   |-HLSLIntrinsicAttr {{[^ ]+}} <<invalid sloc>> Implicit "op" "" 389
+// AST-NEXT: | | |   `-AvailabilityAttr {{[^ ]+}} <<invalid sloc>> Implicit  6.9 0 0 ""
+
+// FCGL:  %[[HANDLE:[^ ]+]] = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RaytracingAccelerationStructure)"(i32 14, %dx.types.Handle %{{[^ ]+}}, %dx.types.ResourceProperties { i32 16, i32 0 }, %struct.RaytracingAccelerationStructure undef)
+// FCGL-NEXT:  call void @"dx.hl.op..void (i32, %dx.types.HitObject*, %dx.types.Handle, i32, i32, i32, i32, i32, %struct.RayDesc*, %struct.Payload*)"(i32 389, %dx.types.HitObject* %{{[^ ]+}}, %dx.types.Handle %[[HANDLE]], i32 513, i32 1, i32 2, i32 4, i32 0, %struct.RayDesc* %{{[^ ]+}}, %struct.Payload* %{{[^ ]+}})
+// FCGL-NEXT:  call void @"dx.hl.op..void (i32, %dx.types.HitObject*, %struct.Payload*)"(i32 382, %dx.types.HitObject* %{{[^ ]+}}, %struct.Payload* %{{[^ ]+}})
+
+// DXIL:  %[[RTAS:[^ ]+]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %{{[^ ]+}}, %dx.types.ResourceProperties { i32 16, i32 0 })  ; AnnotateHandle(res,props)  resource: RTAccelerationStructure
+// DXIL:  %[[HIT:[^ ]+]] = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle %[[RTAS]], i32 513, i32 1, i32 2, i32 4, i32 0, float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, %struct.Payload* nonnull %{{[^ ]+}})  ; HitObject_TraceRay(accelerationStructure,rayFlags,instanceInclusionMask,rayContributionToHitGroupIndex,multiplierForGeometryContributionToHitGroupIndex,missShaderIndex,Origin_X,Origin_Y,Origin_Z,TMin,Direction_X,Direction_Y,Direction_Z,TMax,payload)
+// DXIL:  call void @dx.op.hitObject_Invoke.struct.Payload(i32 267, %dx.types.HitObject %[[HIT]], %struct.Payload* nonnull %{{[^ ]+}})  ; HitObject_Invoke(hitObject,payload)
+
+// DXIL: !dx.dxrPayloadAnnotations = !{![[MDPLD:[^ ]+]]}
+// DXIL: ![[MDPLD]] = !{i32 0, %struct.Payload undef, !{{[^ ]+}}}
+
+RaytracingAccelerationStructure RTAS;
+RWStructuredBuffer<float> UAV : register(u0);
+
+struct [raypayload]
+Payload {
+  float3 dummy : read(closesthit) : write(caller, anyhit);
+};
+
+[shader("raygeneration")]
+void main() {
+  RayDesc rayDesc;
+  rayDesc.Origin = float3(0.0, 1.0, 2.0);
+  rayDesc.TMin = 3.0f;
+  rayDesc.Direction = float3(4.0, 5.0, 6.0);
+  rayDesc.TMax = 7.0f;
+
+  Payload pld;
+  pld.dummy = float3(7.0, 8.0, 9.0);
+
+  dx::HitObject hit = dx::HitObject::TraceRay(
+      RTAS,
+      RAY_FLAG_FORCE_OPAQUE | RAY_FLAG_SKIP_PROCEDURAL_PRIMITIVES,
+      1,
+      2,
+      4,
+      0,
+      rayDesc,
+      pld);
+
+  dx::HitObject::Invoke(hit, pld);
+}
diff --git a/tools/clang/test/HLSLFileCheck/hlsl/objects/RayQuery/tracerayinline.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/objects/RayQuery/tracerayinline.hlsl
similarity index 100%
rename from tools/clang/test/HLSLFileCheck/hlsl/objects/RayQuery/tracerayinline.hlsl
rename to tools/clang/test/CodeGenDXIL/hlsl/objects/RayQuery/tracerayinline.hlsl
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/objects/RayQuery/tracerayinline_cb_raydesc.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/objects/RayQuery/tracerayinline_cb_raydesc.hlsl
new file mode 100644
index 0000000000..256b6a04e8
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/hlsl/objects/RayQuery/tracerayinline_cb_raydesc.hlsl
@@ -0,0 +1,14 @@
+// RUN: %dxc -T vs_6_5 -E main %s | FileCheck %s
+
+// CHECK-DAG: %[[RTAS:[^ ]+]] = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 0, i32 0, i32 0, i1 false)
+// CHECK-DAG: %[[RQ:[^ ]+]] = call i32 @dx.op.allocateRayQuery(i32 178, i32 513)
+// CHECK: call void @dx.op.rayQuery_TraceRayInline(i32 179, i32 %[[RQ]], %dx.types.Handle %[[RTAS]], i32 1, i32 2,
+
+RaytracingAccelerationStructure RTAS;
+
+RayDesc rayDesc;
+
+void main() {
+  RayQuery<RAY_FLAG_FORCE_OPAQUE|RAY_FLAG_SKIP_PROCEDURAL_PRIMITIVES> rayQuery;
+  rayQuery.TraceRayInline(RTAS, 1, 2, rayDesc);
+}
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-intrinsics.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-intrinsics.hlsl
index 0b7f0d6b2f..f13772970b 100644
--- a/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-intrinsics.hlsl
+++ b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-intrinsics.hlsl
@@ -2,7 +2,6 @@
 // RUN: %dxc -T cs_6_9 -enable-16bit-types -DNUM=7   %s | FileCheck %s
 // RUN: %dxc -T cs_6_9 -enable-16bit-types -DNUM=125 %s | FileCheck %s
 // RUN: %dxc -T cs_6_9 -enable-16bit-types -DNUM=256 %s | FileCheck %s
-// RUN: %dxc -T cs_6_9 -enable-16bit-types -DNUM=1024 %s | FileCheck %s
 
 // Test vector-enabled non-trivial intrinsics that take parameters of various types.
 
@@ -203,6 +202,36 @@ void main() {
   // CHECK: fmul fast <[[NUM]] x float> [[tmp]], <float 0x3FE62E4300000000
   fRes += log(fVec1);
 
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[tmp:%.*]] = call <[[NUM]] x half> @dx.op.unary.[[HTY]](i32 23, <[[NUM]] x half> [[hvec2]])  ; Log(value)
+  // CHECK: [[tmp2:%.*]] = fmul fast <[[NUM]] x half> [[tmp]], [[hvec1]]
+  // CHECK: call <[[NUM]] x half> @dx.op.unary.[[HTY]](i32 21, <[[NUM]] x half> [[tmp2]])  ; Exp(value)
+  hRes += pow(hVec2, hVec1);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[tmp:%.*]] = call <[[NUM]] x float> @dx.op.unary.[[FTY]](i32 23, <[[NUM]] x float> [[fvec2]])  ; Log(value)
+  // CHECK: [[tmp2:%.*]] = fmul fast <[[NUM]] x float> [[tmp]], [[fvec1]]
+  // CHECK: call <[[NUM]] x float> @dx.op.unary.[[FTY]](i32 21, <[[NUM]] x float> [[tmp2]])  ; Exp(value)
+  fRes += pow(fVec2, fVec1);
+
+  vector<half, NUM> hVal;
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[tmp:%.*]] = call <[[NUM]] x half> @dx.op.unary.[[HTY]](i32 29, <[[NUM]] x half> [[hvec1]])  ; Round_z(value)
+  // CHECK: fsub fast <[[NUM]] x half> [[hvec1]], [[tmp]]
+  hRes *= modf(hVec1, hVal);
+  hRes += hVal;
+
+  vector<float, NUM> fVal;
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[tmp:%.*]] = call <[[NUM]] x float> @dx.op.unary.[[FTY]](i32 29, <[[NUM]] x float> [[fvec1]])  ; Round_z(value)
+  // CHECK: fsub fast <[[NUM]] x float> [[fvec1]], [[tmp]]
+  fRes *= modf(fVec1, fVal);
+  fRes += fVal;
+
   // CHECK-NOT: extractelement
   // CHECK-NOT: insertelement
   // CHECK: [[sub:%.*]] = fsub fast <[[NUM]] x half> [[hvec2]], [[hvec1]]
@@ -227,6 +256,25 @@ void main() {
   // CHECK: fmul fast <[[NUM]] x float> [[mul]], [[sub]]
   fRes += smoothstep(fVec1, fVec2, fVec3);
 
+  // Note that Fabs is tested in longvec-trivial-unary-float-intrinsics.
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[tmp:%.*]] = sub <[[NUM]] x i16> zeroinitializer, [[svec1]]
+  // CHECK: call <[[NUM]] x i16> @dx.op.binary.[[STY]](i32 37, <[[NUM]] x i16> [[svec1]], <[[NUM]] x i16> [[tmp]])  ; IMax(a,b)
+  sRes += abs(sVec1);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[tmp:%.*]] = sub <[[NUM]] x i32> zeroinitializer, [[ivec1]]
+  // CHECK: call <[[NUM]] x i32> @dx.op.binary.[[ITY]](i32 37, <[[NUM]] x i32> [[ivec1]], <[[NUM]] x i32> [[tmp]])  ; IMax(a,b)
+  iRes += abs(iVec1);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[tmp:%.*]] = sub <[[NUM]] x i64> zeroinitializer, [[lvec1]]
+  // CHECK: call <[[NUM]] x i64> @dx.op.binary.[[LTY]](i32 37, <[[NUM]] x i64> [[lvec1]], <[[NUM]] x i64> [[tmp]])  ; IMax(a,b)
+  lRes += abs(lVec1);
+
   // Intrinsics that expand into llvm ops.
 
   // CHECK-NOT: extractelement
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-scalarized-intrinsics.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-scalarized-intrinsics.hlsl
index 6ebb511b00..37fb1d2e15 100644
--- a/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-scalarized-intrinsics.hlsl
+++ b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-scalarized-intrinsics.hlsl
@@ -9,6 +9,13 @@
 // RUN: %dxc -DFUNC=countbits   -DARITY=1 -DTYPE=uint -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,UNARY
 // RUN: %dxc -DFUNC=firstbithigh -DARITY=1 -DTYPE=uint -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,UNARY
 // RUN: %dxc -DFUNC=firstbitlow  -DARITY=1 -DTYPE=uint -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,UNARY
+// RUN: %dxc -DFUNC=ddx         -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,UNARY
+// RUN: %dxc -DFUNC=ddx_coarse  -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,UNARY
+// RUN: %dxc -DFUNC=ddx_fine    -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,UNARY
+// RUN: %dxc -DFUNC=ddy         -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,UNARY
+// RUN: %dxc -DFUNC=ddy_coarse  -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,UNARY
+// RUN: %dxc -DFUNC=ddy_fine    -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,UNARY
+// RUN: %dxc -DFUNC=fwidth      -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,UNARY
 // RUN: %dxc -DFUNC=QuadReadLaneAt         -DARITY=4 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,QUAD
 // RUN: %dxc -DFUNC=QuadReadAcrossX        -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,QUAD
 // RUN: %dxc -DFUNC=QuadReadAcrossY        -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,QUAD
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-unary-float-intrinsics.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-unary-float-intrinsics.hlsl
index 91ab631a7e..9cc3d23b66 100644
--- a/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-unary-float-intrinsics.hlsl
+++ b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-unary-float-intrinsics.hlsl
@@ -1,3 +1,5 @@
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=abs  -DOP=6 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=abs  -DOP=6 -DNUM=1022 %s | FileCheck %s
 // RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=saturate  -DOP=7 -DNUM=7    %s | FileCheck %s
 // RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=saturate  -DOP=7 -DNUM=1022 %s | FileCheck %s
 // RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=cos  -DOP=12 -DNUM=7    %s | FileCheck %s
diff --git a/tools/clang/test/CodeGenSPIRV/amplification_shader_derivative.hlsl b/tools/clang/test/CodeGenSPIRV/amplification_shader_derivative.hlsl
new file mode 100644
index 0000000000..9982cf1cda
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/amplification_shader_derivative.hlsl
@@ -0,0 +1,28 @@
+// RUN: %dxc -T as_6_5 -E main -fspv-target-env=vulkan1.3 %s -spirv | FileCheck %s --check-prefix=VK13
+// RUN: %dxc -T as_6_5 -E main -fspv-target-env=vulkan1.1 -Vd %s -spirv | FileCheck %s --check-prefix=VK11
+
+// VK13-DAG: OpCapability ComputeDerivativeGroupLinearKHR
+// VK13-DAG: OpCapability DerivativeControl
+// VK13-DAG: OpCapability MeshShadingEXT
+// VK13-DAG: OpExtension "SPV_EXT_mesh_shader"
+// VK13-DAG: OpExtension "SPV_KHR_compute_shader_derivatives"
+// VK13: OpEntryPoint TaskEXT %main "main"
+// VK13: OpExecutionMode %main DerivativeGroupLinearKHR
+
+// VK11-DAG: OpExtension "SPV_NV_mesh_shader"
+// VK11: OpEntryPoint TaskNV %main "main"
+// VK11-NOT: OpExecutionMode %main DerivativeGroup
+
+struct AmplificationPayload
+{
+    float4 value;
+};
+
+groupshared AmplificationPayload payload;
+
+[numthreads(4, 1, 1)]
+void main(in uint tid : SV_GroupThreadID, in uint gtid : SV_GroupID)
+{
+    payload.value = ddx_coarse(float4(tid, 0, 0, 0));
+    DispatchMesh(1,1,1, payload);
+}
diff --git a/tools/clang/test/CodeGenSPIRV/bezier.hull.hlsl2spv b/tools/clang/test/CodeGenSPIRV/bezier.hull.hlsl2spv
index 30565394b4..1425137c68 100644
--- a/tools/clang/test/CodeGenSPIRV/bezier.hull.hlsl2spv
+++ b/tools/clang/test/CodeGenSPIRV/bezier.hull.hlsl2spv
@@ -161,7 +161,7 @@ BEZIER_CONTROL_POINT SubDToBezierHS(InputPatch<VS_CONTROL_POINT_OUTPUT, MAX_POIN
 // CHECK-NEXT:     %v4float = OpTypeVector %float 4
 // CHECK-NEXT: %_ptr_Output_v4float = OpTypePointer Output %v4float
 // CHECK-NEXT:        %void = OpTypeVoid
-// CHECK-NEXT:          %51 = OpTypeFunction %void
+// CHECK-NEXT:          [[L1:%[1-9][0-9]*]] = OpTypeFunction %void
 // CHECK-NEXT: %VS_CONTROL_POINT_OUTPUT = OpTypeStruct %v3float %v2float %v3float
 // CHECK-NEXT: %_arr_VS_CONTROL_POINT_OUTPUT_uint_3 = OpTypeArray %VS_CONTROL_POINT_OUTPUT %uint_3
 // CHECK-NEXT: %_ptr_Function__arr_VS_CONTROL_POINT_OUTPUT_uint_3 = OpTypePointer Function %_arr_VS_CONTROL_POINT_OUTPUT_uint_3
@@ -170,12 +170,12 @@ BEZIER_CONTROL_POINT SubDToBezierHS(InputPatch<VS_CONTROL_POINT_OUTPUT, MAX_POIN
 // CHECK-NEXT: %_ptr_Output_v3float = OpTypePointer Output %v3float
 // CHECK-NEXT:        %bool = OpTypeBool
 // CHECK-NEXT: %HS_CONSTANT_DATA_OUTPUT = OpTypeStruct %_arr_float_uint_4 %_arr_float_uint_2 %_arr_v3float_uint_4 %_arr_v2float_uint_4 %_arr_v3float_uint_4 %_arr_v3float_uint_4 %v4float
-// CHECK-NEXT:          %98 = OpTypeFunction %HS_CONSTANT_DATA_OUTPUT %_ptr_Function__arr_VS_CONTROL_POINT_OUTPUT_uint_3 %_ptr_Function_uint
+// CHECK-NEXT:          [[L2:%[1-9][0-9]*]] = OpTypeFunction %HS_CONSTANT_DATA_OUTPUT %_ptr_Function__arr_VS_CONTROL_POINT_OUTPUT_uint_3 %_ptr_Function_uint
 // CHECK-NEXT: %_ptr_Function_HS_CONSTANT_DATA_OUTPUT = OpTypePointer Function %HS_CONSTANT_DATA_OUTPUT
 // CHECK-NEXT: %_ptr_Function__arr_float_uint_4 = OpTypePointer Function %_arr_float_uint_4
 // CHECK-NEXT: %_ptr_Function_float = OpTypePointer Function %float
 // CHECK-NEXT: %_ptr_Function__arr_float_uint_2 = OpTypePointer Function %_arr_float_uint_2
-// CHECK-NEXT:         %120 = OpTypeFunction %BEZIER_CONTROL_POINT %_ptr_Function__arr_VS_CONTROL_POINT_OUTPUT_uint_3 %_ptr_Function_uint %_ptr_Function_uint
+// CHECK-NEXT:         [[L3:%[1-9][0-9]*]] = OpTypeFunction %BEZIER_CONTROL_POINT %_ptr_Function__arr_VS_CONTROL_POINT_OUTPUT_uint_3 %_ptr_Function_uint %_ptr_Function_uint
 // CHECK-NEXT: %_ptr_Function_VS_CONTROL_POINT_OUTPUT = OpTypePointer Function %VS_CONTROL_POINT_OUTPUT
 // CHECK-NEXT: %_ptr_Function_BEZIER_CONTROL_POINT = OpTypePointer Function %BEZIER_CONTROL_POINT
 // CHECK-NEXT: %_ptr_Function_v3float = OpTypePointer Function %v3float
@@ -192,94 +192,94 @@ BEZIER_CONTROL_POINT SubDToBezierHS(InputPatch<VS_CONTROL_POINT_OUTPUT, MAX_POIN
 // CHECK-NEXT: %out_var_TANUCORNER = OpVariable %_ptr_Output__arr_v3float_uint_4 Output
 // CHECK-NEXT: %out_var_TANVCORNER = OpVariable %_ptr_Output__arr_v3float_uint_4 Output
 // CHECK-NEXT: %out_var_TANWEIGHTS = OpVariable %_ptr_Output_v4float Output
-// CHECK-NEXT: %SubDToBezierHS = OpFunction %void None %51
-// CHECK-NEXT:          %52 = OpLabel
+// CHECK-NEXT: %SubDToBezierHS = OpFunction %void None [[L1]]
+// CHECK-NEXT:          [[L4:%[1-9][0-9]*]] = OpLabel
 // CHECK-NEXT: %param_var_ip = OpVariable %_ptr_Function__arr_VS_CONTROL_POINT_OUTPUT_uint_3 Function
 // CHECK-NEXT: %param_var_cpid = OpVariable %_ptr_Function_uint Function
 // CHECK-NEXT: %param_var_PatchID = OpVariable %_ptr_Function_uint Function
-// CHECK-NEXT:          %60 = OpLoad %_arr_v3float_uint_3 %in_var_WORLDPOS
-// CHECK-NEXT:          %61 = OpLoad %_arr_v2float_uint_3 %in_var_TEXCOORD0
-// CHECK-NEXT:          %62 = OpLoad %_arr_v3float_uint_3 %in_var_TANGENT
-// CHECK-NEXT:          %63 = OpCompositeExtract %v3float %60 0
-// CHECK-NEXT:          %64 = OpCompositeExtract %v2float %61 0
-// CHECK-NEXT:          %65 = OpCompositeExtract %v3float %62 0
-// CHECK-NEXT:          %66 = OpCompositeConstruct %VS_CONTROL_POINT_OUTPUT %63 %64 %65
-// CHECK-NEXT:          %67 = OpCompositeExtract %v3float %60 1
-// CHECK-NEXT:          %68 = OpCompositeExtract %v2float %61 1
-// CHECK-NEXT:          %69 = OpCompositeExtract %v3float %62 1
-// CHECK-NEXT:          %70 = OpCompositeConstruct %VS_CONTROL_POINT_OUTPUT %67 %68 %69
-// CHECK-NEXT:          %71 = OpCompositeExtract %v3float %60 2
-// CHECK-NEXT:          %72 = OpCompositeExtract %v2float %61 2
-// CHECK-NEXT:          %73 = OpCompositeExtract %v3float %62 2
-// CHECK-NEXT:          %74 = OpCompositeConstruct %VS_CONTROL_POINT_OUTPUT %71 %72 %73
-// CHECK-NEXT:          %75 = OpCompositeConstruct %_arr_VS_CONTROL_POINT_OUTPUT_uint_3 %66 %70 %74
-// CHECK-NEXT:          %76 = OpLoad %uint %gl_InvocationID
-// CHECK-NEXT:          %77 = OpLoad %uint %gl_PrimitiveID
-// CHECK-NEXT:          %79 = OpFunctionCall %BEZIER_CONTROL_POINT %src_SubDToBezierHS %param_var_ip %param_var_cpid %param_var_PatchID
-// CHECK-NEXT:          %81 = OpCompositeExtract %v3float %79 0
-// CHECK-NEXT:          %83 = OpAccessChain %_ptr_Output_v3float %out_var_BEZIERPOS %76
-// CHECK-NEXT:                OpStore %83 %81
+// CHECK-NEXT:          [[L5:%[1-9][0-9]*]] = OpLoad %_arr_v3float_uint_3 %in_var_WORLDPOS
+// CHECK-NEXT:          [[L6:%[1-9][0-9]*]] = OpLoad %_arr_v2float_uint_3 %in_var_TEXCOORD0
+// CHECK-NEXT:          [[L7:%[1-9][0-9]*]] = OpLoad %_arr_v3float_uint_3 %in_var_TANGENT
+// CHECK-NEXT:          [[L8:%[1-9][0-9]*]] = OpCompositeExtract %v3float [[L5]] 0
+// CHECK-NEXT:          [[L9:%[1-9][0-9]*]] = OpCompositeExtract %v2float [[L6]] 0
+// CHECK-NEXT:          [[L10:%[1-9][0-9]*]] = OpCompositeExtract %v3float [[L7]] 0
+// CHECK-NEXT:          [[L11:%[1-9][0-9]*]] = OpCompositeConstruct %VS_CONTROL_POINT_OUTPUT [[L8]] [[L9]] [[L10]]
+// CHECK-NEXT:          [[L12:%[1-9][0-9]*]] = OpCompositeExtract %v3float [[L5]] 1
+// CHECK-NEXT:          [[L13:%[1-9][0-9]*]] = OpCompositeExtract %v2float [[L6]] 1
+// CHECK-NEXT:          [[L14:%[1-9][0-9]*]] = OpCompositeExtract %v3float [[L7]] 1
+// CHECK-NEXT:          [[L15:%[1-9][0-9]*]] = OpCompositeConstruct %VS_CONTROL_POINT_OUTPUT [[L12]] [[L13]] [[L14]]
+// CHECK-NEXT:          [[L16:%[1-9][0-9]*]] = OpCompositeExtract %v3float [[L5]] 2
+// CHECK-NEXT:          [[L17:%[1-9][0-9]*]] = OpCompositeExtract %v2float [[L6]] 2
+// CHECK-NEXT:          [[L18:%[1-9][0-9]*]] = OpCompositeExtract %v3float [[L7]] 2
+// CHECK-NEXT:          [[L19:%[1-9][0-9]*]] = OpCompositeConstruct %VS_CONTROL_POINT_OUTPUT [[L16]] [[L17]] [[L18]]
+// CHECK-NEXT:          [[L20:%[1-9][0-9]*]] = OpCompositeConstruct %_arr_VS_CONTROL_POINT_OUTPUT_uint_3 [[L11]] [[L15]] [[L19]]
+// CHECK-NEXT:          [[L21:%[1-9][0-9]*]] = OpLoad %uint %gl_InvocationID
+// CHECK-NEXT:          [[L22:%[1-9][0-9]*]] = OpLoad %uint %gl_PrimitiveID
+// CHECK-NEXT:          [[L23:%[1-9][0-9]*]] = OpFunctionCall %BEZIER_CONTROL_POINT %src_SubDToBezierHS %param_var_ip %param_var_cpid %param_var_PatchID
+// CHECK-NEXT:          [[L24:%[1-9][0-9]*]] = OpCompositeExtract %v3float [[L23]] 0
+// CHECK-NEXT:          [[L25:%[1-9][0-9]*]] = OpAccessChain %_ptr_Output_v3float %out_var_BEZIERPOS [[L21]]
+// CHECK-NEXT:                OpStore [[L25]] [[L24]]
 // CHECK-NEXT:                OpControlBarrier %uint_2 %uint_4 %uint_0
-// CHECK-NEXT:          %85 = OpIEqual %bool %76 %uint_0
+// CHECK-NEXT:          [[L26:%[1-9][0-9]*]] = OpIEqual %bool [[L21]] %uint_0
 // CHECK-NEXT:                OpSelectionMerge %if_merge None
-// CHECK-NEXT:                OpBranchConditional %85 %if_true %if_merge
+// CHECK-NEXT:                OpBranchConditional [[L26]] %if_true %if_merge
 // CHECK-NEXT:     %if_true = OpLabel
-// CHECK-NEXT:          %89 = OpFunctionCall %HS_CONSTANT_DATA_OUTPUT %SubDToBezierConstantsHS %param_var_ip %param_var_PatchID
-// CHECK-NEXT:          %91 = OpCompositeExtract %_arr_float_uint_4 %89 0
-// CHECK-NEXT:                OpStore %gl_TessLevelOuter %91
-// CHECK-NEXT:          %92 = OpCompositeExtract %_arr_float_uint_2 %89 1
-// CHECK-NEXT:                OpStore %gl_TessLevelInner %92
-// CHECK-NEXT:          %93 = OpCompositeExtract %_arr_v3float_uint_4 %89 2
-// CHECK-NEXT:                OpStore %out_var_TANGENT %93
-// CHECK-NEXT:          %94 = OpCompositeExtract %_arr_v2float_uint_4 %89 3
-// CHECK-NEXT:                OpStore %out_var_TEXCOORD %94
-// CHECK-NEXT:          %95 = OpCompositeExtract %_arr_v3float_uint_4 %89 4
-// CHECK-NEXT:                OpStore %out_var_TANUCORNER %95
-// CHECK-NEXT:          %96 = OpCompositeExtract %_arr_v3float_uint_4 %89 5
-// CHECK-NEXT:                OpStore %out_var_TANVCORNER %96
-// CHECK-NEXT:          %97 = OpCompositeExtract %v4float %89 6
-// CHECK-NEXT:                OpStore %out_var_TANWEIGHTS %97
+// CHECK-NEXT:          [[L27:%[1-9][0-9]*]] = OpFunctionCall %HS_CONSTANT_DATA_OUTPUT %SubDToBezierConstantsHS %param_var_ip %param_var_PatchID
+// CHECK-NEXT:          [[L28:%[1-9][0-9]*]] = OpCompositeExtract %_arr_float_uint_4 [[L27]] 0
+// CHECK-NEXT:                OpStore %gl_TessLevelOuter [[L28]]
+// CHECK-NEXT:          [[L29:%[1-9][0-9]*]] = OpCompositeExtract %_arr_float_uint_2 [[L27]] 1
+// CHECK-NEXT:                OpStore %gl_TessLevelInner [[L29]]
+// CHECK-NEXT:          [[L30:%[1-9][0-9]*]] = OpCompositeExtract %_arr_v3float_uint_4 [[L27]] 2
+// CHECK-NEXT:                OpStore %out_var_TANGENT [[L30]]
+// CHECK-NEXT:          [[L31:%[1-9][0-9]*]] = OpCompositeExtract %_arr_v2float_uint_4 [[L27]] 3
+// CHECK-NEXT:                OpStore %out_var_TEXCOORD [[L31]]
+// CHECK-NEXT:          [[L32:%[1-9][0-9]*]] = OpCompositeExtract %_arr_v3float_uint_4 [[L27]] 4
+// CHECK-NEXT:                OpStore %out_var_TANUCORNER [[L32]]
+// CHECK-NEXT:          [[L33:%[1-9][0-9]*]] = OpCompositeExtract %_arr_v3float_uint_4 [[L27]] 5
+// CHECK-NEXT:                OpStore %out_var_TANVCORNER [[L33]]
+// CHECK-NEXT:          [[L34:%[1-9][0-9]*]] = OpCompositeExtract %v4float [[L27]] 6
+// CHECK-NEXT:                OpStore %out_var_TANWEIGHTS [[L34]]
 // CHECK-NEXT:                OpBranch %if_merge
 // CHECK-NEXT:    %if_merge = OpLabel
 // CHECK-NEXT:                OpReturn
 // CHECK-NEXT:                OpFunctionEnd
-// CHECK-NEXT: %SubDToBezierConstantsHS = OpFunction %HS_CONSTANT_DATA_OUTPUT None %98
+// CHECK-NEXT: %SubDToBezierConstantsHS = OpFunction %HS_CONSTANT_DATA_OUTPUT None [[L2]]
 // CHECK-NEXT:          %ip = OpFunctionParameter %_ptr_Function__arr_VS_CONTROL_POINT_OUTPUT_uint_3
 // CHECK-NEXT:     %PatchID = OpFunctionParameter %_ptr_Function_uint
 // CHECK-NEXT:    %bb_entry = OpLabel
 // CHECK-NEXT:      %Output = OpVariable %_ptr_Function_HS_CONSTANT_DATA_OUTPUT Function
-// CHECK-NEXT:         %105 = OpAccessChain %_ptr_Function__arr_float_uint_4 %Output %int_0
-// CHECK-NEXT:         %107 = OpAccessChain %_ptr_Function_float %105 %int_0
-// CHECK-NEXT:                OpStore %107 %float_1
-// CHECK-NEXT:         %108 = OpAccessChain %_ptr_Function__arr_float_uint_4 %Output %int_0
-// CHECK-NEXT:         %109 = OpAccessChain %_ptr_Function_float %108 %int_1
-// CHECK-NEXT:                OpStore %109 %float_2
-// CHECK-NEXT:         %110 = OpAccessChain %_ptr_Function__arr_float_uint_4 %Output %int_0
-// CHECK-NEXT:         %111 = OpAccessChain %_ptr_Function_float %110 %int_2
-// CHECK-NEXT:                OpStore %111 %float_3
-// CHECK-NEXT:         %112 = OpAccessChain %_ptr_Function__arr_float_uint_4 %Output %int_0
-// CHECK-NEXT:         %113 = OpAccessChain %_ptr_Function_float %112 %int_3
-// CHECK-NEXT:                OpStore %113 %float_4
-// CHECK-NEXT:         %115 = OpAccessChain %_ptr_Function__arr_float_uint_2 %Output %int_1
-// CHECK-NEXT:         %116 = OpAccessChain %_ptr_Function_float %115 %int_0
-// CHECK-NEXT:                OpStore %116 %float_5
-// CHECK-NEXT:         %117 = OpAccessChain %_ptr_Function__arr_float_uint_2 %Output %int_1
-// CHECK-NEXT:         %118 = OpAccessChain %_ptr_Function_float %117 %int_1
-// CHECK-NEXT:                OpStore %118 %float_6
-// CHECK-NEXT:         %119 = OpLoad %HS_CONSTANT_DATA_OUTPUT %Output
-// CHECK-NEXT:                OpReturnValue %119
+// CHECK-NEXT:         [[L35:%[1-9][0-9]*]] = OpAccessChain %_ptr_Function__arr_float_uint_4 %Output %int_0
+// CHECK-NEXT:         [[L36:%[1-9][0-9]*]] = OpAccessChain %_ptr_Function_float [[L35]] %int_0
+// CHECK-NEXT:                OpStore [[L36]] %float_1
+// CHECK-NEXT:         [[L37:%[1-9][0-9]*]] = OpAccessChain %_ptr_Function__arr_float_uint_4 %Output %int_0
+// CHECK-NEXT:         [[L38:%[1-9][0-9]*]] = OpAccessChain %_ptr_Function_float [[L37]] %int_1
+// CHECK-NEXT:                OpStore [[L38]] %float_2
+// CHECK-NEXT:         [[L39:%[1-9][0-9]*]] = OpAccessChain %_ptr_Function__arr_float_uint_4 %Output %int_0
+// CHECK-NEXT:         [[L40:%[1-9][0-9]*]] = OpAccessChain %_ptr_Function_float [[L39]] %int_2
+// CHECK-NEXT:                OpStore [[L40]] %float_3
+// CHECK-NEXT:         [[L41:%[1-9][0-9]*]] = OpAccessChain %_ptr_Function__arr_float_uint_4 %Output %int_0
+// CHECK-NEXT:         [[L42:%[1-9][0-9]*]] = OpAccessChain %_ptr_Function_float [[L41]] %int_3
+// CHECK-NEXT:                OpStore [[L42]] %float_4
+// CHECK-NEXT:         [[L43:%[1-9][0-9]*]] = OpAccessChain %_ptr_Function__arr_float_uint_2 %Output %int_1
+// CHECK-NEXT:         [[L44:%[1-9][0-9]*]] = OpAccessChain %_ptr_Function_float [[L43]] %int_0
+// CHECK-NEXT:                OpStore [[L44]] %float_5
+// CHECK-NEXT:         [[L45:%[1-9][0-9]*]] = OpAccessChain %_ptr_Function__arr_float_uint_2 %Output %int_1
+// CHECK-NEXT:         [[L46:%[1-9][0-9]*]] = OpAccessChain %_ptr_Function_float [[L45]] %int_1
+// CHECK-NEXT:                OpStore [[L46]] %float_6
+// CHECK-NEXT:         [[L47:%[1-9][0-9]*]] = OpLoad %HS_CONSTANT_DATA_OUTPUT %Output
+// CHECK-NEXT:                OpReturnValue [[L47]]
 // CHECK-NEXT:                OpFunctionEnd
-// CHECK-NEXT: %src_SubDToBezierHS = OpFunction %BEZIER_CONTROL_POINT None %120
+// CHECK-NEXT: %src_SubDToBezierHS = OpFunction %BEZIER_CONTROL_POINT None [[L3]]
 // CHECK-NEXT:        %ip_0 = OpFunctionParameter %_ptr_Function__arr_VS_CONTROL_POINT_OUTPUT_uint_3
 // CHECK-NEXT:        %cpid = OpFunctionParameter %_ptr_Function_uint
 // CHECK-NEXT:   %PatchID_0 = OpFunctionParameter %_ptr_Function_uint
 // CHECK-NEXT:  %bb_entry_0 = OpLabel
 // CHECK-NEXT:    %vsOutput = OpVariable %_ptr_Function_VS_CONTROL_POINT_OUTPUT Function
 // CHECK-NEXT:      %result = OpVariable %_ptr_Function_BEZIER_CONTROL_POINT Function
-// CHECK-NEXT:         %130 = OpAccessChain %_ptr_Function_v3float %vsOutput %int_0
-// CHECK-NEXT:         %131 = OpLoad %v3float %130
-// CHECK-NEXT:         %132 = OpAccessChain %_ptr_Function_v3float %result %int_0
-// CHECK-NEXT:                OpStore %132 %131
-// CHECK-NEXT:         %133 = OpLoad %BEZIER_CONTROL_POINT %result
-// CHECK-NEXT:                OpReturnValue %133
+// CHECK-NEXT:         [[L48:%[1-9][0-9]*]] = OpAccessChain %_ptr_Function_v3float %vsOutput %int_0
+// CHECK-NEXT:         [[L49:%[1-9][0-9]*]] = OpLoad %v3float [[L48]]
+// CHECK-NEXT:         [[L50:%[1-9][0-9]*]] = OpAccessChain %_ptr_Function_v3float %result %int_0
+// CHECK-NEXT:                OpStore [[L50]] [[L49]]
+// CHECK-NEXT:         [[L51:%[1-9][0-9]*]] = OpLoad %BEZIER_CONTROL_POINT %result
+// CHECK-NEXT:                OpReturnValue [[L51]]
 // CHECK-NEXT:                OpFunctionEnd
diff --git a/tools/clang/test/CodeGenSPIRV/cbuffer.overlap.hlsl b/tools/clang/test/CodeGenSPIRV/cbuffer.overlap.hlsl
new file mode 100644
index 0000000000..4a2e72c7b5
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/cbuffer.overlap.hlsl
@@ -0,0 +1,11 @@
+// RUN: not %dxc -T vs_6_2 -E main -fcgl  %s -spirv  2>&1 | FileCheck %s
+
+// CHECK: error: field "gFoo" at register(c5) overlaps with previous members
+
+uniform float4x4 gMVP : register(c0);
+uniform float4   gFoo : register(c5);
+uniform float4   gBar : register(c5);
+
+float4 main(float4 pos : POSITION) : SV_Position {
+    return mul(gMVP, pos * gFoo + gBar);
+}
diff --git a/tools/clang/test/CodeGenSPIRV/enum_sizeof.hlsl b/tools/clang/test/CodeGenSPIRV/enum_sizeof.hlsl
new file mode 100644
index 0000000000..f596a2db50
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/enum_sizeof.hlsl
@@ -0,0 +1,31 @@
+// RUN: %dxc -T cs_6_0 -E main  -fcgl %s -spirv | FileCheck %s
+
+enum E1 : uint64_t
+{
+    v1 = 0,
+};
+
+enum E2 : uint32_t
+{
+    v2 = 0,
+};
+
+struct S {
+  E1 e1;
+  E2 e2;
+};
+
+RWBuffer<int> b;
+
+[numthreads(128, 1, 1)]
+void main()
+{
+// CHECK: OpImageWrite {{%.*}} %uint_0 %int_8 None
+    b[0] = sizeof(E1);
+
+// CHECK: OpImageWrite {{%.*}} %uint_1 %int_4 None
+    b[1] = sizeof(E2);
+
+// CHECK: OpImageWrite {{%.*}} %uint_2 %int_16 None
+    b[2] = sizeof(S);
+}
diff --git a/tools/clang/test/CodeGenSPIRV/fn.export.with.entrypoint.hlsl b/tools/clang/test/CodeGenSPIRV/fn.export.with.entrypoint.hlsl
index da25ead9c1..312476b260 100644
--- a/tools/clang/test/CodeGenSPIRV/fn.export.with.entrypoint.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/fn.export.with.entrypoint.hlsl
@@ -1,4 +1,4 @@
-// RUN: %dxc -T as_6_6 -E main -fspv-target-env=vulkan1.3 -fcgl  %s -spirv | FileCheck %s
+// RUN: %dxc -T as_6_6 -E main -fspv-target-env=universal1.5 -fcgl  %s -spirv | FileCheck %s
 
 // CHECK: OpCapability Linkage
 // CHECK: OpDecorate %external_function LinkageAttributes "external_function" Export
@@ -10,4 +10,4 @@ export int external_function() {
 void main() {
   external_function();
 	return;
-}
\ No newline at end of file
+}
diff --git a/tools/clang/test/CodeGenSPIRV/groupshared.init.warning.hlsl b/tools/clang/test/CodeGenSPIRV/groupshared.init.warning.hlsl
new file mode 100644
index 0000000000..c49534948b
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/groupshared.init.warning.hlsl
@@ -0,0 +1,19 @@
+// RUN: %dxc -T cs_6_0 -E main -spirv %s 2>&1 | FileCheck %s
+
+groupshared uint testing = 0;
+
+[numthreads(64, 1, 1)]
+void main(uint local_thread_id_flat : SV_GroupIndex) {
+    
+    InterlockedAdd(testing, 1);
+    GroupMemoryBarrierWithGroupSync();
+    
+    if (local_thread_id_flat == 0) {
+        if (testing > 64) {
+            printf("testing is %u wtf", testing);
+        }
+    }
+}
+
+// CHECK: warning: Initializer of external global will be ignored
+// CHECK-NEXT: groupshared uint testing = 0;
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenSPIRV/hs.const.output-patch.out.hlsl b/tools/clang/test/CodeGenSPIRV/hs.const.output-patch.out.hlsl
index 6bbcdd3764..08669c3de0 100644
--- a/tools/clang/test/CodeGenSPIRV/hs.const.output-patch.out.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/hs.const.output-patch.out.hlsl
@@ -8,13 +8,13 @@ struct ControlPoint { float4 position : POSITION; };
 // CHECK: OpFunctionCall %void %HullConst %param_var_edge %param_var_inside %param_var_myFloat 
 // CHECK: [[edges:%[0-9]+]] = OpLoad %_arr_float_uint_3 %param_var_edge 
 // CHECK: [[addr:%[0-9]+]] = OpAccessChain %_ptr_Output_float %gl_TessLevelOuter %uint_0 
-// CHECK: [[val:%[0-9]+]] = OpCompositeExtract %float %66 0 
+// CHECK: [[val:%[0-9]+]] = OpCompositeExtract %float [[arr:%[0-9]+]] 0 
 // CHECK: OpStore [[addr]] [[val]]
 // CHECK: [[addr:%[0-9]+]] = OpAccessChain %_ptr_Output_float %gl_TessLevelOuter %uint_1 
-// CHECK: [[val:%[0-9]+]] = OpCompositeExtract %float %66 1 
+// CHECK: [[val:%[0-9]+]] = OpCompositeExtract %float [[arr]] 1 
 // CHECK: OpStore [[addr]] [[val]]
 // CHECK: [[addr:%[0-9]+]] = OpAccessChain %_ptr_Output_float %gl_TessLevelOuter %uint_2 
-// CHECK: [[val:%[0-9]+]] = OpCompositeExtract %float %66 2 
+// CHECK: [[val:%[0-9]+]] = OpCompositeExtract %float [[arr]] 2 
 // CHECK: OpStore [[addr]] [[val]]
 // CHECK: [[val:%[0-9]+]] = OpLoad %float %param_var_inside 
 // CHECK: [[addr:%[0-9]+]] = OpAccessChain %_ptr_Output_float %gl_TessLevelInner %uint_0 
diff --git a/tools/clang/test/CodeGenSPIRV/inline-spirv/spv.inline.decorate.member.hlsl b/tools/clang/test/CodeGenSPIRV/inline-spirv/spv.inline.decorate.member.hlsl
index bb4c2efde1..88a902d326 100644
--- a/tools/clang/test/CodeGenSPIRV/inline-spirv/spv.inline.decorate.member.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/inline-spirv/spv.inline.decorate.member.hlsl
@@ -4,9 +4,9 @@ template<class T, class U>
 [[vk::ext_instruction(/*spv::OpBitcast*/124)]]
 T Bitcast(U);
 
-// CHECK: OpMemberDecorate %S 0 Offset 0
-// CHECK: OpMemberDecorate %S 1 Offset 16
-// CHECK: %S = OpTypeStruct %v4float %v4float
+// CHECK-DAG: OpMemberDecorate %S 0 Offset 0
+// CHECK-DAG: OpMemberDecorate %S 1 Offset 16
+// CHECK-DAG: %S = OpTypeStruct %v4float %v4float
 
 struct S
 {
@@ -14,6 +14,12 @@ struct S
     [[vk::ext_decorate(/*offset*/ 35, 16)]] float4 f2;
 };
 
+// CHECK-DAG: OpDecorateString %out_var_SV_TARGET UserSemantic "raster_order_group_0"
+struct PixelOutput
+{
+	[[vk::location(0), vk::ext_decorate_string(5635, "raster_order_group_0")]] float4 rt0 : SV_TARGET;
+};
+
 using PointerType = vk::SpirvOpaqueType<
     /* OpTypePointer */ 32,
     /* PhysicalStorageBuffer */ vk::Literal<vk::integral_constant<uint,5349> >,
@@ -27,14 +33,16 @@ S Load(PointerType pointer,
 
 uint64_t address;
 
-float4 main() : SV_TARGET
+PixelOutput main()
 {
 
 // CHECK: [[BC:%[0-9]+]] = OpBitcast %_ptr_PhysicalStorageBuffer_S {{%[0-9]+}}
   PointerType ptr = Bitcast<PointerType>(address);
 
+PixelOutput output;
 // CHECK: [[LD:%[0-9]+]] = OpLoad %S [[BC]] Aligned 32
 // CHECK: [[RET:%[0-9]+]] = OpCompositeExtract %v4float [[LD]] 0
 // CHECK: OpStore %out_var_SV_TARGET [[RET]]
-  return Load(ptr).f1;
+output.rt0 = Load(ptr).f1;
+  return output;
 }
diff --git a/tools/clang/test/CodeGenSPIRV/inline-spirv/spv.intrinsicExecutionModeId.hlsl b/tools/clang/test/CodeGenSPIRV/inline-spirv/spv.intrinsicExecutionModeId.hlsl
index 0d63662ef8..beb0e23a95 100644
--- a/tools/clang/test/CodeGenSPIRV/inline-spirv/spv.intrinsicExecutionModeId.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/inline-spirv/spv.intrinsicExecutionModeId.hlsl
@@ -2,11 +2,11 @@
 
 // CHECK: OpCapability ShaderClockKHR
 // CHECK: OpExtension "SPV_KHR_shader_clock"
-// CHECK: OpExecutionModeId {{%[a-zA-Z0-9_]+}} LocalSizeId %uint_8 %uint_8 %uint_8
-// CHECK: OpExecutionModeId {{%[a-zA-Z0-9_]+}} LocalSizeHintId %uint_4 %uint_4 %uint_4
+// CHECK: OpExecutionModeId {{%[a-zA-Z0-9_]+}} LocalSizeId %uint_8 %uint_6 %uint_8
+// CHECK: OpExecutionModeId {{%[a-zA-Z0-9_]+}} LocalSizeHintId %int_4 %int_4 %int_4
 
 int main() : SV_Target0 {
-  vk::ext_execution_mode_id(/*LocalSizeId*/38, 8, 8, 8);
+  vk::ext_execution_mode_id(/*LocalSizeId*/38, 8u, 6u, 8u);
 
   [[vk::ext_capability(5055)]]
   [[vk::ext_extension("SPV_KHR_shader_clock")]]
diff --git a/tools/clang/test/CodeGenSPIRV/intrinsics.ddx.double.hlsl b/tools/clang/test/CodeGenSPIRV/intrinsics.ddx.double.hlsl
new file mode 100644
index 0000000000..a306463466
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/intrinsics.ddx.double.hlsl
@@ -0,0 +1,21 @@
+// RUN: %dxc -T ps_6_2 -E main -fcgl  %s -spirv 2>&1 | FileCheck %s
+
+// CHECK: :14:22: warning: conversion from larger type 'double' to smaller type 'float', possible loss of data [-Wconversion]
+// CHECK: :20:22: warning: conversion from larger type 'double2' to smaller type 'vector<float, 2>', possible loss of data [-Wconversion]
+
+void main() {
+  double    a;
+  double2   b;
+
+// CHECK:      [[a:%[0-9]+]] = OpLoad %double %a
+// CHECK-NEXT: [[c:%[0-9]+]] = OpFConvert %float [[a]]
+// CHECK-NEXT:   [[r:%[0-9]+]] = OpDPdx %float [[c]]
+// CHECK-NEXT:  OpFConvert %double [[r]]
+  double    da = ddx(a);
+
+// CHECK:      [[b:%[0-9]+]] = OpLoad %v2double %b
+// CHECK-NEXT: [[c:%[0-9]+]] = OpFConvert %v2float [[b]]
+// CHECK-NEXT: [[r:%[0-9]+]] = OpDPdx %v2float [[c]]
+// CHECK-NEXT:  OpFConvert %v2double [[r]]
+  double2   db = ddx(b);
+}
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenSPIRV/intrinsics.ddx.half.hlsl b/tools/clang/test/CodeGenSPIRV/intrinsics.ddx.half.hlsl
new file mode 100644
index 0000000000..11b63151ee
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/intrinsics.ddx.half.hlsl
@@ -0,0 +1,19 @@
+// RUN: %dxc -T ps_6_2 -E main -enable-16bit-types -fcgl  %s -spirv | FileCheck %s
+
+void main() {
+
+  half    a;
+  half2   b;
+
+// CHECK:      [[a:%[0-9]+]] = OpLoad %half %a
+// CHECK-NEXT: [[c:%[0-9]+]] = OpFConvert %float [[a]]
+// CHECK-NEXT:   [[r:%[0-9]+]] = OpDPdx %float [[c]]
+// CHECK-NEXT:  OpFConvert %half [[r]]
+  half    da = ddx(a);
+
+// CHECK:      [[b:%[0-9]+]] = OpLoad %v2half %b
+// CHECK-NEXT: [[c:%[0-9]+]] = OpFConvert %v2float [[b]]
+// CHECK-NEXT: [[r:%[0-9]+]] = OpDPdx %v2float [[c]]
+// CHECK-NEXT:  OpFConvert %v2half [[r]]
+  half2   db = ddx(b);
+}
diff --git a/tools/clang/test/CodeGenSPIRV/intrinsics.mul.hlsl b/tools/clang/test/CodeGenSPIRV/intrinsics.mul.hlsl
index 4d04896781..629e7527c3 100644
--- a/tools/clang/test/CodeGenSPIRV/intrinsics.mul.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/intrinsics.mul.hlsl
@@ -1,5 +1,8 @@
 // RUN: %dxc -T ps_6_0 -E main -fcgl  %s -spirv | FileCheck %s
 
+StructuredBuffer<float3> buffer_vec;
+StructuredBuffer<float3x3> buffer_mat;
+
 /*
 According to HLSL reference, mul() has the following versions:
 
@@ -448,6 +451,7 @@ void main() {
 // mul( Mat(Mx1) * Mat(1xN) ) --> Mat(MxN) matrix
   float1x3 mat1x3;
   float3x2 mat3x2;
+  float3x3 mat3x3;
   float3x1 mat3x1;
   float1x4 mat1x4;
 
@@ -474,4 +478,25 @@ void main() {
 // CHECK-NEXT: [[result3:%[0-9]+]] = OpCompositeConstruct %mat3v4float [[row0]] [[row1]] [[row2]]
 // CHECK-NEXT:                    OpStore %result3 [[result3]]
   float3x4   result3 = mul( mat3x1, mat1x4 ); // result is float3x4 matrix
+
+  float3 v3;
+
+// CHECK: [[matp:%[0-9]+]] = OpAccessChain %_ptr_Uniform_mat3v3float %buffer_mat %int_0 %int_0
+// CHECK:  [[mat:%[0-9]+]] = OpLoad %mat3v3float [[matp]]
+// CHECK:  [[vec:%[0-9]+]] = OpLoad %v3float %v3
+// CHECK:           {{.*}} = OpVectorTimesMatrix %v3float [[vec]] [[mat]]
+  float3 result4 = mul(buffer_mat.Load(0), v3);
+
+// CHECK:  [[mat:%[0-9]+]] = OpLoad %mat3v3float %mat3x3
+// CHECK: [[vecp:%[0-9]+]] = OpAccessChain %_ptr_Uniform_v3float %buffer_vec %int_0 %int_1
+// CHECK:  [[vec:%[0-9]+]] = OpLoad %v3float [[vecp]]
+// CHECK:           {{.*}} = OpVectorTimesMatrix %v3float [[vec]] [[mat]]
+  float3 result5 = mul(mat3x3, buffer_vec.Load(1));
+
+// CHECK: [[matp:%[0-9]+]] = OpAccessChain %_ptr_Uniform_mat3v3float %buffer_mat %int_0 %int_2
+// CHECK:  [[mat:%[0-9]+]] = OpLoad %mat3v3float [[matp]]
+// CHECK: [[vecp:%[0-9]+]] = OpAccessChain %_ptr_Uniform_v3float %buffer_vec %int_0 %int_2
+// CHECK:  [[vec:%[0-9]+]] = OpLoad %v3float [[vecp]]
+// CHECK:           {{.*}} = OpVectorTimesMatrix %v3float [[vec]] [[mat]]
+  float3 result6 = mul(buffer_mat.Load(2), buffer_vec.Load(2));
 }
diff --git a/tools/clang/test/CodeGenSPIRV/intrinsics.vkrawbufferload.hlsl b/tools/clang/test/CodeGenSPIRV/intrinsics.vkrawbufferload.hlsl
index 7be0713e48..c2892cfc29 100644
--- a/tools/clang/test/CodeGenSPIRV/intrinsics.vkrawbufferload.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/intrinsics.vkrawbufferload.hlsl
@@ -12,7 +12,16 @@ struct BufferData {
   float3 v;
 };
 
+using MyInt = vk::SpirvType<
+    /*spv::OpTypeInt*/21,
+    1,1, // size and alignment
+    vk::Literal<vk::integral_constant<uint,16> >, // bits
+    vk::Literal<vk::integral_constant<uint,1> > // signed
+>;
+
 uint64_t Address;
+
+[[vk::ext_capability(/* Int16 */ 22)]]
 float4 main() : SV_Target0 {
   // CHECK:      [[addr:%[0-9]+]] = OpLoad %ulong
   // CHECK-NEXT: [[buf:%[0-9]+]] = OpBitcast %_ptr_PhysicalStorageBuffer_float [[addr]]
@@ -50,5 +59,10 @@ float4 main() : SV_Target0 {
   // CHECK-NEXT: [[load:%[0-9]+]] = OpLoad %BufferData_0 [[buf]] Aligned 4
   d = vk::RawBufferLoad<BufferData>(0);
 
+  // CHECK: [[buf:%[0-9]+]] = OpBitcast %_ptr_PhysicalStorageBuffer_spirvIntrinsicType %ulong_0
+  // CHECK-NEXT: [[load:%[0-9]+]] = OpLoad %spirvIntrinsicType [[buf]] Aligned 4
+  // CHECK-NEXT: OpStore %mi [[load]]
+  MyInt mi = vk::RawBufferLoad<MyInt>(0);
+
   return float4(w.x, x, y, z);
 }
diff --git a/tools/clang/test/CodeGenSPIRV/linalg/outerproductaccumulate-spirv-errors.hlsl b/tools/clang/test/CodeGenSPIRV/linalg/outerproductaccumulate-spirv-errors.hlsl
new file mode 100644
index 0000000000..0213103926
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/linalg/outerproductaccumulate-spirv-errors.hlsl
@@ -0,0 +1,19 @@
+// RUN: %dxc -I %hlsl_headers -T lib_6_9 -enable-16bit-types -spirv %s -verify
+
+// Tests that the header file cannot be included for spirv compilations
+// This is a copy of \tools\clang\test\CodeGenDXIL\hlsl\linalg\outerproductaccumulate.hlsl
+// except that spirv is targeted
+
+// expected-error@dx/linalg.h:4{{Cooperative vectors not (yet) supported for SPIRV}}
+#include <dx/linalg.h>
+
+RWByteAddressBuffer RWBuf;
+
+export void Test4(vector<half, 128> Input1, vector<half, 64> Input2) {
+  using namespace dx::linalg;
+
+  RWMatrixRef<DATA_TYPE_FLOAT16, 128, 64, MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL>
+      matrix = {RWBuf, 0, 0};
+
+  OuterProductAccumulate(Input1, Input2, matrix);  
+}
diff --git a/tools/clang/test/CodeGenSPIRV/logical_copy.hlsl b/tools/clang/test/CodeGenSPIRV/logical_copy.hlsl
new file mode 100644
index 0000000000..eb4a803548
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/logical_copy.hlsl
@@ -0,0 +1,67 @@
+// RUN: %dxc %s -fcgl -spirv -T ps_6_8 -fspv-target-env=vulkan1.1spirv1.4 | FileCheck %s
+
+
+
+struct WithBool {
+  bool b;
+};
+
+struct StructWithBool {
+  WithBool wb;
+};
+
+struct StructWithoutBool {
+  int a;
+};
+
+struct OuterStruct {
+  StructWithBool a[2];
+  WithBool b;
+  StructWithoutBool c;
+  StructWithoutBool d[2];
+} S;
+
+
+// CHECK: %GetStruct = OpFunction %OuterStruct_0 None %34
+// CHECK: %bb_entry_0 = OpLabel
+// CHECK: [[ld:%[0-9]+]] = OpLoad %OuterStruct %39
+
+// The array `a` must be split up because it contains a bool that needs a
+// conversion from int to bool.
+// CHECK: [[arr_with_bool:%[0-9]+]] = OpCompositeExtract %_arr_StructWithBool_uint_2 [[ld]] 0
+// CHECK: [[struct_with_bool:%[0-9]+]] = OpCompositeExtract %StructWithBool [[arr_with_bool]] 0
+// CHECK: [[with_bool:%[0-9]+]] = OpCompositeExtract %WithBool [[struct_with_bool]] 0
+// CHECK: [[int:%[0-9]+]] = OpCompositeExtract %uint [[with_bool]] 0
+// CHECK: [[bool:%[0-9]+]] = OpINotEqual %bool [[int]] %uint_0
+// CHECK: [[with_bool:%[0-9]+]] = OpCompositeConstruct %WithBool_0 [[bool]]
+// CHECK: [[struct_with_bool:%[0-9]+]] = OpCompositeConstruct %StructWithBool_0 [[with_bool]]
+
+// Skip second element of the array. It is more of the same.
+// CHECK: [[a:%[0-9]+]] = OpCompositeConstruct %_arr_StructWithBool_0_uint_2 [[struct_with_bool]] {{%.*}}
+
+// The struct `b` must be split up for the same reason.
+// CHECK: [[with_bool:%[0-9]+]] = OpCompositeExtract %WithBool [[ld]] 1
+// CHECK: [[int:%[0-9]+]] = OpCompositeExtract %uint [[with_bool]] 0
+// CHECK: [[bool:%[0-9]+]] = OpINotEqual %bool [[int]] %uint_0
+// CHECK: [[b:%[0-9]+]] = OpCompositeConstruct %WithBool_0 [[bool]]
+
+// The struct `c` can use OpCopyLogical.
+// CHECK: %59 = OpCompositeExtract %StructWithoutBool [[ld]] 2
+// CHECK: [[c:%[0-9]+]] = OpCopyLogical %StructWithoutBool_0 %59
+
+// The array `d` can use OpCopyLogical.
+// CHECK: %61 = OpCompositeExtract %_arr_StructWithoutBool_uint_2 [[ld]] 3
+// CHECK: [[d:%[0-9]+]] = OpCopyLogical %_arr_StructWithoutBool_0_uint_2 %61
+
+// CHECK: [[r:%[0-9]+]] = OpCompositeConstruct %OuterStruct_0 [[a]] [[b]] [[c]] [[d]]
+// CHECK: OpStore {{%.*}} [[r]]
+// CHECK: OpFunctionEnd
+
+OuterStruct GetStruct() { return S; }
+
+uint main() : SV_TARGET
+{
+  GetStruct();
+  return 0;
+}
+
diff --git a/tools/clang/test/CodeGenSPIRV/mesh_shader_derivative.hlsl b/tools/clang/test/CodeGenSPIRV/mesh_shader_derivative.hlsl
new file mode 100644
index 0000000000..3f26921e28
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/mesh_shader_derivative.hlsl
@@ -0,0 +1,34 @@
+// RUN: %dxc -T ms_6_5 -E main -fspv-target-env=vulkan1.3 %s -spirv | FileCheck %s --check-prefix=VK13
+// RUN: %dxc -T ms_6_5 -E main -fspv-target-env=vulkan1.1 -Vd %s -spirv | FileCheck %s --check-prefix=VK11
+
+// VK13-DAG: OpCapability ComputeDerivativeGroupLinearKHR
+// VK13-DAG: OpCapability DerivativeControl
+// vk13-DAG: OpCapability MeshShadingEXT
+// VK13-DAG: OpExtension "SPV_EXT_mesh_shader"
+// VK13-DAG: OpExtension "SPV_KHR_compute_shader_derivatives"
+// VK13: OpEntryPoint MeshEXT %main "main"
+// VK13: OpExecutionMode %main DerivativeGroupLinearKHR
+
+// VK11-DAG: OpExtension "SPV_NV_mesh_shader"
+// VK11: OpEntryPoint MeshNV %main "main"
+// VK11-NOT: OpExecutionMode %main DerivativeGroup
+
+struct VSOut
+{
+    float4 pos : SV_Position;
+};
+
+[numthreads(4, 1, 1)]
+[outputtopology("triangle")]
+void main(in uint tid : SV_GroupThreadID, out vertices VSOut verts[3], out indices uint3 tris[1])
+{
+    SetMeshOutputCounts(3, 1);
+
+    float4 val = ddx_coarse(float4(tid, 0, 0, 0));
+
+    verts[0].pos = val;
+    verts[1].pos = val + float4(0,1,0,0);
+    verts[2].pos = val + float4(1,0,0,0);
+
+    tris[0] = uint3(0,1,2);
+}
diff --git a/tools/clang/test/CodeGenSPIRV/meshshading.ext.amplification.payload.hlsl b/tools/clang/test/CodeGenSPIRV/meshshading.ext.amplification.payload.hlsl
new file mode 100644
index 0000000000..c50ef252e9
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/meshshading.ext.amplification.payload.hlsl
@@ -0,0 +1,15 @@
+// RUN: %dxc -E main -T as_6_8 -spirv %s -E main -fspv-target-env=vulkan1.1spirv1.4 | FileCheck %s
+
+struct S {
+  uint a;
+};
+
+groupshared S s;
+// CHECK: %s = OpVariable {{.*}} TaskPayloadWorkgroupEXT
+
+[numthreads(1, 1, 1)]
+void main()
+{
+// CHECK: OpEmitMeshTasksEXT %uint_1 %uint_1 %uint_1 %s
+	DispatchMesh(1, 1, 1, s);
+}
diff --git a/tools/clang/test/CodeGenSPIRV/meshshading.ext.cullprimative.hlsl b/tools/clang/test/CodeGenSPIRV/meshshading.ext.cullprimative.hlsl
index cb5d7f771f..2a143afab2 100644
--- a/tools/clang/test/CodeGenSPIRV/meshshading.ext.cullprimative.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/meshshading.ext.cullprimative.hlsl
@@ -1,6 +1,4 @@
 // RUN: %dxc -T ms_6_6 -fspv-target-env=vulkan1.1spirv1.4 -E main %s -spirv | FileCheck %s
-// XFAIL: *
-// FIXME(7160): test disabled until the spirv-val fix is merged.
 
 struct MeshletPrimitiveOut
 {
diff --git a/tools/clang/test/CodeGenSPIRV/node.barrier.compute.hlsl b/tools/clang/test/CodeGenSPIRV/node.barrier.compute.hlsl
new file mode 100644
index 0000000000..42b18d35a0
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/node.barrier.compute.hlsl
@@ -0,0 +1,15 @@
+// RUN: %dxc -spirv -Od -T lib_6_8 -fspv-target-env=vulkan1.3 external %s | FileCheck %s
+
+// Barrier is called from a compute shader
+
+[Shader("compute")]
+[NumThreads(5,1,1)]
+void node116_barrier_compute()
+{
+  Barrier(1, 3);
+}
+
+// CHECK: [[UINT:%[^ ]*]] = OpTypeInt 32 0
+// CHECK: [[U2:%[^ ]*]] = OpConstant [[UINT]] 2
+// CHECK-DAG: [[U72:%[^ ]*]] = OpConstant [[UINT]] 72
+// CHECK: OpControlBarrier [[U2]] [[U2]] [[U72]]
diff --git a/tools/clang/test/CodeGenSPIRV/node.barrier.memory-arg.hlsl b/tools/clang/test/CodeGenSPIRV/node.barrier.memory-arg.hlsl
new file mode 100644
index 0000000000..9b2dc23eea
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/node.barrier.memory-arg.hlsl
@@ -0,0 +1,60 @@
+// RUN: %dxc -spirv -Od -T lib_6_8 -fspv-target-env=vulkan1.3 -enable-16bit-types %s | FileCheck %s
+
+// Barrier is called using a memory type argument
+
+static const int a = 7;
+static const int16_t b = 2;
+
+[Shader("node")]
+[NodeLaunch("coalescing")]
+[NumThreads(16, 1, 1)]
+void node117_barrier_memoryarg()
+{
+  // literal integer flag values
+  Barrier(1, 3);
+
+  // static const integer flag values
+  Barrier(a, b);
+
+  // AllMemoryBarrier() ->
+  Barrier(UAV_MEMORY|GROUP_SHARED_MEMORY|NODE_INPUT_MEMORY|NODE_OUTPUT_MEMORY,
+          DEVICE_SCOPE);
+
+  // AllMemoryBarrierWithGroupSync() ->
+  Barrier(UAV_MEMORY|GROUP_SHARED_MEMORY|NODE_INPUT_MEMORY|NODE_OUTPUT_MEMORY,
+          GROUP_SYNC|DEVICE_SCOPE);
+
+  // DeviceMemoryBarrier() ->
+  Barrier(UAV_MEMORY,
+          DEVICE_SCOPE);
+
+  // DeviceMemoryBarrierWithGroupSync() ->
+  Barrier(UAV_MEMORY,
+          GROUP_SYNC|DEVICE_SCOPE);
+
+  // GroupMemoryBarrier() ->
+  Barrier(GROUP_SHARED_MEMORY,
+          GROUP_SCOPE);
+
+  // GroupMemoryBarrierWithGroupSync() ->
+  Barrier(GROUP_SHARED_MEMORY,
+          GROUP_SYNC|GROUP_SCOPE);
+}
+
+
+// CHECK: [[UINT:%[^ ]*]] = OpTypeInt 32 0
+// CHECK-DAG: [[U2:%[^ ]*]] = OpConstant %uint 2
+// CHECK-DAG: [[U5:%[^ ]*]] = OpConstant %uint 5
+// CHECK-DAG: [[U72:%[^ ]*]] = OpConstant %uint 72
+// CHECK-DAG: [[U264:%[^ ]*]] = OpConstant %uint 264
+// CHECK-DAG: [[U328:%[^ ]*]] = OpConstant %uint 328
+// CHECK-DAG: [[U4424:%[^ ]*]] = OpConstant %uint 4424
+
+// CHECK: OpControlBarrier [[U2]] [[U2]] [[U72]]
+// CHECK: OpMemoryBarrier [[U2]] [[U328]]
+// CHECK: OpMemoryBarrier [[U5]] [[U4424]]
+// CHECK: OpControlBarrier [[U2]] [[U5]] [[U4424]]
+// CHECK: OpMemoryBarrier [[U5]] [[U72]]
+// CHECK: OpControlBarrier [[U2]] [[U5]] [[U72]]
+// CHECK: OpMemoryBarrier [[U2]] [[U264]]
+// CHECK: OpControlBarrier [[U2]] [[U2]] [[U264]]
diff --git a/tools/clang/test/CodeGenSPIRV/node.barrier.object-arg.hlsl b/tools/clang/test/CodeGenSPIRV/node.barrier.object-arg.hlsl
new file mode 100644
index 0000000000..215acf7bfd
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/node.barrier.object-arg.hlsl
@@ -0,0 +1,213 @@
+// RUN: %dxc -spirv -Vd -Od -T lib_6_8 -fspv-target-env=vulkan1.3 %s | FileCheck %s
+// Note: validation disabled until NodePayloadAMDX pointers are allowed
+// as function arguments
+
+// Barrier is called with each node record and UAV type
+
+struct RECORD
+{
+    uint value;
+};
+
+// CHECK: [[UINT:%[^ ]*]] = OpTypeInt 32 0
+// CHECK-DAG: [[U256:%[^ ]*]] = OpConstant [[UINT]] 256
+// CHECK-DAG: [[U1:%[^ ]*]] = OpConstant [[UINT]] 1
+// CHECK-DAG: [[U0:%[^ ]*]] = OpConstant [[UINT]] 0
+// CHECK-DAG: [[U3:%[^ ]*]] = OpConstant [[UINT]] 3
+// CHECK-DAG: [[U4:%[^ ]*]] = OpConstant [[UINT]] 4
+// CHECK-DAG: [[U2:%[^ ]*]] = OpConstant [[UINT]] 2
+// CHECK-DAG: [[U4424:%[^ ]*]] = OpConstant [[UINT]] 4424
+// CHECK-DAG: [[U5:%[^ ]*]] = OpConstant [[UINT]] 5
+
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NumThreads(256,1,1)]
+[NodeDispatchGrid(256,1,1)]
+void node01(DispatchNodeInputRecord<RECORD> input)
+{
+   Barrier(input, 5);
+}
+
+// CHECK: OpControlBarrier %uint_2 %uint_5 %uint_4424
+
+[Shader("node")]
+[NodeLaunch("coalescing")]
+[NumThreads(256,1,1)]
+void node02([MaxRecords(8)] GroupNodeInputRecords<RECORD> input)
+{
+   Barrier(input, 3);
+}
+
+// CHECK: OpControlBarrier %uint_2 %uint_2 %uint_4424
+
+[Shader("node")]
+[NodeLaunch("thread")]
+void node03(RWThreadNodeInputRecord<RECORD> input)
+{
+   Barrier(input, 0);
+}
+
+// CHECK: OpMemoryBarrier %uint_4 %uint_4424
+
+[Shader("node")]
+[NodeLaunch("coalescing")]
+[NumThreads(256,1,1)]
+void node04([MaxRecords(6)] RWGroupNodeInputRecords<RECORD> input)
+{
+   Barrier(input, 0);
+}
+
+// CHECK: OpMemoryBarrier %uint_4 %uint_4424
+
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NumThreads(256,1,1)]
+[NodeDispatchGrid(256,1,1)]
+void node05([MaxRecords(5)] NodeOutput<RECORD> outputs)
+{
+   ThreadNodeOutputRecords<RECORD> outrec = outputs.GetThreadNodeOutputRecords(1);
+   Barrier(outrec, 0);
+}
+
+// CHECK: OpMemoryBarrier %uint_4 %uint_4424
+
+[Shader("node")]
+[NodeLaunch("thread")]
+void node06([MaxRecords(5)] NodeOutput<RECORD> outputs)
+{
+   ThreadNodeOutputRecords<RECORD> outrec = outputs.GetThreadNodeOutputRecords(3);
+   Barrier(outrec, 0);
+}
+
+// CHECK: OpMemoryBarrier %uint_4 %uint_4424
+
+[Shader("node")]
+[NodeLaunch("coalescing")]
+[NumThreads(256,1,3)]
+void node07([MaxRecords(5)] NodeOutput<RECORD> outputs)
+{
+   GroupNodeOutputRecords<RECORD> outrec = outputs.GetGroupNodeOutputRecords(1);
+   Barrier(outrec, 3);
+}
+
+// CHECK: OpControlBarrier %uint_2 %uint_2 %uint_4424
+
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NumThreads(256,1,4)]
+[NodeDispatchGrid(256,1,1)]
+void node08([MaxRecords(5)] NodeOutput<RECORD> outputs)
+{
+   GroupNodeOutputRecords<RECORD> outrec = outputs.GetGroupNodeOutputRecords(4);
+   Barrier(outrec, 3);
+}
+
+// CHECK: OpControlBarrier %uint_2 %uint_2 %uint_4424
+
+RWBuffer<float> obj09;
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NumThreads(256,1,4)]
+[NodeDispatchGrid(256,1,1)]
+void node09()
+{
+   Barrier(obj09, 5);
+}
+
+// CHECK: OpControlBarrier %uint_2 %uint_5 %uint_4424
+
+RWTexture1D<float4> obj10;
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NumThreads(256,1,4)]
+[NodeDispatchGrid(256,1,1)]
+void node10()
+{
+   Barrier(obj10, 5);
+}
+
+// CHECK: OpControlBarrier %uint_2 %uint_5 %uint_4424
+
+RWTexture1DArray<float4> obj11;
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NumThreads(256,1,4)]
+[NodeDispatchGrid(256,1,1)]
+void node11()
+{
+   Barrier(obj11, 5);
+}
+
+// CHECK: OpControlBarrier %uint_2 %uint_5 %uint_4424
+
+RWTexture2D<float> obj12;
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NumThreads(256,1,4)]
+[NodeDispatchGrid(256,1,1)]
+void node12()
+{
+   Barrier(obj12, 5);
+}
+
+// CHECK: OpControlBarrier %uint_2 %uint_5 %uint_4424
+
+RWTexture2DArray<float> obj13;
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NumThreads(256,1,4)]
+[NodeDispatchGrid(256,1,1)]
+void node13()
+{
+   Barrier(obj13, 5);
+}
+
+// CHECK: OpControlBarrier %uint_2 %uint_5 %uint_4424
+
+RWTexture3D<float> obj14;
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NumThreads(256,1,4)]
+[NodeDispatchGrid(256,1,1)]
+void node14()
+{
+   Barrier(obj14, 5);
+}
+
+// CHECK: OpControlBarrier %uint_2 %uint_5 %uint_4424
+
+RWStructuredBuffer<RECORD> obj15;
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NumThreads(256,1,4)]
+[NodeDispatchGrid(256,1,1)]
+void node15()
+{
+   Barrier(obj15, 5);
+}
+
+// CHECK: OpControlBarrier %uint_2 %uint_5 %uint_4424
+
+RWByteAddressBuffer obj16;
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NumThreads(256,1,4)]
+[NodeDispatchGrid(256,1,1)]
+void node16()
+{
+   Barrier(obj16, 5);
+}
+
+// CHECK: OpControlBarrier %uint_2 %uint_5 %uint_4424
+
+AppendStructuredBuffer<RECORD> obj17;
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NumThreads(256,1,4)]
+[NodeDispatchGrid(256,1,1)]
+void node17()
+{
+   Barrier(obj17, 5);
+}
+
+// CHECK: OpControlBarrier %uint_2 %uint_5 %uint_4424
diff --git a/tools/clang/test/CodeGenSPIRV/node.broadcasting.no-input.hlsl b/tools/clang/test/CodeGenSPIRV/node.broadcasting.no-input.hlsl
new file mode 100644
index 0000000000..a3c369b252
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/node.broadcasting.no-input.hlsl
@@ -0,0 +1,15 @@
+// RUN: %dxc -spirv -Od -T lib_6_8 -fspv-target-env=vulkan1.3 %s | FileCheck %s
+
+// Broadcasting launch node with no input
+
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NodeDispatchGrid(3,4,5)]
+[NumThreads(6,7,1)]
+[NodeIsProgramEntry]
+void node070_broadcasting_noinput()
+{
+}
+
+// CHECK: OpReturn
+
diff --git a/tools/clang/test/CodeGenSPIRV/node.coalescing.num-threads.hlsl b/tools/clang/test/CodeGenSPIRV/node.coalescing.num-threads.hlsl
new file mode 100644
index 0000000000..14e899da02
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/node.coalescing.num-threads.hlsl
@@ -0,0 +1,16 @@
+// RUN: %dxc -spirv -Od -T lib_6_8 -fspv-target-env=vulkan1.3 external %s | FileCheck %s
+
+// Coalescing launch node with thread group defined in the shader
+
+[Shader("node")]
+[NodeLaunch("coalescing")]
+[NumThreads(1024,1,1)]
+[NodeIsProgramEntry]
+void node008_coalescing_numthreads_shader()
+{
+}
+
+// CHECK: OpEntryPoint GLCompute [[SHADER:%[0-9A-Za-z_]*]]
+// CHECK-DAG: OpExecutionMode [[SHADER]] CoalescingAMDX
+// CHECK-DAG: OpExecutionMode [[SHADER]] LocalSize 1024 1 1
+// CHECK: OpReturn
diff --git a/tools/clang/test/CodeGenSPIRV/node.dispatch-grid.hlsl b/tools/clang/test/CodeGenSPIRV/node.dispatch-grid.hlsl
new file mode 100644
index 0000000000..302c8ea698
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/node.dispatch-grid.hlsl
@@ -0,0 +1,28 @@
+// RUN: %dxc -spirv -Vd -Od -T lib_6_8 -fspv-target-env=vulkan1.3 external %s | FileCheck %s
+// Note: validation disabled until NodePayloadAMDX pointers are allowed
+// as function arguments
+
+// Broadcasting launch node with dispatch grid defined in shader
+
+struct INPUT_NOGRID
+{
+  uint textureIndex;
+};
+
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NodeDispatchGrid(2,3,2)]
+[NumThreads(1024,1,1)]
+[NodeIsProgramEntry]
+void node001_dispatchgrid_shader(DispatchNodeInputRecord<INPUT_NOGRID> input)
+{
+}
+
+// CHECK: OpEntryPoint GLCompute [[SHADER:%[0-9A-Za-z_]*]]
+// CHECK-DAG: OpExecutionMode [[SHADER]] LocalSize 1024 1 1
+// CHECK-DAG: OpExecutionModeId [[SHADER]] StaticNumWorkgroupsAMDX [[U2:%[0-9A-Za-z_]*]]
+// CHECK-SAME: [[U3:%[^ ]*]] [[U2]]
+// CHECK: [[UINT:%[^ ]*]] = OpTypeInt 32 0
+// CHECK-DAG: [[U2]] = OpConstant [[UINT]] 2
+// CHECK-DAG: [[U3]] = OpConstant [[UINT]] 3
+// CHECK: OpReturn
diff --git a/tools/clang/test/CodeGenSPIRV/node.empty-node-input.hlsl b/tools/clang/test/CodeGenSPIRV/node.empty-node-input.hlsl
new file mode 100644
index 0000000000..fa16429a1b
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/node.empty-node-input.hlsl
@@ -0,0 +1,28 @@
+// RUN: %dxc -spirv -Vd -Od -T lib_6_8 -fspv-target-env=vulkan1.3 %s | FileCheck %s
+// Note: validation disabled until NodePayloadAMDX pointers are allowed
+// as function arguments
+
+// Coalescing launch node declares EmptyNodeInput
+
+RWBuffer<uint> buf0;
+
+[Shader("node")]
+[NodeLaunch("coalescing")]
+[NodeIsProgramEntry]
+[NumThreads(2,1,1)]
+void emptynodeinput(EmptyNodeInput input)
+{
+  // input.Count should always return 1 here, so there is
+  // an opportunity for an optimization.
+  buf0[0] = input.Count();
+}
+
+// CHECK-DAG: [[UINT:%[^ ]*]] = OpTypeInt 32 0
+// CHECK-DAG: [[U0:%[^ ]*]] = OpConstant [[UINT]] 0
+// CHECK-DAG: [[IMG:%[^ ]*]] = OpTypeImage [[UINT]] Buffer 2 0 0 2 R32ui
+// CHECK-DAG: [[IMGPTR:%[^ ]*]] = OpTypePointer UniformConstant [[IMG]]
+// CHECK-DAG: [[BUF:%[^ ]*]] = OpVariable [[IMGPTR]] UniformConstant
+
+// CHECK: [[COUNT:%[^ ]*]] = OpNodePayloadArrayLengthAMDX [[UINT]]
+// CHECK: [[IMAGE:%[^ ]*]] = OpLoad [[IMG]] [[BUF]]
+// CHECK: OpImageWrite [[IMAGE]] [[U0]] [[COUNT]] None
diff --git a/tools/clang/test/CodeGenSPIRV/node.finished-cross-group-sharing.hlsl b/tools/clang/test/CodeGenSPIRV/node.finished-cross-group-sharing.hlsl
new file mode 100644
index 0000000000..8e1ce56307
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/node.finished-cross-group-sharing.hlsl
@@ -0,0 +1,32 @@
+// RUN: %dxc -spirv -Vd -Od -T lib_6_8 -fspv-target-env=vulkan1.3 %s | FileCheck %s
+// Note: validation disabled until NodePayloadAMDX pointers are allowed
+// as function arguments
+
+// FinishedCrossGroupSharing() is called with RWDispatchNodeInputRecord
+
+RWBuffer<uint> buf0;
+
+struct [NodeTrackRWInputSharing] INPUT_RECORD
+{
+  uint value;
+};
+
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NodeDispatchGrid(256,1,1)]
+[NumThreads(1,1,1)]
+void node037_finishedcrossgroupsharing(RWDispatchNodeInputRecord<INPUT_RECORD> input)
+{
+  bool b = input.FinishedCrossGroupSharing();
+  buf0[0] = 0 ? b : 1;
+}
+
+// CHECK: OpName [[INPUT:%[^ ]*]] "input"
+// CHECK: OpDecorate [[STRUCT:%[^ ]*]] TrackFinishWritingAMDX
+// CHECK: [[UINT:%[^ ]*]] = OpTypeInt 32 0
+// CHECK: [[STRUCT]] = OpTypeStruct [[UINT]]
+// CHECK: [[ARR:%[^ ]*]] = OpTypeNodePayloadArrayAMDX [[STRUCT]]
+// CHECK: [[PTR:%[^ ]*]] = OpTypePointer NodePayloadAMDX [[ARR]]
+// CHECK: [[BOOL:%[^ ]*]] = OpTypeBool
+// CHECK: [[INPUT]] = OpFunctionParameter [[PTR]]
+// CHECK: OpFinishWritingNodePayloadAMDX [[BOOL]] [[INPUT]]
diff --git a/tools/clang/test/CodeGenSPIRV/node.get-input-record-count.hlsl b/tools/clang/test/CodeGenSPIRV/node.get-input-record-count.hlsl
new file mode 100644
index 0000000000..a3af668c46
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/node.get-input-record-count.hlsl
@@ -0,0 +1,25 @@
+// RUN: %dxc -spirv -Vd -Od -T lib_6_8 external -fspv-target-env=vulkan1.3 %s | FileCheck %s
+// Note: validation disabled until NodePayloadAMDX pointers are allowed
+// as function arguments
+
+// GetInputRecordCount() called with NodeInputRecordArray
+
+RWBuffer<uint> buf0;
+
+struct INPUT_RECORD
+{
+    uint textureIndex;
+};
+
+[Shader("node")]
+[NodeLaunch("coalescing")]
+[NumThreads(1024,1,1)]
+[NodeIsProgramEntry]
+void node014_getinputrecordcount([MaxRecords(256)] GroupNodeInputRecords<INPUT_RECORD> inputs)
+{
+  uint numRecords = inputs.Count();
+  buf0[0] = numRecords;
+}
+
+// CHECK: [[UINT:%[^ ]*]] = OpTypeInt 32 0
+// CHECK: OpNodePayloadArrayLengthAMDX [[UINT]]
diff --git a/tools/clang/test/CodeGenSPIRV/node.get-node-output-record.multiple.hlsl b/tools/clang/test/CodeGenSPIRV/node.get-node-output-record.multiple.hlsl
new file mode 100644
index 0000000000..d029bd20bb
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/node.get-node-output-record.multiple.hlsl
@@ -0,0 +1,72 @@
+// RUN: %dxc -spirv -Od -T lib_6_8 -fspv-target-env=vulkan1.3 %s | FileCheck %s
+
+// Multiple calls to Get*NodeOuputRecords(array)
+
+struct RECORD {
+  int i;
+  float3 foo;
+};
+
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NumThreads(64, 1, 1)]
+[NodeDispatchGrid(8, 1, 1)]
+void node150_a(NodeOutput<RECORD> output)
+{
+  GroupNodeOutputRecords<RECORD> outRec1 = output.GetGroupNodeOutputRecords(1);
+  GroupNodeOutputRecords<RECORD> outRec2 = output.GetGroupNodeOutputRecords(4);
+  outRec1.OutputComplete();
+  outRec2.OutputComplete();
+}
+
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NumThreads(64, 1, 1)]
+[NodeDispatchGrid(8, 1, 1)]
+void node150_b(NodeOutput<RECORD> output)
+{
+  ThreadNodeOutputRecords<RECORD> outRec1 = output.GetThreadNodeOutputRecords(5);
+  ThreadNodeOutputRecords<RECORD> outRec2 = output.GetThreadNodeOutputRecords(1);
+  outRec1.OutputComplete();
+  outRec1 = outRec2;
+  outRec1.OutputComplete();
+}
+
+// CHECK: OpDecorateId [[ARR_A:%[^ ]*]] PayloadNodeNameAMDX [[STR:%[0-9A-Za-z_]*]]
+// CHECK: OpDecorateId [[ARR_B:%[^ ]*]] PayloadNodeNameAMDX [[STR]]
+
+// CHECK: [[UINT:%[^ ]*]] = OpTypeInt 32 0
+// CHECK-DAG: [[U0:%[^ ]*]] = OpConstant [[UINT]] 0
+// CHECK-DAG: [[U1:%[^ ]*]] = OpConstant [[UINT]] 1
+// CHECK-DAG: [[U2:%[^ ]*]] = OpConstant [[UINT]] 2
+// CHECK-DAG: [[U4:%[^ ]*]] = OpConstant [[UINT]] 4
+// CHECK-DAG: [[U5:%[^ ]*]] = OpConstant [[UINT]] 5
+// CHECK-DAG: [[STR]] = OpConstantStringAMDX "output"
+// CHECK-DAG: [[ARR_A]] = OpTypeNodePayloadArrayAMDX
+// CHECK-DAG: [[ARR_B]] = OpTypeNodePayloadArrayAMDX
+// CHECK-DAG: [[FPTR_A:%[^ ]*]] = OpTypePointer Function [[ARR_A]]
+// CHECK-DAG: [[NPTR_A:%[^ ]*]] = OpTypePointer NodePayloadAMDX [[ARR_A]]
+// CHECK-DAG: [[FPTR_B:%[^ ]*]] = OpTypePointer Function [[ARR_B]]
+// CHECK-DAG: [[NPTR_B:%[^ ]*]] = OpTypePointer NodePayloadAMDX [[ARR_B]]
+
+// checking for OpFunctionCall skips over the entry function wrapper and
+// thereby avoids matching wrapper variables
+// CHECK: OpFunctionCall
+// CHECK: [[OUT1:%[^ ]*]] = OpVariable [[FPTR_A]]
+// CHECK: [[OUT2:%[^ ]*]] = OpVariable [[FPTR_A]]
+// CHECK: [[PAY:%[^ ]*]] = OpAllocateNodePayloadsAMDX [[NPTR_A]] [[U2]] [[U1]] [[U0]]
+// CHECK: [[VAL:%[^ ]*]] = OpLoad [[ARR_A]] [[PAY]]
+// CHECK: OpStore [[OUT1]] [[VAL]]
+// CHECK: [[PAY:%[^ ]*]] = OpAllocateNodePayloadsAMDX [[NPTR_A]] [[U2]] [[U4]] [[U0]]
+// CHECK: [[VAL:%[^ ]*]] = OpLoad [[ARR_A]] [[PAY]]
+// CHECK: OpStore [[OUT2]] [[VAL]]
+// CHECK: OpFunctionCall
+// CHECK: [[OUT1:%[^ ]*]] = OpVariable [[FPTR_B]]
+// CHECK: [[OUT2:%[^ ]*]] = OpVariable [[FPTR_B]]
+// CHECK: [[PAY:%[^ ]*]] = OpAllocateNodePayloadsAMDX [[NPTR_B]] [[U4]] [[U5]] [[U0]]
+// CHECK: [[VAL:%[^ ]*]] = OpLoad [[ARR_B]] [[PAY]]
+// CHECK: OpStore [[OUT1]] [[VAL]]
+// CHECK: [[PAY:%[^ ]*]] = OpAllocateNodePayloadsAMDX [[NPTR_B]] [[U4]] [[U1]] [[U0]]
+// CHECK: [[VAL:%[^ ]*]] = OpLoad [[ARR_B]] [[PAY]]
+// CHECK: OpStore [[OUT2]] [[VAL]]
+// CHECK: OpFunctionEnd
diff --git a/tools/clang/test/CodeGenSPIRV/node.get-remaining-recursion-levels.hlsl b/tools/clang/test/CodeGenSPIRV/node.get-remaining-recursion-levels.hlsl
new file mode 100644
index 0000000000..f981282748
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/node.get-remaining-recursion-levels.hlsl
@@ -0,0 +1,26 @@
+// RUN: %dxc -spirv -T lib_6_8 external -fspv-target-env=vulkan1.3 %s | FileCheck %s
+
+// GetRemainingRecusionLevels() called
+
+RWBuffer<uint> buf0;
+
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NumThreads(8,1,1)]
+[NodeDispatchGrid(32,2,2)]
+[NodeMaxRecursionDepth(16)]
+void node133_getremainingrecursionlevels()
+{
+  uint remaining = GetRemainingRecursionLevels();
+  // Use resource as a way of preventing DCE
+  buf0[0] = remaining;
+}
+
+// CHECK: OpEntryPoint GLCompute [[SHADER:%[^ ]*]] "node133_getremainingrecursionlevels" [[RRL:%[^ ]*]]
+// CHECK: OpExecutionModeId [[SHADER]] MaxNodeRecursionAMDX [[U16:%[^ ]*]]
+// CHECK: OpDecorate [[RRL]] BuiltIn RemainingRecursionLevelsAMDX
+// CHECK: [[UINT:%[^ ]*]] = OpTypeInt 32 0
+// CHECK: [[U16]] = OpConstant [[UINT]] 16
+// CHECK: [[PTR:%[^ ]*]] = OpTypePointer Input [[UINT]]
+// CHECK: [[RRL]] = OpVariable [[PTR]] Input
+// CHECK: OpLoad [[UINT]] [[RRL]]
diff --git a/tools/clang/test/CodeGenSPIRV/node.group-shared.barrier.hlsl b/tools/clang/test/CodeGenSPIRV/node.group-shared.barrier.hlsl
new file mode 100644
index 0000000000..cf1638d75c
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/node.group-shared.barrier.hlsl
@@ -0,0 +1,18 @@
+// RUN: %dxc -spirv -Od -T lib_6_8 -fspv-target-env=vulkan1.3 %s | FileCheck %s
+
+// Check that a barrier can be used on a groupshared object from a
+// work graph node
+
+groupshared uint Test;
+
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NodeDispatchGrid(1, 1, 1)]
+[NumThreads(1, 1, 1)]
+void firstNode()
+{
+  Test = 1;
+  AllMemoryBarrierWithGroupSync();
+}
+
+// CHECK: OpReturn
diff --git a/tools/clang/test/CodeGenSPIRV/node.group-shared.hlsl b/tools/clang/test/CodeGenSPIRV/node.group-shared.hlsl
new file mode 100644
index 0000000000..81fc0e39a2
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/node.group-shared.hlsl
@@ -0,0 +1,24 @@
+// RUN: %dxc -spirv -Vd -Od -T lib_6_8 -fspv-target-env=vulkan1.3 %s | FileCheck %s
+// Note: validation disabled until NodePayloadAMDX pointers are allowed
+// as function arguments
+
+// Check that group shared memory is allowed from a work graph node
+
+struct Record
+{
+    uint index;
+};
+
+groupshared uint testLds[512];
+
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NodeDispatchGrid(2, 1, 1)]
+[NumThreads(1,1,1)]
+void firstNode(DispatchNodeInputRecord<Record> inputData)
+{
+    testLds[inputData.Get().index] = 99;
+}
+
+// CHECK: OpReturn
+
diff --git a/tools/clang/test/CodeGenSPIRV/node.increment-output-count.group.hlsl b/tools/clang/test/CodeGenSPIRV/node.increment-output-count.group.hlsl
new file mode 100644
index 0000000000..d6a2ea759e
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/node.increment-output-count.group.hlsl
@@ -0,0 +1,22 @@
+// RUN: %dxc -spirv -Od -T lib_6_8 external -fspv-target-env=vulkan1.3 %s | FileCheck %s
+
+// Node with EmptyNodeOutput calls GroupIncrementOutputCount
+
+
+[Shader("node")]
+[NodeLaunch("coalescing")]
+[NumThreads(1024,1,1)]
+[NodeIsProgramEntry]
+void node028_incrementoutputcount([MaxRecords(32)] EmptyNodeOutput empty)
+{
+  empty.GroupIncrementOutputCount(1);
+}
+
+// CHECK: [[UINT:%[^ ]*]] = OpTypeInt 32 0
+// CHECK-DAG: [[U0:%[^ ]*]] = OpConstant [[UINT]] 0
+// CHECK-DAG: [[U1:%[^ ]*]] = OpConstant [[UINT]] 1
+// CHECK-DAG: [[STRUCT:%[^ ]*]] = OpTypeStruct
+// CHECK-DAG: [[ARR:%[^ ]*]] = OpTypeNodePayloadArrayAMDX [[STRUCT]]
+// CHECK-DAG: [[PTR:%[^ ]*]] = OpTypePointer NodePayloadAMDX [[ARR]]
+// CHECK-DAG: [[U2:%[^ ]*]] = OpConstant [[UINT]] 2
+// CHECK: OpAllocateNodePayloadsAMDX [[PTR]] [[U2]] [[U1]] [[U0]]
diff --git a/tools/clang/test/CodeGenSPIRV/node.increment-output-count.thread.hlsl b/tools/clang/test/CodeGenSPIRV/node.increment-output-count.thread.hlsl
new file mode 100644
index 0000000000..6cd984fe69
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/node.increment-output-count.thread.hlsl
@@ -0,0 +1,22 @@
+// RUN: %dxc -spirv -Od -T lib_6_8 external -fspv-target-env=vulkan1.3 %s | FileCheck %s
+
+// Node with EmptyNodeOutput calls ThreadIncrementOutputCount
+
+
+[Shader("node")]
+[NodeLaunch("thread")]
+[NodeIsProgramEntry]
+void node028_incrementoutputcount([MaxRecords(32)] EmptyNodeOutput empty)
+{
+  empty.ThreadIncrementOutputCount(1);
+}
+
+// CHECK: [[UINT:%[^ ]*]] = OpTypeInt 32 0
+// CHECK-DAG: [[U0:%[^ ]*]] = OpConstant [[UINT]] 0
+// CHECK-DAG: [[U1:%[^ ]*]] = OpConstant [[UINT]] 1
+// CHECK-DAG: [[STRUCT:%[^ ]*]] = OpTypeStruct
+// CHECK-DAG: [[ARR:%[^ ]*]] = OpTypeNodePayloadArrayAMDX [[STRUCT]]
+// CHECK-DAG: [[PTR:%[^ ]*]] = OpTypePointer NodePayloadAMDX [[ARR]]
+// CHECK-DAG: OpConstantStringAMDX "empty"
+// CHECK-DAG: [[U4:%[^ ]*]] = OpConstant [[UINT]] 4
+// CHECK: OpAllocateNodePayloadsAMDX [[PTR]] [[U4]] [[U1]] [[U0]]
diff --git a/tools/clang/test/CodeGenSPIRV/node.input-record.dispatch-grid.array.hlsl b/tools/clang/test/CodeGenSPIRV/node.input-record.dispatch-grid.array.hlsl
new file mode 100644
index 0000000000..bae3f759b8
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/node.input-record.dispatch-grid.array.hlsl
@@ -0,0 +1,26 @@
+// RUN: %dxc -spirv -Vd -Od -T lib_6_8 -fspv-target-env=vulkan1.3 %s | FileCheck %s
+// Note: validation disabled until NodePayloadAMDX pointers are allowed
+// as function arguments
+
+// Check that SV_DispatchGrid supports array
+
+struct RECORD
+{
+  uint a[3] : SV_DispatchGrid;
+  uint b[3];
+};
+
+[Shader("node")]
+[NodeLaunch("coalescing")]
+[numthreads(4,4,4)]
+void node01(RWGroupNodeInputRecords<RECORD> input)
+{
+  input.Get().a = input.Get().b;
+}
+
+// CHECK: OpName [[RECORD:%[^ ]*]] "RECORD"
+// CHECK: OpMemberDecorate [[RECORD]] 0 PayloadDispatchIndirectAMDX
+// CHECK: [[UINT:%[^ ]*]] = OpTypeInt 32 0
+// CHECK: [[U3:%[^ ]*]] = OpConstant %uint 3
+// CHECK: [[ARRAY:%[^ ]*]] = OpTypeArray [[UINT]] [[U3]]
+// CHECK: [[RECORD]] = OpTypeStruct [[ARRAY]] [[ARRAY]]
diff --git a/tools/clang/test/CodeGenSPIRV/node.input-record.dispatch-grid.nested.hlsl b/tools/clang/test/CodeGenSPIRV/node.input-record.dispatch-grid.nested.hlsl
new file mode 100644
index 0000000000..aee7e0d014
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/node.input-record.dispatch-grid.nested.hlsl
@@ -0,0 +1,32 @@
+// RUN: %dxc -spirv -Vd -Od -T lib_6_8 -fspv-target-env=vulkan1.3 %s | FileCheck %s
+// Note: validation disabled until NodePayloadAMDX pointers are allowed
+// as function arguments
+
+// Check that SV_DispatchGrid in nested struct is recognized
+
+struct INNER {
+  uint c;
+  uint3 grid : SV_DispatchGrid;
+};
+
+struct RECORD
+{
+  uint a;
+  INNER b;
+};
+
+[Shader("node")]
+[NodeLaunch("coalescing")]
+[numthreads(4,4,4)]
+void node01(RWGroupNodeInputRecords<RECORD> input)
+{
+  input.Get().a = input.Get().b.grid.x;
+}
+
+// CHECK: OpName [[RECORD:%[^ ]*]] "RECORD"
+// CHECK: OpName [[INNER:%[^ ]*]] "INNER"
+// CHECK: OpMemberDecorate [[INNER]] 1 PayloadDispatchIndirectAMDX
+// CHECK: [[UINT:%[^ ]*]] = OpTypeInt 32 0
+// CHECK: [[VECTOR:%[^ ]*]] = OpTypeVector %uint 3
+// CHECK: [[INNER]] = OpTypeStruct [[UINT]] [[VECTOR]]
+// CHECK: [[RECORD]] = OpTypeStruct [[UINT]] [[INNER]]
diff --git a/tools/clang/test/CodeGenSPIRV/node.max-dispatch-grid.hlsl b/tools/clang/test/CodeGenSPIRV/node.max-dispatch-grid.hlsl
new file mode 100644
index 0000000000..e2440a31c0
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/node.max-dispatch-grid.hlsl
@@ -0,0 +1,30 @@
+// RUN: %dxc -spirv -Vd -Od -T lib_6_8 -fspv-target-env=vulkan1.3 %s | FileCheck %s
+// Note: validation disabled until NodePayloadAMDX pointers are allowed
+// as function arguments
+
+// Broadcasting launch node with dispatch grid defined in input
+// and max dispatch grid defined in the shader
+
+struct INPUT_GRID
+{
+  uint3 DispatchGrid : SV_DispatchGrid;
+  uint textureIndex;
+};
+
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NodeMaxDispatchGrid(2,3,4)]
+[NumThreads(1024,1,1)]
+void node002_dispatchgrid_input_maxdispatchgrid_shader(DispatchNodeInputRecord<INPUT_GRID> input)
+{
+}
+
+// CHECK: OpEntryPoint GLCompute [[SHADER:%[^ ]*]] "node002_dispatchgrid_input_maxdispatchgrid_shader"
+// CHECK-DAG: OpExecutionMode [[SHADER]] LocalSize 1024 1 1
+// CHECK-DAG: OpExecutionModeId [[SHADER]] MaxNumWorkgroupsAMDX [[U2:%[^ ]*]] [[U3:%[^ ]*]] [[U4:%[0-9A-Za-z_]*]]
+// CHECK: OpMemberDecorate %{{[^ ]*}} 0 PayloadDispatchIndirectAMDX
+// CHECK: [[UINT:%[^ ]*]] = OpTypeInt 32 0
+// CHECK-DAG: [[U2]] = OpConstant [[UINT]] 2
+// CHECK-DAG: [[U3]] = OpConstant [[UINT]] 3
+// CHECK-DAG: [[U4]] = OpConstant [[UINT]] 4
+// CHECK: OpReturn
diff --git a/tools/clang/test/CodeGenSPIRV/node.max-records.hlsl b/tools/clang/test/CodeGenSPIRV/node.max-records.hlsl
new file mode 100644
index 0000000000..7d8449afab
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/node.max-records.hlsl
@@ -0,0 +1,45 @@
+// RUN: %dxc -spirv -T lib_6_8  -fspv-target-env=vulkan1.3 %s | FileCheck %s
+
+// Test referencing params with MaxOutputRecordsSharedWith
+
+struct rec0
+{
+    int i0;
+    float f0;
+};
+
+struct rec1
+{
+    float f1;
+    int i1;
+};
+
+[Shader("node")]
+[NodeLaunch("thread")]
+void BackwardRef(
+  RWThreadNodeInputRecord<rec0> InputyMcInputFace,
+  [MaxRecords(5)] NodeOutput<rec1> Output1,
+  [MaxRecordsSharedWith(Output1)] NodeOutput<rec1> Output2)
+{
+}
+
+// CHECK: OpDecorateId [[TYPE1:%[^ ]*]] PayloadNodeNameAMDX [[STR1:%[^ ]*]]
+// CHECK: OpDecorateId [[TYPE1]] NodeMaxPayloadsAMDX [[U5:%[^ ]*]]
+// CHECK: OpDecorateId [[TYPE2:%[^ ]*]] PayloadNodeNameAMDX [[STR2:%[^ ]*]]
+// CHECK: OpDecorateId [[TYPE2]] NodeSharesPayloadLimitsWithAMDX [[TYPE1]]
+// CHECK: [[UINT:%[^ ]*]] = OpTypeInt 32 0
+// CHECK-DAG: [[U5]] = OpConstant [[UINT]] 5
+// CHECK-DAG: [[STR1]] = OpConstantStringAMDX "Output1"
+// CHECK-DAG: [[STR2]] = OpConstantStringAMDX "Output2"
+
+#if 0
+// copied from DXIL test but doesn't seem to conform to spec
+[Shader("node")]
+[NodeLaunch("thread")]
+void ForwardRef(
+  RWThreadNodeInputRecord<rec0> InputyMcInputFace,
+  [MaxRecordsSharedWith(Output2)] NodeOutput<rec1> Output1,
+  [MaxRecords(5)] NodeOutput<rec1> Output2)
+{
+}
+#endif
diff --git a/tools/clang/test/CodeGenSPIRV/node.member.read.hlsl b/tools/clang/test/CodeGenSPIRV/node.member.read.hlsl
new file mode 100644
index 0000000000..ac2474b29b
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/node.member.read.hlsl
@@ -0,0 +1,150 @@
+// RUN: %dxc -spirv -Vd -Od -T lib_6_8 -fspv-target-env=vulkan1.3 %s | FileCheck %s
+// Note: validation disabled until NodePayloadAMDX pointers are allowed
+// as function arguments
+
+// Read access to members of node input/output records
+
+RWBuffer<uint> buf0;
+
+struct RECORD
+{
+  uint a;
+  uint b;
+  uint c;
+};
+
+// CHECK: OpName [[BUF0:%[^ ]*]] "buf0"
+// CHECK: [[UINT:%[^ ]*]] = OpTypeInt 32 0
+// CHECK: [[U0:%[^ ]*]] = OpConstant [[UINT]] 0
+// CHECK: [[U16:%[^ ]*]] = OpConstant [[UINT]] 16
+// CHECK-DAG: [[INT:%[^ ]*]] = OpTypeInt 32 1
+// CHECK-DAG: [[S0:%[^ ]*]] = OpConstant [[INT]] 0
+// CHECK-DAG: [[U1:%[^ ]*]] = OpConstant [[UINT]] 1
+// CHECK-DAG: [[S1:%[^ ]*]] = OpConstant [[INT]] 1
+// CHECK-DAG: [[U2:%[^ ]*]] = OpConstant [[UINT]] 2
+// CHECK-DAG: [[S2:%[^ ]*]] = OpConstant [[INT]] 2
+// CHECK-DAG: [[U4:%[^ ]*]] = OpConstant [[UINT]] 4
+// CHECK-DAG: [[U7:%[^ ]*]] = OpConstant [[UINT]] 7
+// CHECK-DAG: [[TBI:%[^ ]*]] = OpTypeImage [[UINT]] Buffer
+
+[Shader("node")]
+[NumThreads(1024,1,1)]
+[NodeLaunch("broadcasting")]
+[NodeDispatchGrid(16,1,1)]
+void node01(DispatchNodeInputRecord<RECORD> input)
+{
+  buf0[0] = input.Get().a;
+}
+
+// CHECK: OpFunction
+// CHECK: [[PTR:%[^ ]*]] = OpAccessChain %{{[^ ]*}} %{{[^ ]*}} [[S0]]
+// CHECK: [[VAL:%[^ ]*]] = OpLoad [[UINT]] [[PTR]]
+// CHECK: [[IMG:%[^ ]*]] = OpLoad [[TBI]] [[BUF0]]
+// CHECK: OpImageWrite [[IMG]] [[U0]] [[VAL]]
+// CHECK: OpFunctionEnd
+
+
+[Shader("node")]
+[NumThreads(1024,1,1)]
+[NodeLaunch("broadcasting")]
+[NodeDispatchGrid(16,1,1)]
+void node02(RWDispatchNodeInputRecord<RECORD> input)
+{
+  buf0[0] = input.Get().b;
+}
+
+// CHECK: OpFunction
+// CHECK: [[PTR:%[^ ]*]] = OpAccessChain %{{[^ ]*}} %{{[^ ]*}} [[S1]]
+// CHECK: [[VAL:%[^ ]*]] = OpLoad [[UINT]] [[PTR]]
+// CHECK: [[IMG:%[^ ]*]] = OpLoad [[TBI]] [[BUF0]]
+// CHECK: OpImageWrite [[IMG]] [[U0]] [[VAL]]
+// CHECK: OpFunctionEnd
+
+[Shader("node")]
+[NumThreads(1024, 1, 1)]
+[NodeLaunch("coalescing")]
+void node03([MaxRecords(3)] GroupNodeInputRecords<RECORD> input)
+{
+  buf0[0] = input[1].c;
+}
+
+// CHECK: OpFunction
+// CHECK: [[PTR:%[^ ]*]] = OpAccessChain %{{[^ ]*}} %{{[^ ]*}} [[U1]] [[S2]]
+// CHECK: [[VAL:%[^ ]*]] = OpLoad [[UINT]] [[PTR]]
+// CHECK: [[IMG:%[^ ]*]] = OpLoad [[TBI]] [[BUF0]]
+// CHECK: OpImageWrite [[IMG]] [[U0]] [[VAL]]
+// CHECK: OpFunctionEnd
+
+[Shader("node")]
+[NumThreads(1,1,1)]
+[NodeLaunch("coalescing")]
+void node04([MaxRecords(4)] RWGroupNodeInputRecords<RECORD> input)
+{
+  buf0[0] = input[2].c;
+}
+
+// CHECK: OpFunction
+// CHECK: [[PTR:%[^ ]*]] = OpAccessChain %{{[^ ]*}} %{{[^ ]*}} [[U2]] [[S2]]
+// CHECK: [[VAL:%[^ ]*]] = OpLoad [[UINT]] [[PTR]]
+// CHECK: [[IMG:%[^ ]*]] = OpLoad [[TBI]] [[BUF0]]
+// CHECK: OpImageWrite [[IMG]] [[U0]] [[VAL]]
+// CHECK: OpFunctionEnd
+
+[Shader("node")]
+[NumThreads(1,1,1)]
+[NodeLaunch("coalescing")]
+void node05(NodeOutput<RECORD> output)
+{
+  ThreadNodeOutputRecords<RECORD> outrec = output.GetThreadNodeOutputRecords(1);
+  buf0[0] = outrec.Get().a;
+}
+
+// CHECK: OpFunction
+// CHECK: [[PAY:%[^ ]*]] = OpAllocateNodePayloadsAMDX %{{[^ ]*}} [[U4]] [[U1]] [[U0]]
+// CHECK: [[TEMP:%[^ ]*]] = OpLoad %{{[^ ]*}} [[PAY]]
+// CHECK: OpStore [[OUT:%[^ ]*]] [[TEMP]]
+// CHECK: [[PTR1:%[^ ]*]] = OpAccessChain %{{[^ ]*}} [[OUT]] [[U0]]
+// CHECK: [[PTR2:%[^ ]*]] = OpAccessChain %{{[^ ]*}} [[PTR1]] [[S0]]
+// CHECK-DAG: [[VAL:%[^ ]*]] = OpLoad [[UINT]] [[PTR2]]
+// CHECK-DAG: [[IMG:%[^ ]*]] = OpLoad [[TBI]] [[BUF0]]
+// CHECK: OpImageWrite [[IMG]] [[U0]] [[VAL]]
+// CHECK: OpFunctionEnd
+
+[Shader("node")]
+[NumThreads(1,1,1)]
+[NodeLaunch("coalescing")]
+void node06(NodeOutput<RECORD> output)
+{
+  ThreadNodeOutputRecords<RECORD> outrec = output.GetThreadNodeOutputRecords(7);
+  buf0[0] = outrec[2].b;
+}
+
+// CHECK: OpFunction
+// CHECK: [[PAY:%[^ ]*]] = OpAllocateNodePayloadsAMDX %{{[^ ]*}} [[U4]] [[U7]] [[U0]]
+// CHECK: [[TEMP:%[^ ]*]] = OpLoad %{{[^ ]*}} [[PAY]]
+// CHECK: OpStore [[OUT:%[^ ]*]] [[TEMP]]
+// CHECK: [[PTR:%[^ ]*]] = OpAccessChain %{{[^ ]*}} [[OUT]] [[U2]] [[S1]]
+// CHECK-DAG: [[VAL:%[^ ]*]] = OpLoad [[UINT]] [[PTR]]
+// CHECK-DAG: [[IMG:%[^ ]*]] = OpLoad [[TBI]] [[BUF0]]
+// CHECK: OpImageWrite [[IMG]] [[U0]] [[VAL]]
+// CHECK: OpFunctionEnd
+
+[Shader("node")]
+[NumThreads(1,1,1)]
+[NodeLaunch("coalescing")]
+void node07(NodeOutput<RECORD> output)
+{
+  GroupNodeOutputRecords<RECORD> outrec = output.GetGroupNodeOutputRecords(1);
+  buf0[0] = outrec.Get().c;
+}
+
+// CHECK: OpFunction
+// CHECK: [[PAY:%[^ ]*]] = OpAllocateNodePayloadsAMDX %{{[^ ]*}} [[U2]] [[U1]] [[U0]]
+// CHECK: [[TEMP:%[^ ]*]] = OpLoad %{{[^ ]*}} [[PAY]]
+// CHECK: OpStore [[OUT:%[^ ]*]] [[TEMP]]
+// CHECK: [[PTR1:%[^ ]*]] = OpAccessChain %{{[^ ]*}} [[OUT]] [[U0]]
+// CHECK: [[PTR2:%[^ ]*]] = OpAccessChain %{{[^ ]*}} [[PTR1]] [[S2]]
+// CHECK-DAG: [[VAL:%[^ ]*]] = OpLoad [[UINT]] [[PTR2]]
+// CHECK-DAG: [[IMG:%[^ ]*]] = OpLoad [[TBI]] [[BUF0]]
+// CHECK: OpImageWrite [[IMG]] [[U0]] [[VAL]]
+// CHECK: OpFunctionEnd
diff --git a/tools/clang/test/CodeGenSPIRV/node.member.read.types.hlsl b/tools/clang/test/CodeGenSPIRV/node.member.read.types.hlsl
new file mode 100644
index 0000000000..5f7d434bd2
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/node.member.read.types.hlsl
@@ -0,0 +1,193 @@
+// RUN: %dxc -spirv -Vd -Od -T lib_6_8 -fspv-target-env=vulkan1.3 -enable-16bit-types %s | FileCheck %s
+// Note: validation disabled until NodePayloadAMDX pointers are allowed
+// as function arguments
+
+// Read access of members of input/output record with different type
+// sizes - we check the function specializations generated
+
+RWBuffer<uint> buf0;
+
+struct RECORD
+{
+  half h;
+  float f;
+  double d;
+  bool b;
+  uint16_t i16;
+  int i;
+  int64_t i64;
+  uint64_t u64;
+};
+
+// CHECK: OpName [[BUF0:%[^ ]*]] "buf0"
+// CHECK-DAG: OpName [[RECORD:%[^ ]*]] "RECORD"
+// CHECK-DAG: OpMemberName [[RECORD]] 0 "h"
+// CHECK-DAG: OpMemberName [[RECORD]] 1 "f"
+// CHECK-DAG: OpMemberName [[RECORD]] 2 "d"
+// CHECK-DAG: OpMemberName [[RECORD]] 3 "b"
+// CHECK-DAG: OpMemberName [[RECORD]] 4 "i16"
+// CHECK-DAG: OpMemberName [[RECORD]] 5 "i"
+// CHECK-DAG: OpMemberName [[RECORD]] 6 "i64"
+// CHECK-DAG: OpMemberName [[RECORD]] 7 "u64"
+
+// CHECK-DAG: [[UINT:%[^ ]*]] = OpTypeInt 32 0
+// CHECK-DAG: [[INT:%[^ ]*]] = OpTypeInt 32 1
+// CHECK-DAG: [[S0:%[^ ]*]] = OpConstant [[INT]] 0
+// CHECK-DAG: [[S1:%[^ ]*]] = OpConstant [[INT]] 1
+// CHECK-DAG: [[S2:%[^ ]*]] = OpConstant [[INT]] 2
+// CHECK-DAG: [[S3:%[^ ]*]] = OpConstant [[INT]] 3
+// CHECK-DAG: [[S4:%[^ ]*]] = OpConstant [[INT]] 4
+// CHECK-DAG: [[S5:%[^ ]*]] = OpConstant [[INT]] 5
+// CHECK-DAG: [[S6:%[^ ]*]] = OpConstant [[INT]] 6
+// CHECK-DAG: [[S7:%[^ ]*]] = OpConstant [[INT]] 7
+// CHECK-DAG: [[U0:%[^ ]*]] = OpConstant [[UINT]] 0
+// CHECK-DAG: [[U1:%[^ ]*]] = OpConstant [[UINT]] 1
+// CHECK-DAG: [[TBI:%[^ ]*]] = OpTypeImage [[UINT]] Buffer
+
+// CHECK-DAG: [[HALF:%[^ ]*]] = OpTypeFloat 16
+// CHECK-DAG: [[FLOAT:%[^ ]*]] = OpTypeFloat 32
+// CHECK-DAG: [[DOUBLE:%[^ ]*]] = OpTypeFloat 64
+// CHECK-DAG: [[USHORT:%[^ ]*]] = OpTypeInt 16 0
+// CHECK-DAG: [[LONG:%[^ ]*]] = OpTypeInt 64 1
+// CHECK-DAG: [[ULONG:%[^ ]*]] = OpTypeInt 64 0
+// CHECK: [[RECORD]] = OpTypeStruct [[HALF]] [[FLOAT]] [[DOUBLE]] [[UINT]] [[USHORT]] [[INT]] [[LONG]] [[ULONG]]
+// CHECK: [[BOOL:%[^ ]*]] = OpTypeBool
+
+[Shader("node")]
+[NumThreads(1024,1,1)]
+[NodeLaunch("broadcasting")]
+[NodeDispatchGrid(64,1,1)]
+void node01(DispatchNodeInputRecord<RECORD> input)
+{
+  buf0[0] = input.Get().h;
+}
+
+// CHECK: OpFunction
+// CHECK: [[PTR:%[^ ]*]] = OpAccessChain %{{[^ ]*}} %{{[^ ]*}} [[S0]]
+// CHECK: [[VAL0:%[^ ]*]] = OpLoad [[HALF]] [[PTR]]
+// CHECK: [[VAL1:%[^ ]*]] = OpConvertFToU [[UINT]] [[VAL0]]
+// CHECK: [[VAL2:%[^ ]*]] = OpLoad [[TBI]] [[BUF0]]
+// CHECK: OpImageWrite [[VAL2]] [[U0]] [[VAL1]] None
+// CHECK: OpFunctionEnd
+
+[Shader("node")]
+[NumThreads(1024,1,1)]
+[NodeLaunch("broadcasting")]
+[NodeDispatchGrid(64,1,1)]
+void node02(DispatchNodeInputRecord<RECORD> input)
+{
+  buf0[0] = input.Get().f;
+}
+
+// CHECK: OpFunction
+// CHECK: [[PTR:%[^ ]*]] = OpAccessChain %{{[^ ]*}} %{{[^ ]*}} [[S1]]
+// CHECK: [[VAL0:%[^ ]*]] = OpLoad [[FLOAT]] [[PTR]]
+// CHECK: [[VAL1:%[^ ]*]] = OpConvertFToU [[UINT]] [[VAL0]]
+// CHECK: [[VAL2:%[^ ]*]] = OpLoad [[TBI]] [[BUF0]]
+// CHECK: OpImageWrite [[VAL2]] [[U0]] [[VAL1]] None
+// CHECK: OpFunctionEnd
+
+[Shader("node")]
+[NumThreads(1024,1,1)]
+[NodeLaunch("broadcasting")]
+[NodeDispatchGrid(64,1,1)]
+void node03(DispatchNodeInputRecord<RECORD> input)
+{
+  buf0[0] = input.Get().d;
+}
+
+// CHECK: OpFunction
+// CHECK: [[PTR:%[^ ]*]] = OpAccessChain %{{[^ ]*}} %{{[^ ]*}} [[S2]]
+// CHECK: [[VAL0:%[^ ]*]] = OpLoad [[DOUBLE]] [[PTR]]
+// CHECK: [[VAL1:%[^ ]*]] = OpConvertFToU [[UINT]] [[VAL0]]
+// CHECK: [[VAL2:%[^ ]*]] = OpLoad [[TBI]] [[BUF0]]
+// CHECK: OpImageWrite [[VAL2]] [[U0]] [[VAL1]] None
+// CHECK: OpFunctionEnd
+
+[Shader("node")]
+[NumThreads(1024,1,1)]
+[NodeLaunch("broadcasting")]
+[NodeDispatchGrid(64,1,1)]
+void node04(DispatchNodeInputRecord<RECORD> input)
+{
+  buf0[0] = input.Get().b;
+}
+
+// CHECK: OpFunction
+// CHECK: [[PTR:%[^ ]*]] = OpAccessChain %{{[^ ]*}} %{{[^ ]*}} [[S3]]
+// CHECK: [[VAL0:%[^ ]*]] = OpLoad [[UINT]] [[PTR]]
+// CHECK: [[VAL1:%[^ ]*]] = OpINotEqual [[BOOL]] [[VAL0]] [[U0]]
+// CHECK: [[VAL2:%[^ ]*]] = OpSelect [[UINT]] [[VAL1]] [[U1]] [[U0]]
+// CHECK: [[VAL3:%[^ ]*]] = OpLoad [[TBI]] [[BUF0]]
+// CHECK: OpImageWrite [[VAL3]] [[U0]] [[VAL2]] None
+// CHECK: OpFunctionEnd
+
+[Shader("node")]
+[NumThreads(1024,1,1)]
+[NodeLaunch("broadcasting")]
+[NodeDispatchGrid(64,1,1)]
+void node05(DispatchNodeInputRecord<RECORD> input)
+{
+  buf0[0] = input.Get().i16;
+}
+
+// CHECK: OpFunction
+// CHECK: [[PTR:%[^ ]*]] = OpAccessChain %{{[^ ]*}} %{{[^ ]*}} [[S4]]
+// CHECK: [[VAL0:%[^ ]*]] = OpLoad [[USHORT]] [[PTR]]
+// CHECK: [[VAL1:%[^ ]*]] = OpUConvert [[UINT]] [[VAL0]]
+// CHECK: [[VAL2:%[^ ]*]] = OpLoad [[TBI]] [[BUF0]]
+// CHECK: OpImageWrite [[VAL2]] [[U0]] [[VAL1]] None
+// CHECK: OpFunctionEnd
+
+[Shader("node")]
+[NumThreads(1024,1,1)]
+[NodeLaunch("broadcasting")]
+[NodeDispatchGrid(64,1,1)]
+void node06(DispatchNodeInputRecord<RECORD> input)
+{
+  buf0[0] = input.Get().i;
+}
+
+// CHECK: OpFunction
+// CHECK: [[PTR:%[^ ]*]] = OpAccessChain %{{[^ ]*}} %{{[^ ]*}} [[S5]]
+// CHECK: [[VAL0:%[^ ]*]] = OpLoad [[INT]] [[PTR]]
+// CHECK: [[VAL1:%[^ ]*]] = OpBitcast [[UINT]] [[VAL0]]
+// CHECK: [[VAL2:%[^ ]*]] = OpLoad [[TBI]] [[BUF0]]
+// CHECK: OpImageWrite [[VAL2]] [[U0]] [[VAL1]] None
+// CHECK: OpFunctionEnd
+
+[Shader("node")]
+[NumThreads(1024,1,1)]
+[NodeLaunch("broadcasting")]
+[NodeDispatchGrid(64,1,1)]
+void node07(DispatchNodeInputRecord<RECORD> input)
+{
+  buf0[0] = input.Get().i64;
+}
+
+// CHECK: OpFunction
+// CHECK: [[PTR:%[^ ]*]] = OpAccessChain %{{[^ ]*}} %{{[^ ]*}} [[S6]]
+// CHECK: [[VAL0:%[^ ]*]] = OpLoad [[LONG]] [[PTR]]
+// CHECK: [[VAL1:%[^ ]*]] = OpSConvert [[INT]] [[VAL0]]
+// CHECK: [[VAL2:%[^ ]*]] = OpBitcast [[UINT]] [[VAL1]]
+// CHECK: [[VAL3:%[^ ]*]] = OpLoad [[TBI]] [[BUF0]]
+// CHECK: OpImageWrite [[VAL3]] [[U0]] [[VAL2]] None
+// CHECK: OpFunctionEnd
+
+[Shader("node")]
+[NumThreads(1024,1,1)]
+[NodeLaunch("broadcasting")]
+[NodeDispatchGrid(64,1,1)]
+void node08(DispatchNodeInputRecord<RECORD> input)
+{
+  buf0[0] = input.Get().u64;
+}
+
+// CHECK: OpFunction
+// CHECK: [[PTR:%[^ ]*]] = OpAccessChain %{{[^ ]*}} %{{[^ ]*}} [[S7]]
+// CHECK: [[VAL0:%[^ ]*]] = OpLoad [[ULONG]] [[PTR]]
+// CHECK: [[VAL1:%[^ ]*]] = OpUConvert [[UINT]] [[VAL0]]
+// CHECK: [[VAL2:%[^ ]*]] = OpLoad [[TBI]] [[BUF0]]
+// CHECK: OpImageWrite [[VAL2]] [[U0]] [[VAL1]] None
+// CHECK: OpFunctionEnd
+
diff --git a/tools/clang/test/CodeGenSPIRV/node.member.write.hlsl b/tools/clang/test/CodeGenSPIRV/node.member.write.hlsl
new file mode 100644
index 0000000000..33fc2dd9ff
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/node.member.write.hlsl
@@ -0,0 +1,88 @@
+// RUN: %dxc -spirv -Vd -Od -T lib_6_8 -fspv-target-env=vulkan1.3 %s | FileCheck %s
+// Note: validation disabled until NodePayloadAMDX pointers are allowed
+// as function arguments
+
+// Writes to members of the various read-write node records
+
+struct RECORD
+{
+  uint a;
+  uint b;
+};
+
+// CHECK-DAG: [[UINT:%[^ ]*]] = OpTypeInt 32 0
+// CHECK-DAG: [[INT:%[^ ]*]] = OpTypeInt 32 1
+// CHECK-DAG: [[U0:%[^ ]*]] = OpConstant [[UINT]] 0
+// CHECK-DAG: [[S0:%[^ ]*]] = OpConstant [[INT]] 0
+// CHECK-DAG: [[U1:%[^ ]*]] = OpConstant [[UINT]] 1
+// CHECK-DAG: [[S1:%[^ ]*]] = OpConstant [[INT]] 1
+// CHECK-DAG: [[U2:%[^ ]*]] = OpConstant [[UINT]] 2
+// CHECK-DAG: [[U4:%[^ ]*]] = OpConstant [[UINT]] 4
+// CHECK-DAG: [[U5:%[^ ]*]] = OpConstant [[UINT]] 5
+// CHECK-DAG: [[U7:%[^ ]*]] = OpConstant [[UINT]] 7
+// CHECK-DAG: [[U8:%[^ ]*]] = OpConstant [[UINT]] 8
+// CHECK-DAG: [[U9:%[^ ]*]] = OpConstant [[UINT]] 9
+// CHECK-DAG: [[U11:%[^ ]*]] = OpConstant [[UINT]] 11
+
+[Shader("node")]
+[NumThreads(1024,1,1)]
+[NodeLaunch("broadcasting")]
+[NodeDispatchGrid(64,1,1)]
+void node01(RWDispatchNodeInputRecord<RECORD> input1)
+{
+  input1.Get().a = 5;
+}
+
+// CHECK: OpFunction
+// CHECK: [[PTR:%[^ ]*]] = OpAccessChain %{{[^ ]*}} %{{[^ ]*}} [[S0]]
+// CHECK: OpStore [[PTR]] [[U5]]
+// CHECK: OpFunctionEnd
+
+[Shader("node")]
+[NumThreads(2,1,1)]
+[NodeLaunch("coalescing")]
+void node02([MaxRecords(4)] RWGroupNodeInputRecords<RECORD> input2)
+{
+  input2[1].b = 7;
+}
+
+// CHECK: OpFunction
+// CHECK: [[PTR:%[^ ]*]] = OpAccessChain %{{[^ ]*}} %{{[^ ]*}} [[U1]] [[S1]]
+// CHECK: OpStore [[PTR]] [[U7]]
+// CHECK: OpFunctionEnd
+
+[Shader("node")]
+[NumThreads(3,1,1)]
+[NodeLaunch("coalescing")]
+void node03(NodeOutput<RECORD> output)
+{
+  ThreadNodeOutputRecords<RECORD> output3 = output.GetThreadNodeOutputRecords(2);
+  output3.Get().b = 9;
+}
+
+// CHECK: OpFunction
+// CHECK: [[PAY:%[^ ]*]] = OpAllocateNodePayloadsAMDX %{{[^ ]*}} [[U4]] [[U2]] [[U0]]
+// CHECK: [[VAL:%[^ ]*]] = OpLoad %{{[^ ]*}} [[PAY]]
+// CHECK: OpStore [[OUT:%[^ ]*]] [[VAL]]
+// CHECK: [[PTR0:%[^ ]*]] = OpAccessChain %{{[^ ]*}} [[OUT]] [[U0]]
+// CHECK: [[PTR1:%[^ ]*]] = OpAccessChain %{{[^ ]*}} [[PTR0]] [[S1]]
+// CHECK: OpStore [[PTR1]] [[U9]]
+// CHECK: OpFunctionEnd
+
+[Shader("node")]
+[NumThreads(4,1,1)]
+[NodeLaunch("coalescing")]
+void node04(NodeOutput<RECORD> output)
+{
+  GroupNodeOutputRecords<RECORD> output4 = output.GetGroupNodeOutputRecords(8);
+  output4[0].a = 11;
+}
+
+// CHECK: OpFunction
+// CHECK: [[PAY:%[^ ]*]] = OpAllocateNodePayloadsAMDX %{{[^ ]*}} [[U2]] [[U8]] [[U0]]
+// CHECK: [[VAL:%[^ ]*]] = OpLoad %{{[^ ]*}} [[PAY]]
+// CHECK: OpStore [[OUT:%[^ ]*]] [[VAL]]
+// CHECK: [[PTR:%[^ ]*]] = OpAccessChain %{{[^ ]*}} [[OUT]] [[U0]] [[S0]]
+// CHECK: OpStore [[PTR]] [[U11]]
+// CHECK: OpFunctionEnd
+
diff --git a/tools/clang/test/CodeGenSPIRV/node.member.write.matrix.hlsl b/tools/clang/test/CodeGenSPIRV/node.member.write.matrix.hlsl
new file mode 100644
index 0000000000..d875f27d4e
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/node.member.write.matrix.hlsl
@@ -0,0 +1,123 @@
+// RUN: %dxc -spirv -Vd -Od -T lib_6_8 -fspv-target-env=vulkan1.3 %s | FileCheck %s
+// Note: validation disabled until NodePayloadAMDX pointers are allowed
+// as function arguments
+// ==================================================================
+// Test writing to matrix members of node records
+// ==================================================================
+
+// CHECK: OpName [[NODE01:%[^ ]*]] "node01"
+// CHECK: OpName [[INPUT1:%[^ ]*]] "input1"
+// CHECK: OpName [[NODE02:%[^ ]*]] "node02"
+// CHECK: OpName [[INPUT2:%[^ ]*]] "input2"
+// CHECK: OpName [[NODE03:%[^ ]*]] "node03"
+// CHECK: OpName [[OUTPUT3:%[^ ]*]] "output3"
+// CHECK: OpName [[NODE04:%[^ ]*]] "node04"
+// CHECK: OpName [[OUTPUTS4:%[^ ]*]] "outputs4"
+
+struct RECORD
+{
+  row_major float2x2 m0;
+  row_major float2x2 m1;
+  column_major float2x2 m2;
+};
+
+// CHECK-DAG: [[UINT:%[^ ]*]] = OpTypeInt 32 0
+// CHECK-DAG: [[U64:%[^ ]*]] = OpConstant [[UINT]] 64
+// CHECK-DAG: [[U1:%[^ ]*]] = OpConstant [[UINT]] 1
+// CHECK-DAG: [[FLOAT:%[^ ]*]] = OpTypeFloat 32
+// CHECK-DAG: [[F111:%[^ ]*]] = OpConstant [[FLOAT]] 111
+// CHECK-DAG: [[V2FLOAT:%[^ ]*]] = OpTypeVector [[FLOAT]] 2
+// CHECK-DAG: [[C1:%[^ ]*]] = OpConstantComposite [[V2FLOAT]] [[F111]] [[F111]]
+// CHECK-DAG: [[MAT2V2FLOAT:[^ ]*]] = OpTypeMatrix [[V2FLOAT]] 2
+// CHECK-DAG: [[M1:%[^ ]*]] = OpConstantComposite [[MAT2V2FLOAT]] [[C1]] [[C1]]
+// CHECK-DAG: [[INT:%[^ ]*]] = OpTypeInt 32 1
+// CHECK-DAG: [[I1:%[^ ]*]] = OpConstant [[INT]] 1
+// CHECK-DAG: [[I0:%[^ ]*]] = OpConstant [[INT]] 0
+// CHECK-DAG: [[I2:%[^ ]*]] = OpConstant [[INT]] 2
+// CHECK-DAG: [[U0:%[^ ]*]] = OpConstant [[UINT]] 0
+// CHECK-DAG: [[F222:%[^ ]*]] = OpConstant [[FLOAT]] 222
+// CHECK-DAG: [[C2:%[^ ]*]] = OpConstantComposite [[V2FLOAT]] [[F222]] [[F222]]
+// CHECK-DAG: [[M2:%[^ ]*]] = OpConstantComposite [[MAT2V2FLOAT]] [[C2]] [[C2]]
+// CHECK-DAG: [[U4:%[^ ]*]] = OpConstant [[UINT]] 4
+// CHECK-DAG: [[U2:%[^ ]*]] = OpConstant [[UINT]] 2
+
+[Shader("node")]
+[NumThreads(1024,1,1)]
+[NodeLaunch("broadcasting")]
+[NodeDispatchGrid(64,1,1)]
+void node01(RWDispatchNodeInputRecord<RECORD> input1)
+{
+  // CHECK: [[NODE01]] = OpFunction
+  // CHECK: [[P0:%[^ ]*]] = OpAccessChain %{{[^ ]*}} [[INPUT1]] [[U0]]
+  // CHECK: [[P1:%[^ ]*]] = OpAccessChain %{{[^ ]*}} [[P0]] [[I1]]
+  // CHECK: OpStore [[P1]] [[M1]]
+  // CHECK: [[P0:%[^ ]*]] = OpAccessChain %{{[^ ]*}} [[INPUT1]] [[U0]]
+  // CHECK: [[P2:%[^ ]*]] = OpAccessChain %{{[^ ]*}} [[P0]] [[I0]]
+  // CHECK: [[VAL:%[^ ]*]] = OpLoad [[MAT2V2FLOAT]] [[P2]]
+  // CHECK: [[P0:%[^ ]*]] = OpAccessChain %{{[^ ]*}} [[INPUT1]] [[U0]]
+  // CHECK: [[P3:%[^ ]*]] = OpAccessChain %{{[^ ]*}} [[P0]] [[I2]]
+  // CHECK: OpStore [[P3]] [[VAL]]
+  // CHECK: OpFunctionEnd
+  input1.Get().m1 = 111;
+  input1.Get().m2 = input1.Get().m0;
+}
+
+[Shader("node")]
+[NumThreads(1,1,1)]
+[NodeLaunch("coalescing")]
+void node02([MaxRecords(4)] RWGroupNodeInputRecords<RECORD> input2)
+{
+  // CHECK: [[NODE02]] = OpFunction
+  // CHECK: [[P1:%[^ ]*]] = OpAccessChain %{{[^ ]*}} [[INPUT2]] [[U0]] [[I1]]
+  // CHECK: OpStore [[P1]] [[M1]]
+  // CHECK: [[P2:%[^ ]*]] = OpAccessChain %{{[^ ]*}} [[INPUT2]] [[U1]] [[I0]]
+  // CHECK: [[VAL:%[^ ]*]] = OpLoad [[MAT2V2FLOAT]] [[P2]]
+  // CHECK: [[P3:%[^ ]*]] = OpAccessChain %{{[^ ]*}} [[INPUT2]] [[U1]] [[I2]]
+  // CHECK: OpStore [[P3]] [[VAL]]
+  // CHECK: OpFunctionEnd
+  input2[0].m1 = 111;
+  input2[1].m2 = input2[1].m0;
+}
+
+[Shader("node")]
+[NumThreads(1024,1,1)]
+[NodeDispatchGrid(64,1,1)]
+[NodeLaunch("broadcasting")]
+void node03(NodeOutput<RECORD> output3)
+{
+  // CHECK: [[NODE03]] = OpFunction
+  // CHECK: [[PAY:%[^ ]*]] = OpAllocateNodePayloadsAMDX %{{[^ ]*}} [[U4]] [[U1]] [[U0]]
+  // CHECK: [[VAL:%[^ ]*]] = OpLoad %{{[^ ]*}} [[PAY]]
+  // CHECK: OpStore [[OUTREC3:%[^ ]*]] [[VAL]]
+  // CHECK: [[P0:%[^ ]*]] = OpAccessChain %{{[^ ]*}} [[OUTREC3]] [[U0]]
+  // CHECK: [[P1:%[^ ]*]] = OpAccessChain %{{[^ ]*}} [[P0]] [[I1]]
+  // CHECK: OpStore [[P1]] [[M1]]
+  // CHECK: [[P0:%[^ ]*]] = OpAccessChain %{{[^ ]*}} [[OUTREC3]] [[U0]]
+  // CHECK: [[P2:%[^ ]*]] = OpAccessChain %{{[^ ]*}} [[P0]] [[I2]]
+  // CHECK: OpStore [[P2]] [[M2]]
+  // CHECK: OpFunctionEnd
+  ThreadNodeOutputRecords<RECORD> outrec = output3.GetThreadNodeOutputRecords(1);
+  outrec.Get().m1 = 111;
+  outrec.Get().m2 = 222;
+}
+
+[Shader("node")]
+[NumThreads(1024,1,1)]
+[NodeLaunch("coalescing")]
+void node04([MaxRecords(5)] NodeOutput<RECORD> outputs4)
+{
+  // CHECK: [[NODE04]] = OpFunction
+  // CHECK: [[PAY:%[^ ]*]] = OpAllocateNodePayloadsAMDX %{{[^ ]*}} [[U2]] [[U1]] [[U0]]
+  // CHECK: [[VAL:%[^ ]*]] = OpLoad %{{[^ ]*}} [[PAY]]
+  // CHECK: OpStore [[OUTREC4:%[^ ]*]] [[VAL]]
+  // CHECK: [[P0:%[^ ]*]] = OpAccessChain %{{[^ ]*}} [[OUTREC4]] [[U0]]
+  // CHECK: [[P1:%[^ ]*]] = OpAccessChain %{{[^ ]*}} [[P0]] [[I1]]
+  // CHECK: OpStore [[P1]] [[M1]]
+  // CHECK: [[P0:%[^ ]*]] = OpAccessChain %{{[^ ]*}} [[OUTREC4]] [[U0]]
+  // CHECK: [[P2:%[^ ]*]] = OpAccessChain %{{[^ ]*}} [[P0]] [[I2]]
+  // CHECK: OpStore [[P2]] [[M2]]
+  // CHECK: OpFunctionEnd
+  GroupNodeOutputRecords<RECORD> outrec = outputs4.GetGroupNodeOutputRecords(1);
+  outrec.Get().m1 = 111;
+  outrec.Get().m2 = 222;
+}
diff --git a/tools/clang/test/CodeGenSPIRV/node.member.write.types.hlsl b/tools/clang/test/CodeGenSPIRV/node.member.write.types.hlsl
new file mode 100644
index 0000000000..ec95c3d758
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/node.member.write.types.hlsl
@@ -0,0 +1,150 @@
+// RUN: %dxc -spirv -T lib_6_8 -fspv-target-env=vulkan1.3 -enable-16bit-types %s | FileCheck %s
+
+// Writes to node record members of various types
+
+
+struct RECORD
+{
+  half h;
+  float f;
+  double d;
+  bool b;
+  int16_t i16;
+  uint16_t u16;
+  int i;
+  int64_t i64;
+  uint64_t u64;
+  float3 f3;
+  int ia[7];
+};
+
+// CHECK: OpName [[RECORD:%[^ ]*]] "RECORD"
+// CHECK: OpMemberName [[RECORD]] 0 "h"
+// CHECK: OpMemberName [[RECORD]] 1 "f"
+// CHECK: OpMemberName [[RECORD]] 2 "d"
+// CHECK: OpMemberName [[RECORD]] 3 "b"
+// CHECK: OpMemberName [[RECORD]] 4 "i16"
+// CHECK: OpMemberName [[RECORD]] 5 "u16"
+// CHECK: OpMemberName [[RECORD]] 6 "i"
+// CHECK: OpMemberName [[RECORD]] 7 "i64"
+// CHECK: OpMemberName [[RECORD]] 8 "u64"
+// CHECK: OpMemberName [[RECORD]] 9 "f3"
+// CHECK: OpMemberName [[RECORD]] 10 "ia"
+
+// CHECK-DAG: [[UINT:%[^ ]*]] = OpTypeInt 32 0
+// CHECK-DAG: [[HALF:%[^ ]*]] = OpTypeFloat 16
+// CHECK-DAG: [[INT:%[^ ]*]] = OpTypeInt 32 1
+// CHECK-DAG: [[FLOAT:%[^ ]*]] = OpTypeFloat 32
+// CHECK-DAG: [[DOUBLE:%[^ ]*]] = OpTypeFloat 64
+// CHECK-DAG: [[SHORT:%[^ ]*]] = OpTypeInt 16 1
+// CHECK-DAG: [[USHORT:%[^ ]*]] = OpTypeInt 16 0
+// CHECK-DAG: [[LONG:%[^ ]*]] = OpTypeInt 64 1
+// CHECK-DAG: [[ULONG:%[^ ]*]] = OpTypeInt 64 0
+// CHECK-DAG: [[V3FLOAT:%[^ ]*]] = OpTypeVector [[FLOAT]] 3
+
+// CHECK-DAG: [[U0:%[^ ]*]] = OpConstant [[UINT]] 0
+// CHECK-DAG: [[U1:%[^ ]*]] = OpConstant [[UINT]] 1
+// CHECK-DAG: [[HALF_0X1_8P_1:%[^ ]*]] = OpConstant [[HALF]] 0x1.8p+1
+// CHECK-DAG: [[I0:%[^ ]*]] = OpConstant [[INT]] 0
+// CHECK-DAG: [[FN5:%[^ ]*]] = OpConstant [[FLOAT]] -5
+// CHECK-DAG: [[I1:%[^ ]*]] = OpConstant [[INT]] 1
+// CHECK-DAG: [[D7:%[^ ]*]] = OpConstant [[DOUBLE]] 7
+// CHECK-DAG: [[I2:%[^ ]*]] = OpConstant [[INT]] 2
+// CHECK-DAG: [[I3:%[^ ]*]] = OpConstant [[INT]] 3
+// CHECK-DAG: [[S11:%[^ ]*]] = OpConstant [[SHORT]] 11
+// CHECK-DAG: [[I4:%[^ ]*]] = OpConstant [[INT]] 4
+// CHECK-DAG: [[US13:%[^ ]*]] = OpConstant [[USHORT]] 13
+// CHECK-DAG: [[I5:%[^ ]*]] = OpConstant [[INT]] 5
+// CHECK-DAG: [[I17:%[^ ]*]] = OpConstant [[INT]] 17
+// CHECK-DAG: [[I6:%[^ ]*]] = OpConstant [[INT]] 6
+// CHECK-DAG: [[LN19:%[^ ]*]] = OpConstant [[LONG]] -19
+// CHECK-DAG: [[I7:%[^ ]*]] = OpConstant [[INT]] 7
+// CHECK-DAG: [[UL21:%[^ ]*]] = OpConstant [[ULONG]] 21
+// CHECK-DAG: [[I8:%[^ ]*]] = OpConstant [[INT]] 8
+// CHECK-DAG: [[F23:%[^ ]*]] = OpConstant [[FLOAT]] 23
+// CHECK-DAG: [[I9:%[^ ]*]] = OpConstant [[INT]] 9
+// CHECK-DAG: [[I29:%[^ ]*]] = OpConstant [[INT]] 29
+// CHECK-DAG: [[I10:%[^ ]*]] = OpConstant [[INT]] 10
+// CHECK-DAG: [[U7:%[^ ]*]] = OpConstant [[UINT]] 7
+
+// CHECK-DAG: [[AI7:%[^ ]*]] = OpTypeArray [[INT]] [[U7]]
+// CHECK-DAG: [[RECORD]] = OpTypeStruct [[HALF]] [[FLOAT]] [[DOUBLE]] [[UINT]] [[SHORT]] [[USHORT]] [[INT]] [[LONG]] [[ULONG]] [[V3FLOAT]] [[AI7]]
+// CHECK-DAG: [[RAR:%[^ ]*]] = OpTypeNodePayloadArrayAMDX %RECORD
+// CHECK-DAG: [[RARP:%[^ ]*]] = OpTypePointer NodePayloadAMDX [[RAR]]
+// CHECK-DAG: [[U2:%[^ ]*]] = OpConstant [[UINT]] 2
+// CHECK-DAG: [[HALFP:%[^ ]*]] = OpTypePointer Function [[HALF]]
+// CHECK-DAG: [[FLOATP:%[^ ]*]] = OpTypePointer Function [[FLOAT]]
+// CHECK-DAG: [[DOUBLEP:%[^ ]*]] = OpTypePointer Function [[DOUBLE]]
+// CHECK-DAG: [[UINTP:%[^ ]*]] = OpTypePointer Function [[UINT]]
+// CHECK-DAG: [[SHORTP:%[^ ]*]] = OpTypePointer Function [[SHORT]]
+// CHECK-DAG: [[USHORTP:%[^ ]*]] = OpTypePointer Function [[USHORT]]
+// CHECK-DAG: [[INTP:%[^ ]*]] = OpTypePointer Function [[INT]]
+// CHECK-DAG: [[LONGP:%[^ ]*]] = OpTypePointer Function [[LONG]]
+// CHECK-DAG: [[ULONGP:%[^ ]*]] = OpTypePointer Function [[ULONG]]
+
+[Shader("node")]
+[NumThreads(1024,1,1)]
+[NodeLaunch("broadcasting")]
+[NodeDispatchGrid(512,1,1)]
+void node125(NodeOutput<RECORD> output)
+{
+  GroupNodeOutputRecords<RECORD> output01 = output.GetGroupNodeOutputRecords(1);
+  // CHECK: OpAllocateNodePayloadsAMDX [[RARP]] [[U2]] [[U1]] [[U0]]
+
+  output01.Get().h = 3.0;
+  // CHECK: [[PTR:%[^ ]*]] = OpAccessChain [[HALFP]]
+  // CHECK-SAME: [[I0]]
+  // CHECK: OpStore [[PTR]] [[HALF_0X1_8P_1]]
+
+  output01.Get().f = -5.0;
+  // CHECK: [[PTR:%[^ ]*]] = OpAccessChain [[FLOATP]]
+  // CHECK-SAME: [[I1]]
+  // CHECK: OpStore [[PTR]] [[FN5]]
+
+  output01.Get().d = 7.0;
+  // CHECK: [[PTR:%[^ ]*]] = OpAccessChain [[DOUBLEP]]
+  // CHECK-SAME: [[I2]]
+  // CHECK: OpStore [[PTR]] [[D7]]
+
+  output01.Get().b = true;
+  // CHECK: [[PTR:%[^ ]*]] = OpAccessChain [[UINTP]]
+  // CHECK-SAME: [[I3]]
+  // CHECK: OpStore [[PTR]] [[U1]]
+
+  output01.Get().i16 = 11;
+  // CHECK: [[PTR:%[^ ]*]] = OpAccessChain [[SHORTP]]
+  // CHECK-SAME: [[I4]]
+  // CHECK: OpStore [[PTR]] [[S11]]
+
+  output01.Get().u16 = 13;
+  // CHECK: [[PTR:%[^ ]*]] = OpAccessChain [[USHORTP]]
+  // CHECK-SAME: [[I5]]
+  // CHECK: OpStore [[PTR]] [[US13]]
+
+  output01.Get().i = 17;
+  // CHECK: [[PTR:%[^ ]*]] = OpAccessChain [[INTP]]
+  // CHECK-SAME: [[I6]]
+  // CHECK: OpStore [[PTR]] [[I17]]
+
+  output01.Get().i64 = -19;
+  // CHECK: [[PTR:%[^ ]*]] = OpAccessChain [[LONGP]]
+  // CHECK-SAME: [[I7]]
+  // CHECK: OpStore [[PTR]] [[LN19]]
+
+  output01.Get().u64 = 21;
+  // CHECK: [[PTR:%[^ ]*]] = OpAccessChain [[ULONGP]]
+  // CHECK-SAME: [[I8]]
+  // CHECK: OpStore [[PTR]] [[UL21]]
+
+  output01.Get().f3.y = 23;
+  // CHECK: [[PTR:%[^ ]*]] = OpAccessChain [[FLOATP]]
+  // CHECK-SAME: [[I9]]
+  // CHECK-SAME: [[I1]]
+  // CHECK: OpStore [[PTR]] [[F23]]
+
+  output01.Get().ia[5] = 29;
+  // CHECK: [[PTR:%[^ ]*]] = OpAccessChain [[INTP]]
+  // CHECK-SAME: [[I10]]
+  // CHECK-SAME: [[I5]]
+  // CHECK: OpStore [[PTR]] [[I29]]
+}
diff --git a/tools/clang/test/CodeGenSPIRV/node.mesh.hlsl b/tools/clang/test/CodeGenSPIRV/node.mesh.hlsl
new file mode 100644
index 0000000000..4d1726abb2
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/node.mesh.hlsl
@@ -0,0 +1,88 @@
+// RUN: %dxc -spirv -T lib_6_9 -fspv-target-env=vulkan1.3 %s | FileCheck %s
+// XFAIL: *
+// disabled until mesh nodes are implemented
+
+// Test loading of node input and funneling into mesh outputs
+// Essentially an end-to-end mesh node test.
+
+
+RWBuffer<float> buf0;
+
+#define MAX_VERT 32
+#define MAX_PRIM 16
+
+struct MeshPerVertex {
+    float4 position : SV_Position;
+    float color[4] : COLOR;
+};
+
+struct MeshPerPrimitive {
+    float normal : NORMAL;
+    float malnor : MALNOR;
+    float alnorm : ALNORM;
+    float ormaln : ORMALN;
+    int layer[6] : LAYER;
+};
+
+struct MeshPayload {
+    float normal;
+    float malnor;
+    float alnorm;
+    float ormaln;
+    int layer[6];
+};
+
+groupshared float gsMem[MAX_PRIM];
+
+[Shader("node")]
+[NodeLaunch("mesh")]
+[outputtopology("triangle")]
+[numthreads(128, 1, 1)]
+[NodeDispatchGrid(64,1,1)]
+void node_setmeshoutputcounts(DispatchNodeInputRecord<MeshPayload> mpl,
+            out indices uint3 primIndices[MAX_PRIM],
+            out vertices MeshPerVertex verts[MAX_VERT],
+            out primitives MeshPerPrimitive prims[MAX_PRIM],
+            in uint tig : SV_GroupIndex) {
+  SetMeshOutputCounts(32, 16);
+
+  // create mpl
+
+  MeshPerVertex ov;
+  ov.position = float4(14.0,15.0,16.0,17.0);
+  ov.color[0] = 14.0;
+  ov.color[1] = 15.0;
+  ov.color[2] = 16.0;
+  ov.color[3] = 17.0;
+
+  if (tig % 3) {
+    primIndices[tig / 3] = uint3(tig, tig + 1, tig + 2);
+
+    MeshPerPrimitive op;
+    op.normal = mpl.Get().normal;
+    op.malnor = gsMem[tig / 3 + 1];
+    op.alnorm = mpl.Get().alnorm;
+    op.ormaln = mpl.Get().ormaln;
+    op.layer[0] = mpl.Get().layer[0];
+    op.layer[1] = mpl.Get().layer[1];
+    op.layer[2] = mpl.Get().layer[2];
+    op.layer[3] = mpl.Get().layer[3];
+    op.layer[4] = mpl.Get().layer[4];
+    op.layer[5] = mpl.Get().layer[5];
+
+    gsMem[tig / 3] = op.normal;
+    prims[tig / 3] = op;
+  }
+  verts[tig] = ov;
+}
+
+// CHECK: OpEntryPoint MeshExt [[ENTRY:%[^ ]*]]
+// CHECK-DAG: OpExecutionMode [[ENTRY]] OutputVertices 32
+// CHECK-DAG: OpExecutionMode [[ENTRY]] OutputPrimitivesNV 16
+// CHECK-DAG: OpExecutionMode [[ENTRY]] OutputTrianglesNV
+// CHECK: [[UINT:%[^ ]*]] = OpTypeInt 32 0
+// CHECK-DAG: [[U16:%[^ ]*]] = OpConstant [[UINT]] 16
+// CHECK-DAG: [[U32:%[^ ]*]] = OpConstant [[UINT]] 32
+// CHECK: [[ENTRY]] = OpFunction
+// CHECK: OpSetMeshOutputsEXT [[U32]] [[U16]]
+// CHECK: OpFunctionEnd
diff --git a/tools/clang/test/CodeGenSPIRV/node.output-complete.hlsl b/tools/clang/test/CodeGenSPIRV/node.output-complete.hlsl
new file mode 100644
index 0000000000..17db15e7db
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/node.output-complete.hlsl
@@ -0,0 +1,33 @@
+// RUN: %dxc -spirv -Od -T lib_6_8 -fspv-target-env=vulkan1.3 %s | FileCheck %s
+
+// OutputComplete() is called with NodeOutput
+
+struct OUTPUT_RECORD
+{
+  uint value;
+};
+
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NodeDispatchGrid(256,1,1)]
+[NumThreads(1024,1,1)]
+void outputcomplete([MaxRecords(256)] NodeOutput<OUTPUT_RECORD> output)
+{
+  ThreadNodeOutputRecords<OUTPUT_RECORD> outputrecords = output.GetThreadNodeOutputRecords(1);
+    // ...
+  outputrecords.OutputComplete();
+}
+
+// CHECK: OpName [[RECORDS:%[^ ]*]] "outputrecords"
+// CHECK: OpDecorateId [[ARR:%[^ ]*]] PayloadNodeNameAMDX [[STR:%[0-9A-Za-z_]*]]
+// CHECK-DAG: [[UINT:%[^ ]*]] = OpTypeInt 32 0
+// CHECK-DAG: [[U1:%[^ ]*]] = OpConstant [[UINT]] 1
+// CHECK-DAG: [[U0:%[^ ]*]] = OpConstant [[UINT]] 0
+// CHECK-DAG: [[REC:%[^ ]*]] = OpTypeStruct [[UINT]]
+// CHECK-DAG: [[ARR:%[^ ]*]] = OpTypeNodePayloadArrayAMDX [[REC]]
+// CHECK-DAG: [[PTR:%[^ ]*]] = OpTypePointer NodePayloadAMDX [[ARR]]
+// CHECK-DAG: [[U4:[^ ]*]] = OpConstant [[UINT]] 4
+// CHECK: [[V0:%[^ ]*]] = OpAllocateNodePayloadsAMDX [[PTR]] [[U4]] [[U1]] [[U0]]
+// CHECK: [[V1:%[^ ]*]] = OpLoad [[ARR]] [[V0]]
+// CHECK: OpStore [[RECORDS]] [[V1]]
+// CHECK: OpEnqueueNodePayloadsAMDX [[RECORDS]]
diff --git a/tools/clang/test/CodeGenSPIRV/node.output.is-valid.empty.hlsl b/tools/clang/test/CodeGenSPIRV/node.output.is-valid.empty.hlsl
new file mode 100644
index 0000000000..08a103cf5e
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/node.output.is-valid.empty.hlsl
@@ -0,0 +1,19 @@
+// RUN: %dxc -spirv -T lib_6_8 -fspv-target-env=vulkan1.3 %s | FileCheck %s
+
+// NodeOutputIsValid() is called with EmptyNodeOutput
+
+RWBuffer<uint> buf0;
+
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NodeDispatchGrid(256,1,1)]
+[NumThreads(1,1,1)]
+void node131_nodeoutputisvalid_emptynodeoutput(EmptyNodeOutput output)
+{
+  buf0[0] = output.IsValid();
+}
+
+// CHECK: [[UINT:%[^ ]*]] = OpTypeInt 32 0
+// CHECK: [[U0:%[^ ]*]] = OpConstant [[UINT]] 0
+// CHECK: [[BOOL:%[^ ]*]] = OpTypeBool
+// CHECK: OpIsNodePayloadValidAMDX [[BOOL]] %{{[^ ]*}} [[U0]]
diff --git a/tools/clang/test/CodeGenSPIRV/node.output.is-valid.hlsl b/tools/clang/test/CodeGenSPIRV/node.output.is-valid.hlsl
new file mode 100644
index 0000000000..40e3a74fcb
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/node.output.is-valid.hlsl
@@ -0,0 +1,24 @@
+// RUN: %dxc -spirv -T lib_6_8 -fspv-target-env=vulkan1.3 %s | FileCheck %s
+
+// IsValid() is invoked on NodeOutput
+
+RWBuffer<uint> buf0;
+
+struct RECORD
+{
+  uint value;
+};
+
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NodeDispatchGrid(256,1,1)]
+[NumThreads(1,1,1)]
+void node129_nodeoutputisvalid_nodeoutput(NodeOutput<RECORD> output)
+{
+  buf0[0] = output.IsValid();
+}
+
+// CHECK: [[UINT:%[^ ]*]] = OpTypeInt 32 0
+// CHECK: [[U0:%[^ ]*]] = OpConstant [[UINT]] 0
+// CHECK: [[BOOL:%[^ ]*]] = OpTypeBool
+// CHECK: OpIsNodePayloadValidAMDX [[BOOL]] %{{[^ ]*}} [[U0]]
diff --git a/tools/clang/test/CodeGenSPIRV/node.renamed.hlsl b/tools/clang/test/CodeGenSPIRV/node.renamed.hlsl
new file mode 100644
index 0000000000..265fd6c17f
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/node.renamed.hlsl
@@ -0,0 +1,28 @@
+// RUN: %dxc -spirv -Od -T lib_6_8 -fspv-target-env=vulkan1.3 external %s | FileCheck %s
+
+// Renamed node, unnamed index defaults to 0
+
+struct RECORD {
+  uint i;
+};
+
+[Shader("node")]
+[NodeLaunch("thread")]
+[NodeID("new_node_name")]
+[NodeIsProgramEntry]
+void node017_renamed_node([NodeID("output_node_name", 2)] NodeOutput<RECORD> r)
+{
+  ThreadNodeOutputRecords<RECORD> records = r.GetThreadNodeOutputRecords(1);
+  records.OutputComplete();
+}
+
+// CHECK: OpEntryPoint GLCompute %{{[^ ]*}} "node017_renamed_node"
+// CHECK-DAG: OpDecorateId [[TYPE:%[^ ]*]] PayloadNodeNameAMDX [[STR:%[0-9A-Za-z_]*]]
+// CHECK-DAG: OpDecorateId [[TYPE]] PayloadNodeBaseIndexAMDX [[U2:%[0-9A-Za-z_]*]]
+// CHECK: [[UINT:%[^ ]*]] = OpTypeInt 32 0
+// CHECK-DAG: [[STR]] = OpConstantStringAMDX "output_node_name"
+// CHECK-DAG: [[U0:%[_0-9A-Za-z]*]] = OpConstant [[UINT]] 0
+// CHECK-DAG: [[U1:%[_0-9A-Za-z]*]] = OpConstant [[UINT]] 1
+// CHECK-DAG: [[U2]] = OpConstant [[UINT]] 2
+// CHECK-DAG: [[U4:%[_0-9A-Za-z]*]] = OpConstant [[UINT]] 4
+// CHECK: OpAllocateNodePayloadsAMDX %{{[^ ]*}} [[U4]] [[U1]] [[U0]]
diff --git a/tools/clang/test/CodeGenSPIRV/node.share-input.hlsl b/tools/clang/test/CodeGenSPIRV/node.share-input.hlsl
new file mode 100644
index 0000000000..c439bef017
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/node.share-input.hlsl
@@ -0,0 +1,42 @@
+// RUN: %dxc -spirv -T lib_6_8 -fspv-target-env=vulkan1.3 %s | FileCheck %s
+
+// Check that the NodeShareInputOf metadata entry is populated correctly
+
+struct entryRecord
+{
+    int data0;
+};
+
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NodeDispatchGrid(2, 1, 1)]
+[NumThreads(1, 1, 1)]
+void firstNode(DispatchNodeInputRecord<entryRecord> inputData)
+{ }
+
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NodeDispatchGrid(2, 1, 1)]
+[NumThreads(1, 1, 1)]
+[NodeShareInputOf("firstNode")]
+void secondNode(DispatchNodeInputRecord<entryRecord> inputData)
+{ }
+
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NodeDispatchGrid(2, 1, 1)]
+[NumThreads(1, 1, 1)]
+[NodeShareInputOf("firstNode", 3)]
+void thirdNode(DispatchNodeInputRecord<entryRecord> inputData)
+{ }
+
+
+// CHECK: OpEntryPoint GLCompute %firstNode "firstNode"
+// CHECK: OpEntryPoint GLCompute %secondNode "secondNode"
+// CHECK: OpEntryPoint GLCompute %thirdNode "thirdNode"
+// CHECK-NOT: OpExecutionModeId %firstNode SharesInputWithAMDX
+// CHECK: OpExecutionModeId %secondNode SharesInputWithAMDX [[STR:%[^ ]*]] [[U0:%[^ ]*]]
+// CHECK: OpExecutionModeId %thirdNode SharesInputWithAMDX [[STR]] [[U3:%[^ ]*]]
+// CHECK: [[UINT:%[^ ]*]] = OpTypeInt 32 0
+// CHECK-DAG: [[U0:%[^ ]*]] = OpConstant [[UINT]] 0
+// CHECK-DAG: [[U3:%[^ ]*]] = OpConstant [[UINT]] 3
diff --git a/tools/clang/test/CodeGenSPIRV/node.sparse-nodes.hlsl b/tools/clang/test/CodeGenSPIRV/node.sparse-nodes.hlsl
new file mode 100644
index 0000000000..ca3c14b8da
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/node.sparse-nodes.hlsl
@@ -0,0 +1,141 @@
+// RUN: %dxc -spirv -T lib_6_8 -fspv-target-env=vulkan1.3 %s | FileCheck %s
+
+struct RECORD1
+{
+  uint value;
+  uint value2;
+};
+
+// CHECK: OpEntryPoint GLCompute [[NODE10:%[^ ]*]] "node_1_0"
+// CHECK: OpEntryPoint GLCompute [[NODE11:%[^ ]*]] "node_1_1"
+// CHECK: OpEntryPoint GLCompute [[NODE12:%[^ ]*]] "node_1_2"
+// CHECK: OpEntryPoint GLCompute [[NODE20:%[^ ]*]] "node_2_0"
+// CHECK: OpEntryPoint GLCompute [[NODE21:%[^ ]*]] "node_2_1"
+// CHECK: OpEntryPoint GLCompute [[NODE22:%[^ ]*]] "node_2_2"
+// CHECK: OpDecorateId [[A10:%[^ ]*]] PayloadNodeNameAMDX [[S10:%[^ ]*]]
+// CHECK: OpDecorateId [[A10]] NodeMaxPayloadsAMDX [[U31:%[^ ]*]]
+// CHECK: OpDecorate [[A10]] PayloadNodeSparseArrayAMDX
+// CHECK: OpDecorateId [[A10]] PayloadNodeArraySizeAMDX [[U129:%[^ ]*]]
+// CHECK: OpDecorateId [[A11:%[^ ]*]] PayloadNodeNameAMDX [[S11:%[^ ]*]]
+// CHECK: OpDecorateId [[A11]] NodeMaxPayloadsAMDX [[U37:%[^ ]*]]
+// CHECK: OpDecorate [[A11]] PayloadNodeSparseArrayAMDX
+// CHECK: OpDecorateId [[A12:%[^ ]*]] PayloadNodeNameAMDX [[S12:%[^ ]*]]
+// CHECK: OpDecorateId [[A12]] NodeMaxPayloadsAMDX [[U47:%[^ ]*]]
+// CHECK: OpDecorate [[A12]] PayloadNodeSparseArrayAMDX
+// CHECK: OpDecorateId [[A20:%[^ ]*]] PayloadNodeNameAMDX [[S20:%[^ ]*]]
+// CHECK: OpDecorateId [[A20]] NodeMaxPayloadsAMDX [[U41:%[^ ]*]]
+// CHECK: OpDecorate [[A20]] PayloadNodeSparseArrayAMDX
+// CHECK: OpDecorateId [[A20]] PayloadNodeArraySizeAMDX [[U131:%[^ ]*]]
+// CHECK: OpDecorateId [[A21:%[^ ]*]] PayloadNodeNameAMDX [[S21:%[^ ]*]]
+// CHECK: OpDecorateId [[A21]] NodeMaxPayloadsAMDX [[U43:%[^ ]*]]
+// CHECK: OpDecorate [[A21]] PayloadNodeSparseArrayAMDX
+// CHECK: OpDecorateId [[A22:%[^ ]*]] PayloadNodeNameAMDX [[S22:%[^ ]*]]
+// CHECK: OpDecorateId [[A22]] NodeMaxPayloadsAMDX [[U53:%[^ ]*]]
+// CHECK: OpDecorate [[A22]] PayloadNodeSparseArrayAMDX
+// CHECK: [[UINT:%[^ ]*]] = OpTypeInt 32 0
+// CHECK: [[U0:%[^ ]*]] = OpConstant [[UINT]] 0
+// CHECK: [[RECORD:%[^ ]*]] = OpTypeStruct [[UINT]] [[UINT]]
+// CHECK-DAG: [[A10]] = OpTypeNodePayloadArrayAMDX [[RECORD]]
+// CHECK-DAG: [[S10]] = OpConstantStringAMDX "OutputArray_1_0"
+// CHECK-DAG: [[U31]] = OpConstant [[UINT]] 31
+// CHECK-DAG: [[U129]] = OpConstant [[UINT]] 129
+// CHECK-DAG: [[A11]] = OpTypeNodePayloadArrayAMDX [[RECORD]]
+// CHECK-DAG: [[S11]] = OpConstantStringAMDX "OutputArray_1_1"
+// CHECK-DAG: [[U37]] = OpConstant [[UINT]] 37
+// CHECK-DAG: [[A12]] = OpTypeNodePayloadArrayAMDX [[RECORD]]
+// CHECK-DAG: [[S12]] = OpConstantStringAMDX "Output_1_2"
+// CHECK-DAG: [[U47]] = OpConstant [[UINT]] 47
+// CHECK-DAG: [[EMPTY:%[^ ]*]] = OpTypeStruct
+// CHECK-DAG: [[A20]] = OpTypeNodePayloadArrayAMDX [[EMPTY]]
+// CHECK-DAG: [[S20]] = OpConstantStringAMDX "OutputArray_2_0"
+// CHECK-DAG: [[U41]] = OpConstant [[UINT]] 41
+// CHECK-DAG: [[U131]] = OpConstant [[UINT]] 131
+// CHECK-DAG: [[A21]] = OpTypeNodePayloadArrayAMDX [[EMPTY]]
+// CHECK-DAG: [[S21]] = OpConstantStringAMDX "OutputArray_2_1"
+// CHECK-DAG: [[U43]] = OpConstant [[UINT]] 43
+// CHECK-DAG: [[A22]] = OpTypeNodePayloadArrayAMDX [[EMPTY]]
+// CHECK-DAG: [[S22]] = OpConstantStringAMDX "Output_2_2"
+// CHECK-DAG: [[U53]] = OpConstant [[UINT]] 53
+
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NodeDispatchGrid(1, 1, 1)]
+[NumThreads(1, 1, 1)]
+void node_1_0(
+    [AllowSparseNodes] [NodeArraySize(129)] [MaxRecords(31)]
+    NodeOutputArray<RECORD1> OutputArray_1_0) {
+  ThreadNodeOutputRecords<RECORD1> outRec = OutputArray_1_0[1].GetThreadNodeOutputRecords(2);
+  outRec.OutputComplete();
+}
+
+// CHECK: [[NODE10]] = OpFunction %void None
+// CHECK: OpFunctionEnd
+
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NodeDispatchGrid(1, 1, 1)]
+[NumThreads(1, 1, 1)]
+void node_1_1(
+    [UnboundedSparseNodes] [MaxRecords(37)]
+    NodeOutputArray<RECORD1> OutputArray_1_1) {
+  ThreadNodeOutputRecords<RECORD1> outRec = OutputArray_1_1[1].GetThreadNodeOutputRecords(2);
+  outRec.OutputComplete();
+}
+
+// CHECK: [[NODE11]] = OpFunction %void None
+// CHECK: OpFunctionEnd
+
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NodeDispatchGrid(1, 1, 1)]
+[NumThreads(1, 1, 1)]
+void node_1_2(
+    [AllowSparseNodes] [MaxRecords(47)]
+    NodeOutput<RECORD1> Output_1_2) {
+  ThreadNodeOutputRecords<RECORD1> outRec = Output_1_2.GetThreadNodeOutputRecords(2);
+  outRec.OutputComplete();
+}
+
+// CHECK: [[NODE12]] = OpFunction %void None
+// CHECK: %{{[^ ]*}} = OpAllocateNodePayloadsAMDX %{{[^ ]*}} %{{[^ ]*}} %{{[^ ]*}} [[U0]]
+// CHECK: OpFunctionEnd
+
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NodeDispatchGrid(1, 1, 1)]
+[NumThreads(1, 1, 1)]
+void node_2_0(
+    [AllowSparseNodes] [NodeArraySize(131)] [MaxRecords(41)]
+    EmptyNodeOutputArray OutputArray_2_0) {
+  OutputArray_2_0[1].GroupIncrementOutputCount(10);
+}
+
+// CHECK: [[NODE20]] = OpFunction %void None
+// CHECK: OpFunctionEnd
+
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NodeDispatchGrid(1, 1, 1)]
+[NumThreads(1, 1, 1)]
+void node_2_1(
+    [UnboundedSparseNodes] [MaxRecords(43)]
+    EmptyNodeOutputArray OutputArray_2_1) {
+  OutputArray_2_1[1].GroupIncrementOutputCount(10);
+}
+
+// CHECK: [[NODE21]] = OpFunction %void None
+// CHECK: OpFunctionEnd
+
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NodeDispatchGrid(1, 1, 1)]
+[NumThreads(1, 1, 1)]
+void node_2_2(
+    [AllowSparseNodes] [MaxRecords(53)]
+    EmptyNodeOutput Output_2_2) {
+  Output_2_2.GroupIncrementOutputCount(10);
+}
+
+// CHECK: [[NODE22]] = OpFunction %void None
+// CHECK: %{{[^ ]*}} = OpAllocateNodePayloadsAMDX %{{[^ ]*}} %{{[^ ]*}} %{{[^ ]*}} [[U0]]
+// CHECK: OpFunctionEnd
diff --git a/tools/clang/test/CodeGenSPIRV/node.thread.num-threads.hlsl b/tools/clang/test/CodeGenSPIRV/node.thread.num-threads.hlsl
new file mode 100644
index 0000000000..8732cf3478
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/node.thread.num-threads.hlsl
@@ -0,0 +1,15 @@
+// RUN: %dxc -spirv -Od -T lib_6_8 -fspv-target-env=vulkan1.3 %s | FileCheck %s
+
+// NumThreads
+
+[Shader("node")]
+[NodeLaunch("thread")]
+[NumThreads(1,1,1)]
+[NodeIsProgramEntry]
+void node010_thread_numthreads_shader()
+{
+}
+
+// CHECK: OpEntryPoint GLCompute [[SHADER:%[0-9A-Za-z_]*]]
+// CHECK: OpExecutionMode [[SHADER]] LocalSize 1 1 1
+// CHECK: OpReturn
diff --git a/tools/clang/test/CodeGenSPIRV/node.thread.num-threads.none.hlsl b/tools/clang/test/CodeGenSPIRV/node.thread.num-threads.none.hlsl
new file mode 100644
index 0000000000..0b230479c4
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/node.thread.num-threads.none.hlsl
@@ -0,0 +1,15 @@
+// RUN: %dxc -spirv -Od -T lib_6_8 -fspv-target-env=vulkan1.3 %s | FileCheck %s
+
+// Thread launch node without NumThreads specified should use a
+// default of (1,1,1)
+
+[Shader("node")]
+[NodeLaunch("thread")]
+[NodeIsProgramEntry]
+void node011_thread_numthreads_none()
+{
+}
+
+// CHECK: OpEntryPoint GLCompute [[SHADER:%[0-9A-Za-z_]*]]
+// CHECK: OpExecutionMode [[SHADER]] LocalSize 1 1 1
+// CHECK: OpReturn
diff --git a/tools/clang/test/CodeGenSPIRV/op.vector.swizzle.buffer-store.hlsl b/tools/clang/test/CodeGenSPIRV/op.vector.swizzle.buffer-store.hlsl
new file mode 100644
index 0000000000..5d77d222f9
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/op.vector.swizzle.buffer-store.hlsl
@@ -0,0 +1,26 @@
+// RUN: %dxc -T cs_6_0 -E main -fcgl  %s -spirv | FileCheck %s
+
+RWStructuredBuffer<bool4>  buffer;
+
+// CHECK-DAG: [[v4_0:%[0-9]+]] = OpConstantComposite %v4uint %uint_0 %uint_0 %uint_0 %uint_0
+// CHECK-DAG: [[v4_1:%[0-9]+]] = OpConstantComposite %v4uint %uint_1 %uint_1 %uint_1 %uint_1
+
+[numthreads(1, 1, 1)]
+void main()
+{
+// CHECK:  [[ptr:%[0-9]+]] = OpAccessChain %_ptr_Uniform_v4uint %buffer %int_0 %uint_0
+// CHECK: [[load:%[0-9]+]] = OpLoad %v4uint [[ptr]]
+// CHECK: [[cast:%[0-9]+]] = OpINotEqual %v4bool [[load]] [[v4_0]]
+// CHECK: [[shuf:%[0-9]+]] = OpVectorShuffle %v3bool [[cast]] [[cast]] 0 1 2
+// CHECK:                    OpStore %a [[shuf]]
+  bool3 a = buffer[0].xyz;
+
+// CHECK:    [[a:%[0-9]+]] = OpLoad %v3bool %a
+// CHECK:  [[ptr:%[0-9]+]] = OpAccessChain %_ptr_Uniform_v4uint %buffer %int_0 %uint_1
+// CHECK: [[load:%[0-9]+]] = OpLoad %v4uint [[ptr]]
+// CHECK: [[cast:%[0-9]+]] = OpINotEqual %v4bool [[load]] [[v4_0]]
+// CHECK: [[shuf:%[0-9]+]] = OpVectorShuffle %v4bool [[cast]] [[a]] 4 5 6 3
+// CHECK: [[cast:%[0-9]+]] = OpSelect %v4uint [[shuf]] [[v4_1]] [[v4_0]]
+// CHECK:                    OpStore [[ptr]] [[cast]]
+  buffer[1].xyz = a;
+}
diff --git a/tools/clang/test/CodeGenSPIRV/rich.debug.function.param.hlsl b/tools/clang/test/CodeGenSPIRV/rich.debug.function.param.hlsl
index 9576837884..a3701a4ed4 100644
--- a/tools/clang/test/CodeGenSPIRV/rich.debug.function.param.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/rich.debug.function.param.hlsl
@@ -9,7 +9,7 @@
 // CHECK:               [[x:%[0-9]+]] = OpString "x"
 // CHECK:     [[srcMainName:%[0-9]+]] = OpString "main"
 // CHECK:           [[color:%[0-9]+]] = OpString "color"
-// CHECK:        [[mainName:%[0-9]+]] = OpString "wrapper"
+// CHECK:        [[mainName:%[0-9]+]] = OpString "__dxc_setup"
 
 // CHECK: [[int:%[0-9]+]] = OpExtInst %void [[set]] DebugTypeBasic {{%[0-9]+}} %uint_32 Signed
 // CHECK: [[float:%[0-9]+]] = OpExtInst %void [[set]] DebugTypeBasic {{%[0-9]+}} %uint_32 Float
diff --git a/tools/clang/test/CodeGenSPIRV/shader.debug.function.hlsl b/tools/clang/test/CodeGenSPIRV/shader.debug.function.hlsl
index b263fd88ad..23bb479a46 100644
--- a/tools/clang/test/CodeGenSPIRV/shader.debug.function.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/shader.debug.function.hlsl
@@ -6,7 +6,7 @@
 // CHECK:         [[fooName:%[0-9]+]] = OpString "foo"
 // CHECK:        [[emptyStr:%[0-9]+]] = OpString ""
 // CHECK:     [[srcMainName:%[0-9]+]] = OpString "main"
-// CHECK:        [[mainName:%[0-9]+]] = OpString "wrapper"
+// CHECK:        [[mainName:%[0-9]+]] = OpString "__dxc_setup"
 // CHECK:          [[clOpts:%[0-9]+]] = OpString " -E main -T ps_6_0 -spirv -fcgl -fspv-debug=vulkan
 
 // CHECK:    [[int:%[0-9]+]] = OpExtInst %void [[set]] DebugTypeBasic {{%[0-9]+}} %uint_32 %uint_4 %uint_0
diff --git a/tools/clang/test/CodeGenSPIRV/spv.intrinsicConstantValue.hlsl b/tools/clang/test/CodeGenSPIRV/spv.intrinsicConstantValue.hlsl
new file mode 100644
index 0000000000..a592863f1b
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/spv.intrinsicConstantValue.hlsl
@@ -0,0 +1,13 @@
+// RUN: %dxc -Od -T cs_6_8 -spirv -fcgl %s | FileCheck %s
+
+// CHECK: %spirvIntrinsicType = OpTypeInt 8 0
+using uint8_t [[vk::ext_capability(/* Int8 */ 39)]] =
+    vk::SpirvType</* OpTypeInt */ 21, 8, 8, vk::Literal<vk::integral_constant<uint, 8> >,
+                  vk::Literal<vk::integral_constant<bool, false> > >;
+
+[[vk::ext_instruction(/* OpConstant */ 43)]] uint8_t mkconsant([[vk::ext_literal]] int v);
+
+// CHECK: OpConstant %spirvIntrinsicType 42
+static const uint8_t K = mkconsant(42);
+
+[numthreads(1, 1, 1)] void main() {}
diff --git a/tools/clang/test/CodeGenSPIRV/spv.intrinsicInTemplate.hlsl b/tools/clang/test/CodeGenSPIRV/spv.intrinsicInTemplate.hlsl
new file mode 100644
index 0000000000..0ecda64dbb
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/spv.intrinsicInTemplate.hlsl
@@ -0,0 +1,29 @@
+// RUN: %dxc -T cs_6_8 -HV 2021 -O0 -spirv -fspv-target-env=universal1.5 %s | FileCheck %s
+
+// CHECK: [[Int8Type:%.*]] = OpTypeInt 8 0
+using Int8Type = vk::SpirvType</* OpTypeInt */ 21, 8, 8,
+                               vk::Literal<vk::integral_constant<uint32_t, 8> >,
+                               vk::Literal<vk::integral_constant<bool, 0> > >;
+
+// CHECK: [[MatrixType:%.*]] = OpTypeCooperativeMatrixKHR [[Int8Type]] %uint_3 %uint_16 %uint_16 %uint_0
+using I8MatA = vk::SpirvOpaqueType<
+    /* OpTypeCooperativeMatrixKHR */ 4456, Int8Type,
+    vk::integral_constant<uint, /* ScopeSubgroup */ 3>,
+    vk::integral_constant<uint, 16>, vk::integral_constant<uint, 16>,
+    vk::integral_constant<uint, /* Use */ 0> >;
+
+template <typename ResultType, typename PointerType>
+[[vk::ext_instruction(/* OpCooperativeMatrixLoadKHR */ 4457)]] ResultType
+__builtin_spv_CooperativeMatrixLoadKHR([[vk::ext_reference]] PointerType pointer,
+    uint32_t memory_layout, uint32_t stride, [[vk::ext_literal]] uint32_t memory_operand);
+
+StructuredBuffer<uint32_t> buffer : register(t0, space0);
+
+[numthreads(32, 1, 1)] void main() {
+  [[vk::ext_extension("SPV_KHR_cooperative_matrix")]]
+  [[vk::ext_capability(/* CooperativeMatrixKHRCapability */ 6022)]]
+  [[vk::ext_capability(/* VulkanMemoryModel */ 5345)]]
+  [[vk::ext_capability(/* Int8 */ 39)]]
+  // CHECK: OpCooperativeMatrixLoadKHR [[MatrixType]] %{{.*}} %uint_0 %uint_32 None
+  I8MatA matA = __builtin_spv_CooperativeMatrixLoadKHR<I8MatA>(buffer[0], /* rowMajor */ 0, 32, 0);
+}
diff --git a/tools/clang/test/CodeGenSPIRV/type.buffer.hlsl b/tools/clang/test/CodeGenSPIRV/type.buffer.hlsl
index 35d1b868a8..769fe808b2 100644
--- a/tools/clang/test/CodeGenSPIRV/type.buffer.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/type.buffer.hlsl
@@ -1,109 +1,144 @@
-// RUN: %dxc -T ps_6_0 -E main -fcgl  %s -spirv | FileCheck %s
+// RUN: %dxc -T ps_6_0 -E main -fcgl  %s -spirv | FileCheck %s --check-prefixes=CHECK,INFER
+// RUN: %dxc -fspv-use-unknown-image-format -T ps_6_0 -E main -fcgl  %s -spirv | FileCheck %s --check-prefixes=CHECK,UNKNOWN
 
 // CHECK: OpCapability SampledBuffer
-// CHECK: OpCapability StorageImageExtendedFormats
+// INFER: OpCapability StorageImageExtendedFormats
 
-// CHECK: %type_buffer_image = OpTypeImage %int Buffer 2 0 0 1 R32i
+// INFER: %type_buffer_image = OpTypeImage %int Buffer 2 0 0 1 R32i
+// UNKNOWN: %type_buffer_image = OpTypeImage %int Buffer 2 0 0 1 Unknown
 // CHECK: %_ptr_UniformConstant_type_buffer_image = OpTypePointer UniformConstant %type_buffer_image
 Buffer<int> intbuf;
-// CHECK: %type_buffer_image_0 = OpTypeImage %uint Buffer 2 0 0 1 R32ui
+// INFER: %type_buffer_image_0 = OpTypeImage %uint Buffer 2 0 0 1 R32ui
+// UNKNOWN: %type_buffer_image_0 = OpTypeImage %uint Buffer 2 0 0 1 Unknown
 // CHECK: %_ptr_UniformConstant_type_buffer_image_0 = OpTypePointer UniformConstant %type_buffer_image_0
 Buffer<uint> uintbuf;
-// CHECK: %type_buffer_image_1 = OpTypeImage %float Buffer 2 0 0 1 R32f
+// INFER: %type_buffer_image_1 = OpTypeImage %float Buffer 2 0 0 1 R32f
+// UNKNOWN: %type_buffer_image_1 = OpTypeImage %float Buffer 2 0 0 1 Unknown
 // CHECK: %_ptr_UniformConstant_type_buffer_image_1 = OpTypePointer UniformConstant %type_buffer_image_1
 Buffer<float> floatbuf;
 
-// CHECK: %type_buffer_image_2 = OpTypeImage %int Buffer 2 0 0 2 R32i
+// INFER: %type_buffer_image_2 = OpTypeImage %int Buffer 2 0 0 2 R32i
+// UNKNOWN: %type_buffer_image_2 = OpTypeImage %int Buffer 2 0 0 2 Unknown
 // CHECK: %_ptr_UniformConstant_type_buffer_image_2 = OpTypePointer UniformConstant %type_buffer_image_2
 RWBuffer<int> intrwbuf;
-// CHECK: %type_buffer_image_3 = OpTypeImage %uint Buffer 2 0 0 2 R32ui
+// INFER: %type_buffer_image_3 = OpTypeImage %uint Buffer 2 0 0 2 R32ui
+// UNKNOWN: %type_buffer_image_3 = OpTypeImage %uint Buffer 2 0 0 2 Unknown
 // CHECK: %_ptr_UniformConstant_type_buffer_image_3 = OpTypePointer UniformConstant %type_buffer_image_3
 RWBuffer<uint> uintrwbuf;
-// CHECK: %type_buffer_image_4 = OpTypeImage %float Buffer 2 0 0 2 R32f
+// INFER: %type_buffer_image_4 = OpTypeImage %float Buffer 2 0 0 2 R32f
+// UNKNOWN: %type_buffer_image_4 = OpTypeImage %float Buffer 2 0 0 2 Unknown
 // CHECK: %_ptr_UniformConstant_type_buffer_image_4 = OpTypePointer UniformConstant %type_buffer_image_4
 RWBuffer<float> floatrwbuf;
 
-// CHECK: %type_buffer_image_5 = OpTypeImage %int Buffer 2 0 0 1 Rg32i
-// CHECK: %_ptr_UniformConstant_type_buffer_image_5 = OpTypePointer UniformConstant %type_buffer_image_5
+// If the `Unkonwn image format is used, then the images below will reuse the types above.
+// UNKNOWN-NOT: OpTypeImage
+
+// INFER: %type_buffer_image_5 = OpTypeImage %int Buffer 2 0 0 1 Rg32i
+// INFER: %_ptr_UniformConstant_type_buffer_image_5 = OpTypePointer UniformConstant %type_buffer_image_5
 Buffer<int2> int2buf;
-// CHECK: %type_buffer_image_6 = OpTypeImage %uint Buffer 2 0 0 1 Rg32ui
-// CHECK: %_ptr_UniformConstant_type_buffer_image_6 = OpTypePointer UniformConstant %type_buffer_image_6
+// INFER: %type_buffer_image_6 = OpTypeImage %uint Buffer 2 0 0 1 Rg32ui
+// INFER: %_ptr_UniformConstant_type_buffer_image_6 = OpTypePointer UniformConstant %type_buffer_image_6
 Buffer<uint2> uint2buf;
-// CHECK: %type_buffer_image_7 = OpTypeImage %float Buffer 2 0 0 1 Rg32f
-// CHECK: %_ptr_UniformConstant_type_buffer_image_7 = OpTypePointer UniformConstant %type_buffer_image_7
+// INFER: %type_buffer_image_7 = OpTypeImage %float Buffer 2 0 0 1 Rg32f
+// INFER: %_ptr_UniformConstant_type_buffer_image_7 = OpTypePointer UniformConstant %type_buffer_image_7
 Buffer<float2> float2buf;
 
-// CHECK: %type_buffer_image_8 = OpTypeImage %int Buffer 2 0 0 2 Rg32i
-// CHECK: %_ptr_UniformConstant_type_buffer_image_8 = OpTypePointer UniformConstant %type_buffer_image_8
+// INFER: %type_buffer_image_8 = OpTypeImage %int Buffer 2 0 0 2 Rg32i
+// INFER: %_ptr_UniformConstant_type_buffer_image_8 = OpTypePointer UniformConstant %type_buffer_image_8
 RWBuffer<int2> int2rwbuf;
-// CHECK: %type_buffer_image_9 = OpTypeImage %uint Buffer 2 0 0 2 Rg32ui
-// CHECK: %_ptr_UniformConstant_type_buffer_image_9 = OpTypePointer UniformConstant %type_buffer_image_9
+// INFER: %type_buffer_image_9 = OpTypeImage %uint Buffer 2 0 0 2 Rg32ui
+// INFER: %_ptr_UniformConstant_type_buffer_image_9 = OpTypePointer UniformConstant %type_buffer_image_9
 RWBuffer<uint2> uint2rwbuf;
-// CHECK: %type_buffer_image_10 = OpTypeImage %float Buffer 2 0 0 2 Rg32f
-// CHECK: %_ptr_UniformConstant_type_buffer_image_10 = OpTypePointer UniformConstant %type_buffer_image_10
+// INFER: %type_buffer_image_10 = OpTypeImage %float Buffer 2 0 0 2 Rg32f
+// INFER: %_ptr_UniformConstant_type_buffer_image_10 = OpTypePointer UniformConstant %type_buffer_image_10
 RWBuffer<float2> float2rwbuf;
 
-// CHECK: %type_buffer_image_11 = OpTypeImage %int Buffer 2 0 0 1 Unknown
-// CHECK: %_ptr_UniformConstant_type_buffer_image_11 = OpTypePointer UniformConstant %type_buffer_image_11
-// CHECK: %type_buffer_image_12 = OpTypeImage %int Buffer 2 0 0 1 Rgba32i
-// CHECK: %_ptr_UniformConstant_type_buffer_image_12 = OpTypePointer UniformConstant %type_buffer_image_12
+// INFER: %type_buffer_image_11 = OpTypeImage %int Buffer 2 0 0 1 Unknown
+// INFER: %_ptr_UniformConstant_type_buffer_image_11 = OpTypePointer UniformConstant %type_buffer_image_11
+// INFER: %type_buffer_image_12 = OpTypeImage %int Buffer 2 0 0 1 Rgba32i
+// INFER: %_ptr_UniformConstant_type_buffer_image_12 = OpTypePointer UniformConstant %type_buffer_image_12
 Buffer<int3> int3buf;
 Buffer<int4> int4buf;
-// CHECK: %type_buffer_image_13 = OpTypeImage %uint Buffer 2 0 0 1 Unknown
-// CHECK: %_ptr_UniformConstant_type_buffer_image_13 = OpTypePointer UniformConstant %type_buffer_image_13
-// CHECK: %type_buffer_image_14 = OpTypeImage %uint Buffer 2 0 0 1 Rgba32ui
-// CHECK: %_ptr_UniformConstant_type_buffer_image_14 = OpTypePointer UniformConstant %type_buffer_image_14
+// INFER: %type_buffer_image_13 = OpTypeImage %uint Buffer 2 0 0 1 Unknown
+// INFER: %_ptr_UniformConstant_type_buffer_image_13 = OpTypePointer UniformConstant %type_buffer_image_13
+// INFER: %type_buffer_image_14 = OpTypeImage %uint Buffer 2 0 0 1 Rgba32ui
+// INFER: %_ptr_UniformConstant_type_buffer_image_14 = OpTypePointer UniformConstant %type_buffer_image_14
 Buffer<uint3> uint3buf;
 Buffer<uint4> uint4buf;
-// CHECK: %type_buffer_image_15 = OpTypeImage %float Buffer 2 0 0 1 Unknown
-// CHECK: %_ptr_UniformConstant_type_buffer_image_15 = OpTypePointer UniformConstant %type_buffer_image_15
-// CHECK: %type_buffer_image_16 = OpTypeImage %float Buffer 2 0 0 1 Rgba32f
-// CHECK: %_ptr_UniformConstant_type_buffer_image_16 = OpTypePointer UniformConstant %type_buffer_image_16
+// INFER: %type_buffer_image_15 = OpTypeImage %float Buffer 2 0 0 1 Unknown
+// INFER: %_ptr_UniformConstant_type_buffer_image_15 = OpTypePointer UniformConstant %type_buffer_image_15
+// INFER: %type_buffer_image_16 = OpTypeImage %float Buffer 2 0 0 1 Rgba32f
+// INFER: %_ptr_UniformConstant_type_buffer_image_16 = OpTypePointer UniformConstant %type_buffer_image_16
 Buffer<float3> float3buf;
 Buffer<float4> float4buf;
 
-// CHECK: %type_buffer_image_17 = OpTypeImage %int Buffer 2 0 0 2 Unknown
-// CHECK: %_ptr_UniformConstant_type_buffer_image_17 = OpTypePointer UniformConstant %type_buffer_image_17
-// CHECK: %type_buffer_image_18 = OpTypeImage %int Buffer 2 0 0 2 Rgba32i
-// CHECK: %_ptr_UniformConstant_type_buffer_image_18 = OpTypePointer UniformConstant %type_buffer_image_18
+// INFER: %type_buffer_image_17 = OpTypeImage %int Buffer 2 0 0 2 Unknown
+// INFER: %_ptr_UniformConstant_type_buffer_image_17 = OpTypePointer UniformConstant %type_buffer_image_17
+// INFER: %type_buffer_image_18 = OpTypeImage %int Buffer 2 0 0 2 Rgba32i
+// INFER: %_ptr_UniformConstant_type_buffer_image_18 = OpTypePointer UniformConstant %type_buffer_image_18
 RWBuffer<int3> int3rwbuf;
 RWBuffer<int4> int4rwbuf;
-// CHECK: %type_buffer_image_19 = OpTypeImage %uint Buffer 2 0 0 2 Unknown
-// CHECK: %_ptr_UniformConstant_type_buffer_image_19 = OpTypePointer UniformConstant %type_buffer_image_19
-// CHECK: %type_buffer_image_20 = OpTypeImage %uint Buffer 2 0 0 2 Rgba32ui
-// CHECK: %_ptr_UniformConstant_type_buffer_image_20 = OpTypePointer UniformConstant %type_buffer_image_20
+// INFER: %type_buffer_image_19 = OpTypeImage %uint Buffer 2 0 0 2 Unknown
+// INFER: %_ptr_UniformConstant_type_buffer_image_19 = OpTypePointer UniformConstant %type_buffer_image_19
+// INFER: %type_buffer_image_20 = OpTypeImage %uint Buffer 2 0 0 2 Rgba32ui
+// INFER: %_ptr_UniformConstant_type_buffer_image_20 = OpTypePointer UniformConstant %type_buffer_image_20
 RWBuffer<uint3> uint3rwbuf;
 RWBuffer<uint4> uint4rwbuf;
-// CHECK: %type_buffer_image_21 = OpTypeImage %float Buffer 2 0 0 2 Unknown
-// CHECK: %_ptr_UniformConstant_type_buffer_image_21 = OpTypePointer UniformConstant %type_buffer_image_21
-// CHECK: %type_buffer_image_22 = OpTypeImage %float Buffer 2 0 0 2 Rgba32f
-// CHECK: %_ptr_UniformConstant_type_buffer_image_22 = OpTypePointer UniformConstant %type_buffer_image_22
+// INFER: %type_buffer_image_21 = OpTypeImage %float Buffer 2 0 0 2 Unknown
+// INFER: %_ptr_UniformConstant_type_buffer_image_21 = OpTypePointer UniformConstant %type_buffer_image_21
+// INFER: %type_buffer_image_22 = OpTypeImage %float Buffer 2 0 0 2 Rgba32f
+// INFER: %_ptr_UniformConstant_type_buffer_image_22 = OpTypePointer UniformConstant %type_buffer_image_22
 RWBuffer<float3> float3rwbuf;
 RWBuffer<float4> float4rwbuf;
 
-// CHECK: %intbuf = OpVariable %_ptr_UniformConstant_type_buffer_image UniformConstant
-// CHECK: %uintbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_0 UniformConstant
-// CHECK: %floatbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_1 UniformConstant
-// CHECK: %intrwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_2 UniformConstant
-// CHECK: %uintrwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_3 UniformConstant
-// CHECK: %floatrwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_4 UniformConstant
-// CHECK: %int2buf = OpVariable %_ptr_UniformConstant_type_buffer_image_5 UniformConstant
-// CHECK: %uint2buf = OpVariable %_ptr_UniformConstant_type_buffer_image_6 UniformConstant
-// CHECK: %float2buf = OpVariable %_ptr_UniformConstant_type_buffer_image_7 UniformConstant
-// CHECK: %int2rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_8 UniformConstant
-// CHECK: %uint2rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_9 UniformConstant
-// CHECK: %float2rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_10 UniformConstant
-// CHECK: %int3buf = OpVariable %_ptr_UniformConstant_type_buffer_image_11 UniformConstant
-// CHECK: %int4buf = OpVariable %_ptr_UniformConstant_type_buffer_image_12 UniformConstant
-// CHECK: %uint3buf = OpVariable %_ptr_UniformConstant_type_buffer_image_13 UniformConstant
-// CHECK: %uint4buf = OpVariable %_ptr_UniformConstant_type_buffer_image_14 UniformConstant
-// CHECK: %float3buf = OpVariable %_ptr_UniformConstant_type_buffer_image_15 UniformConstant
-// CHECK: %float4buf = OpVariable %_ptr_UniformConstant_type_buffer_image_16 UniformConstant
-// CHECK: %int3rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_17 UniformConstant
-// CHECK: %int4rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_18 UniformConstant
-// CHECK: %uint3rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_19 UniformConstant
-// CHECK: %uint4rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_20 UniformConstant
-// CHECK: %float3rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_21 UniformConstant
-// CHECK: %float4rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_22 UniformConstant
+// INFER: %intbuf = OpVariable %_ptr_UniformConstant_type_buffer_image UniformConstant
+// INFER: %uintbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_0 UniformConstant
+// INFER: %floatbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_1 UniformConstant
+// INFER: %intrwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_2 UniformConstant
+// INFER: %uintrwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_3 UniformConstant
+// INFER: %floatrwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_4 UniformConstant
+// INFER: %int2buf = OpVariable %_ptr_UniformConstant_type_buffer_image_5 UniformConstant
+// INFER: %uint2buf = OpVariable %_ptr_UniformConstant_type_buffer_image_6 UniformConstant
+// INFER: %float2buf = OpVariable %_ptr_UniformConstant_type_buffer_image_7 UniformConstant
+// INFER: %int2rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_8 UniformConstant
+// INFER: %uint2rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_9 UniformConstant
+// INFER: %float2rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_10 UniformConstant
+// INFER: %int3buf = OpVariable %_ptr_UniformConstant_type_buffer_image_11 UniformConstant
+// INFER: %int4buf = OpVariable %_ptr_UniformConstant_type_buffer_image_12 UniformConstant
+// INFER: %uint3buf = OpVariable %_ptr_UniformConstant_type_buffer_image_13 UniformConstant
+// INFER: %uint4buf = OpVariable %_ptr_UniformConstant_type_buffer_image_14 UniformConstant
+// INFER: %float3buf = OpVariable %_ptr_UniformConstant_type_buffer_image_15 UniformConstant
+// INFER: %float4buf = OpVariable %_ptr_UniformConstant_type_buffer_image_16 UniformConstant
+// INFER: %int3rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_17 UniformConstant
+// INFER: %int4rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_18 UniformConstant
+// INFER: %uint3rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_19 UniformConstant
+// INFER: %uint4rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_20 UniformConstant
+// INFER: %float3rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_21 UniformConstant
+// INFER: %float4rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_22 UniformConstant
+
+// UNKNOWN: %intbuf = OpVariable %_ptr_UniformConstant_type_buffer_image UniformConstant
+// UNKNOWN: %uintbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_0 UniformConstant
+// UNKNOWN: %floatbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_1 UniformConstant
+// UNKNOWN: %intrwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_2 UniformConstant
+// UNKNOWN: %uintrwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_3 UniformConstant
+// UNKNOWN: %floatrwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_4 UniformConstant
+// UNKNOWN: %int2buf = OpVariable %_ptr_UniformConstant_type_buffer_image UniformConstant
+// UNKNOWN: %uint2buf = OpVariable %_ptr_UniformConstant_type_buffer_image_0 UniformConstant
+// UNKNOWN: %float2buf = OpVariable %_ptr_UniformConstant_type_buffer_image_1 UniformConstant
+// UNKNOWN: %int2rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_2 UniformConstant
+// UNKNOWN: %uint2rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_3 UniformConstant
+// UNKNOWN: %float2rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_4 UniformConstant
+// UNKNOWN: %int3buf = OpVariable %_ptr_UniformConstant_type_buffer_image UniformConstant
+// UNKNOWN: %int4buf = OpVariable %_ptr_UniformConstant_type_buffer_image UniformConstant
+// UNKNOWN: %uint3buf = OpVariable %_ptr_UniformConstant_type_buffer_image_0 UniformConstant
+// UNKNOWN: %uint4buf = OpVariable %_ptr_UniformConstant_type_buffer_image_0 UniformConstant
+// UNKNOWN: %float3buf = OpVariable %_ptr_UniformConstant_type_buffer_image_1 UniformConstant
+// UNKNOWN: %float4buf = OpVariable %_ptr_UniformConstant_type_buffer_image_1 UniformConstant
+// UNKNOWN: %int3rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_2 UniformConstant
+// UNKNOWN: %int4rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_2 UniformConstant
+// UNKNOWN: %uint3rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_3 UniformConstant
+// UNKNOWN: %uint4rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_3 UniformConstant
+// UNKNOWN: %float3rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_4 UniformConstant
+// UNKNOWN: %float4rwbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_4 UniformConstant
 
 void main() {}
diff --git a/tools/clang/test/CodeGenSPIRV/type.rasterizer-ordered-buffer.hlsl b/tools/clang/test/CodeGenSPIRV/type.rasterizer-ordered-buffer.hlsl
index c616f65bb9..cf84562e52 100644
--- a/tools/clang/test/CodeGenSPIRV/type.rasterizer-ordered-buffer.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/type.rasterizer-ordered-buffer.hlsl
@@ -1,59 +1,80 @@
-// RUN: %dxc -T ps_6_6 -E main -fcgl %s -spirv | FileCheck %s
+// RUN: %dxc -T ps_6_6 -E main -fcgl %s -spirv | FileCheck %s --check-prefixes=CHECK,INFER
+// RUN: %dxc -fspv-use-unknown-image-format -T ps_6_6 -E main -fcgl %s -spirv | FileCheck %s --check-prefixes=CHECK,UNKNOWN
+
+// Before vulkan1.3, we should be trying to infer the image type for because
+// we cannot necessarily use Unknown. However in VK1.3 and later, we can use
+// Unknown.
 
 // CHECK: OpCapability SampledBuffer
-// CHECK: OpCapability StorageImageExtendedFormats
+// INFER: OpCapability StorageImageExtendedFormats
 
-// CHECK: %type_buffer_image = OpTypeImage %int Buffer 2 0 0 2 R32i
+// INFER: %type_buffer_image = OpTypeImage %int Buffer 2 0 0 2 R32i
+// UNKNOWN: %type_buffer_image = OpTypeImage %int Buffer 2 0 0 2 Unknown
 // CHECK: %_ptr_UniformConstant_type_buffer_image = OpTypePointer UniformConstant %type_buffer_image
 RasterizerOrderedBuffer<int> introvbuf;
-// CHECK: %type_buffer_image_0 = OpTypeImage %uint Buffer 2 0 0 2 R32ui
+// INFER: %type_buffer_image_0 = OpTypeImage %uint Buffer 2 0 0 2 R32ui
+// UNKNOWN: %type_buffer_image_0 = OpTypeImage %uint Buffer 2 0 0 2 Unknown
 // CHECK: %_ptr_UniformConstant_type_buffer_image_0 = OpTypePointer UniformConstant %type_buffer_image_0
 RasterizerOrderedBuffer<uint> uintrovbuf;
-// CHECK: %type_buffer_image_1 = OpTypeImage %float Buffer 2 0 0 2 R32f
+// INFER: %type_buffer_image_1 = OpTypeImage %float Buffer 2 0 0 2 R32f
+// UNKNOWN: %type_buffer_image_1 = OpTypeImage %float Buffer 2 0 0 2 Unknown
 // CHECK: %_ptr_UniformConstant_type_buffer_image_1 = OpTypePointer UniformConstant %type_buffer_image_1
 RasterizerOrderedBuffer<float> floatrovbuf;
 
-// CHECK: %type_buffer_image_2 = OpTypeImage %int Buffer 2 0 0 2 Rg32i
-// CHECK: %_ptr_UniformConstant_type_buffer_image_2 = OpTypePointer UniformConstant %type_buffer_image_2
+// INFER: %type_buffer_image_2 = OpTypeImage %int Buffer 2 0 0 2 Rg32i
+// INFER: %_ptr_UniformConstant_type_buffer_image_2 = OpTypePointer UniformConstant %type_buffer_image_2
 RasterizerOrderedBuffer<int2> int2rovbuf;
-// CHECK: %type_buffer_image_3 = OpTypeImage %uint Buffer 2 0 0 2 Rg32ui
-// CHECK: %_ptr_UniformConstant_type_buffer_image_3 = OpTypePointer UniformConstant %type_buffer_image_3
+// INFER: %type_buffer_image_3 = OpTypeImage %uint Buffer 2 0 0 2 Rg32ui
+// INFER: %_ptr_UniformConstant_type_buffer_image_3 = OpTypePointer UniformConstant %type_buffer_image_3
 RasterizerOrderedBuffer<uint2> uint2rovbuf;
-// CHECK: %type_buffer_image_4 = OpTypeImage %float Buffer 2 0 0 2 Rg32f
-// CHECK: %_ptr_UniformConstant_type_buffer_image_4 = OpTypePointer UniformConstant %type_buffer_image_4
+// INFER: %type_buffer_image_4 = OpTypeImage %float Buffer 2 0 0 2 Rg32f
+// INFER: %_ptr_UniformConstant_type_buffer_image_4 = OpTypePointer UniformConstant %type_buffer_image_4
 RasterizerOrderedBuffer<float2> float2rovbuf;
 
-// CHECK: %type_buffer_image_5 = OpTypeImage %int Buffer 2 0 0 2 Unknown
-// CHECK: %_ptr_UniformConstant_type_buffer_image_5 = OpTypePointer UniformConstant %type_buffer_image_5
-// CHECK: %type_buffer_image_6 = OpTypeImage %int Buffer 2 0 0 2 Rgba32i
-// CHECK: %_ptr_UniformConstant_type_buffer_image_6 = OpTypePointer UniformConstant %type_buffer_image_6
+// INFER: %type_buffer_image_5 = OpTypeImage %int Buffer 2 0 0 2 Unknown
+// INFER: %_ptr_UniformConstant_type_buffer_image_5 = OpTypePointer UniformConstant %type_buffer_image_5
+// INFER: %type_buffer_image_6 = OpTypeImage %int Buffer 2 0 0 2 Rgba32i
+// INFER: %_ptr_UniformConstant_type_buffer_image_6 = OpTypePointer UniformConstant %type_buffer_image_6
 RasterizerOrderedBuffer<int3> int3rovbuf;
 RasterizerOrderedBuffer<int4> int4rovbuf;
-// CHECK: %type_buffer_image_7 = OpTypeImage %uint Buffer 2 0 0 2 Unknown
-// CHECK: %_ptr_UniformConstant_type_buffer_image_7 = OpTypePointer UniformConstant %type_buffer_image_7
-// CHECK: %type_buffer_image_8 = OpTypeImage %uint Buffer 2 0 0 2 Rgba32ui
-// CHECK: %_ptr_UniformConstant_type_buffer_image_8 = OpTypePointer UniformConstant %type_buffer_image_8
+// INFER: %type_buffer_image_7 = OpTypeImage %uint Buffer 2 0 0 2 Unknown
+// INFER: %_ptr_UniformConstant_type_buffer_image_7 = OpTypePointer UniformConstant %type_buffer_image_7
+// INFER: %type_buffer_image_8 = OpTypeImage %uint Buffer 2 0 0 2 Rgba32ui
+// INFER: %_ptr_UniformConstant_type_buffer_image_8 = OpTypePointer UniformConstant %type_buffer_image_8
 RasterizerOrderedBuffer<uint3> uint3rovbuf;
 RasterizerOrderedBuffer<uint4> uint4rovbuf;
-// CHECK: %type_buffer_image_9 = OpTypeImage %float Buffer 2 0 0 2 Unknown
-// CHECK: %_ptr_UniformConstant_type_buffer_image_9 = OpTypePointer UniformConstant %type_buffer_image_9
-// CHECK: %type_buffer_image_10 = OpTypeImage %float Buffer 2 0 0 2 Rgba32f
-// CHECK: %_ptr_UniformConstant_type_buffer_image_10 = OpTypePointer UniformConstant %type_buffer_image_10
+// INFER: %type_buffer_image_9 = OpTypeImage %float Buffer 2 0 0 2 Unknown
+// INFER: %_ptr_UniformConstant_type_buffer_image_9 = OpTypePointer UniformConstant %type_buffer_image_9
+// INFER: %type_buffer_image_10 = OpTypeImage %float Buffer 2 0 0 2 Rgba32f
+// INFER: %_ptr_UniformConstant_type_buffer_image_10 = OpTypePointer UniformConstant %type_buffer_image_10
 RasterizerOrderedBuffer<float3> float3rovbuf;
 RasterizerOrderedBuffer<float4> float4rovbuf;
 
-// CHECK: %introvbuf = OpVariable %_ptr_UniformConstant_type_buffer_image UniformConstant
-// CHECK: %uintrovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_0 UniformConstant
-// CHECK: %floatrovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_1 UniformConstant
-// CHECK: %int2rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_2 UniformConstant
-// CHECK: %uint2rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_3 UniformConstant
-// CHECK: %float2rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_4 UniformConstant
-// CHECK: %int3rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_5 UniformConstant
-// CHECK: %int4rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_6 UniformConstant
-// CHECK: %uint3rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_7 UniformConstant
-// CHECK: %uint4rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_8 UniformConstant
-// CHECK: %float3rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_9 UniformConstant
-// CHECK: %float4rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_10 UniformConstant
+// INFER: %introvbuf = OpVariable %_ptr_UniformConstant_type_buffer_image UniformConstant
+// INFER: %uintrovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_0 UniformConstant
+// INFER: %floatrovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_1 UniformConstant
+// INFER: %int2rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_2 UniformConstant
+// INFER: %uint2rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_3 UniformConstant
+// INFER: %float2rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_4 UniformConstant
+// INFER: %int3rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_5 UniformConstant
+// INFER: %int4rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_6 UniformConstant
+// INFER: %uint3rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_7 UniformConstant
+// INFER: %uint4rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_8 UniformConstant
+// INFER: %float3rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_9 UniformConstant
+// INFER: %float4rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_10 UniformConstant
+
+// UNKNOWN: %introvbuf = OpVariable %_ptr_UniformConstant_type_buffer_image UniformConstant
+// UNKNOWN: %uintrovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_0 UniformConstant
+// UNKNOWN: %floatrovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_1 UniformConstant
+// UNKNOWN: %int2rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image UniformConstant
+// UNKNOWN: %uint2rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_0 UniformConstant
+// UNKNOWN: %float2rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_1 UniformConstant
+// UNKNOWN: %int3rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image UniformConstant
+// UNKNOWN: %int4rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image UniformConstant
+// UNKNOWN: %uint3rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_0 UniformConstant
+// UNKNOWN: %uint4rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_0 UniformConstant
+// UNKNOWN: %float3rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_1 UniformConstant
+// UNKNOWN: %float4rovbuf = OpVariable %_ptr_UniformConstant_type_buffer_image_1 UniformConstant
 
 void main() {}
 
diff --git a/tools/clang/test/CodeGenSPIRV/type.rasterizer-ordered-texture.hlsl b/tools/clang/test/CodeGenSPIRV/type.rasterizer-ordered-texture.hlsl
index 32dd76e6f1..651840b0e6 100644
--- a/tools/clang/test/CodeGenSPIRV/type.rasterizer-ordered-texture.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/type.rasterizer-ordered-texture.hlsl
@@ -1,23 +1,27 @@
-// RUN: %dxc -T ps_6_6 -E main -fcgl %s -spirv | FileCheck %s
+// RUN: %dxc -T ps_6_6 -E main -fcgl %s -spirv | FileCheck %s --check-prefixes=CHECK,INFER
+// RUN: %dxc -fspv-use-unknown-image-format -T ps_6_6 -E main -fcgl %s -spirv | FileCheck %s --check-prefixes=CHECK,UNKNOWN
 
 // CHECK: OpCapability Image1D
 
-// CHECK: %type_1d_image = OpTypeImage %int 1D 2 0 0 2 R32i
+// INFER: %type_1d_image = OpTypeImage %int 1D 2 0 0 2 R32i
+// UNKNOWN: %type_1d_image = OpTypeImage %int 1D 2 0 0 2 Unknown
 // CHECK: %_ptr_UniformConstant_type_1d_image = OpTypePointer UniformConstant %type_1d_image
-// CHECK: %type_2d_image = OpTypeImage %uint 2D 2 0 0 2 Rg32ui
+// INFER: %type_2d_image = OpTypeImage %uint 2D 2 0 0 2 Rg32ui
+// UNKNOWN: %type_2d_image = OpTypeImage %uint 2D 2 0 0 2 Unknown
 // CHECK: %_ptr_UniformConstant_type_2d_image = OpTypePointer UniformConstant %type_2d_image
-// CHECK: %type_3d_image = OpTypeImage %int 3D 2 0 0 2 R32i
+// INFER: %type_3d_image = OpTypeImage %int 3D 2 0 0 2 R32i
+// UNKNOWN: %type_3d_image = OpTypeImage %int 3D 2 0 0 2 Unknown
 // CHECK: %_ptr_UniformConstant_type_3d_image = OpTypePointer UniformConstant %type_3d_image
-// CHECK: %type_3d_image_0 = OpTypeImage %float 3D 2 0 0 2 Rgba32f
-// CHECK: %_ptr_UniformConstant_type_3d_image_0 = OpTypePointer UniformConstant %type_3d_image_0
-// CHECK: %type_1d_image_array = OpTypeImage %int 1D 2 1 0 2 R32i
-// CHECK: %_ptr_UniformConstant_type_1d_image_array = OpTypePointer UniformConstant %type_1d_image_array
-// CHECK: %type_2d_image_array = OpTypeImage %uint 2D 2 1 0 2 Rg32ui
-// CHECK: %_ptr_UniformConstant_type_2d_image_array = OpTypePointer UniformConstant %type_2d_image_array
-// CHECK: %type_1d_image_array_0 = OpTypeImage %float 1D 2 1 0 2 Rgba32f
-// CHECK: %_ptr_UniformConstant_type_1d_image_array_0 = OpTypePointer UniformConstant %type_1d_image_array_0
-// CHECK: %type_2d_image_array_0 = OpTypeImage %float 2D 2 1 0 2 Rgba32f
-// CHECK: %_ptr_UniformConstant_type_2d_image_array_0 = OpTypePointer UniformConstant %type_2d_image_array_0
+// INFER: %type_3d_image_0 = OpTypeImage %float 3D 2 0 0 2 Rgba32f
+// INFER: %_ptr_UniformConstant_type_3d_image_0 = OpTypePointer UniformConstant %type_3d_image_0
+// INFER: %type_1d_image_array = OpTypeImage %int 1D 2 1 0 2 R32i
+// INFER: %_ptr_UniformConstant_type_1d_image_array = OpTypePointer UniformConstant %type_1d_image_array
+// INFER: %type_2d_image_array = OpTypeImage %uint 2D 2 1 0 2 Rg32ui
+// INFER: %_ptr_UniformConstant_type_2d_image_array = OpTypePointer UniformConstant %type_2d_image_array
+// INFER: %type_1d_image_array_0 = OpTypeImage %float 1D 2 1 0 2 Rgba32f
+// INFER: %_ptr_UniformConstant_type_1d_image_array_0 = OpTypePointer UniformConstant %type_1d_image_array_0
+// INFER: %type_2d_image_array_0 = OpTypeImage %float 2D 2 1 0 2 Rgba32f
+// INFER: %_ptr_UniformConstant_type_2d_image_array_0 = OpTypePointer UniformConstant %type_2d_image_array_0
 
 
 // CHECK: %t1 = OpVariable %_ptr_UniformConstant_type_1d_image UniformConstant
@@ -33,7 +37,8 @@ RasterizerOrderedTexture3D   <int>    t3 ;
 [[vk::image_format("rgba32f")]]
 RasterizerOrderedTexture3D   <float3> t4 ;
 
-// CHECK: %t5 = OpVariable %_ptr_UniformConstant_type_3d_image_0 UniformConstant
+// INFER: %t5 = OpVariable %_ptr_UniformConstant_type_3d_image_0 UniformConstant
+// UNKNOWN: %t5 = OpVariable %_ptr_UniformConstant_type_3d_image_1 UniformConstant
 RasterizerOrderedTexture3D   <float4> t5 ;
 
 // CHECK: %t6 = OpVariable %_ptr_UniformConstant_type_1d_image_array UniformConstant
diff --git a/tools/clang/test/CodeGenSPIRV/type.rwtexture.hlsl b/tools/clang/test/CodeGenSPIRV/type.rwtexture.hlsl
index f901d44cfa..44e7592869 100644
--- a/tools/clang/test/CodeGenSPIRV/type.rwtexture.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/type.rwtexture.hlsl
@@ -1,24 +1,43 @@
-// RUN: %dxc -T vs_6_0 -E main -fcgl  %s -spirv | FileCheck %s
+// RUN: %dxc -T vs_6_0 -E main -fcgl  %s -spirv | FileCheck %s --check-prefixes=CHECK,INFER
+// RUN: %dxc -fspv-use-unknown-image-format -T vs_6_0 -E main -fcgl  %s -spirv | FileCheck %s --check-prefixes=CHECK,UNKNOWN
 
 // CHECK: OpCapability Image1D
 
-// CHECK: %type_1d_image = OpTypeImage %int 1D 2 0 0 2 R32i
-// CHECK: %_ptr_UniformConstant_type_1d_image = OpTypePointer UniformConstant %type_1d_image
-// CHECK: %type_2d_image = OpTypeImage %uint 2D 2 0 0 2 Rg32ui
-// CHECK: %_ptr_UniformConstant_type_2d_image = OpTypePointer UniformConstant %type_2d_image
-// CHECK: %type_3d_image = OpTypeImage %int 3D 2 0 0 2 R32i
-// CHECK: %_ptr_UniformConstant_type_3d_image = OpTypePointer UniformConstant %type_3d_image
-// CHECK: %type_3d_image_0 = OpTypeImage %float 3D 2 0 0 2 Rgba32f
-// CHECK: %_ptr_UniformConstant_type_3d_image_0 = OpTypePointer UniformConstant %type_3d_image_0
-// CHECK: %type_1d_image_array = OpTypeImage %int 1D 2 1 0 2 R32i
-// CHECK: %_ptr_UniformConstant_type_1d_image_array = OpTypePointer UniformConstant %type_1d_image_array
-// CHECK: %type_2d_image_array = OpTypeImage %uint 2D 2 1 0 2 Rg32ui
-// CHECK: %_ptr_UniformConstant_type_2d_image_array = OpTypePointer UniformConstant %type_2d_image_array
-// CHECK: %type_1d_image_array_0 = OpTypeImage %float 1D 2 1 0 2 Rgba32f
-// CHECK: %_ptr_UniformConstant_type_1d_image_array_0 = OpTypePointer UniformConstant %type_1d_image_array_0
-// CHECK: %type_2d_image_array_0 = OpTypeImage %float 2D 2 1 0 2 Rgba32f
-// CHECK: %_ptr_UniformConstant_type_2d_image_array_0 = OpTypePointer UniformConstant %type_2d_image_array_0
+// INFER: %type_1d_image = OpTypeImage %int 1D 2 0 0 2 R32i
+// INFER: %_ptr_UniformConstant_type_1d_image = OpTypePointer UniformConstant %type_1d_image
+// INFER: %type_2d_image = OpTypeImage %uint 2D 2 0 0 2 Rg32ui
+// INFER: %_ptr_UniformConstant_type_2d_image = OpTypePointer UniformConstant %type_2d_image
+// INFER: %type_3d_image = OpTypeImage %int 3D 2 0 0 2 R32i
+// INFER: %_ptr_UniformConstant_type_3d_image = OpTypePointer UniformConstant %type_3d_image
+// INFER: %type_3d_image_0 = OpTypeImage %float 3D 2 0 0 2 Rgba32f
+// INFER: %_ptr_UniformConstant_type_3d_image_0 = OpTypePointer UniformConstant %type_3d_image_0
+// INFER: %type_1d_image_array = OpTypeImage %int 1D 2 1 0 2 R32i
+// INFER: %_ptr_UniformConstant_type_1d_image_array = OpTypePointer UniformConstant %type_1d_image_array
+// INFER: %type_2d_image_array = OpTypeImage %uint 2D 2 1 0 2 Rg32ui
+// INFER: %_ptr_UniformConstant_type_2d_image_array = OpTypePointer UniformConstant %type_2d_image_array
+// INFER: %type_1d_image_array_0 = OpTypeImage %float 1D 2 1 0 2 Rgba32f
+// INFER: %_ptr_UniformConstant_type_1d_image_array_0 = OpTypePointer UniformConstant %type_1d_image_array_0
+// INFER: %type_2d_image_array_0 = OpTypeImage %float 2D 2 1 0 2 Rgba32f
+// INFER: %_ptr_UniformConstant_type_2d_image_array_0 = OpTypePointer UniformConstant %type_2d_image_array_0
 
+// UNKNOWN: %type_1d_image = OpTypeImage %int 1D 2 0 0 2 Unknown
+// UNKNOWN: %_ptr_UniformConstant_type_1d_image = OpTypePointer UniformConstant %type_1d_image
+// UNKNOWN: %type_2d_image = OpTypeImage %uint 2D 2 0 0 2 Unknown
+// UNKNOWN: %_ptr_UniformConstant_type_2d_image = OpTypePointer UniformConstant %type_2d_image
+// UNKNOWN: %type_3d_image = OpTypeImage %int 3D 2 0 0 2 Unknown
+// UNKNOWN: %_ptr_UniformConstant_type_3d_image = OpTypePointer UniformConstant %type_3d_image
+// UNKNOWN: %type_3d_image_0 = OpTypeImage %float 3D 2 0 0 2 Rgba32f
+// UNKNOWN: %_ptr_UniformConstant_type_3d_image_0 = OpTypePointer UniformConstant %type_3d_image_0
+// UNKNOWN: %type_3d_image_1 = OpTypeImage %float 3D 2 0 0 2 Unknown
+// UNKNOWN: %_ptr_UniformConstant_type_3d_image_1 = OpTypePointer UniformConstant %type_3d_image_1
+// UNKNOWN: %type_1d_image_array = OpTypeImage %int 1D 2 1 0 2 Unknown
+// UNKNOWN: %_ptr_UniformConstant_type_1d_image_array = OpTypePointer UniformConstant %type_1d_image_array
+// UNKNOWN: %type_2d_image_array = OpTypeImage %uint 2D 2 1 0 2 Unknown
+// UNKNOWN: %_ptr_UniformConstant_type_2d_image_array = OpTypePointer UniformConstant %type_2d_image_array
+// UNKNOWN: %type_1d_image_array_0 = OpTypeImage %float 1D 2 1 0 2 Unknown
+// UNKNOWN: %_ptr_UniformConstant_type_1d_image_array_0 = OpTypePointer UniformConstant %type_1d_image_array_0
+// UNKNOWN: %type_2d_image_array_0 = OpTypeImage %float 2D 2 1 0 2 Unknown
+// UNKNOWN: %_ptr_UniformConstant_type_2d_image_array_0 = OpTypePointer UniformConstant %type_2d_image_array_0
 
 // CHECK: %t1 = OpVariable %_ptr_UniformConstant_type_1d_image UniformConstant
 RWTexture1D   <int>    t1 ;
@@ -33,7 +52,8 @@ RWTexture3D   <int>    t3 ;
 [[vk::image_format("rgba32f")]]
 RWTexture3D   <float3> t4 ;
 
-// CHECK: %t5 = OpVariable %_ptr_UniformConstant_type_3d_image_0 UniformConstant
+// INFER: %t5 = OpVariable %_ptr_UniformConstant_type_3d_image_0 UniformConstant
+// UNKNOWN: %t5 = OpVariable %_ptr_UniformConstant_type_3d_image_1 UniformConstant
 RWTexture3D   <float4> t5 ;
 
 // CHECK: %t6 = OpVariable %_ptr_UniformConstant_type_1d_image_array UniformConstant
diff --git a/tools/clang/test/CodeGenSPIRV/vertex_shader_derivative_in_branch.hlsl b/tools/clang/test/CodeGenSPIRV/vertex_shader_derivative_in_branch.hlsl
new file mode 100644
index 0000000000..9719dc1dc5
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/vertex_shader_derivative_in_branch.hlsl
@@ -0,0 +1,23 @@
+// RUN: %dxc -T vs_6_0 -E main -DCOND=false -fspv-target-env=vulkan1.3 %s -spirv | FileCheck %s
+// CHECK-NOT: OpCapability DerivativeControl
+// CHECK-NOT: OpExtension "SPV_KHR_compute_shader_derivatives"
+
+// RUN: not %dxc -T vs_6_0 -E main -DCOND=true -fspv-target-env=vulkan1.3 %s -spirv 2>&1 | FileCheck %s -check-prefix=ERROR
+// ERROR: generated SPIR-V is invalid:
+// ERROR-NEXT: Derivative instructions require Fragment, GLCompute, MeshEXT or TaskEXT execution model: DPdx
+
+struct VSOut
+{
+    float4 pos : SV_Position;
+};
+
+VSOut main(float4 pos : POSITION)
+{
+    VSOut output;
+    output.pos = pos;
+    if (COND)
+    {
+        output.pos += ddx(pos);
+    }
+    return output;
+}
diff --git a/tools/clang/test/CodeGenSPIRV/vk.attribute.image-format.hlsl b/tools/clang/test/CodeGenSPIRV/vk.attribute.image-format.hlsl
index 12b03fffda..4d10dc446b 100644
--- a/tools/clang/test/CodeGenSPIRV/vk.attribute.image-format.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/vk.attribute.image-format.hlsl
@@ -60,12 +60,6 @@ RWBuffer<int64_t> Buf_r64i;
 [[vk::image_format("r64ui")]]
 RWBuffer<uint64_t> Buf_r64ui;
 
-[[vk::image_format("r16f")]]
-// CHECK: [[ImgType:%[0-9a-zA-Z_]+]] = OpTypeImage %float 2D 2 0 0 2 R16f
-// CHECK: [[ArrayType:%[0-9a-zA-Z_]+]] = OpTypeRuntimeArray [[ImgType]]
-// CHECK: [[PtrType:%[0-9a-zA-Z_]+]] = OpTypePointer UniformConstant [[ArrayType]]
-RWTexture2D<float> Buf_r16f_bindless[];
-
 struct S {
     RWBuffer<float4> b;
 };
diff --git a/tools/clang/test/CodeGenSPIRV/vk.buffer-pointer.alias.cs.hlsl b/tools/clang/test/CodeGenSPIRV/vk.buffer-pointer.alias.cs.hlsl
index f0f5c54a16..e063a4bc23 100644
--- a/tools/clang/test/CodeGenSPIRV/vk.buffer-pointer.alias.cs.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/vk.buffer-pointer.alias.cs.hlsl
@@ -20,7 +20,7 @@ void main() {
   foo(rwbuf[0].Get());
 }
 
-// CHECK: [[L0:%[_0-9A-Za-z]*]] = OpLoad %{{[_0-9A-Za-z]*}} %{{[_0-9A-Za-z]*}} Aligned 8
+// CHECK: [[L0:%[_0-9A-Za-z]*]] = OpLoad %{{[_0-9A-Za-z]*}} %{{[_0-9A-Za-z]*}}
 // CHECK: [[L1:%[_0-9A-Za-z]*]] = OpLoad %{{[_0-9A-Za-z]*}} [[L0]] Aligned 8
 // CHECK: [[L2:%[_0-9A-Za-z]*]] = OpAccessChain %{{[_0-9A-Za-z]*}} [[L1]] %int_0
 // CHECK: OpStore [[L2]] %int_1 Aligned 4
diff --git a/tools/clang/test/CodeGenSPIRV/vk.buffer-pointer.alias.hlsl b/tools/clang/test/CodeGenSPIRV/vk.buffer-pointer.alias.hlsl
index fc5b9edad0..e159f6997c 100644
--- a/tools/clang/test/CodeGenSPIRV/vk.buffer-pointer.alias.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/vk.buffer-pointer.alias.hlsl
@@ -62,10 +62,10 @@ float4 MainPs(void) : SV_Target0
 // CHECK: [[X4:%[_0-9A-Za-z]*]] = OpLoad [[PGS]] [[X3]]
 // CHECK: OpStore [[BP1]] [[X4]]
 // CHECK: [[X5:%[_0-9A-Za-z]*]] = OpLoad [[V4FLOAT]] [[VTEST]]
-// CHECK: [[X6:%[_0-9A-Za-z]*]] = OpLoad [[PGS]] [[BP0]] Aligned 16
+// CHECK: [[X6:%[_0-9A-Za-z]*]] = OpLoad [[PGS]] [[BP0]]
 // CHECK: [[X7:%[_0-9A-Za-z]*]] = OpAccessChain [[PBV4FLOAT]] [[X6]] [[I1]]
 // CHECK: OpStore [[X7]] [[X5]] Aligned 16
-// CHECK: [[X8:%[_0-9A-Za-z]*]] = OpLoad [[PGS]] [[BP1]] Aligned 16
+// CHECK: [[X8:%[_0-9A-Za-z]*]] = OpLoad [[PGS]] [[BP1]]
 // CHECK: [[X9:%[_0-9A-Za-z]*]] = OpAccessChain [[PBV4FLOAT]] [[X8]] [[I1]]
 // CHECK: [[X10:%[_0-9A-Za-z]*]] = OpLoad [[V4FLOAT]] [[X9]] Aligned 16
 // CHECK: OpReturnValue [[X10]]
diff --git a/tools/clang/test/CodeGenSPIRV/vk.buffer-pointer.atomic.hlsl b/tools/clang/test/CodeGenSPIRV/vk.buffer-pointer.atomic.hlsl
index 992d8b39fd..485da6fd93 100644
--- a/tools/clang/test/CodeGenSPIRV/vk.buffer-pointer.atomic.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/vk.buffer-pointer.atomic.hlsl
@@ -29,7 +29,7 @@ void main()
   uint u0, u1;
 
 // CHECK: [[X1:%[_0-9]+]] = OpAccessChain %{{[_0-9A-Za-z]*}} [[PC]] [[I0]]
-// CHECK: [[X2:%[_0-9]+]] = OpLoad [[PS]] [[X1]] Aligned 4
+// CHECK: [[X2:%[_0-9]+]] = OpLoad [[PS]] [[X1]]
 // CHECK: [[X3:%[_0-9]+]] = OpAccessChain [[PU]] [[X2]] [[I0]]
 // CHECK: [[X4:%[_0-9]+]] = OpLoad [[UINT]] [[IN]]
 // CHECK: [[X5:%[_0-9]+]] = OpAtomicExchange [[UINT]] [[X3]] [[U1]] [[U0]] [[X4]]
diff --git a/tools/clang/test/CodeGenSPIRV/vk.buffer-pointer.from-uint.hlsl b/tools/clang/test/CodeGenSPIRV/vk.buffer-pointer.from-uint.hlsl
index b44e1eca09..e7908e0ce7 100644
--- a/tools/clang/test/CodeGenSPIRV/vk.buffer-pointer.from-uint.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/vk.buffer-pointer.from-uint.hlsl
@@ -37,8 +37,8 @@ void main() {
 // CHECK: [[TEST:%[_0-9A-Za-z]*]] = OpVariable [[PFPPUINT]] Function
 // CHECK: [[X1:%[_0-9A-Za-z]*]] = OpConvertUToPtr [[PPUINT]]
 // CHECK: OpStore [[TEST]] [[X1]]
-// CHECK: [[X2:%[_0-9A-Za-z]*]] = OpLoad [[PPUINT]] [[TEST]] Aligned 32
-// CHECK: [[X3:%[_0-9A-Za-z]*]] = OpLoad [[UINT]] [[X2]] Aligned 4
+// CHECK: [[X2:%[_0-9A-Za-z]*]] = OpLoad [[PPUINT]] [[TEST]]
+// CHECK: [[X3:%[_0-9A-Za-z]*]] = OpLoad [[UINT]] [[X2]] Aligned 32
 // CHECK: [[X4:%[_0-9A-Za-z]*]] = OpAccessChain [[PUUINT]] [[OUTPUT]] [[I0]] [[U0]]
 // CHECK: OpStore [[X4]] [[X3]]
 // CHECK: OpReturn
diff --git a/tools/clang/test/CodeGenSPIRV/vk.buffer-pointer.linked-list.hlsl b/tools/clang/test/CodeGenSPIRV/vk.buffer-pointer.linked-list.hlsl
index 71fee1a795..75380d3f4e 100644
--- a/tools/clang/test/CodeGenSPIRV/vk.buffer-pointer.linked-list.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/vk.buffer-pointer.linked-list.hlsl
@@ -76,9 +76,9 @@ float4 MainPs(void) : SV_Target0
 // CHECK: [[X1:%[_0-9A-Za-z]*]] = OpAccessChain [[PPBLOCK1]] [[GPC]] [[S0]]
 // CHECK: [[X2:%[_0-9A-Za-z]*]] = OpLoad [[PBLOCK]] [[X1]]
 // CHECK: OpStore [[GP]] [[X2]]
-// CHECK: [[X3:%[_0-9A-Za-z]*]] = OpLoad [[PBLOCK]] [[GP]] Aligned 32
+// CHECK: [[X3:%[_0-9A-Za-z]*]] = OpLoad [[PBLOCK]] [[GP]]
 // CHECK: [[X4:%[_0-9A-Za-z]*]] = OpAccessChain [[PPBLOCK2]] [[X3]] [[S1]]
-// CHECK: [[X5:%[_0-9A-Za-z]*]] = OpLoad [[PBLOCK]] [[X4]] Aligned 8
+// CHECK: [[X5:%[_0-9A-Za-z]*]] = OpLoad [[PBLOCK]] [[X4]] Aligned 32
 // CHECK: OpStore [[GP]] [[X5]]
 // CHECK: [[X6:%[_0-9A-Za-z]*]] = OpLoad [[PBLOCK]] [[GP]]
 // CHECK: [[X7:%[_0-9A-Za-z]*]] = OpConvertPtrToU [[ULONG]] [[X6]]
@@ -94,7 +94,7 @@ float4 MainPs(void) : SV_Target0
 // CHECK: [[IF_TRUE]] = OpLabel
 // CHECK: OpReturnValue [[CV4FLOAT]]
 // CHECK: [[IF_MERGE]] = OpLabel
-// CHECK: [[X13:%[_0-9A-Za-z]*]] = OpLoad [[PBLOCK]] [[GP]] Aligned 32
+// CHECK: [[X13:%[_0-9A-Za-z]*]] = OpLoad [[PBLOCK]] [[GP]]
 // CHECK: [[X14:%[_0-9A-Za-z]*]] = OpAccessChain [[PV4FLOAT2]] [[X13]] [[S0]]
 // CHECK: [[X15:%[_0-9A-Za-z]*]] = OpLoad [[V4FLOAT]] [[X14]] Aligned 16
 // CHECK: OpReturnValue [[X15]]
diff --git a/tools/clang/test/CodeGenSPIRV/vk.buffer-pointer.read.hlsl b/tools/clang/test/CodeGenSPIRV/vk.buffer-pointer.read.hlsl
index c7d6f0ed2b..cc3b1a0209 100644
--- a/tools/clang/test/CodeGenSPIRV/vk.buffer-pointer.read.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/vk.buffer-pointer.read.hlsl
@@ -36,7 +36,8 @@ struct TestPushConstant_t
 float4 MainPs(void) : SV_Target0
 {
       float4 vTest = g_PushConstants.m_nBufferDeviceAddress.Get().g_vTestFloat4;
-      return vTest;
+      float f = vk::BufferPointer<float,4>(0xdeadbeefull).Get();
+      return vTest+f;
 }
 
 // CHECK: [[FUN]] = OpFunction
@@ -44,5 +45,9 @@ float4 MainPs(void) : SV_Target0
 // CHECK: [[X2:%[_0-9A-Za-z]*]] = OpLoad [[PGLOBALS]] [[X1]]
 // CHECK: [[X3:%[_0-9A-Za-z]*]] = OpAccessChain [[PV4FLOAT2]] [[X2]] [[S1]]
 // CHECK: [[X4:%[_0-9A-Za-z]*]] = OpLoad [[V4FLOAT]] [[X3]] Aligned 16
-// CHECK: OpStore [[OUT]] [[X4]]
+// CHECK: [[TEMP_PTR:%[_0-9A-Za-z]*]] = OpConvertUToPtr %_ptr_PhysicalStorageBuffer_float %ulong_3735928559
+// CHECK: [[LD:%[_0-9A-Za-z]*]] = OpLoad %float [[TEMP_PTR]] Aligned 4
+// CHECK: [[CONSTRUCT:%[_0-9A-Za-z]*]] = OpCompositeConstruct [[V4FLOAT]] [[LD]] [[LD]] [[LD]] [[LD]]
+// CHECK: [[ADD:%[_0-9A-Za-z]*]] = OpFAdd [[V4FLOAT]] [[X4]] [[CONSTRUCT]]
+// CHECK: OpStore [[OUT]] [[ADD]]
 // CHECK: OpFunctionEnd
diff --git a/tools/clang/test/CodeGenSPIRV/vk.buffer-pointer.rvalue.hlsl b/tools/clang/test/CodeGenSPIRV/vk.buffer-pointer.rvalue.hlsl
index 930770cc16..5132c57000 100644
--- a/tools/clang/test/CodeGenSPIRV/vk.buffer-pointer.rvalue.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/vk.buffer-pointer.rvalue.hlsl
@@ -1,4 +1,5 @@
-// RUN: %dxc -spirv -HV 202x -Od -T cs_6_9 %s | FileCheck %s
+// RUN: %dxc -spirv -HV 202x -Od -T cs_6_9 %s | FileCheck %s --check-prefix=CHECK --check-prefix=NOFUN
+// RUN: %dxc -spirv -HV 202x -Od -T cs_6_9 -DFUN %s | FileCheck %s --check-prefix=CHECK --check-prefix=FUN
 
 // Issue #7302: implicit object argument of Get() evaluates to rvalue
 
@@ -20,16 +21,45 @@ struct Content
 // CHECK: [[V2UINT:%[_0-9A-Za-z]*]] = OpTypeVector [[UINT]] 2
 // CHECK: [[VECTOR:%[_0-9A-Za-z]*]] = OpConstantComposite [[V2UINT]] [[UDEADBEEF]] [[U0]]
 // CHECK: [[CONTENT:%[_0-9A-Za-z]*]] = OpTypeStruct [[INT]]
-// CHECK: [[PPCONTENT:%[_0-9A-Za-z]*]] = OpTypePointer PhysicalStorageBuffer [[CONTENT]]
-// CHECK: [[PPINT:%[_0-9A-Za-z]*]] = OpTypePointer PhysicalStorageBuffer [[INT]]
+// FUN: [[PFCONTENT:%[_0-9A-Za-z]*]] = OpTypePointer Function [[CONTENT]]
+// FUN: [[PFINT:%[_0-9A-Za-z]*]] = OpTypePointer Function [[INT]]
+// FUN: [[CONTENT0:%[_0-9A-Za-z]*]] = OpTypeStruct [[INT]]
+// FUN: [[PPCONTENT:%[_0-9A-Za-z]*]] = OpTypePointer PhysicalStorageBuffer [[CONTENT0]]
+// NOFUN: [[PPCONTENT:%[_0-9A-Za-z]*]] = OpTypePointer PhysicalStorageBuffer [[CONTENT]]
+// NOFUN: [[PPINT:%[_0-9A-Za-z]*]] = OpTypePointer PhysicalStorageBuffer [[INT]]
+
+Content f() {
+  return bitcast<vk::BufferPointer<Content> >(uint32_t2(0xdeadbeefu,0x0u)).Get();
+}
 
 [numthreads(1, 1, 1)]
 void main()
 {
+#ifdef FUN
+  Content c = f();
+  c.a = 1;
+#else
   bitcast<vk::BufferPointer<Content> >(uint32_t2(0xdeadbeefu,0x0u)).Get().a = 1;
+#endif
 }
 
-// CHECK: [[BITCAST:%[0-9]*]] = OpBitcast [[PPCONTENT]] [[VECTOR]]
-// CHECK: [[PTR:%[0-9]*]] = OpAccessChain [[PPINT]] [[BITCAST]] [[IO]]
-// CHECK: OpStore [[PTR]] [[I1]] Aligned 4
+// NOFUN: [[BITCAST:%[0-9]*]] = OpBitcast [[PPCONTENT]] [[VECTOR]]
+// NOFUN: [[PTR:%[0-9]*]] = OpAccessChain [[PPINT]] [[BITCAST]] [[IO]]
+// NOFUN: OpStore [[PTR]] [[I1]] Aligned 4
+
+// FUN: [[VAR:%[_0-9A-Za-z]*]] = OpVariable [[PFCONTENT]] Function
+// FUN: [[CALL:%[0-9]*]] = OpFunctionCall [[CONTENT]] [[F:%[_0-9A-Za-z]*]]
+// FUN: OpStore [[VAR]] [[CALL]]
+// FUN: [[PTR:%[0-9]*]] = OpAccessChain [[PFINT]] [[VAR]] [[IO]]
+// FUN: OpStore [[PTR]] [[I1]]
+
+// FUN: [[F]] = OpFunction [[CONTENT]]
+// FUN: [[VAR:%[_0-9A-Za-z]*]] = OpVariable [[PFCONTENT]] Function
+// FUN: [[BITCAST:%[0-9]*]] = OpBitcast [[PPCONTENT]] [[VECTOR]]
+// FUN: [[CVAL0:%[0-9]*]] = OpLoad [[CONTENT0]] [[BITCAST]] Aligned 4
+// FUN: [[IVAL:%[0-9]*]] = OpCompositeExtract [[INT]] [[CVAL0]] 0
+// FUN: [[CVAL1:%[0-9]*]] = OpCompositeConstruct [[CONTENT]] [[IVAL]]
+// FUN: OpStore [[VAR]] [[CVAL1]]
+// FUN: [[RET:%[0-9]*]] = OpLoad [[CONTENT]] [[VAR]]
+// FUN: OpReturnValue [[RET]]
 
diff --git a/tools/clang/test/CodeGenSPIRV/vk.buffer-pointer.write.hlsl b/tools/clang/test/CodeGenSPIRV/vk.buffer-pointer.write.hlsl
index b2efd02cbd..843815a4a0 100644
--- a/tools/clang/test/CodeGenSPIRV/vk.buffer-pointer.write.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/vk.buffer-pointer.write.hlsl
@@ -40,6 +40,7 @@ float4 MainPs(void) : SV_Target0
 {
       float4 vTest = float4(1.0,0.0,0.0,0.0);
       g_PushConstants.m_nBufferDeviceAddress.Get().g_vTestFloat4 = vTest;
+      vk::BufferPointer<float,4>(0xdeadbeefull).Get() = 4.5f;
       return vTest;
 }
 
@@ -48,5 +49,7 @@ float4 MainPs(void) : SV_Target0
 // CHECK: [[X2:%[_0-9A-Za-z]*]] = OpLoad [[PGLOBALS]] [[X1]]
 // CHECK: [[X3:%[_0-9A-Za-z]*]] = OpAccessChain [[PV4FLOAT2]] [[X2]] [[S1]]
 // CHECK: OpStore [[X3]] [[CV4FLOAT]] Aligned 16
+// CHECK: [[TEMP_PTR:%[_0-9A-Za-z]*]] = OpConvertUToPtr %_ptr_PhysicalStorageBuffer_float %ulong_3735928559
+// CHECK: OpStore [[TEMP_PTR]] %float_4_5 Aligned 4
 // CHECK: OpStore [[OUT]] [[CV4FLOAT]]
 // CHECK: OpFunctionEnd
diff --git a/tools/clang/test/CodeGenSPIRV/vk.cloption.invert-y.lib.hlsl b/tools/clang/test/CodeGenSPIRV/vk.cloption.invert-y.lib.hlsl
new file mode 100644
index 0000000000..6dac20fc6f
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/vk.cloption.invert-y.lib.hlsl
@@ -0,0 +1,12 @@
+// RUN: %dxc -T lib_6_3 -fvk-invert-y -fcgl  %s -spirv | FileCheck %s
+
+[shader("vertex")]
+float4 main(float4 a : A) : SV_Position {
+    return a;
+}
+
+// CHECK:         [[a:%[0-9]+]] = OpFunctionCall %v4float %src_main %param_var_a
+// CHECK-NEXT: [[oldY:%[0-9]+]] = OpCompositeExtract %float [[a]] 1
+// CHECK-NEXT: [[newY:%[0-9]+]] = OpFNegate %float [[oldY]]
+// CHECK-NEXT:  [[pos:%[0-9]+]] = OpCompositeInsert %v4float [[newY]] [[a]] 1
+// CHECK-NEXT:                 OpStore %gl_Position [[pos]]
diff --git a/tools/clang/test/DXC/Passes/DxilGen/hitobject_accessors_dxilgen.ll b/tools/clang/test/DXC/Passes/DxilGen/hitobject_accessors_dxilgen.ll
new file mode 100644
index 0000000000..4fc6a47780
--- /dev/null
+++ b/tools/clang/test/DXC/Passes/DxilGen/hitobject_accessors_dxilgen.ll
@@ -0,0 +1,687 @@
+; RUN: %dxopt %s -hlsl-passes-resume -dxilgen -S | FileCheck %s
+; REQUIRES: dxil-1-9
+
+;
+; Buffer Definitions:
+;
+;
+; Resource Bindings:
+;
+; Name                                 Type  Format         Dim      ID      HLSL Bind  Count
+; ------------------------------ ---------- ------- ----------- ------- -------------- ------
+; outbuf                                UAV    byte         r/w      U0u4294967295,space4294967295     1
+;
+target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+%struct.RWByteAddressBuffer = type { i32 }
+%dx.types.HitObject = type { i8* }
+%dx.types.Handle = type { i8* }
+%dx.types.ResourceProperties = type { i32, i32 }
+%"class.dx::HitObject" = type { i32 }
+
+@"\01?outbuf@@3URWByteAddressBuffer@@A" = external global %struct.RWByteAddressBuffer, align 4
+
+; CHECK:  %{{[^ ]+}} = call %dx.types.HitObject @dx.op.hitObject_SetShaderTableIndex(i32 287, %dx.types.HitObject %{{[^ ]+}}, i32 1)
+; CHECK:  %{{[^ ]+}} = call i1 @dx.op.hitObject_StateScalar.i1(i32 270, %dx.types.HitObject %{{[^ ]+}})
+; CHECK:  %{{[^ ]+}} = call i1 @dx.op.hitObject_StateScalar.i1(i32 269, %dx.types.HitObject %{{[^ ]+}})
+; CHECK:  %{{[^ ]+}} = call i1 @dx.op.hitObject_StateScalar.i1(i32 271, %dx.types.HitObject %{{[^ ]+}})
+; CHECK:  %{{[^ ]+}} = call i32 @dx.op.hitObject_StateScalar.i32(i32 281, %dx.types.HitObject %{{[^ ]+}})
+; CHECK:  %{{[^ ]+}} = call i32 @dx.op.hitObject_StateScalar.i32(i32 285, %dx.types.HitObject %{{[^ ]+}})
+; CHECK:  %{{[^ ]+}} = call i32 @dx.op.hitObject_StateScalar.i32(i32 282, %dx.types.HitObject %{{[^ ]+}})
+; CHECK:  %{{[^ ]+}} = call i32 @dx.op.hitObject_StateScalar.i32(i32 283, %dx.types.HitObject %{{[^ ]+}})
+; CHECK:  %{{[^ ]+}} = call i32 @dx.op.hitObject_StateScalar.i32(i32 284, %dx.types.HitObject %{{[^ ]+}})
+; CHECK:  %{{[^ ]+}} = call i32 @dx.op.hitObject_StateScalar.i32(i32 286, %dx.types.HitObject %{{[^ ]+}})
+; CHECK:  %{{[^ ]+}} = call i32 @dx.op.hitObject_LoadLocalRootTableConstant(i32 288, %dx.types.HitObject %{{[^ ]+}}, i32 42)
+; CHECK:  %{{[^ ]+}} = call float @dx.op.hitObject_StateVector.f32(i32 275, %dx.types.HitObject %{{[^ ]+}}, i32 0)
+; CHECK:  %{{[^ ]+}} = call float @dx.op.hitObject_StateVector.f32(i32 275, %dx.types.HitObject %{{[^ ]+}}, i32 1)
+; CHECK:  %{{[^ ]+}} = call float @dx.op.hitObject_StateVector.f32(i32 275, %dx.types.HitObject %{{[^ ]+}}, i32 2)
+; CHECK:  %{{[^ ]+}} = call float @dx.op.hitObject_StateVector.f32(i32 276, %dx.types.HitObject %{{[^ ]+}}, i32 0)
+; CHECK:  %{{[^ ]+}} = call float @dx.op.hitObject_StateVector.f32(i32 276, %dx.types.HitObject %{{[^ ]+}}, i32 1)
+; CHECK:  %{{[^ ]+}} = call float @dx.op.hitObject_StateVector.f32(i32 276, %dx.types.HitObject %{{[^ ]+}}, i32 2)
+; CHECK:  %{{[^ ]+}} = call float @dx.op.hitObject_StateVector.f32(i32 277, %dx.types.HitObject %{{[^ ]+}}, i32 0)
+; CHECK:  %{{[^ ]+}} = call float @dx.op.hitObject_StateVector.f32(i32 277, %dx.types.HitObject %{{[^ ]+}}, i32 1)
+; CHECK:  %{{[^ ]+}} = call float @dx.op.hitObject_StateVector.f32(i32 277, %dx.types.HitObject %{{[^ ]+}}, i32 2)
+; CHECK:  %{{[^ ]+}} = call float @dx.op.hitObject_StateVector.f32(i32 278, %dx.types.HitObject %{{[^ ]+}}, i32 0)
+; CHECK:  %{{[^ ]+}} = call float @dx.op.hitObject_StateVector.f32(i32 278, %dx.types.HitObject %{{[^ ]+}}, i32 1)
+; CHECK:  %{{[^ ]+}} = call float @dx.op.hitObject_StateVector.f32(i32 278, %dx.types.HitObject %{{[^ ]+}}, i32 2)
+
+; CHECK:  %[[M34OW00:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %[[M34OWHO:[^ ]+]], i32 0, i32 0)
+; CHECK-NEXT:  %[[M34VOW0:[^ ]+]] = insertelement <12 x float> undef, float %[[M34OW00]], i64 0
+; CHECK-NEXT:  %[[M34OW01:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %[[M34OWHO]], i32 0, i32 1)
+; CHECK-NEXT:  %[[M34VOW1:[^ ]+]] = insertelement <12 x float> %[[M34VOW0]], float %[[M34OW01]], i64 1
+; CHECK-NEXT:  %[[M34OW02:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %[[M34OWHO]], i32 0, i32 2)
+; CHECK-NEXT:  %[[M34VOW2:[^ ]+]] = insertelement <12 x float> %[[M34VOW1]], float %[[M34OW02]], i64 2
+; CHECK-NEXT:  %[[M34OW03:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %[[M34OWHO]], i32 0, i32 3)
+; CHECK-NEXT:  %[[M34VOW3:[^ ]+]] = insertelement <12 x float> %[[M34VOW2]], float %[[M34OW03]], i64 3
+; CHECK-NEXT:  %[[M34OW10:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %[[M34OWHO]], i32 1, i32 0)
+; CHECK-NEXT:  %[[M34VOW4:[^ ]+]] = insertelement <12 x float> %[[M34VOW3]], float %[[M34OW10]], i64 4
+; CHECK-NEXT:  %[[M34OW11:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %[[M34OWHO]], i32 1, i32 1)
+; CHECK-NEXT:  %[[M34VOW5:[^ ]+]] = insertelement <12 x float> %[[M34VOW4]], float %[[M34OW11]], i64 5
+; CHECK-NEXT:  %[[M34OW12:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %[[M34OWHO]], i32 1, i32 2)
+; CHECK-NEXT:  %[[M34VOW6:[^ ]+]] = insertelement <12 x float> %[[M34VOW5]], float %[[M34OW12]], i64 6
+; CHECK-NEXT:  %[[M34OW13:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %[[M34OWHO]], i32 1, i32 3)
+; CHECK-NEXT:  %[[M34VOW7:[^ ]+]] = insertelement <12 x float> %[[M34VOW6]], float %[[M34OW13]], i64 7
+; CHECK-NEXT:  %[[M34OW20:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %[[M34OWHO]], i32 2, i32 0)
+; CHECK-NEXT:  %[[M34VOW8:[^ ]+]] = insertelement <12 x float> %[[M34VOW7]], float %[[M34OW20]], i64 8
+; CHECK-NEXT:  %[[M34OW21:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %[[M34OWHO]], i32 2, i32 1)
+; CHECK-NEXT:  %[[M34VOW9:[^ ]+]] = insertelement <12 x float> %[[M34VOW8]], float %[[M34OW21]], i64 9
+; CHECK-NEXT:  %[[M34OW22:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %[[M34OWHO]], i32 2, i32 2)
+; CHECK-NEXT:  %[[M34VOW10:[^ ]+]] = insertelement <12 x float> %[[M34VOW9]], float %[[M34OW22]], i64 10
+; CHECK-NEXT:  %[[M34OW23:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %[[M34OWHO]], i32 2, i32 3)
+; CHECK-NEXT:  %{{[^ ]+}} = insertelement <12 x float> %[[M34VOW10]], float %[[M34OW23]], i64 11
+
+; CHECK:  %[[M43OW00:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %[[M43OWHO:[^ ]+]], i32 0, i32 0)
+; CHECK-NEXT:  %[[M43VOW0:[^ ]+]] = insertelement <12 x float> undef, float %[[M43OW00]], i64 0
+; CHECK-NEXT:  %[[M43OW10:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %[[M43OWHO]], i32 1, i32 0)
+; CHECK-NEXT:  %[[M43VOW1:[^ ]+]] = insertelement <12 x float> %[[M43VOW0]], float %[[M43OW10]], i64 1
+; CHECK-NEXT:  %[[M43OW20:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %[[M43OWHO]], i32 2, i32 0)
+; CHECK-NEXT:  %[[M43VOW2:[^ ]+]] = insertelement <12 x float> %[[M43VOW1]], float %[[M43OW20]], i64 2
+; CHECK-NEXT:  %[[M43OW01:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %[[M43OWHO]], i32 0, i32 1)
+; CHECK-NEXT:  %[[M43VOW3:[^ ]+]] = insertelement <12 x float> %[[M43VOW2]], float %[[M43OW01]], i64 3
+; CHECK-NEXT:  %[[M43OW11:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %[[M43OWHO]], i32 1, i32 1)
+; CHECK-NEXT:  %[[M43VOW4:[^ ]+]] = insertelement <12 x float> %[[M43VOW3]], float %[[M43OW11]], i64 4
+; CHECK-NEXT:  %[[M43OW21:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %[[M43OWHO]], i32 2, i32 1)
+; CHECK-NEXT:  %[[M43VOW5:[^ ]+]] = insertelement <12 x float> %[[M43VOW4]], float %[[M43OW21]], i64 5
+; CHECK-NEXT:  %[[M43OW02:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %[[M43OWHO]], i32 0, i32 2)
+; CHECK-NEXT:  %[[M43VOW6:[^ ]+]] = insertelement <12 x float> %[[M43VOW5]], float %[[M43OW02]], i64 6
+; CHECK-NEXT:  %[[M43OW12:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %[[M43OWHO]], i32 1, i32 2)
+; CHECK-NEXT:  %[[M43VOW7:[^ ]+]] = insertelement <12 x float> %[[M43VOW6]], float %[[M43OW12]], i64 7
+; CHECK-NEXT:  %[[M43OW22:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %[[M43OWHO]], i32 2, i32 2)
+; CHECK-NEXT:  %[[M43VOW8:[^ ]+]] = insertelement <12 x float> %[[M43VOW7]], float %[[M43OW22]], i64 8
+; CHECK-NEXT:  %[[M43OW03:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %[[M43OWHO]], i32 0, i32 3)
+; CHECK-NEXT:  %[[M43VOW9:[^ ]+]] = insertelement <12 x float> %[[M43VOW8]], float %[[M43OW03]], i64 9
+; CHECK-NEXT:  %[[M43OW13:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %[[M43OWHO]], i32 1, i32 3)
+; CHECK-NEXT:  %[[M43VOW10:[^ ]+]] = insertelement <12 x float> %[[M43VOW9]], float %[[M43OW13]], i64 10
+; CHECK-NEXT:  %[[M43OW23:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %[[M43OWHO]], i32 2, i32 3)
+; CHECK-NEXT:  %{{[^ ]+}} = insertelement <12 x float> %[[M43VOW10]], float %[[M43OW23]], i64 11
+
+; CHECK:  %[[M34WO00:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %[[M34WOHO:[^ ]+]], i32 0, i32 0)
+; CHECK-NEXT:  %[[M34VWO0:[^ ]+]] = insertelement <12 x float> undef, float %[[M34WO00]], i64 0
+; CHECK-NEXT:  %[[M34WO01:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %[[M34WOHO]], i32 0, i32 1)
+; CHECK-NEXT:  %[[M34VWO1:[^ ]+]] = insertelement <12 x float> %[[M34VWO0]], float %[[M34WO01]], i64 1
+; CHECK-NEXT:  %[[M34WO02:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %[[M34WOHO]], i32 0, i32 2)
+; CHECK-NEXT:  %[[M34VWO2:[^ ]+]] = insertelement <12 x float> %[[M34VWO1]], float %[[M34WO02]], i64 2
+; CHECK-NEXT:  %[[M34WO03:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %[[M34WOHO]], i32 0, i32 3)
+; CHECK-NEXT:  %[[M34VWO3:[^ ]+]] = insertelement <12 x float> %[[M34VWO2]], float %[[M34WO03]], i64 3
+; CHECK-NEXT:  %[[M34WO10:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %[[M34WOHO]], i32 1, i32 0)
+; CHECK-NEXT:  %[[M34VWO4:[^ ]+]] = insertelement <12 x float> %[[M34VWO3]], float %[[M34WO10]], i64 4
+; CHECK-NEXT:  %[[M34WO11:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %[[M34WOHO]], i32 1, i32 1)
+; CHECK-NEXT:  %[[M34VWO5:[^ ]+]] = insertelement <12 x float> %[[M34VWO4]], float %[[M34WO11]], i64 5
+; CHECK-NEXT:  %[[M34WO12:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %[[M34WOHO]], i32 1, i32 2)
+; CHECK-NEXT:  %[[M34VWO6:[^ ]+]] = insertelement <12 x float> %[[M34VWO5]], float %[[M34WO12]], i64 6
+; CHECK-NEXT:  %[[M34WO13:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %[[M34WOHO]], i32 1, i32 3)
+; CHECK-NEXT:  %[[M34VWO7:[^ ]+]] = insertelement <12 x float> %[[M34VWO6]], float %[[M34WO13]], i64 7
+; CHECK-NEXT:  %[[M34WO20:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %[[M34WOHO]], i32 2, i32 0)
+; CHECK-NEXT:  %[[M34VWO8:[^ ]+]] = insertelement <12 x float> %[[M34VWO7]], float %[[M34WO20]], i64 8
+; CHECK-NEXT:  %[[M34WO21:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %[[M34WOHO]], i32 2, i32 1)
+; CHECK-NEXT:  %[[M34VWO9:[^ ]+]] = insertelement <12 x float> %[[M34VWO8]], float %[[M34WO21]], i64 9
+; CHECK-NEXT:  %[[M34WO22:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %[[M34WOHO]], i32 2, i32 2)
+; CHECK-NEXT:  %[[M34VWO10:[^ ]+]] = insertelement <12 x float> %[[M34VWO9]], float %[[M34WO22]], i64 10
+; CHECK-NEXT:  %[[M34WO23:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %[[M34WOHO]], i32 2, i32 3)
+; CHECK-NEXT:  %{{[^ ]+}} = insertelement <12 x float> %[[M34VWO10]], float %[[M34WO23]], i64 11
+
+; CHECK:  %[[M43WO00:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %[[M43WOHO:[^ ]+]], i32 0, i32 0)
+; CHECK-NEXT:  %[[M43VWO0:[^ ]+]] = insertelement <12 x float> undef, float %[[M43WO00]], i64 0
+; CHECK-NEXT:  %[[M43WO10:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %[[M43WOHO]], i32 1, i32 0)
+; CHECK-NEXT:  %[[M43VWO1:[^ ]+]] = insertelement <12 x float> %[[M43VWO0]], float %[[M43WO10]], i64 1
+; CHECK-NEXT:  %[[M43WO20:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %[[M43WOHO]], i32 2, i32 0)
+; CHECK-NEXT:  %[[M43VWO2:[^ ]+]] = insertelement <12 x float> %[[M43VWO1]], float %[[M43WO20]], i64 2
+; CHECK-NEXT:  %[[M43WO01:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %[[M43WOHO]], i32 0, i32 1)
+; CHECK-NEXT:  %[[M43VWO3:[^ ]+]] = insertelement <12 x float> %[[M43VWO2]], float %[[M43WO01]], i64 3
+; CHECK-NEXT:  %[[M43WO11:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %[[M43WOHO]], i32 1, i32 1)
+; CHECK-NEXT:  %[[M43VWO4:[^ ]+]] = insertelement <12 x float> %[[M43VWO3]], float %[[M43WO11]], i64 4
+; CHECK-NEXT:  %[[M43WO21:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %[[M43WOHO]], i32 2, i32 1)
+; CHECK-NEXT:  %[[M43VWO5:[^ ]+]] = insertelement <12 x float> %[[M43VWO4]], float %[[M43WO21]], i64 5
+; CHECK-NEXT:  %[[M43WO02:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %[[M43WOHO]], i32 0, i32 2)
+; CHECK-NEXT:  %[[M43VWO6:[^ ]+]] = insertelement <12 x float> %[[M43VWO5]], float %[[M43WO02]], i64 6
+; CHECK-NEXT:  %[[M43WO12:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %[[M43WOHO]], i32 1, i32 2)
+; CHECK-NEXT:  %[[M43VWO7:[^ ]+]] = insertelement <12 x float> %[[M43VWO6]], float %[[M43WO12]], i64 7
+; CHECK-NEXT:  %[[M43WO22:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %[[M43WOHO]], i32 2, i32 2)
+; CHECK-NEXT:  %[[M43VWO8:[^ ]+]] = insertelement <12 x float> %[[M43VWO7]], float %[[M43WO22]], i64 8
+; CHECK-NEXT:  %[[M43WO03:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %[[M43WOHO]], i32 0, i32 3)
+; CHECK-NEXT:  %[[M43VWO9:[^ ]+]] = insertelement <12 x float> %[[M43VWO8]], float %[[M43WO03]], i64 9
+; CHECK-NEXT:  %[[M43WO13:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %[[M43WOHO]], i32 1, i32 3)
+; CHECK-NEXT:  %[[M43VWO10:[^ ]+]] = insertelement <12 x float> %[[M43VWO9]], float %[[M43WO13]], i64 10
+; CHECK-NEXT:  %[[M43WO23:[^ ]+]] = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %[[M43WOHO]], i32 2, i32 3)
+; CHECK-NEXT:  %{{[^ ]+}} = insertelement <12 x float> %[[M43VWO10]], float %[[M43WO23]], i64 11
+
+; Function Attrs: nounwind
+define void @"\01?main@@YAXXZ"() #0 {
+for.body.i.lr.ph:
+  %0 = alloca [12 x float]
+  %1 = alloca [3 x i32]
+  %2 = alloca [12 x float]
+  %3 = alloca [4 x i32]
+  %4 = alloca [12 x float]
+  %5 = alloca [3 x i32]
+  %6 = alloca [12 x float]
+  %7 = alloca [4 x i32]
+  %hit = alloca %dx.types.HitObject, align 4
+  %8 = bitcast %dx.types.HitObject* %hit to i8*, !dbg !19 ; line:69 col:3
+  call void @llvm.lifetime.start(i64 4, i8* %8) #0, !dbg !19 ; line:69 col:3
+  %9 = call %dx.types.HitObject* @"dx.hl.op..%dx.types.HitObject* (i32, %dx.types.HitObject*)"(i32 358, %dx.types.HitObject* %hit), !dbg !23 ; line:69 col:17
+  call void @"dx.hl.op..void (i32, %dx.types.HitObject*, i32)"(i32 388, %dx.types.HitObject* %hit, i32 1), !dbg !24 ; line:75 col:3
+  %10 = call i1 @"dx.hl.op.rn.i1 (i32, %dx.types.HitObject*)"(i32 383, %dx.types.HitObject* %hit), !dbg !25 ; line:80 col:11
+  %conv = zext i1 %10 to i32, !dbg !25 ; line:80 col:11
+  %11 = call i1 @"dx.hl.op.rn.i1 (i32, %dx.types.HitObject*)"(i32 384, %dx.types.HitObject* %hit), !dbg !26 ; line:81 col:11
+  %conv3 = zext i1 %11 to i32, !dbg !26 ; line:81 col:11
+  %add4 = add nsw i32 %conv, %conv3, !dbg !27 ; line:81 col:8
+  %12 = call i1 @"dx.hl.op.rn.i1 (i32, %dx.types.HitObject*)"(i32 385, %dx.types.HitObject* %hit), !dbg !28 ; line:82 col:11
+  %conv6 = zext i1 %12 to i32, !dbg !28 ; line:82 col:11
+  %add7 = add nsw i32 %add4, %conv6, !dbg !29 ; line:82 col:8
+  %13 = call i32 @"dx.hl.op.rn.i32 (i32, %dx.types.HitObject*)"(i32 365, %dx.types.HitObject* %hit), !dbg !30 ; line:85 col:11
+  %add9 = add i32 %add7, %13, !dbg !31 ; line:85 col:8
+  %14 = call i32 @"dx.hl.op.rn.i32 (i32, %dx.types.HitObject*)"(i32 366, %dx.types.HitObject* %hit), !dbg !32 ; line:86 col:11
+  %add11 = add i32 %add9, %14, !dbg !33 ; line:86 col:8
+  %15 = call i32 @"dx.hl.op.rn.i32 (i32, %dx.types.HitObject*)"(i32 368, %dx.types.HitObject* %hit), !dbg !34 ; line:87 col:11
+  %add13 = add i32 %add11, %15, !dbg !35 ; line:87 col:8
+  %16 = call i32 @"dx.hl.op.rn.i32 (i32, %dx.types.HitObject*)"(i32 367, %dx.types.HitObject* %hit), !dbg !36 ; line:88 col:11
+  %add15 = add i32 %add13, %16, !dbg !37 ; line:88 col:8
+  %17 = call i32 @"dx.hl.op.rn.i32 (i32, %dx.types.HitObject*)"(i32 373, %dx.types.HitObject* %hit), !dbg !38 ; line:89 col:11
+  %add17 = add i32 %add15, %17, !dbg !39 ; line:89 col:8
+  %18 = call i32 @"dx.hl.op.rn.i32 (i32, %dx.types.HitObject*)"(i32 377, %dx.types.HitObject* %hit), !dbg !40 ; line:90 col:11
+  %add19 = add i32 %add17, %18, !dbg !41 ; line:90 col:8
+  %19 = call i32 @"dx.hl.op.ro.i32 (i32, %dx.types.HitObject*, i32)"(i32 386, %dx.types.HitObject* %hit, i32 42), !dbg !42 ; line:91 col:11
+  %add21 = add i32 %add19, %19, !dbg !43 ; line:91 col:8
+  %20 = call <3 x float> @"dx.hl.op.rn.<3 x float> (i32, %dx.types.HitObject*)"(i32 379, %dx.types.HitObject* %hit), !dbg !44 ; line:94 col:11
+  %add23 = fadd <3 x float> zeroinitializer, %20, !dbg !45 ; line:94 col:8
+  %21 = call <3 x float> @"dx.hl.op.rn.<3 x float> (i32, %dx.types.HitObject*)"(i32 378, %dx.types.HitObject* %hit), !dbg !46 ; line:95 col:11
+  %add25 = fadd <3 x float> %add23, %21, !dbg !47 ; line:95 col:8
+  %22 = call <3 x float> @"dx.hl.op.rn.<3 x float> (i32, %dx.types.HitObject*)"(i32 370, %dx.types.HitObject* %hit), !dbg !48 ; line:96 col:11
+  %add27 = fadd <3 x float> %add25, %22, !dbg !49 ; line:96 col:8
+  %23 = call <3 x float> @"dx.hl.op.rn.<3 x float> (i32, %dx.types.HitObject*)"(i32 369, %dx.types.HitObject* %hit), !dbg !50 ; line:97 col:11
+  %add29 = fadd <3 x float> %add27, %23, !dbg !51 ; line:97 col:8
+  %vsum.0.vec.extract = extractelement <3 x float> %add29, i32 0, !dbg !52 ; line:98 col:11
+  %vsum.4.vec.extract = extractelement <3 x float> %add29, i32 1, !dbg !53 ; line:98 col:21
+  %add30 = fadd float %vsum.0.vec.extract, %vsum.4.vec.extract, !dbg !54 ; line:98 col:19
+  %vsum.8.vec.extract = extractelement <3 x float> %add29, i32 2, !dbg !55 ; line:98 col:31
+  %add31 = fadd float %add30, %vsum.8.vec.extract, !dbg !56 ; line:98 col:29
+  %add32 = fadd float 0.000000e+00, %add31, !dbg !57 ; line:98 col:8
+  %24 = call <12 x float> @"dx.hl.op.rn.<12 x float> (i32, %dx.types.HitObject*)"(i32 371, %dx.types.HitObject* %hit), !dbg !58 ; line:101 col:23
+  %row2col = shufflevector <12 x float> %24, <12 x float> %24, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>, !dbg !59 ; line:101 col:11
+  br label %for.body.7.i.lr.ph, !dbg !60 ; line:61 col:3
+
+for.body.7.i.lr.ph:                               ; preds = %for.cond.cleanup.6.i, %for.body.i.lr.ph
+  %i.i.0 = phi i32 [ 0, %for.body.i.lr.ph ], [ %inc9.i, %for.cond.cleanup.6.i ]
+  %h.i.0 = phi float [ 0.000000e+00, %for.body.i.lr.ph ], [ %add.i, %for.cond.cleanup.6.i ]
+  br label %for.body.7.i, !dbg !63 ; line:62 col:5
+
+for.cond.cleanup.6.i:                             ; preds = %for.body.7.i
+  %inc9.i = add nsw i32 %i.i.0, 1, !dbg !64 ; line:61 col:26
+  %cmp.i = icmp slt i32 %inc9.i, 3, !dbg !65 ; line:61 col:21
+  br i1 %cmp.i, label %for.body.7.i.lr.ph, label %for.body.i.8.lr.ph, !dbg !60 ; line:61 col:3
+
+for.body.7.i:                                     ; preds = %for.body.7.i.lr.ph, %for.body.7.i
+  %h.i.263 = phi float [ %h.i.0, %for.body.7.i.lr.ph ], [ %add.i, %for.body.7.i ]
+  %j.i.0 = phi i32 [ 0, %for.body.7.i.lr.ph ], [ %inc.i, %for.body.7.i ]
+  %25 = add i32 3, %i.i.0, !dbg !66 ; line:63 col:12
+  %26 = add i32 6, %i.i.0, !dbg !66 ; line:63 col:12
+  %27 = add i32 9, %i.i.0, !dbg !66 ; line:63 col:12
+  %28 = getelementptr [4 x i32], [4 x i32]* %7, i32 0, i32 0, !dbg !66 ; line:63 col:12
+  store i32 %i.i.0, i32* %28, !dbg !66 ; line:63 col:12
+  %29 = getelementptr [4 x i32], [4 x i32]* %7, i32 0, i32 1, !dbg !66 ; line:63 col:12
+  store i32 %25, i32* %29, !dbg !66 ; line:63 col:12
+  %30 = getelementptr [4 x i32], [4 x i32]* %7, i32 0, i32 2, !dbg !66 ; line:63 col:12
+  store i32 %26, i32* %30, !dbg !66 ; line:63 col:12
+  %31 = getelementptr [4 x i32], [4 x i32]* %7, i32 0, i32 3, !dbg !66 ; line:63 col:12
+  store i32 %27, i32* %31, !dbg !66 ; line:63 col:12
+  %32 = getelementptr [4 x i32], [4 x i32]* %7, i32 0, i32 %j.i.0, !dbg !66 ; line:63 col:12
+  %33 = load i32, i32* %32, !dbg !66 ; line:63 col:12
+  %34 = extractelement <12 x float> %row2col, i64 0, !dbg !66 ; line:63 col:12
+  %35 = getelementptr [12 x float], [12 x float]* %6, i32 0, i32 0, !dbg !66 ; line:63 col:12
+  store float %34, float* %35, !dbg !66 ; line:63 col:12
+  %36 = extractelement <12 x float> %row2col, i64 1, !dbg !66 ; line:63 col:12
+  %37 = getelementptr [12 x float], [12 x float]* %6, i32 0, i32 1, !dbg !66 ; line:63 col:12
+  store float %36, float* %37, !dbg !66 ; line:63 col:12
+  %38 = extractelement <12 x float> %row2col, i64 2, !dbg !66 ; line:63 col:12
+  %39 = getelementptr [12 x float], [12 x float]* %6, i32 0, i32 2, !dbg !66 ; line:63 col:12
+  store float %38, float* %39, !dbg !66 ; line:63 col:12
+  %40 = extractelement <12 x float> %row2col, i64 3, !dbg !66 ; line:63 col:12
+  %41 = getelementptr [12 x float], [12 x float]* %6, i32 0, i32 3, !dbg !66 ; line:63 col:12
+  store float %40, float* %41, !dbg !66 ; line:63 col:12
+  %42 = extractelement <12 x float> %row2col, i64 4, !dbg !66 ; line:63 col:12
+  %43 = getelementptr [12 x float], [12 x float]* %6, i32 0, i32 4, !dbg !66 ; line:63 col:12
+  store float %42, float* %43, !dbg !66 ; line:63 col:12
+  %44 = extractelement <12 x float> %row2col, i64 5, !dbg !66 ; line:63 col:12
+  %45 = getelementptr [12 x float], [12 x float]* %6, i32 0, i32 5, !dbg !66 ; line:63 col:12
+  store float %44, float* %45, !dbg !66 ; line:63 col:12
+  %46 = extractelement <12 x float> %row2col, i64 6, !dbg !66 ; line:63 col:12
+  %47 = getelementptr [12 x float], [12 x float]* %6, i32 0, i32 6, !dbg !66 ; line:63 col:12
+  store float %46, float* %47, !dbg !66 ; line:63 col:12
+  %48 = extractelement <12 x float> %row2col, i64 7, !dbg !66 ; line:63 col:12
+  %49 = getelementptr [12 x float], [12 x float]* %6, i32 0, i32 7, !dbg !66 ; line:63 col:12
+  store float %48, float* %49, !dbg !66 ; line:63 col:12
+  %50 = extractelement <12 x float> %row2col, i64 8, !dbg !66 ; line:63 col:12
+  %51 = getelementptr [12 x float], [12 x float]* %6, i32 0, i32 8, !dbg !66 ; line:63 col:12
+  store float %50, float* %51, !dbg !66 ; line:63 col:12
+  %52 = extractelement <12 x float> %row2col, i64 9, !dbg !66 ; line:63 col:12
+  %53 = getelementptr [12 x float], [12 x float]* %6, i32 0, i32 9, !dbg !66 ; line:63 col:12
+  store float %52, float* %53, !dbg !66 ; line:63 col:12
+  %54 = extractelement <12 x float> %row2col, i64 10, !dbg !66 ; line:63 col:12
+  %55 = getelementptr [12 x float], [12 x float]* %6, i32 0, i32 10, !dbg !66 ; line:63 col:12
+  store float %54, float* %55, !dbg !66 ; line:63 col:12
+  %56 = extractelement <12 x float> %row2col, i64 11, !dbg !66 ; line:63 col:12
+  %57 = getelementptr [12 x float], [12 x float]* %6, i32 0, i32 11, !dbg !66 ; line:63 col:12
+  store float %56, float* %57, !dbg !66 ; line:63 col:12
+  %58 = getelementptr [12 x float], [12 x float]* %6, i32 0, i32 %33, !dbg !66 ; line:63 col:12
+  %59 = load float, float* %58, !dbg !66 ; line:63 col:12
+  %add.i = fadd float %h.i.263, %59, !dbg !67 ; line:63 col:9
+  %inc.i = add nsw i32 %j.i.0, 1, !dbg !68 ; line:62 col:28
+  %cmp3.i = icmp slt i32 %inc.i, 4, !dbg !69 ; line:62 col:23
+  br i1 %cmp3.i, label %for.body.7.i, label %for.cond.cleanup.6.i, !dbg !63 ; line:62 col:5
+
+for.body.i.8.lr.ph:                               ; preds = %for.cond.cleanup.6.i
+  %add35 = fadd float %add32, %add.i, !dbg !70 ; line:101 col:8
+  %60 = call <12 x float> @"dx.hl.op.rn.<12 x float> (i32, %dx.types.HitObject*)"(i32 372, %dx.types.HitObject* %hit), !dbg !71 ; line:102 col:23
+  %row2col52 = shufflevector <12 x float> %60, <12 x float> %60, <12 x i32> <i32 0, i32 3, i32 6, i32 9, i32 1, i32 4, i32 7, i32 10, i32 2, i32 5, i32 8, i32 11>, !dbg !72 ; line:102 col:11
+  br label %for.body.7.i.15.lr.ph, !dbg !73 ; line:61 col:3
+
+for.body.7.i.15.lr.ph:                            ; preds = %for.cond.cleanup.6.i.12, %for.body.i.8.lr.ph
+  %i.i.3.0 = phi i32 [ 0, %for.body.i.8.lr.ph ], [ %inc9.i.11, %for.cond.cleanup.6.i.12 ]
+  %h.i.2.0 = phi float [ 0.000000e+00, %for.body.i.8.lr.ph ], [ %add.i.13, %for.cond.cleanup.6.i.12 ]
+  br label %for.body.7.i.15, !dbg !76 ; line:62 col:5
+
+for.cond.cleanup.6.i.12:                          ; preds = %for.body.7.i.15
+  %inc9.i.11 = add nsw i32 %i.i.3.0, 1, !dbg !77 ; line:61 col:26
+  %cmp.i.6 = icmp slt i32 %inc9.i.11, 4, !dbg !78 ; line:61 col:21
+  br i1 %cmp.i.6, label %for.body.7.i.15.lr.ph, label %for.body.i.23.lr.ph, !dbg !73 ; line:61 col:3
+
+for.body.7.i.15:                                  ; preds = %for.body.7.i.15.lr.ph, %for.body.7.i.15
+  %j.i.5.0 = phi i32 [ 0, %for.body.7.i.15.lr.ph ], [ %inc.i.14, %for.body.7.i.15 ]
+  %h.i.2.2 = phi float [ %h.i.2.0, %for.body.7.i.15.lr.ph ], [ %add.i.13, %for.body.7.i.15 ]
+  %61 = add i32 4, %i.i.3.0, !dbg !79 ; line:63 col:12
+  %62 = add i32 8, %i.i.3.0, !dbg !79 ; line:63 col:12
+  %63 = getelementptr [3 x i32], [3 x i32]* %5, i32 0, i32 0, !dbg !79 ; line:63 col:12
+  store i32 %i.i.3.0, i32* %63, !dbg !79 ; line:63 col:12
+  %64 = getelementptr [3 x i32], [3 x i32]* %5, i32 0, i32 1, !dbg !79 ; line:63 col:12
+  store i32 %61, i32* %64, !dbg !79 ; line:63 col:12
+  %65 = getelementptr [3 x i32], [3 x i32]* %5, i32 0, i32 2, !dbg !79 ; line:63 col:12
+  store i32 %62, i32* %65, !dbg !79 ; line:63 col:12
+  %66 = getelementptr [3 x i32], [3 x i32]* %5, i32 0, i32 %j.i.5.0, !dbg !79 ; line:63 col:12
+  %67 = load i32, i32* %66, !dbg !79 ; line:63 col:12
+  %68 = extractelement <12 x float> %row2col52, i64 0, !dbg !79 ; line:63 col:12
+  %69 = getelementptr [12 x float], [12 x float]* %4, i32 0, i32 0, !dbg !79 ; line:63 col:12
+  store float %68, float* %69, !dbg !79 ; line:63 col:12
+  %70 = extractelement <12 x float> %row2col52, i64 1, !dbg !79 ; line:63 col:12
+  %71 = getelementptr [12 x float], [12 x float]* %4, i32 0, i32 1, !dbg !79 ; line:63 col:12
+  store float %70, float* %71, !dbg !79 ; line:63 col:12
+  %72 = extractelement <12 x float> %row2col52, i64 2, !dbg !79 ; line:63 col:12
+  %73 = getelementptr [12 x float], [12 x float]* %4, i32 0, i32 2, !dbg !79 ; line:63 col:12
+  store float %72, float* %73, !dbg !79 ; line:63 col:12
+  %74 = extractelement <12 x float> %row2col52, i64 3, !dbg !79 ; line:63 col:12
+  %75 = getelementptr [12 x float], [12 x float]* %4, i32 0, i32 3, !dbg !79 ; line:63 col:12
+  store float %74, float* %75, !dbg !79 ; line:63 col:12
+  %76 = extractelement <12 x float> %row2col52, i64 4, !dbg !79 ; line:63 col:12
+  %77 = getelementptr [12 x float], [12 x float]* %4, i32 0, i32 4, !dbg !79 ; line:63 col:12
+  store float %76, float* %77, !dbg !79 ; line:63 col:12
+  %78 = extractelement <12 x float> %row2col52, i64 5, !dbg !79 ; line:63 col:12
+  %79 = getelementptr [12 x float], [12 x float]* %4, i32 0, i32 5, !dbg !79 ; line:63 col:12
+  store float %78, float* %79, !dbg !79 ; line:63 col:12
+  %80 = extractelement <12 x float> %row2col52, i64 6, !dbg !79 ; line:63 col:12
+  %81 = getelementptr [12 x float], [12 x float]* %4, i32 0, i32 6, !dbg !79 ; line:63 col:12
+  store float %80, float* %81, !dbg !79 ; line:63 col:12
+  %82 = extractelement <12 x float> %row2col52, i64 7, !dbg !79 ; line:63 col:12
+  %83 = getelementptr [12 x float], [12 x float]* %4, i32 0, i32 7, !dbg !79 ; line:63 col:12
+  store float %82, float* %83, !dbg !79 ; line:63 col:12
+  %84 = extractelement <12 x float> %row2col52, i64 8, !dbg !79 ; line:63 col:12
+  %85 = getelementptr [12 x float], [12 x float]* %4, i32 0, i32 8, !dbg !79 ; line:63 col:12
+  store float %84, float* %85, !dbg !79 ; line:63 col:12
+  %86 = extractelement <12 x float> %row2col52, i64 9, !dbg !79 ; line:63 col:12
+  %87 = getelementptr [12 x float], [12 x float]* %4, i32 0, i32 9, !dbg !79 ; line:63 col:12
+  store float %86, float* %87, !dbg !79 ; line:63 col:12
+  %88 = extractelement <12 x float> %row2col52, i64 10, !dbg !79 ; line:63 col:12
+  %89 = getelementptr [12 x float], [12 x float]* %4, i32 0, i32 10, !dbg !79 ; line:63 col:12
+  store float %88, float* %89, !dbg !79 ; line:63 col:12
+  %90 = extractelement <12 x float> %row2col52, i64 11, !dbg !79 ; line:63 col:12
+  %91 = getelementptr [12 x float], [12 x float]* %4, i32 0, i32 11, !dbg !79 ; line:63 col:12
+  store float %90, float* %91, !dbg !79 ; line:63 col:12
+  %92 = getelementptr [12 x float], [12 x float]* %4, i32 0, i32 %67, !dbg !79 ; line:63 col:12
+  %93 = load float, float* %92, !dbg !79 ; line:63 col:12
+  %add.i.13 = fadd float %h.i.2.2, %93, !dbg !80 ; line:63 col:9
+  %inc.i.14 = add nsw i32 %j.i.5.0, 1, !dbg !81 ; line:62 col:28
+  %cmp3.i.9 = icmp slt i32 %inc.i.14, 3, !dbg !82 ; line:62 col:23
+  br i1 %cmp3.i.9, label %for.body.7.i.15, label %for.cond.cleanup.6.i.12, !dbg !76 ; line:62 col:5
+
+for.body.i.23.lr.ph:                              ; preds = %for.cond.cleanup.6.i.12
+  %add38 = fadd float %add35, %add.i.13, !dbg !83 ; line:102 col:8
+  %94 = call <12 x float> @"dx.hl.op.rn.<12 x float> (i32, %dx.types.HitObject*)"(i32 380, %dx.types.HitObject* %hit), !dbg !84 ; line:103 col:23
+  %row2col53 = shufflevector <12 x float> %94, <12 x float> %94, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>, !dbg !85 ; line:103 col:11
+  br label %for.body.7.i.30.lr.ph, !dbg !86 ; line:61 col:3
+
+for.body.7.i.30.lr.ph:                            ; preds = %for.cond.cleanup.6.i.27, %for.body.i.23.lr.ph
+  %i.i.18.0 = phi i32 [ 0, %for.body.i.23.lr.ph ], [ %inc9.i.26, %for.cond.cleanup.6.i.27 ]
+  %h.i.17.0 = phi float [ 0.000000e+00, %for.body.i.23.lr.ph ], [ %add.i.28, %for.cond.cleanup.6.i.27 ]
+  br label %for.body.7.i.30, !dbg !88 ; line:62 col:5
+
+for.cond.cleanup.6.i.27:                          ; preds = %for.body.7.i.30
+  %inc9.i.26 = add nsw i32 %i.i.18.0, 1, !dbg !89 ; line:61 col:26
+  %cmp.i.21 = icmp slt i32 %inc9.i.26, 3, !dbg !90 ; line:61 col:21
+  br i1 %cmp.i.21, label %for.body.7.i.30.lr.ph, label %for.body.i.39.lr.ph, !dbg !86 ; line:61 col:3
+
+for.body.7.i.30:                                  ; preds = %for.body.7.i.30.lr.ph, %for.body.7.i.30
+  %j.i.20.0 = phi i32 [ 0, %for.body.7.i.30.lr.ph ], [ %inc.i.29, %for.body.7.i.30 ]
+  %h.i.17.2 = phi float [ %h.i.17.0, %for.body.7.i.30.lr.ph ], [ %add.i.28, %for.body.7.i.30 ]
+  %95 = add i32 3, %i.i.18.0, !dbg !91 ; line:63 col:12
+  %96 = add i32 6, %i.i.18.0, !dbg !91 ; line:63 col:12
+  %97 = add i32 9, %i.i.18.0, !dbg !91 ; line:63 col:12
+  %98 = getelementptr [4 x i32], [4 x i32]* %3, i32 0, i32 0, !dbg !91 ; line:63 col:12
+  store i32 %i.i.18.0, i32* %98, !dbg !91 ; line:63 col:12
+  %99 = getelementptr [4 x i32], [4 x i32]* %3, i32 0, i32 1, !dbg !91 ; line:63 col:12
+  store i32 %95, i32* %99, !dbg !91 ; line:63 col:12
+  %100 = getelementptr [4 x i32], [4 x i32]* %3, i32 0, i32 2, !dbg !91 ; line:63 col:12
+  store i32 %96, i32* %100, !dbg !91 ; line:63 col:12
+  %101 = getelementptr [4 x i32], [4 x i32]* %3, i32 0, i32 3, !dbg !91 ; line:63 col:12
+  store i32 %97, i32* %101, !dbg !91 ; line:63 col:12
+  %102 = getelementptr [4 x i32], [4 x i32]* %3, i32 0, i32 %j.i.20.0, !dbg !91 ; line:63 col:12
+  %103 = load i32, i32* %102, !dbg !91 ; line:63 col:12
+  %104 = extractelement <12 x float> %row2col53, i64 0, !dbg !91 ; line:63 col:12
+  %105 = getelementptr [12 x float], [12 x float]* %2, i32 0, i32 0, !dbg !91 ; line:63 col:12
+  store float %104, float* %105, !dbg !91 ; line:63 col:12
+  %106 = extractelement <12 x float> %row2col53, i64 1, !dbg !91 ; line:63 col:12
+  %107 = getelementptr [12 x float], [12 x float]* %2, i32 0, i32 1, !dbg !91 ; line:63 col:12
+  store float %106, float* %107, !dbg !91 ; line:63 col:12
+  %108 = extractelement <12 x float> %row2col53, i64 2, !dbg !91 ; line:63 col:12
+  %109 = getelementptr [12 x float], [12 x float]* %2, i32 0, i32 2, !dbg !91 ; line:63 col:12
+  store float %108, float* %109, !dbg !91 ; line:63 col:12
+  %110 = extractelement <12 x float> %row2col53, i64 3, !dbg !91 ; line:63 col:12
+  %111 = getelementptr [12 x float], [12 x float]* %2, i32 0, i32 3, !dbg !91 ; line:63 col:12
+  store float %110, float* %111, !dbg !91 ; line:63 col:12
+  %112 = extractelement <12 x float> %row2col53, i64 4, !dbg !91 ; line:63 col:12
+  %113 = getelementptr [12 x float], [12 x float]* %2, i32 0, i32 4, !dbg !91 ; line:63 col:12
+  store float %112, float* %113, !dbg !91 ; line:63 col:12
+  %114 = extractelement <12 x float> %row2col53, i64 5, !dbg !91 ; line:63 col:12
+  %115 = getelementptr [12 x float], [12 x float]* %2, i32 0, i32 5, !dbg !91 ; line:63 col:12
+  store float %114, float* %115, !dbg !91 ; line:63 col:12
+  %116 = extractelement <12 x float> %row2col53, i64 6, !dbg !91 ; line:63 col:12
+  %117 = getelementptr [12 x float], [12 x float]* %2, i32 0, i32 6, !dbg !91 ; line:63 col:12
+  store float %116, float* %117, !dbg !91 ; line:63 col:12
+  %118 = extractelement <12 x float> %row2col53, i64 7, !dbg !91 ; line:63 col:12
+  %119 = getelementptr [12 x float], [12 x float]* %2, i32 0, i32 7, !dbg !91 ; line:63 col:12
+  store float %118, float* %119, !dbg !91 ; line:63 col:12
+  %120 = extractelement <12 x float> %row2col53, i64 8, !dbg !91 ; line:63 col:12
+  %121 = getelementptr [12 x float], [12 x float]* %2, i32 0, i32 8, !dbg !91 ; line:63 col:12
+  store float %120, float* %121, !dbg !91 ; line:63 col:12
+  %122 = extractelement <12 x float> %row2col53, i64 9, !dbg !91 ; line:63 col:12
+  %123 = getelementptr [12 x float], [12 x float]* %2, i32 0, i32 9, !dbg !91 ; line:63 col:12
+  store float %122, float* %123, !dbg !91 ; line:63 col:12
+  %124 = extractelement <12 x float> %row2col53, i64 10, !dbg !91 ; line:63 col:12
+  %125 = getelementptr [12 x float], [12 x float]* %2, i32 0, i32 10, !dbg !91 ; line:63 col:12
+  store float %124, float* %125, !dbg !91 ; line:63 col:12
+  %126 = extractelement <12 x float> %row2col53, i64 11, !dbg !91 ; line:63 col:12
+  %127 = getelementptr [12 x float], [12 x float]* %2, i32 0, i32 11, !dbg !91 ; line:63 col:12
+  store float %126, float* %127, !dbg !91 ; line:63 col:12
+  %128 = getelementptr [12 x float], [12 x float]* %2, i32 0, i32 %103, !dbg !91 ; line:63 col:12
+  %129 = load float, float* %128, !dbg !91 ; line:63 col:12
+  %add.i.28 = fadd float %h.i.17.2, %129, !dbg !92 ; line:63 col:9
+  %inc.i.29 = add nsw i32 %j.i.20.0, 1, !dbg !93 ; line:62 col:28
+  %cmp3.i.24 = icmp slt i32 %inc.i.29, 4, !dbg !94 ; line:62 col:23
+  br i1 %cmp3.i.24, label %for.body.7.i.30, label %for.cond.cleanup.6.i.27, !dbg !88 ; line:62 col:5
+
+for.body.i.39.lr.ph:                              ; preds = %for.cond.cleanup.6.i.27
+  %add41 = fadd float %add38, %add.i.28, !dbg !95 ; line:103 col:8
+  %130 = call <12 x float> @"dx.hl.op.rn.<12 x float> (i32, %dx.types.HitObject*)"(i32 381, %dx.types.HitObject* %hit), !dbg !96 ; line:104 col:23
+  %row2col54 = shufflevector <12 x float> %130, <12 x float> %130, <12 x i32> <i32 0, i32 3, i32 6, i32 9, i32 1, i32 4, i32 7, i32 10, i32 2, i32 5, i32 8, i32 11>, !dbg !97 ; line:104 col:11
+  br label %for.body.7.i.46.lr.ph, !dbg !98 ; line:61 col:3
+
+for.body.7.i.46.lr.ph:                            ; preds = %for.cond.cleanup.6.i.43, %for.body.i.39.lr.ph
+  %i.i.34.0 = phi i32 [ 0, %for.body.i.39.lr.ph ], [ %inc9.i.42, %for.cond.cleanup.6.i.43 ]
+  %h.i.33.0 = phi float [ 0.000000e+00, %for.body.i.39.lr.ph ], [ %add.i.44, %for.cond.cleanup.6.i.43 ]
+  br label %for.body.7.i.46, !dbg !100 ; line:62 col:5
+
+for.cond.cleanup.6.i.43:                          ; preds = %for.body.7.i.46
+  %inc9.i.42 = add nsw i32 %i.i.34.0, 1, !dbg !101 ; line:61 col:26
+  %cmp.i.37 = icmp slt i32 %inc9.i.42, 4, !dbg !102 ; line:61 col:21
+  br i1 %cmp.i.37, label %for.body.7.i.46.lr.ph, label %"\01??$hashM@$03$02@@YAMV?$matrix@M$03$02@@@Z.exit.47", !dbg !98 ; line:61 col:3
+
+for.body.7.i.46:                                  ; preds = %for.body.7.i.46.lr.ph, %for.body.7.i.46
+  %j.i.36.0 = phi i32 [ 0, %for.body.7.i.46.lr.ph ], [ %inc.i.45, %for.body.7.i.46 ]
+  %h.i.33.2 = phi float [ %h.i.33.0, %for.body.7.i.46.lr.ph ], [ %add.i.44, %for.body.7.i.46 ]
+  %131 = add i32 4, %i.i.34.0, !dbg !103 ; line:63 col:12
+  %132 = add i32 8, %i.i.34.0, !dbg !103 ; line:63 col:12
+  %133 = getelementptr [3 x i32], [3 x i32]* %1, i32 0, i32 0, !dbg !103 ; line:63 col:12
+  store i32 %i.i.34.0, i32* %133, !dbg !103 ; line:63 col:12
+  %134 = getelementptr [3 x i32], [3 x i32]* %1, i32 0, i32 1, !dbg !103 ; line:63 col:12
+  store i32 %131, i32* %134, !dbg !103 ; line:63 col:12
+  %135 = getelementptr [3 x i32], [3 x i32]* %1, i32 0, i32 2, !dbg !103 ; line:63 col:12
+  store i32 %132, i32* %135, !dbg !103 ; line:63 col:12
+  %136 = getelementptr [3 x i32], [3 x i32]* %1, i32 0, i32 %j.i.36.0, !dbg !103 ; line:63 col:12
+  %137 = load i32, i32* %136, !dbg !103 ; line:63 col:12
+  %138 = extractelement <12 x float> %row2col54, i64 0, !dbg !103 ; line:63 col:12
+  %139 = getelementptr [12 x float], [12 x float]* %0, i32 0, i32 0, !dbg !103 ; line:63 col:12
+  store float %138, float* %139, !dbg !103 ; line:63 col:12
+  %140 = extractelement <12 x float> %row2col54, i64 1, !dbg !103 ; line:63 col:12
+  %141 = getelementptr [12 x float], [12 x float]* %0, i32 0, i32 1, !dbg !103 ; line:63 col:12
+  store float %140, float* %141, !dbg !103 ; line:63 col:12
+  %142 = extractelement <12 x float> %row2col54, i64 2, !dbg !103 ; line:63 col:12
+  %143 = getelementptr [12 x float], [12 x float]* %0, i32 0, i32 2, !dbg !103 ; line:63 col:12
+  store float %142, float* %143, !dbg !103 ; line:63 col:12
+  %144 = extractelement <12 x float> %row2col54, i64 3, !dbg !103 ; line:63 col:12
+  %145 = getelementptr [12 x float], [12 x float]* %0, i32 0, i32 3, !dbg !103 ; line:63 col:12
+  store float %144, float* %145, !dbg !103 ; line:63 col:12
+  %146 = extractelement <12 x float> %row2col54, i64 4, !dbg !103 ; line:63 col:12
+  %147 = getelementptr [12 x float], [12 x float]* %0, i32 0, i32 4, !dbg !103 ; line:63 col:12
+  store float %146, float* %147, !dbg !103 ; line:63 col:12
+  %148 = extractelement <12 x float> %row2col54, i64 5, !dbg !103 ; line:63 col:12
+  %149 = getelementptr [12 x float], [12 x float]* %0, i32 0, i32 5, !dbg !103 ; line:63 col:12
+  store float %148, float* %149, !dbg !103 ; line:63 col:12
+  %150 = extractelement <12 x float> %row2col54, i64 6, !dbg !103 ; line:63 col:12
+  %151 = getelementptr [12 x float], [12 x float]* %0, i32 0, i32 6, !dbg !103 ; line:63 col:12
+  store float %150, float* %151, !dbg !103 ; line:63 col:12
+  %152 = extractelement <12 x float> %row2col54, i64 7, !dbg !103 ; line:63 col:12
+  %153 = getelementptr [12 x float], [12 x float]* %0, i32 0, i32 7, !dbg !103 ; line:63 col:12
+  store float %152, float* %153, !dbg !103 ; line:63 col:12
+  %154 = extractelement <12 x float> %row2col54, i64 8, !dbg !103 ; line:63 col:12
+  %155 = getelementptr [12 x float], [12 x float]* %0, i32 0, i32 8, !dbg !103 ; line:63 col:12
+  store float %154, float* %155, !dbg !103 ; line:63 col:12
+  %156 = extractelement <12 x float> %row2col54, i64 9, !dbg !103 ; line:63 col:12
+  %157 = getelementptr [12 x float], [12 x float]* %0, i32 0, i32 9, !dbg !103 ; line:63 col:12
+  store float %156, float* %157, !dbg !103 ; line:63 col:12
+  %158 = extractelement <12 x float> %row2col54, i64 10, !dbg !103 ; line:63 col:12
+  %159 = getelementptr [12 x float], [12 x float]* %0, i32 0, i32 10, !dbg !103 ; line:63 col:12
+  store float %158, float* %159, !dbg !103 ; line:63 col:12
+  %160 = extractelement <12 x float> %row2col54, i64 11, !dbg !103 ; line:63 col:12
+  %161 = getelementptr [12 x float], [12 x float]* %0, i32 0, i32 11, !dbg !103 ; line:63 col:12
+  store float %160, float* %161, !dbg !103 ; line:63 col:12
+  %162 = getelementptr [12 x float], [12 x float]* %0, i32 0, i32 %137, !dbg !103 ; line:63 col:12
+  %163 = load float, float* %162, !dbg !103 ; line:63 col:12
+  %add.i.44 = fadd float %h.i.33.2, %163, !dbg !104 ; line:63 col:9
+  %inc.i.45 = add nsw i32 %j.i.36.0, 1, !dbg !105 ; line:62 col:28
+  %cmp3.i.40 = icmp slt i32 %inc.i.45, 3, !dbg !106 ; line:62 col:23
+  br i1 %cmp3.i.40, label %for.body.7.i.46, label %for.cond.cleanup.6.i.43, !dbg !100 ; line:62 col:5
+
+"\01??$hashM@$03$02@@YAMV?$matrix@M$03$02@@@Z.exit.47": ; preds = %for.cond.cleanup.6.i.43
+  %add44 = fadd float %add41, %add.i.44, !dbg !107 ; line:104 col:8
+  %164 = call i32 @"dx.hl.op.rn.i32 (i32, %dx.types.HitObject*)"(i32 374, %dx.types.HitObject* %hit), !dbg !108 ; line:107 col:11
+  %add46 = add i32 %add21, %164, !dbg !109 ; line:107 col:8
+  %165 = call float @"dx.hl.op.rn.float (i32, %dx.types.HitObject*)"(i32 376, %dx.types.HitObject* %hit), !dbg !110 ; line:108 col:11
+  %add48 = fadd float %add44, %165, !dbg !111 ; line:108 col:8
+  %166 = call float @"dx.hl.op.rn.float (i32, %dx.types.HitObject*)"(i32 375, %dx.types.HitObject* %hit), !dbg !112 ; line:109 col:11
+  %add50 = fadd float %add48, %166, !dbg !113 ; line:109 col:8
+  %167 = load %struct.RWByteAddressBuffer, %struct.RWByteAddressBuffer* @"\01?outbuf@@3URWByteAddressBuffer@@A", !dbg !114 ; line:111 col:3
+  %168 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RWByteAddressBuffer)"(i32 0, %struct.RWByteAddressBuffer %167), !dbg !114 ; line:111 col:3
+  %169 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RWByteAddressBuffer)"(i32 14, %dx.types.Handle %168, %dx.types.ResourceProperties { i32 4107, i32 0 }, %struct.RWByteAddressBuffer zeroinitializer), !dbg !114 ; line:111 col:3
+  call void @"dx.hl.op..void (i32, %dx.types.Handle, i32, float)"(i32 277, %dx.types.Handle %169, i32 0, float %add50), !dbg !114 ; line:111 col:3
+  %170 = load %struct.RWByteAddressBuffer, %struct.RWByteAddressBuffer* @"\01?outbuf@@3URWByteAddressBuffer@@A", !dbg !115 ; line:112 col:3
+  %171 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RWByteAddressBuffer)"(i32 0, %struct.RWByteAddressBuffer %170), !dbg !115 ; line:112 col:3
+  %172 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RWByteAddressBuffer)"(i32 14, %dx.types.Handle %171, %dx.types.ResourceProperties { i32 4107, i32 0 }, %struct.RWByteAddressBuffer zeroinitializer), !dbg !115 ; line:112 col:3
+  call void @"dx.hl.op..void (i32, %dx.types.Handle, i32, i32)"(i32 277, %dx.types.Handle %172, i32 4, i32 %add46), !dbg !115 ; line:112 col:3
+  %173 = bitcast %dx.types.HitObject* %hit to i8*, !dbg !116 ; line:113 col:1
+  call void @llvm.lifetime.end(i64 4, i8* %173) #0, !dbg !116 ; line:113 col:1
+  ret void, !dbg !116 ; line:113 col:1
+}
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture) #0
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture) #0
+
+; Function Attrs: nounwind
+declare %dx.types.HitObject* @"dx.hl.op..%dx.types.HitObject* (i32, %dx.types.HitObject*)"(i32, %dx.types.HitObject*) #0
+
+; Function Attrs: nounwind
+declare void @"dx.hl.op..void (i32, %dx.types.HitObject*, i32)"(i32, %dx.types.HitObject*, i32) #0
+
+; Function Attrs: nounwind readnone
+declare i1 @"dx.hl.op.rn.i1 (i32, %dx.types.HitObject*)"(i32, %dx.types.HitObject*) #1
+
+; Function Attrs: nounwind readnone
+declare i32 @"dx.hl.op.rn.i32 (i32, %dx.types.HitObject*)"(i32, %dx.types.HitObject*) #1
+
+; Function Attrs: nounwind readonly
+declare i32 @"dx.hl.op.ro.i32 (i32, %dx.types.HitObject*, i32)"(i32, %dx.types.HitObject*, i32) #2
+
+; Function Attrs: nounwind readnone
+declare <3 x float> @"dx.hl.op.rn.<3 x float> (i32, %dx.types.HitObject*)"(i32, %dx.types.HitObject*) #1
+
+; Function Attrs: nounwind readnone
+declare float @"dx.hl.op.rn.float (i32, %dx.types.HitObject*)"(i32, %dx.types.HitObject*) #1
+
+; Function Attrs: nounwind
+declare void @"dx.hl.op..void (i32, %dx.types.Handle, i32, float)"(i32, %dx.types.Handle, i32, float) #0
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RWByteAddressBuffer)"(i32, %struct.RWByteAddressBuffer) #1
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RWByteAddressBuffer)"(i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RWByteAddressBuffer) #1
+
+; Function Attrs: nounwind
+declare void @"dx.hl.op..void (i32, %dx.types.Handle, i32, i32)"(i32, %dx.types.Handle, i32, i32) #0
+
+; Function Attrs: nounwind readnone
+declare <12 x float> @"dx.hl.op.rn.<12 x float> (i32, %dx.types.HitObject*)"(i32, %dx.types.HitObject*) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!pauseresume = !{!1}
+!llvm.ident = !{!2}
+!dx.version = !{!3}
+!dx.valver = !{!3}
+!dx.shaderModel = !{!4}
+!dx.typeAnnotations = !{!5, !8}
+!dx.entryPoints = !{!12}
+!dx.fnprops = !{!16}
+!dx.options = !{!17, !18}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !{!"hlsl-hlemit", !"hlsl-hlensure"}
+!2 = !{!"dxc(private) 1.8.0.4891 (staging/ser_hlslaccessors_patch, 1ca27ee12)"}
+!3 = !{i32 1, i32 9}
+!4 = !{!"lib", i32 6, i32 9}
+!5 = !{i32 0, %"class.dx::HitObject" undef, !6}
+!6 = !{i32 4, !7}
+!7 = !{i32 6, !"h", i32 3, i32 0, i32 7, i32 4}
+!8 = !{i32 1, void ()* @"\01?main@@YAXXZ", !9}
+!9 = !{!10}
+!10 = !{i32 1, !11, !11}
+!11 = !{}
+!12 = !{null, !"", null, !13, null}
+!13 = !{null, !14, null, null}
+!14 = !{!15}
+!15 = !{i32 0, %struct.RWByteAddressBuffer* @"\01?outbuf@@3URWByteAddressBuffer@@A", !"outbuf", i32 -1, i32 -1, i32 1, i32 11, i1 false, i1 false, i1 false, null}
+!16 = !{void ()* @"\01?main@@YAXXZ", i32 7}
+!17 = !{i32 -2147483584}
+!18 = !{i32 -1}
+!19 = !DILocation(line: 69, column: 3, scope: !20)
+!20 = !DISubprogram(name: "main", scope: !21, file: !21, line: 68, type: !22, isLocal: false, isDefinition: true, scopeLine: 68, flags: DIFlagPrototyped, isOptimized: false, function: void ()* @"\01?main@@YAXXZ")
+!21 = !DIFile(filename: "tools/clang/test/CodeGenDXIL/hlsl/objects/HitObject/hitobject_accessors.hlsl", directory: "")
+!22 = !DISubroutineType(types: !11)
+!23 = !DILocation(line: 69, column: 17, scope: !20)
+!24 = !DILocation(line: 75, column: 3, scope: !20)
+!25 = !DILocation(line: 80, column: 11, scope: !20)
+!26 = !DILocation(line: 81, column: 11, scope: !20)
+!27 = !DILocation(line: 81, column: 8, scope: !20)
+!28 = !DILocation(line: 82, column: 11, scope: !20)
+!29 = !DILocation(line: 82, column: 8, scope: !20)
+!30 = !DILocation(line: 85, column: 11, scope: !20)
+!31 = !DILocation(line: 85, column: 8, scope: !20)
+!32 = !DILocation(line: 86, column: 11, scope: !20)
+!33 = !DILocation(line: 86, column: 8, scope: !20)
+!34 = !DILocation(line: 87, column: 11, scope: !20)
+!35 = !DILocation(line: 87, column: 8, scope: !20)
+!36 = !DILocation(line: 88, column: 11, scope: !20)
+!37 = !DILocation(line: 88, column: 8, scope: !20)
+!38 = !DILocation(line: 89, column: 11, scope: !20)
+!39 = !DILocation(line: 89, column: 8, scope: !20)
+!40 = !DILocation(line: 90, column: 11, scope: !20)
+!41 = !DILocation(line: 90, column: 8, scope: !20)
+!42 = !DILocation(line: 91, column: 11, scope: !20)
+!43 = !DILocation(line: 91, column: 8, scope: !20)
+!44 = !DILocation(line: 94, column: 11, scope: !20)
+!45 = !DILocation(line: 94, column: 8, scope: !20)
+!46 = !DILocation(line: 95, column: 11, scope: !20)
+!47 = !DILocation(line: 95, column: 8, scope: !20)
+!48 = !DILocation(line: 96, column: 11, scope: !20)
+!49 = !DILocation(line: 96, column: 8, scope: !20)
+!50 = !DILocation(line: 97, column: 11, scope: !20)
+!51 = !DILocation(line: 97, column: 8, scope: !20)
+!52 = !DILocation(line: 98, column: 11, scope: !20)
+!53 = !DILocation(line: 98, column: 21, scope: !20)
+!54 = !DILocation(line: 98, column: 19, scope: !20)
+!55 = !DILocation(line: 98, column: 31, scope: !20)
+!56 = !DILocation(line: 98, column: 29, scope: !20)
+!57 = !DILocation(line: 98, column: 8, scope: !20)
+!58 = !DILocation(line: 101, column: 23, scope: !20)
+!59 = !DILocation(line: 101, column: 11, scope: !20)
+!60 = !DILocation(line: 61, column: 3, scope: !61, inlinedAt: !62)
+!61 = !DISubprogram(name: "hashM<3, 4>", scope: !21, file: !21, line: 59, type: !22, isLocal: false, isDefinition: true, scopeLine: 59, flags: DIFlagPrototyped, isOptimized: false)
+!62 = distinct !DILocation(line: 101, column: 11, scope: !20)
+!63 = !DILocation(line: 62, column: 5, scope: !61, inlinedAt: !62)
+!64 = !DILocation(line: 61, column: 26, scope: !61, inlinedAt: !62)
+!65 = !DILocation(line: 61, column: 21, scope: !61, inlinedAt: !62)
+!66 = !DILocation(line: 63, column: 12, scope: !61, inlinedAt: !62)
+!67 = !DILocation(line: 63, column: 9, scope: !61, inlinedAt: !62)
+!68 = !DILocation(line: 62, column: 28, scope: !61, inlinedAt: !62)
+!69 = !DILocation(line: 62, column: 23, scope: !61, inlinedAt: !62)
+!70 = !DILocation(line: 101, column: 8, scope: !20)
+!71 = !DILocation(line: 102, column: 23, scope: !20)
+!72 = !DILocation(line: 102, column: 11, scope: !20)
+!73 = !DILocation(line: 61, column: 3, scope: !74, inlinedAt: !75)
+!74 = !DISubprogram(name: "hashM<4, 3>", scope: !21, file: !21, line: 59, type: !22, isLocal: false, isDefinition: true, scopeLine: 59, flags: DIFlagPrototyped, isOptimized: false)
+!75 = distinct !DILocation(line: 102, column: 11, scope: !20)
+!76 = !DILocation(line: 62, column: 5, scope: !74, inlinedAt: !75)
+!77 = !DILocation(line: 61, column: 26, scope: !74, inlinedAt: !75)
+!78 = !DILocation(line: 61, column: 21, scope: !74, inlinedAt: !75)
+!79 = !DILocation(line: 63, column: 12, scope: !74, inlinedAt: !75)
+!80 = !DILocation(line: 63, column: 9, scope: !74, inlinedAt: !75)
+!81 = !DILocation(line: 62, column: 28, scope: !74, inlinedAt: !75)
+!82 = !DILocation(line: 62, column: 23, scope: !74, inlinedAt: !75)
+!83 = !DILocation(line: 102, column: 8, scope: !20)
+!84 = !DILocation(line: 103, column: 23, scope: !20)
+!85 = !DILocation(line: 103, column: 11, scope: !20)
+!86 = !DILocation(line: 61, column: 3, scope: !61, inlinedAt: !87)
+!87 = distinct !DILocation(line: 103, column: 11, scope: !20)
+!88 = !DILocation(line: 62, column: 5, scope: !61, inlinedAt: !87)
+!89 = !DILocation(line: 61, column: 26, scope: !61, inlinedAt: !87)
+!90 = !DILocation(line: 61, column: 21, scope: !61, inlinedAt: !87)
+!91 = !DILocation(line: 63, column: 12, scope: !61, inlinedAt: !87)
+!92 = !DILocation(line: 63, column: 9, scope: !61, inlinedAt: !87)
+!93 = !DILocation(line: 62, column: 28, scope: !61, inlinedAt: !87)
+!94 = !DILocation(line: 62, column: 23, scope: !61, inlinedAt: !87)
+!95 = !DILocation(line: 103, column: 8, scope: !20)
+!96 = !DILocation(line: 104, column: 23, scope: !20)
+!97 = !DILocation(line: 104, column: 11, scope: !20)
+!98 = !DILocation(line: 61, column: 3, scope: !74, inlinedAt: !99)
+!99 = distinct !DILocation(line: 104, column: 11, scope: !20)
+!100 = !DILocation(line: 62, column: 5, scope: !74, inlinedAt: !99)
+!101 = !DILocation(line: 61, column: 26, scope: !74, inlinedAt: !99)
+!102 = !DILocation(line: 61, column: 21, scope: !74, inlinedAt: !99)
+!103 = !DILocation(line: 63, column: 12, scope: !74, inlinedAt: !99)
+!104 = !DILocation(line: 63, column: 9, scope: !74, inlinedAt: !99)
+!105 = !DILocation(line: 62, column: 28, scope: !74, inlinedAt: !99)
+!106 = !DILocation(line: 62, column: 23, scope: !74, inlinedAt: !99)
+!107 = !DILocation(line: 104, column: 8, scope: !20)
+!108 = !DILocation(line: 107, column: 11, scope: !20)
+!109 = !DILocation(line: 107, column: 8, scope: !20)
+!110 = !DILocation(line: 108, column: 11, scope: !20)
+!111 = !DILocation(line: 108, column: 8, scope: !20)
+!112 = !DILocation(line: 109, column: 11, scope: !20)
+!113 = !DILocation(line: 109, column: 8, scope: !20)
+!114 = !DILocation(line: 111, column: 3, scope: !20)
+!115 = !DILocation(line: 112, column: 3, scope: !20)
+!116 = !DILocation(line: 113, column: 1, scope: !20)
diff --git a/tools/clang/test/DXC/Passes/DxilGen/hitobject_attributes_dxilgen.ll b/tools/clang/test/DXC/Passes/DxilGen/hitobject_attributes_dxilgen.ll
new file mode 100644
index 0000000000..3488a3df03
--- /dev/null
+++ b/tools/clang/test/DXC/Passes/DxilGen/hitobject_attributes_dxilgen.ll
@@ -0,0 +1,151 @@
+; RUN: %dxopt %s -hlsl-passes-resume -dxilgen -S | FileCheck %s
+; REQUIRES: dxil-1-9
+
+;
+; Buffer Definitions:
+;
+;
+; Resource Bindings:
+;
+; Name                                 Type  Format         Dim      ID      HLSL Bind  Count
+; ------------------------------ ---------- ------- ----------- ------- -------------- ------
+; outbuf                                UAV    byte         r/w      U0u4294967295,space4294967295     1
+;
+target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+%struct.RWByteAddressBuffer = type { i32 }
+%dx.types.HitObject = type { i8* }
+%struct.CustomAttrs = type { <4 x float>, i32 }
+%dx.types.Handle = type { i8* }
+%dx.types.ResourceProperties = type { i32, i32 }
+%"class.dx::HitObject" = type { i32 }
+
+@"\01?outbuf@@3URWByteAddressBuffer@@A" = external global %struct.RWByteAddressBuffer, align 4
+
+; CHECK: %[[ATTRA:[^ ]+]] = alloca %struct.CustomAttrs, align 4
+; CHECK: call void @dx.op.hitObject_Attributes.struct.CustomAttrs(i32 289, %dx.types.HitObject %{{[^ ]+}}, %struct.CustomAttrs* %[[ATTRA]])
+
+; Function Attrs: nounwind
+define void @"\01?main@@YAXXZ"() #0 {
+entry:
+  %hit = alloca %dx.types.HitObject, align 4
+  %attrs = alloca %struct.CustomAttrs, align 4
+  %0 = bitcast %dx.types.HitObject* %hit to i8*, !dbg !21 ; line:29 col:3
+  call void @llvm.lifetime.start(i64 4, i8* %0) #0, !dbg !21 ; line:29 col:3
+  %1 = call %dx.types.HitObject* @"dx.hl.op..%dx.types.HitObject* (i32, %dx.types.HitObject*)"(i32 358, %dx.types.HitObject* %hit), !dbg !25 ; line:29 col:17
+  %2 = bitcast %struct.CustomAttrs* %attrs to i8*, !dbg !26 ; line:30 col:3
+  call void @llvm.lifetime.start(i64 20, i8* %2) #0, !dbg !26 ; line:30 col:3
+  call void @"dx.hl.op..void (i32, %dx.types.HitObject*, %struct.CustomAttrs*)"(i32 364, %dx.types.HitObject* %hit, %struct.CustomAttrs* %attrs), !dbg !27 ; line:31 col:3
+  %v = getelementptr inbounds %struct.CustomAttrs, %struct.CustomAttrs* %attrs, i32 0, i32 0, !dbg !28 ; line:32 col:21
+  %3 = load <4 x float>, <4 x float>* %v, align 4, !dbg !29 ; line:32 col:15
+  %4 = extractelement <4 x float> %3, i32 0, !dbg !29 ; line:32 col:15
+  %v1 = getelementptr inbounds %struct.CustomAttrs, %struct.CustomAttrs* %attrs, i32 0, i32 0, !dbg !30 ; line:32 col:33
+  %5 = load <4 x float>, <4 x float>* %v1, align 4, !dbg !31 ; line:32 col:27
+  %6 = extractelement <4 x float> %5, i32 1, !dbg !31 ; line:32 col:27
+  %add = fadd float %4, %6, !dbg !32 ; line:32 col:25
+  %v2 = getelementptr inbounds %struct.CustomAttrs, %struct.CustomAttrs* %attrs, i32 0, i32 0, !dbg !33 ; line:32 col:45
+  %7 = load <4 x float>, <4 x float>* %v2, align 4, !dbg !34 ; line:32 col:39
+  %8 = extractelement <4 x float> %7, i32 2, !dbg !34 ; line:32 col:39
+  %add3 = fadd float %add, %8, !dbg !35 ; line:32 col:37
+  %v4 = getelementptr inbounds %struct.CustomAttrs, %struct.CustomAttrs* %attrs, i32 0, i32 0, !dbg !36 ; line:32 col:57
+  %9 = load <4 x float>, <4 x float>* %v4, align 4, !dbg !37 ; line:32 col:51
+  %10 = extractelement <4 x float> %9, i32 3, !dbg !37 ; line:32 col:51
+  %add5 = fadd float %add3, %10, !dbg !38 ; line:32 col:49
+  %y = getelementptr inbounds %struct.CustomAttrs, %struct.CustomAttrs* %attrs, i32 0, i32 1, !dbg !39 ; line:32 col:69
+  %11 = load i32, i32* %y, align 4, !dbg !39, !tbaa !40 ; line:32 col:69
+  %conv = sitofp i32 %11 to float, !dbg !44 ; line:32 col:63
+  %add6 = fadd float %add5, %conv, !dbg !45 ; line:32 col:61
+  %12 = load %struct.RWByteAddressBuffer, %struct.RWByteAddressBuffer* @"\01?outbuf@@3URWByteAddressBuffer@@A", !dbg !46 ; line:33 col:3
+  %13 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RWByteAddressBuffer)"(i32 0, %struct.RWByteAddressBuffer %12), !dbg !46 ; line:33 col:3
+  %14 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RWByteAddressBuffer)"(i32 14, %dx.types.Handle %13, %dx.types.ResourceProperties { i32 4107, i32 0 }, %struct.RWByteAddressBuffer zeroinitializer), !dbg !46 ; line:33 col:3
+  call void @"dx.hl.op..void (i32, %dx.types.Handle, i32, float)"(i32 277, %dx.types.Handle %14, i32 0, float %add6), !dbg !46 ; line:33 col:3
+  %15 = bitcast %struct.CustomAttrs* %attrs to i8*, !dbg !47 ; line:34 col:1
+  call void @llvm.lifetime.end(i64 20, i8* %15) #0, !dbg !47 ; line:34 col:1
+  %16 = bitcast %dx.types.HitObject* %hit to i8*, !dbg !47 ; line:34 col:1
+  call void @llvm.lifetime.end(i64 4, i8* %16) #0, !dbg !47 ; line:34 col:1
+  ret void, !dbg !47 ; line:34 col:1
+}
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture) #0
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture) #0
+
+; Function Attrs: nounwind
+declare %dx.types.HitObject* @"dx.hl.op..%dx.types.HitObject* (i32, %dx.types.HitObject*)"(i32, %dx.types.HitObject*) #0
+
+; Function Attrs: nounwind
+declare void @"dx.hl.op..void (i32, %dx.types.HitObject*, %struct.CustomAttrs*)"(i32, %dx.types.HitObject*, %struct.CustomAttrs*) #0
+
+; Function Attrs: nounwind
+declare void @"dx.hl.op..void (i32, %dx.types.Handle, i32, float)"(i32, %dx.types.Handle, i32, float) #0
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RWByteAddressBuffer)"(i32, %struct.RWByteAddressBuffer) #1
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RWByteAddressBuffer)"(i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RWByteAddressBuffer) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+
+!llvm.module.flags = !{!0}
+!pauseresume = !{!1}
+!dx.version = !{!2}
+!dx.valver = !{!2}
+!dx.shaderModel = !{!3}
+!dx.typeAnnotations = !{!4, !10}
+!dx.entryPoints = !{!14}
+!dx.fnprops = !{!18}
+!dx.options = !{!19, !20}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !{!"hlsl-hlemit", !"hlsl-hlensure"}
+!2 = !{i32 1, i32 9}
+!3 = !{!"lib", i32 6, i32 9}
+!4 = !{i32 0, %"class.dx::HitObject" undef, !5, %struct.CustomAttrs undef, !7}
+!5 = !{i32 4, !6}
+!6 = !{i32 6, !"h", i32 3, i32 0, i32 7, i32 4}
+!7 = !{i32 20, !8, !9}
+!8 = !{i32 6, !"v", i32 3, i32 0, i32 7, i32 9, i32 13, i32 4}
+!9 = !{i32 6, !"y", i32 3, i32 16, i32 7, i32 4}
+!10 = !{i32 1, void ()* @"\01?main@@YAXXZ", !11}
+!11 = !{!12}
+!12 = !{i32 1, !13, !13}
+!13 = !{}
+!14 = !{null, !"", null, !15, null}
+!15 = !{null, !16, null, null}
+!16 = !{!17}
+!17 = !{i32 0, %struct.RWByteAddressBuffer* @"\01?outbuf@@3URWByteAddressBuffer@@A", !"outbuf", i32 -1, i32 -1, i32 1, i32 11, i1 false, i1 false, i1 false, null}
+!18 = !{void ()* @"\01?main@@YAXXZ", i32 7}
+!19 = !{i32 -2147483584}
+!20 = !{i32 -1}
+!21 = !DILocation(line: 29, column: 3, scope: !22)
+!22 = !DISubprogram(name: "main", scope: !23, file: !23, line: 28, type: !24, isLocal: false, isDefinition: true, scopeLine: 28, flags: DIFlagPrototyped, isOptimized: false, function: void ()* @"\01?main@@YAXXZ")
+!23 = !DIFile(filename: "tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_attributes.hlsl", directory: "")
+!24 = !DISubroutineType(types: !13)
+!25 = !DILocation(line: 29, column: 17, scope: !22)
+!26 = !DILocation(line: 30, column: 3, scope: !22)
+!27 = !DILocation(line: 31, column: 3, scope: !22)
+!28 = !DILocation(line: 32, column: 21, scope: !22)
+!29 = !DILocation(line: 32, column: 15, scope: !22)
+!30 = !DILocation(line: 32, column: 33, scope: !22)
+!31 = !DILocation(line: 32, column: 27, scope: !22)
+!32 = !DILocation(line: 32, column: 25, scope: !22)
+!33 = !DILocation(line: 32, column: 45, scope: !22)
+!34 = !DILocation(line: 32, column: 39, scope: !22)
+!35 = !DILocation(line: 32, column: 37, scope: !22)
+!36 = !DILocation(line: 32, column: 57, scope: !22)
+!37 = !DILocation(line: 32, column: 51, scope: !22)
+!38 = !DILocation(line: 32, column: 49, scope: !22)
+!39 = !DILocation(line: 32, column: 69, scope: !22)
+!40 = !{!41, !41, i64 0}
+!41 = !{!"int", !42, i64 0}
+!42 = !{!"omnipotent char", !43, i64 0}
+!43 = !{!"Simple C/C++ TBAA"}
+!44 = !DILocation(line: 32, column: 63, scope: !22)
+!45 = !DILocation(line: 32, column: 61, scope: !22)
+!46 = !DILocation(line: 33, column: 3, scope: !22)
+!47 = !DILocation(line: 34, column: 1, scope: !22)
diff --git a/tools/clang/test/DXC/Passes/DxilGen/hitobject_fromrayquery_dxilgen.ll b/tools/clang/test/DXC/Passes/DxilGen/hitobject_fromrayquery_dxilgen.ll
new file mode 100644
index 0000000000..0ae8e36fa7
--- /dev/null
+++ b/tools/clang/test/DXC/Passes/DxilGen/hitobject_fromrayquery_dxilgen.ll
@@ -0,0 +1,146 @@
+; RUN: %dxopt %s -hlsl-passes-resume -dxilgen -S | FileCheck %s
+; REQUIRES: dxil-1-9
+
+;
+; Buffer Definitions:
+;
+;
+; Resource Bindings:
+;
+; Name                                 Type  Format         Dim      ID      HLSL Bind  Count
+; ------------------------------ ---------- ------- ----------- ------- -------------- ------
+; RTAS                              texture     i32         ras      T0t4294967295,space4294967295     1
+;
+target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+%struct.RaytracingAccelerationStructure = type { i32 }
+%struct.CustomAttrs = type { float, float }
+%dx.types.HitObject = type { i8* }
+%dx.types.Handle = type { i8* }
+%dx.types.ResourceProperties = type { i32, i32 }
+%"class.RWStructuredBuffer<float>" = type { float }
+%struct.RayDesc = type { <3 x float>, float, <3 x float>, float }
+%"class.dx::HitObject" = type { i32 }
+%"class.RayQuery<5, 0>" = type { i32 }
+
+@"\01?RTAS@@3URaytracingAccelerationStructure@@A" = external global %struct.RaytracingAccelerationStructure, align 4
+
+; CHECK: %[[ATTRA:[^ ]+]] = alloca %struct.CustomAttrs
+; CHECK: call void @dx.op.rayQuery_TraceRayInline(i32 179, i32 %[[RQH:[^ ]+]], %dx.types.Handle %{{[^ ]+}}, i32 0, i32 255, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 9.999000e+03)
+; CHECK: %{{[^ ]+}} = call %dx.types.HitObject @dx.op.hitObject_FromRayQuery(i32 263, i32 %[[RQH]])
+; CHECK: %{{[^ ]+}} = call %dx.types.HitObject @dx.op.hitObject_FromRayQueryWithAttrs.struct.CustomAttrs(i32 264, i32 %[[RQH]], i32 16, %struct.CustomAttrs* %[[ATTRA]])
+
+; Function Attrs: nounwind
+define void @"\01?main@@YAXXZ"() #0 {
+entry:
+  %0 = alloca %struct.CustomAttrs
+  %agg.tmp = alloca %dx.types.HitObject, align 4
+  %agg.tmp1 = alloca %dx.types.HitObject, align 4
+  %q2 = call i32 @"dx.hl.op..i32 (i32, i32, i32)"(i32 4, i32 5, i32 0), !dbg !38 ; line:29 col:78
+  %1 = load %struct.RaytracingAccelerationStructure, %struct.RaytracingAccelerationStructure* @"\01?RTAS@@3URaytracingAccelerationStructure@@A", !dbg !42 ; line:31 col:3
+  %2 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RaytracingAccelerationStructure)"(i32 0, %struct.RaytracingAccelerationStructure %1), !dbg !42 ; line:31 col:3
+  %3 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RaytracingAccelerationStructure)"(i32 14, %dx.types.Handle %2, %dx.types.ResourceProperties { i32 16, i32 0 }, %struct.RaytracingAccelerationStructure zeroinitializer), !dbg !42 ; line:31 col:3
+  call void @"dx.hl.op..void (i32, i32, %dx.types.Handle, i32, i32, <3 x float>, float, <3 x float>, float)"(i32 325, i32 %q2, %dx.types.Handle %3, i32 0, i32 255, <3 x float> zeroinitializer, float 0.000000e+00, <3 x float> <float 1.000000e+00, float 0.000000e+00, float 0.000000e+00>, float 9.999000e+03), !dbg !42 ; line:31 col:3
+  call void @"dx.hl.op..void (i32, %dx.types.HitObject*, i32)"(i32 363, %dx.types.HitObject* %agg.tmp, i32 %q2), !dbg !43 ; line:33 col:7
+  call void @"dx.hl.op..void (i32, %dx.types.HitObject*)"(i32 359, %dx.types.HitObject* %agg.tmp) #0, !dbg !44 ; line:24 col:3
+  %.0 = getelementptr inbounds %struct.CustomAttrs, %struct.CustomAttrs* %0, i32 0, i32 0
+  store float 1.000000e+00, float* %.0
+  %.1 = getelementptr inbounds %struct.CustomAttrs, %struct.CustomAttrs* %0, i32 0, i32 1
+  store float 2.000000e+00, float* %.1, align 4
+  call void @"dx.hl.op..void (i32, %dx.types.HitObject*, i32, i32, %struct.CustomAttrs*)"(i32 363, %dx.types.HitObject* %agg.tmp1, i32 %q2, i32 16, %struct.CustomAttrs* %0), !dbg !47 ; line:36 col:7
+  call void @"dx.hl.op..void (i32, %dx.types.HitObject*)"(i32 359, %dx.types.HitObject* %agg.tmp1) #0, !dbg !48 ; line:24 col:3
+  ret void, !dbg !49 ; line:37 col:1
+}
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture) #0
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture) #0
+
+; Function Attrs: nounwind
+declare void @"dx.hl.op..void (i32, %dx.types.HitObject*)"(i32, %dx.types.HitObject*) #0
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RaytracingAccelerationStructure)"(i32, %struct.RaytracingAccelerationStructure) #1
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RaytracingAccelerationStructure)"(i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RaytracingAccelerationStructure) #1
+
+; Function Attrs: nounwind
+declare i32 @"dx.hl.op..i32 (i32, i32, i32)"(i32, i32, i32) #0
+
+; Function Attrs: nounwind
+declare void @"dx.hl.op..void (i32, i32, %dx.types.Handle, i32, i32, <3 x float>, float, <3 x float>, float)"(i32, i32, %dx.types.Handle, i32, i32, <3 x float>, float, <3 x float>, float) #0
+
+; Function Attrs: nounwind
+declare void @"dx.hl.op..void (i32, %dx.types.HitObject*, i32, i32, %struct.CustomAttrs*)"(i32, %dx.types.HitObject*, i32, i32, %struct.CustomAttrs*) #0
+
+; Function Attrs: nounwind
+declare void @"dx.hl.op..void (i32, %dx.types.HitObject*, i32)"(i32, %dx.types.HitObject*, i32) #0
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+
+!llvm.module.flags = !{!0}
+!pauseresume = !{!1}
+!dx.version = !{!2}
+!dx.valver = !{!2}
+!dx.shaderModel = !{!3}
+!dx.typeAnnotations = !{!4, !26}
+!dx.entryPoints = !{!30}
+!dx.fnprops = !{!35}
+!dx.options = !{!36, !37}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !{!"hlsl-hlemit", !"hlsl-hlensure"}
+!2 = !{i32 1, i32 9}
+!3 = !{!"lib", i32 6, i32 9}
+!4 = !{i32 0, %"class.RWStructuredBuffer<float>" undef, !5, %struct.RayDesc undef, !10, %"class.dx::HitObject" undef, !15, %"class.RayQuery<5, 0>" undef, !17, %struct.CustomAttrs undef, !23}
+!5 = !{i32 4, !6, !7}
+!6 = !{i32 6, !"h", i32 3, i32 0, i32 7, i32 9}
+!7 = !{i32 0, !8}
+!8 = !{!9}
+!9 = !{i32 0, float undef}
+!10 = !{i32 32, !11, !12, !13, !14}
+!11 = !{i32 6, !"Origin", i32 3, i32 0, i32 7, i32 9, i32 13, i32 3}
+!12 = !{i32 6, !"TMin", i32 3, i32 12, i32 7, i32 9}
+!13 = !{i32 6, !"Direction", i32 3, i32 16, i32 7, i32 9, i32 13, i32 3}
+!14 = !{i32 6, !"TMax", i32 3, i32 28, i32 7, i32 9}
+!15 = !{i32 4, !16}
+!16 = !{i32 6, !"h", i32 3, i32 0, i32 7, i32 4}
+!17 = !{i32 4, !18, !19}
+!18 = !{i32 6, !"h", i32 3, i32 0, i32 7, i32 5}
+!19 = !{i32 0, !20}
+!20 = !{!21, !22}
+!21 = !{i32 1, i64 5}
+!22 = !{i32 1, i64 0}
+!23 = !{i32 8, !24, !25}
+!24 = !{i32 6, !"x", i32 3, i32 0, i32 7, i32 9}
+!25 = !{i32 6, !"y", i32 3, i32 4, i32 7, i32 9}
+!26 = !{i32 1, void ()* @"\01?main@@YAXXZ", !27}
+!27 = !{!28}
+!28 = !{i32 1, !29, !29}
+!29 = !{}
+!30 = !{null, !"", null, !31, null}
+!31 = !{!32, null, null, null}
+!32 = !{!33}
+!33 = !{i32 0, %struct.RaytracingAccelerationStructure* @"\01?RTAS@@3URaytracingAccelerationStructure@@A", !"RTAS", i32 -1, i32 -1, i32 1, i32 16, i32 0, !34}
+!34 = !{i32 0, i32 4}
+!35 = !{void ()* @"\01?main@@YAXXZ", i32 7}
+!36 = !{i32 -2147483584}
+!37 = !{i32 -1}
+!38 = !DILocation(line: 29, column: 78, scope: !39)
+!39 = !DISubprogram(name: "main", scope: !40, file: !40, line: 28, type: !41, isLocal: false, isDefinition: true, scopeLine: 28, flags: DIFlagPrototyped, isOptimized: false, function: void ()* @"\01?main@@YAXXZ")
+!40 = !DIFile(filename: "tools/clang/test/CodeGenDXIL/hlsl/objects/HitObject/hitobject_fromrayquery.hlsl", directory: "")
+!41 = !DISubroutineType(types: !29)
+!42 = !DILocation(line: 31, column: 3, scope: !39)
+!43 = !DILocation(line: 33, column: 7, scope: !39)
+!44 = !DILocation(line: 24, column: 3, scope: !45, inlinedAt: !46)
+!45 = !DISubprogram(name: "Use", scope: !40, file: !40, line: 23, type: !41, isLocal: false, isDefinition: true, scopeLine: 23, flags: DIFlagPrototyped, isOptimized: false)
+!46 = distinct !DILocation(line: 33, column: 3, scope: !39)
+!47 = !DILocation(line: 36, column: 7, scope: !39)
+!48 = !DILocation(line: 24, column: 3, scope: !45, inlinedAt: !49)
+!49 = distinct !DILocation(line: 36, column: 3, scope: !39)
+!50 = !DILocation(line: 37, column: 1, scope: !39)
diff --git a/tools/clang/test/DXC/Passes/DxilGen/hitobject_traceinvoke_dxilgen.ll b/tools/clang/test/DXC/Passes/DxilGen/hitobject_traceinvoke_dxilgen.ll
new file mode 100644
index 0000000000..03bb0716ce
--- /dev/null
+++ b/tools/clang/test/DXC/Passes/DxilGen/hitobject_traceinvoke_dxilgen.ll
@@ -0,0 +1,124 @@
+; RUN: %dxopt %s -hlsl-passes-resume -dxilgen -S | FileCheck %s
+; REQUIRES: dxil-1-9
+
+target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+%struct.RaytracingAccelerationStructure = type { i32 }
+%struct.Payload = type { <3 x float> }
+%dx.types.HitObject = type { i8* }
+%dx.types.Handle = type { i8* }
+%dx.types.ResourceProperties = type { i32, i32 }
+%"class.RWStructuredBuffer<float>" = type { float }
+%struct.RayDesc = type { <3 x float>, float, <3 x float>, float }
+%"class.dx::HitObject" = type { i32 }
+
+@"\01?RTAS@@3URaytracingAccelerationStructure@@A" = external global %struct.RaytracingAccelerationStructure, align 4
+
+; Function Attrs: nounwind
+define void @"\01?main@@YAXXZ"() #0 {
+entry:
+  %pld_invoke = alloca %struct.Payload
+  %pld_trace = alloca %struct.Payload
+  %hit = alloca %dx.types.HitObject, align 4
+  %0 = bitcast %dx.types.HitObject* %hit to i8*, !dbg !32 ; line:91 col:3
+  call void @llvm.lifetime.start(i64 4, i8* %0) #0, !dbg !32 ; line:91 col:3
+  %1 = load %struct.RaytracingAccelerationStructure, %struct.RaytracingAccelerationStructure* @"\01?RTAS@@3URaytracingAccelerationStructure@@A", !dbg !36 ; line:91 col:23
+  %rtas = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RaytracingAccelerationStructure)"(i32 0, %struct.RaytracingAccelerationStructure %1), !dbg !36 ; line:91 col:23
+
+  ; Capture the handle for the RTAS
+  ; CHECK: %[[RTAS:[^ ]+]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %{{[^ ]+}}, %dx.types.ResourceProperties { i32 16, i32 0 })
+  %2 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RaytracingAccelerationStructure)"(i32 14, %dx.types.Handle %rtas, %dx.types.ResourceProperties { i32 16, i32 0 }, %struct.RaytracingAccelerationStructure zeroinitializer), !dbg !36 ; line:91 col:23
+
+  %3 = getelementptr inbounds %struct.Payload, %struct.Payload* %pld_trace, i32 0, i32 0, !dbg !36 ; line:91 col:23
+  store <3 x float> <float 7.000000e+00, float 8.000000e+00, float 9.000000e+00>, <3 x float>* %3, !dbg !36 ; line:91 col:23
+
+  ; CHECK: %[[TRACEHO:[^ ]+]] = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle %[[RTAS]], i32 513, i32 1, i32 2, i32 4, i32 0, float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, %struct.Payload* %pld_trace), !dbg !3 ; line:91 col:23
+  ; CHECK: store %dx.types.HitObject %[[TRACEHO]], %dx.types.HitObject* %hit
+  call void @"dx.hl.op..void (i32, %dx.types.HitObject*, %dx.types.Handle, i32, i32, i32, i32, i32, <3 x float>, float, <3 x float>, float, %struct.Payload*)"(i32 389, %dx.types.HitObject* %hit, %dx.types.Handle %2, i32 513, i32 1, i32 2, i32 4, i32 0, <3 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00>, float 3.000000e+00, <3 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00>, float 7.000000e+00, %struct.Payload* %pld_trace), !dbg !36 ; line:91 col:23
+
+  %4 = getelementptr inbounds %struct.Payload, %struct.Payload* %pld_trace, i32 0, i32 0, !dbg !37 ; line:101 col:3
+  %5 = load <3 x float>, <3 x float>* %4, !dbg !37 ; line:101 col:3
+  %6 = getelementptr inbounds %struct.Payload, %struct.Payload* %pld_invoke, i32 0, i32 0, !dbg !37 ; line:101 col:3
+  store <3 x float> %5, <3 x float>* %6, !dbg !37 ; line:101 col:3
+
+  ; CHECK: %[[INVOKEHO:[^ ]+]] = load %dx.types.HitObject, %dx.types.HitObject* %hit
+  ; CHECK: call void @dx.op.hitObject_Invoke.struct.Payload(i32 267, %dx.types.HitObject %[[INVOKEHO]], %struct.Payload* %pld_invoke)
+  call void @"dx.hl.op..void (i32, %dx.types.HitObject*, %struct.Payload*)"(i32 382, %dx.types.HitObject* %hit, %struct.Payload* %pld_invoke), !dbg !37 ; line:101 col:3
+
+  %7 = bitcast %dx.types.HitObject* %hit to i8*, !dbg !38 ; line:102 col:1
+  call void @llvm.lifetime.end(i64 4, i8* %7) #0, !dbg !38 ; line:102 col:1
+  ret void, !dbg !38 ; line:102 col:1
+}
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture) #0
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture) #0
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RaytracingAccelerationStructure)"(i32, %struct.RaytracingAccelerationStructure) #1
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RaytracingAccelerationStructure)"(i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RaytracingAccelerationStructure) #1
+
+; Function Attrs: nounwind
+declare void @"dx.hl.op..void (i32, %dx.types.HitObject*, %struct.Payload*)"(i32, %dx.types.HitObject*, %struct.Payload*) #0
+
+; Function Attrs: nounwind
+declare void @"dx.hl.op..void (i32, %dx.types.HitObject*, %dx.types.Handle, i32, i32, i32, i32, i32, <3 x float>, float, <3 x float>, float, %struct.Payload*)"(i32, %dx.types.HitObject*, %dx.types.Handle, i32, i32, i32, i32, i32, <3 x float>, float, <3 x float>, float, %struct.Payload*) #0
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+
+!llvm.module.flags = !{!0}
+!pauseresume = !{!1}
+!llvm.ident = !{!2}
+!dx.version = !{!3}
+!dx.valver = !{!3}
+!dx.shaderModel = !{!4}
+!dx.typeAnnotations = !{!5, !20}
+!dx.entryPoints = !{!24}
+!dx.fnprops = !{!29}
+!dx.options = !{!30, !31}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !{!"hlsl-hlemit", !"hlsl-hlensure"}
+!2 = !{!"dxc(private) 1.8.0.4928 (ser_hlslattributes_patch, 937c16cc6)"}
+!3 = !{i32 1, i32 9}
+!4 = !{!"lib", i32 6, i32 9}
+!5 = !{i32 0, %"class.RWStructuredBuffer<float>" undef, !6, %struct.RayDesc undef, !11, %struct.Payload undef, !16, %"class.dx::HitObject" undef, !18}
+!6 = !{i32 4, !7, !8}
+!7 = !{i32 6, !"h", i32 3, i32 0, i32 7, i32 9}
+!8 = !{i32 0, !9}
+!9 = !{!10}
+!10 = !{i32 0, float undef}
+!11 = !{i32 32, !12, !13, !14, !15}
+!12 = !{i32 6, !"Origin", i32 3, i32 0, i32 7, i32 9, i32 13, i32 3}
+!13 = !{i32 6, !"TMin", i32 3, i32 12, i32 7, i32 9}
+!14 = !{i32 6, !"Direction", i32 3, i32 16, i32 7, i32 9, i32 13, i32 3}
+!15 = !{i32 6, !"TMax", i32 3, i32 28, i32 7, i32 9}
+!16 = !{i32 12, !17}
+!17 = !{i32 6, !"dummy", i32 3, i32 0, i32 7, i32 9, i32 13, i32 3}
+!18 = !{i32 4, !19}
+!19 = !{i32 6, !"h", i32 3, i32 0, i32 7, i32 4}
+!20 = !{i32 1, void ()* @"\01?main@@YAXXZ", !21}
+!21 = !{!22}
+!22 = !{i32 1, !23, !23}
+!23 = !{}
+!24 = !{null, !"", null, !25, null}
+!25 = !{!26, null, null, null}
+!26 = !{!27}
+!27 = !{i32 0, %struct.RaytracingAccelerationStructure* @"\01?RTAS@@3URaytracingAccelerationStructure@@A", !"RTAS", i32 -1, i32 -1, i32 1, i32 16, i32 0, !28}
+!28 = !{i32 0, i32 4}
+!29 = !{void ()* @"\01?main@@YAXXZ", i32 7}
+!30 = !{i32 -2147483584}
+!31 = !{i32 -1}
+!32 = !DILocation(line: 91, column: 3, scope: !33)
+!33 = !DISubprogram(name: "main", scope: !34, file: !34, line: 81, type: !35, isLocal: false, isDefinition: true, scopeLine: 81, flags: DIFlagPrototyped, isOptimized: false, function: void ()* @"\01?main@@YAXXZ")
+!34 = !DIFile(filename: "tools/clang/test/CodeGenDXIL/hlsl/objects/HitObject/hitobject_traceinvoke.hlsl", directory: "")
+!35 = !DISubroutineType(types: !23)
+!36 = !DILocation(line: 91, column: 23, scope: !33)
+!37 = !DILocation(line: 101, column: 3, scope: !33)
+!38 = !DILocation(line: 102, column: 1, scope: !33)
diff --git a/tools/clang/test/DXC/Passes/DxilGen/linalg-builtins.ll b/tools/clang/test/DXC/Passes/DxilGen/linalg-builtins.ll
new file mode 100644
index 0000000000..ea1be46c4c
--- /dev/null
+++ b/tools/clang/test/DXC/Passes/DxilGen/linalg-builtins.ll
@@ -0,0 +1,189 @@
+; RUN: %dxopt %s -hlsl-passes-resume -dxilgen -S | FileCheck %s
+; REQUIRES: dxil-1-9
+
+target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+%struct.ByteAddressBuffer = type { i32 }
+%struct.RWByteAddressBuffer = type { i32 }
+%dx.types.Handle = type { i8* }
+%dx.types.ResourceProperties = type { i32, i32 }
+
+@"\01?input_vector_buffer@@3UByteAddressBuffer@@A" = external global %struct.ByteAddressBuffer, align 4
+@"\01?opa_input_buffer@@3UByteAddressBuffer@@A" = external global %struct.ByteAddressBuffer, align 4
+@"\01?matrix_buffer@@3UByteAddressBuffer@@A" = external global %struct.ByteAddressBuffer, align 4
+@"\01?bias_buffer@@3UByteAddressBuffer@@A" = external global %struct.ByteAddressBuffer, align 4
+@"\01?rw_matrix_buffer@@3URWByteAddressBuffer@@A" = external global %struct.RWByteAddressBuffer, align 4
+@"\01?output_vector_buffer@@3URWByteAddressBuffer@@A" = external global %struct.RWByteAddressBuffer, align 4
+
+; Function Attrs: nounwind
+define void @cs_main() #0 {
+entry:
+  ;CHECK-DAG: %[[MLD:[^ ]+]] = load %struct.ByteAddressBuffer, %struct.ByteAddressBuffer* @"\01?matrix_buffer@@3UByteAddressBuffer@@A"
+  ;CHECK-DAG: %[[BLD:[^ ]+]] = load %struct.ByteAddressBuffer, %struct.ByteAddressBuffer* @"\01?bias_buffer@@3UByteAddressBuffer@@A"
+  ;CHECK-DAG: %[[RWMLD0:[^ ]+]] = load %struct.RWByteAddressBuffer, %struct.RWByteAddressBuffer* @"\01?rw_matrix_buffer@@3URWByteAddressBuffer@@A"
+  %output_vector = alloca <4 x float>, align 4
+  %tmp = bitcast <4 x float>* %output_vector to i8*, !dbg !21 ; line:14 col:5
+  call void @llvm.lifetime.start(i64 16, i8* %tmp) #0, !dbg !21 ; line:14 col:5
+  %tmp1 = load %struct.ByteAddressBuffer, %struct.ByteAddressBuffer* @"\01?input_vector_buffer@@3UByteAddressBuffer@@A", !dbg !25 ; line:17 col:37
+  %tmp2 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.ByteAddressBuffer)"(i32 0, %struct.ByteAddressBuffer %tmp1), !dbg !25 ; line:17 col:37
+  %tmp3 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.ByteAddressBuffer)"(i32 14, %dx.types.Handle %tmp2, %dx.types.ResourceProperties { i32 11, i32 0 }, %struct.ByteAddressBuffer zeroinitializer), !dbg !25 ; line:17 col:37
+  %tmp4 = call <4 x float> @"dx.hl.op.ro.<4 x float> (i32, %dx.types.Handle, i32)"(i32 231, %dx.types.Handle %tmp3, i32 0), !dbg !25 ; line:17 col:37
+  %tmp5 = load %struct.ByteAddressBuffer, %struct.ByteAddressBuffer* @"\01?matrix_buffer@@3UByteAddressBuffer@@A", !dbg !26 ; line:33 col:5
+  %tmp6 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.ByteAddressBuffer)"(i32 0, %struct.ByteAddressBuffer %tmp5), !dbg !26 ; line:33 col:5
+  %tmp7 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.ByteAddressBuffer)"(i32 14, %dx.types.Handle %tmp6, %dx.types.ResourceProperties { i32 11, i32 0 }, %struct.ByteAddressBuffer zeroinitializer), !dbg !26 ; line:33 col:5
+
+  ;CHECK: %[[MCH0:[^ ]+]] = call %dx.types.Handle @dx.op.createHandleForLib.struct.ByteAddressBuffer(i32 160, %struct.ByteAddressBuffer %[[MLD]]
+  ;CHECK: %[[MAH0:[^ ]+]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %[[MCH0]]
+  ;CHECK: call <4 x float> @dx.op.matVecMul.v4f32.v4f32(i32 305, <4 x float> %{{[^ ]+}}, i1 false, i32 9, %dx.types.Handle %[[MAH0]], i32 0, i32 9, i32 4, i32 4, i32 0, i1 false, i32 64, i1 false) 
+  call void @"dx.hl.op..void (i32, <4 x float>*, i1, <4 x float>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32)"(i32 390, <4 x float>* %output_vector, i1 false, <4 x float> %tmp4, i1 false, i32 9, %dx.types.Handle %tmp7, i32 0, i32 9, i32 4, i32 4, i32 0, i1 false, i32 64), !dbg !26 ; line:33 col:5
+
+  %tmp8 = load <4 x float>, <4 x float>* %output_vector, align 4, !dbg !27, !tbaa !28 ; line:37 col:35
+  %tmp9 = load %struct.RWByteAddressBuffer, %struct.RWByteAddressBuffer* @"\01?output_vector_buffer@@3URWByteAddressBuffer@@A", !dbg !31 ; line:37 col:5
+  %tmp10 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RWByteAddressBuffer)"(i32 0, %struct.RWByteAddressBuffer %tmp9), !dbg !31 ; line:37 col:5
+  %tmp11 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RWByteAddressBuffer)"(i32 14, %dx.types.Handle %tmp10, %dx.types.ResourceProperties { i32 4107, i32 0 }, %struct.RWByteAddressBuffer zeroinitializer), !dbg !31 ; line:37 col:5
+  call void @"dx.hl.op..void (i32, %dx.types.Handle, i32, <4 x float>)"(i32 277, %dx.types.Handle %tmp11, i32 0, <4 x float> %tmp8), !dbg !31 ; line:37 col:5
+  %tmp12 = load %struct.ByteAddressBuffer, %struct.ByteAddressBuffer* @"\01?matrix_buffer@@3UByteAddressBuffer@@A", !dbg !32 ; line:49 col:5
+  %tmp13 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.ByteAddressBuffer)"(i32 0, %struct.ByteAddressBuffer %tmp12), !dbg !32 ; line:49 col:5
+  %tmp14 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.ByteAddressBuffer)"(i32 14, %dx.types.Handle %tmp13, %dx.types.ResourceProperties { i32 11, i32 0 }, %struct.ByteAddressBuffer zeroinitializer), !dbg !32 ; line:49 col:5
+  %tmp15 = load %struct.ByteAddressBuffer, %struct.ByteAddressBuffer* @"\01?bias_buffer@@3UByteAddressBuffer@@A", !dbg !32 ; line:49 col:5
+  %tmp16 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.ByteAddressBuffer)"(i32 0, %struct.ByteAddressBuffer %tmp15), !dbg !32 ; line:49 col:5
+  %tmp17 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.ByteAddressBuffer)"(i32 14, %dx.types.Handle %tmp16, %dx.types.ResourceProperties { i32 11, i32 0 }, %struct.ByteAddressBuffer zeroinitializer), !dbg !32 ; line:49 col:5
+
+  ;CHECK: %[[MCH1:[^ ]+]] = call %dx.types.Handle @dx.op.createHandleForLib.struct.ByteAddressBuffer(i32 160, %struct.ByteAddressBuffer %[[MLD]]
+  ;CHECK: %[[MAH1:[^ ]+]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %[[MCH1]]
+  ;CHECK: %[[BCH1:[^ ]+]] = call %dx.types.Handle @dx.op.createHandleForLib.struct.ByteAddressBuffer(i32 160, %struct.ByteAddressBuffer %[[BLD]]
+  ;CHECK: %[[BAH1:[^ ]+]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %[[BCH1]]
+  ;CHECK: call <4 x float> @dx.op.matVecMulAdd.v4f32.v4f32(i32 306, <4 x float> %{{[^ ]+}}, i1 false, i32 9, %dx.types.Handle %[[MAH1]], i32 0, i32 9, i32 4, i32 4, i32 0, i1 false, i32 64, %dx.types.Handle %[[BAH1]], i32 0, i32 9, i1 false)
+  call void @"dx.hl.op..void (i32, <4 x float>*, i1, <4 x float>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32, %dx.types.Handle, i32, i32)"(i32 391, <4 x float>* %output_vector, i1 false, <4 x float> %tmp4, i1 false, i32 9, %dx.types.Handle %tmp14, i32 0, i32 9, i32 4, i32 4, i32 0, i1 false, i32 64, %dx.types.Handle %tmp17, i32 0, i32 9), !dbg !32 ; line:49 col:5
+  
+  %tmp18 = load <4 x float>, <4 x float>* %output_vector, align 4, !dbg !33, !tbaa !28 ; line:54 col:38
+  %tmp19 = load %struct.RWByteAddressBuffer, %struct.RWByteAddressBuffer* @"\01?output_vector_buffer@@3URWByteAddressBuffer@@A", !dbg !34 ; line:54 col:5
+  %tmp20 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RWByteAddressBuffer)"(i32 0, %struct.RWByteAddressBuffer %tmp19), !dbg !34 ; line:54 col:5
+  %tmp21 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RWByteAddressBuffer)"(i32 14, %dx.types.Handle %tmp20, %dx.types.ResourceProperties { i32 4107, i32 0 }, %struct.RWByteAddressBuffer zeroinitializer), !dbg !34 ; line:54 col:5
+  call void @"dx.hl.op..void (i32, %dx.types.Handle, i32, <4 x float>)"(i32 277, %dx.types.Handle %tmp21, i32 1024, <4 x float> %tmp18), !dbg !34 ; line:54 col:5
+  %tmp22 = load %struct.ByteAddressBuffer, %struct.ByteAddressBuffer* @"\01?opa_input_buffer@@3UByteAddressBuffer@@A", !dbg !35 ; line:56 col:37
+  %tmp23 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.ByteAddressBuffer)"(i32 0, %struct.ByteAddressBuffer %tmp22), !dbg !35 ; line:56 col:37
+  %tmp24 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.ByteAddressBuffer)"(i32 14, %dx.types.Handle %tmp23, %dx.types.ResourceProperties { i32 11, i32 0 }, %struct.ByteAddressBuffer zeroinitializer), !dbg !35 ; line:56 col:37
+  %tmp25 = call <8 x i32> @"dx.hl.op.ro.<8 x i32> (i32, %dx.types.Handle, i32)"(i32 231, %dx.types.Handle %tmp24, i32 0), !dbg !35 ; line:56 col:37
+  %tmp26 = load %struct.ByteAddressBuffer, %struct.ByteAddressBuffer* @"\01?opa_input_buffer@@3UByteAddressBuffer@@A", !dbg !36 ; line:57 col:37
+  %tmp27 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.ByteAddressBuffer)"(i32 0, %struct.ByteAddressBuffer %tmp26), !dbg !36 ; line:57 col:37
+  %tmp28 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.ByteAddressBuffer)"(i32 14, %dx.types.Handle %tmp27, %dx.types.ResourceProperties { i32 11, i32 0 }, %struct.ByteAddressBuffer zeroinitializer), !dbg !36 ; line:57 col:37
+  %tmp29 = call <8 x i32> @"dx.hl.op.ro.<8 x i32> (i32, %dx.types.Handle, i32)"(i32 231, %dx.types.Handle %tmp28, i32 128), !dbg !36 ; line:57 col:37
+  %tmp30 = load %struct.RWByteAddressBuffer, %struct.RWByteAddressBuffer* @"\01?rw_matrix_buffer@@3URWByteAddressBuffer@@A", !dbg !37 ; line:67 col:5
+  %tmp31 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RWByteAddressBuffer)"(i32 0, %struct.RWByteAddressBuffer %tmp30), !dbg !37 ; line:67 col:5
+  %tmp32 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RWByteAddressBuffer)"(i32 14, %dx.types.Handle %tmp31, %dx.types.ResourceProperties { i32 4107, i32 0 }, %struct.RWByteAddressBuffer zeroinitializer), !dbg !37 ; line:67 col:5
+
+  ;CHECK: %[[RWMCH0:[^ ]+]] = call %dx.types.Handle @dx.op.createHandleForLib.struct.RWByteAddressBuffer(i32 160, %struct.RWByteAddressBuffer %[[RWMLD0]]
+  ;CHECK: %[[RWMAH0:[^ ]+]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %[[RWMCH0]]
+  ;CHECK: call void @dx.op.outerProductAccumulate.v8i32.v8i32(i32 307, <8 x i32> %{{[^ ]+}}, <8 x i32> %{{[^ ]+}}, %dx.types.Handle %[[RWMAH0]], i32 0, i32 5, i32 3, i32 0)
+  call void @"dx.hl.op..void (i32, <8 x i32>, <8 x i32>, %dx.types.Handle, i32, i32, i32, i32)"(i32 392, <8 x i32> %tmp25, <8 x i32> %tmp29, %dx.types.Handle %tmp32, i32 0, i32 5, i32 3, i32 0), !dbg !37 ; line:67 col:5
+
+  
+  %tmp33 = load %struct.RWByteAddressBuffer, %struct.RWByteAddressBuffer* @"\01?rw_matrix_buffer@@3URWByteAddressBuffer@@A", !dbg !38 ; line:77 col:5
+  %tmp34 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RWByteAddressBuffer)"(i32 0, %struct.RWByteAddressBuffer %tmp33), !dbg !38 ; line:77 col:5
+  %tmp35 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RWByteAddressBuffer)"(i32 14, %dx.types.Handle %tmp34, %dx.types.ResourceProperties { i32 4107, i32 0 }, %struct.RWByteAddressBuffer zeroinitializer), !dbg !38 ; line:77 col:5
+
+  ;CHECK: %[[RWMCH1:[^ ]+]] = call %dx.types.Handle @dx.op.createHandleForLib.struct.RWByteAddressBuffer(i32 160, %struct.RWByteAddressBuffer %[[RWMLD0]]
+  ;CHECK: %[[RWMAH1:[^ ]+]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %[[RWMCH1]]
+  ;CHECK: call void @dx.op.vectorAccumulate.v8i32(i32 308, <8 x i32> %{{[^ ]+}}, %dx.types.Handle %[[RWMAH1]], i32 0)
+  call void @"dx.hl.op..void (i32, <8 x i32>, %dx.types.Handle, i32)"(i32 393, <8 x i32> %tmp25, %dx.types.Handle %tmp35, i32 0), !dbg !38 ; line:77 col:5
+
+  %tmp36 = bitcast <4 x float>* %output_vector to i8*, !dbg !39 ; line:79 col:1
+  call void @llvm.lifetime.end(i64 16, i8* %tmp36) #0, !dbg !39 ; line:79 col:1
+  ret void, !dbg !39 ; line:79 col:1
+}
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture) #0
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture) #0
+
+; Function Attrs: nounwind readonly
+declare <4 x float> @"dx.hl.op.ro.<4 x float> (i32, %dx.types.Handle, i32)"(i32, %dx.types.Handle, i32) #1
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.ByteAddressBuffer)"(i32, %struct.ByteAddressBuffer) #2
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.ByteAddressBuffer)"(i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.ByteAddressBuffer) #2
+
+; Function Attrs: nounwind
+declare void @"dx.hl.op..void (i32, <4 x float>*, i1, <4 x float>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32)"(i32, <4 x float>*, i1, <4 x float>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32) #0
+
+; Function Attrs: nounwind
+declare void @"dx.hl.op..void (i32, %dx.types.Handle, i32, <4 x float>)"(i32, %dx.types.Handle, i32, <4 x float>) #0
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RWByteAddressBuffer)"(i32, %struct.RWByteAddressBuffer) #2
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RWByteAddressBuffer)"(i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RWByteAddressBuffer) #2
+
+; Function Attrs: nounwind
+declare void @"dx.hl.op..void (i32, <4 x float>*, i1, <4 x float>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32, %dx.types.Handle, i32, i32)"(i32, <4 x float>*, i1, <4 x float>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32, %dx.types.Handle, i32, i32) #0
+
+; Function Attrs: nounwind readonly
+declare <8 x i32> @"dx.hl.op.ro.<8 x i32> (i32, %dx.types.Handle, i32)"(i32, %dx.types.Handle, i32) #1
+
+; Function Attrs: nounwind
+declare void @"dx.hl.op..void (i32, <8 x i32>, <8 x i32>, %dx.types.Handle, i32, i32, i32, i32)"(i32, <8 x i32>, <8 x i32>, %dx.types.Handle, i32, i32, i32, i32) #0
+
+; Function Attrs: nounwind
+declare void @"dx.hl.op..void (i32, <8 x i32>, %dx.types.Handle, i32)"(i32, <8 x i32>, %dx.types.Handle, i32) #0
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readonly }
+attributes #2 = { nounwind readnone }
+
+!llvm.module.flags = !{!0}
+!pauseresume = !{!1}
+!dx.version = !{!2}
+!dx.valver = !{!2}
+!dx.shaderModel = !{!3}
+!dx.typeAnnotations = !{!4}
+!dx.entryPoints = !{!8}
+!dx.fnprops = !{!18}
+!dx.options = !{!19, !20}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !{!"hlsl-hlemit", !"hlsl-hlensure"}
+!2 = !{i32 1, i32 9}
+!3 = !{!"cs", i32 6, i32 9}
+!4 = !{i32 1, void ()* @cs_main, !5}
+!5 = !{!6}
+!6 = !{i32 1, !7, !7}
+!7 = !{}
+!8 = !{void ()* @cs_main, !"cs_main", null, !9, null}
+!9 = !{!10, !15, null, null}
+!10 = !{!11, !12, !13, !14}
+!11 = !{i32 0, %struct.ByteAddressBuffer* @"\01?input_vector_buffer@@3UByteAddressBuffer@@A", !"input_vector_buffer", i32 -1, i32 -1, i32 1, i32 11, i32 0, null}
+!12 = !{i32 1, %struct.ByteAddressBuffer* @"\01?opa_input_buffer@@3UByteAddressBuffer@@A", !"opa_input_buffer", i32 -1, i32 -1, i32 1, i32 11, i32 0, null}
+!13 = !{i32 2, %struct.ByteAddressBuffer* @"\01?matrix_buffer@@3UByteAddressBuffer@@A", !"matrix_buffer", i32 -1, i32 -1, i32 1, i32 11, i32 0, null}
+!14 = !{i32 3, %struct.ByteAddressBuffer* @"\01?bias_buffer@@3UByteAddressBuffer@@A", !"bias_buffer", i32 -1, i32 -1, i32 1, i32 11, i32 0, null}
+!15 = !{!16, !17}
+!16 = !{i32 0, %struct.RWByteAddressBuffer* @"\01?rw_matrix_buffer@@3URWByteAddressBuffer@@A", !"rw_matrix_buffer", i32 -1, i32 -1, i32 1, i32 11, i1 false, i1 false, i1 false, null}
+!17 = !{i32 1, %struct.RWByteAddressBuffer* @"\01?output_vector_buffer@@3URWByteAddressBuffer@@A", !"output_vector_buffer", i32 -1, i32 -1, i32 1, i32 11, i1 false, i1 false, i1 false, null}
+!18 = !{void ()* @cs_main, i32 5, i32 1, i32 1, i32 1}
+!19 = !{i32 -2147483584}
+!20 = !{i32 -1}
+!21 = !DILocation(line: 14, column: 5, scope: !22)
+!22 = !DISubprogram(name: "cs_main", scope: !23, file: !23, line: 12, type: !24, isLocal: false, isDefinition: true, scopeLine: 13, flags: DIFlagPrototyped, isOptimized: false, function: void ()* @cs_main)
+!23 = !DIFile(filename: "DirectXShaderCompiler\5Ctools\5Cclang\5Ctest\5CCodeGenDXIL\5Chlsl\5Cintrinsics\5Clinalg_builtins\5Clinalg-builtins.hlsl", directory: "")
+!24 = !DISubroutineType(types: !7)
+!25 = !DILocation(line: 17, column: 37, scope: !22)
+!26 = !DILocation(line: 33, column: 5, scope: !22)
+!27 = !DILocation(line: 37, column: 35, scope: !22)
+!28 = !{!29, !29, i64 0}
+!29 = !{!"omnipotent char", !30, i64 0}
+!30 = !{!"Simple C/C++ TBAA"}
+!31 = !DILocation(line: 37, column: 5, scope: !22)
+!32 = !DILocation(line: 49, column: 5, scope: !22)
+!33 = !DILocation(line: 54, column: 38, scope: !22)
+!34 = !DILocation(line: 54, column: 5, scope: !22)
+!35 = !DILocation(line: 56, column: 37, scope: !22)
+!36 = !DILocation(line: 57, column: 37, scope: !22)
+!37 = !DILocation(line: 67, column: 5, scope: !22)
+!38 = !DILocation(line: 77, column: 5, scope: !22)
+!39 = !DILocation(line: 79, column: 1, scope: !22)
diff --git a/tools/clang/test/DXC/Passes/DxilGen/tracerayinline_cb_raydesc_dxilgen.ll b/tools/clang/test/DXC/Passes/DxilGen/tracerayinline_cb_raydesc_dxilgen.ll
new file mode 100644
index 0000000000..b969a63f12
--- /dev/null
+++ b/tools/clang/test/DXC/Passes/DxilGen/tracerayinline_cb_raydesc_dxilgen.ll
@@ -0,0 +1,160 @@
+; RUN: %dxopt %s -hlsl-passes-resume -dxilgen -S | FileCheck %s
+
+; Based on tools/clang/test/CodeGenDXIL/hlsl/objects/RayQuery/tracerayinline_cb_raydesc.hlsl
+
+target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+%struct.RaytracingAccelerationStructure = type { i32 }
+%"$Globals" = type { %struct.RayDesc }
+%struct.RayDesc = type { <3 x float>, float, <3 x float>, float }
+%dx.types.Handle = type { i8* }
+%dx.types.ResourceProperties = type { i32, i32 }
+%"class.RayQuery<513, 0>" = type { i32 }
+
+@"\01?RTAS@@3URaytracingAccelerationStructure@@A" = external global %struct.RaytracingAccelerationStructure, align 4
+@"$Globals" = external constant %"$Globals"
+
+; Function Attrs: nounwind
+define void @main() #0 {
+entry:
+
+  ; Capture CB, RTAS, and RayQuery
+  ; CHECK-DAG: %[[CB:[^ ,]+]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %"$Globals", %dx.types.ResourceProperties { i32 13, i32 32 })
+  ; CHECK-DAG: %[[RTAS:[^ ,]+]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %{{[^ ,]+}}, %dx.types.ResourceProperties { i32 16, i32 0 })
+  ; CHECK-DAG: %[[RQ:[^ ,]+]] = call i32 @dx.op.allocateRayQuery(i32 178, i32 513)
+
+  %0 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22$Globals\22*, i32)"(i32 0, %"$Globals"* @"$Globals", i32 0)
+  %1 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22$Globals\22)"(i32 14, %dx.types.Handle %0, %dx.types.ResourceProperties { i32 13, i32 32 }, %"$Globals" undef)
+  %2 = call %"$Globals"* @"dx.hl.subscript.cb.rn.%\22$Globals\22* (i32, %dx.types.Handle, i32)"(i32 6, %dx.types.Handle %1, i32 0)
+  %3 = getelementptr inbounds %"$Globals", %"$Globals"* %2, i32 0, i32 0
+  %rayQuery1 = call i32 @"dx.hl.op..i32 (i32, i32, i32)"(i32 4, i32 513, i32 0), !dbg !34 ; line:12 col:71
+  %4 = load %struct.RaytracingAccelerationStructure, %struct.RaytracingAccelerationStructure* @"\01?RTAS@@3URaytracingAccelerationStructure@@A", !dbg !38 ; line:13 col:3
+  %5 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RaytracingAccelerationStructure)"(i32 0, %struct.RaytracingAccelerationStructure %4), !dbg !38 ; line:13 col:3
+  %6 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RaytracingAccelerationStructure)"(i32 14, %dx.types.Handle %5, %dx.types.ResourceProperties { i32 16, i32 0 }, %struct.RaytracingAccelerationStructure zeroinitializer), !dbg !38 ; line:13 col:3
+
+  ; Load RayDesc.Origin
+  ; CHECK: %[[ORIG_CB_LD:[^ ,]+]] = call %dx.types.CBufRet.f32 @dx.op.cbufferLoadLegacy.f32(i32 59, %dx.types.Handle %[[CB]], i32 0)
+  ; CHECK: %[[ORIG_EX0:[^ ,]+]] = extractvalue %dx.types.CBufRet.f32 %[[ORIG_CB_LD]], 0
+  ; CHECK: %[[ORIG_VX:[^ ,]+]] = insertelement <3 x float> undef, float %[[ORIG_EX0]], i64 0
+  ; CHECK: %[[ORIG_EX1:[^ ,]+]] = extractvalue %dx.types.CBufRet.f32 %[[ORIG_CB_LD]], 1
+  ; CHECK: %[[ORIG_VXY:[^ ,]+]] = insertelement <3 x float> %[[ORIG_VX]], float %[[ORIG_EX1]], i64 1
+  ; CHECK: %[[ORIG_EX2:[^ ,]+]] = extractvalue %dx.types.CBufRet.f32 %[[ORIG_CB_LD]], 2
+  ; CHECK: %[[ORIG_VXYZ:[^ ,]+]] = insertelement <3 x float> %[[ORIG_VXY]], float %[[ORIG_EX2]], i64 2
+
+  ; Load RayDesc.TMin
+  ; CHECK: %[[TMIN_CB_LD:[^ ,]+]] = call %dx.types.CBufRet.f32 @dx.op.cbufferLoadLegacy.f32(i32 59, %dx.types.Handle %[[CB]], i32 0)
+  ; CHECK: %[[TMIN:[^ ,]+]] = extractvalue %dx.types.CBufRet.f32 %[[TMIN_CB_LD]], 3
+
+  ; Load RayDesc.Direction
+  ; CHECK: %[[DIR_CB_LD:[^ ,]+]] = call %dx.types.CBufRet.f32 @dx.op.cbufferLoadLegacy.f32(i32 59, %dx.types.Handle %[[CB]], i32 1)
+  ; CHECK: %[[DIR_EX0:[^ ,]+]] = extractvalue %dx.types.CBufRet.f32 %[[DIR_CB_LD]], 0
+  ; CHECK: %[[DIR_VX:[^ ,]+]] = insertelement <3 x float> undef, float %[[DIR_EX0]], i64 0
+  ; CHECK: %[[DIR_EX1:[^ ,]+]] = extractvalue %dx.types.CBufRet.f32 %[[DIR_CB_LD]], 1
+  ; CHECK: %[[DIR_VXY:[^ ,]+]] = insertelement <3 x float> %[[DIR_VX]], float %[[DIR_EX1]], i64 1
+  ; CHECK: %[[DIR_EX2:[^ ,]+]] = extractvalue %dx.types.CBufRet.f32 %[[DIR_CB_LD]], 2
+  ; CHECK: %[[DIR_VXYZ:[^ ,]+]] = insertelement <3 x float> %[[DIR_VXY]], float %[[DIR_EX2]], i64 2
+
+  ; Load RayDesc.TMax
+  ; CHECK: %21 = call %dx.types.CBufRet.f32 @dx.op.cbufferLoadLegacy.f32(i32 59, %dx.types.Handle %[[CB]], i32 1)
+  ; CHECK: %22 = extractvalue %dx.types.CBufRet.f32 %21, 3
+
+  ; Extract RayDesc vector fields
+  ; CHECK: %[[ORIGX:[^ ,]+]] = extractelement <3 x float> %[[ORIG_VXYZ]], i64 0
+  ; CHECK: %[[ORIGY:[^ ,]+]] = extractelement <3 x float> %[[ORIG_VXYZ]], i64 1
+  ; CHECK: %[[ORIGZ:[^ ,]+]] = extractelement <3 x float> %[[ORIG_VXYZ]], i64 2
+  ; CHECK: %[[DIRX:[^ ,]+]] = extractelement <3 x float> %[[DIR_VXYZ]], i64 0
+  ; CHECK: %[[DIRY:[^ ,]+]] = extractelement <3 x float> %[[DIR_VXYZ]], i64 1
+  ; CHECK: %[[DIRZ:[^ ,]+]] = extractelement <3 x float> %[[DIR_VXYZ]], i64 2
+
+  %7 = getelementptr inbounds %struct.RayDesc, %struct.RayDesc* %3, i32 0, i32 0, !dbg !38 ; line:13 col:3
+  %8 = load <3 x float>, <3 x float>* %7, !dbg !38 ; line:13 col:3
+  %9 = getelementptr inbounds %struct.RayDesc, %struct.RayDesc* %3, i32 0, i32 1, !dbg !38 ; line:13 col:3
+  %10 = load float, float* %9, !dbg !38 ; line:13 col:3
+  %11 = getelementptr inbounds %struct.RayDesc, %struct.RayDesc* %3, i32 0, i32 2, !dbg !38 ; line:13 col:3
+  %12 = load <3 x float>, <3 x float>* %11, !dbg !38 ; line:13 col:3
+  %13 = getelementptr inbounds %struct.RayDesc, %struct.RayDesc* %3, i32 0, i32 3, !dbg !38 ; line:13 col:3
+  %14 = load float, float* %13, !dbg !38 ; line:13 col:3
+
+  ; Call TraceRayInline
+  ; CHECK: call void @dx.op.rayQuery_TraceRayInline(i32 179, i32 %[[RQ]], %dx.types.Handle %[[RTAS]], i32 1, i32 2, float %[[ORIGX]], float %[[ORIGY]], float %[[ORIGZ]], float %[[TMIN]], float %[[DIRX]], float %[[DIRY]], float %[[DIRZ]], float %22)
+
+  call void @"dx.hl.op..void (i32, i32, %dx.types.Handle, i32, i32, <3 x float>, float, <3 x float>, float)"(i32 325, i32 %rayQuery1, %dx.types.Handle %6, i32 1, i32 2, <3 x float> %8, float %10, <3 x float> %12, float %14), !dbg !38 ; line:13 col:3
+  ret void, !dbg !39 ; line:14 col:1
+}
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RaytracingAccelerationStructure)"(i32, %struct.RaytracingAccelerationStructure) #1
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RaytracingAccelerationStructure)"(i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RaytracingAccelerationStructure) #1
+
+; Function Attrs: nounwind readnone
+declare %"$Globals"* @"dx.hl.subscript.cb.rn.%\22$Globals\22* (i32, %dx.types.Handle, i32)"(i32, %dx.types.Handle, i32) #1
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22$Globals\22*, i32)"(i32, %"$Globals"*, i32) #1
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22$Globals\22)"(i32, %dx.types.Handle, %dx.types.ResourceProperties, %"$Globals") #1
+
+; Function Attrs: nounwind
+declare i32 @"dx.hl.op..i32 (i32, i32, i32)"(i32, i32, i32) #0
+
+; Function Attrs: nounwind
+declare void @"dx.hl.op..void (i32, i32, %dx.types.Handle, i32, i32, <3 x float>, float, <3 x float>, float)"(i32, i32, %dx.types.Handle, i32, i32, <3 x float>, float, <3 x float>, float) #0
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+
+!llvm.module.flags = !{!0}
+!pauseresume = !{!1}
+!llvm.ident = !{!2}
+!dx.version = !{!3}
+!dx.valver = !{!4}
+!dx.shaderModel = !{!5}
+!dx.typeAnnotations = !{!6, !20}
+!dx.entryPoints = !{!24}
+!dx.fnprops = !{!31}
+!dx.options = !{!32, !33}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !{!"hlsl-hlemit", !"hlsl-hlensure"}
+!2 = !{!"dxc(private) 1.8.0.14861 (main, 33bc44a3d)"}
+!3 = !{i32 1, i32 5}
+!4 = !{i32 1, i32 9}
+!5 = !{!"vs", i32 6, i32 5}
+!6 = !{i32 0, %struct.RayDesc undef, !7, %"class.RayQuery<513, 0>" undef, !12, %"$Globals" undef, !18}
+!7 = !{i32 32, !8, !9, !10, !11}
+!8 = !{i32 6, !"Origin", i32 3, i32 0, i32 7, i32 9}
+!9 = !{i32 6, !"TMin", i32 3, i32 12, i32 7, i32 9}
+!10 = !{i32 6, !"Direction", i32 3, i32 16, i32 7, i32 9}
+!11 = !{i32 6, !"TMax", i32 3, i32 28, i32 7, i32 9}
+!12 = !{i32 4, !13, !14}
+!13 = !{i32 6, !"h", i32 3, i32 0, i32 7, i32 5}
+!14 = !{i32 0, !15}
+!15 = !{!16, !17}
+!16 = !{i32 1, i64 513}
+!17 = !{i32 1, i64 0}
+!18 = !{i32 32, !19}
+!19 = !{i32 6, !"rayDesc", i32 3, i32 0}
+!20 = !{i32 1, void ()* @main, !21}
+!21 = !{!22}
+!22 = !{i32 1, !23, !23}
+!23 = !{}
+!24 = !{void ()* @main, !"main", null, !25, null}
+!25 = !{!26, null, !29, null}
+!26 = !{!27}
+!27 = !{i32 0, %struct.RaytracingAccelerationStructure* @"\01?RTAS@@3URaytracingAccelerationStructure@@A", !"RTAS", i32 -1, i32 -1, i32 1, i32 16, i32 0, !28}
+!28 = !{i32 0, i32 4}
+!29 = !{!30}
+!30 = !{i32 0, %"$Globals"* @"$Globals", !"$Globals", i32 0, i32 -1, i32 1, i32 32, null}
+!31 = !{void ()* @main, i32 1}
+!32 = !{i32 64}
+!33 = !{i32 -1}
+!34 = !DILocation(line: 12, column: 71, scope: !35)
+!35 = !DISubprogram(name: "main", scope: !36, file: !36, line: 11, type: !37, isLocal: false, isDefinition: true, scopeLine: 11, flags: DIFlagPrototyped, isOptimized: false, function: void ()* @main)
+!36 = !DIFile(filename: "tools/clang/test/CodeGenDXIL/hlsl/objects/RayQuery/tracerayinline_cb_raydesc.hlsl", directory: "")
+!37 = !DISubroutineType(types: !23)
+!38 = !DILocation(line: 13, column: 3, scope: !35)
+!39 = !DILocation(line: 14, column: 1, scope: !35)
diff --git a/tools/clang/test/DXC/Passes/DxilGen/tracerayinline_dxilgen.ll b/tools/clang/test/DXC/Passes/DxilGen/tracerayinline_dxilgen.ll
new file mode 100644
index 0000000000..0d97d8782d
--- /dev/null
+++ b/tools/clang/test/DXC/Passes/DxilGen/tracerayinline_dxilgen.ll
@@ -0,0 +1,134 @@
+; RUN: %dxopt %s -hlsl-passes-resume -dxilgen -S | FileCheck %s
+
+; Based on tools/clang/test/CodeGenDXIL/hlsl/objects/RayQuery/tracerayinline.hlsl,
+; with call to DoTrace commented out.
+
+target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+%struct.RaytracingAccelerationStructure = type { i32 }
+%dx.types.Handle = type { i8* }
+%dx.types.ResourceProperties = type { i32, i32 }
+%struct.RayDesc = type { <3 x float>, float, <3 x float>, float }
+%"class.RayQuery<513, 0>" = type { i32 }
+
+@"\01?RTAS@@3URaytracingAccelerationStructure@@A" = external global %struct.RaytracingAccelerationStructure, align 4
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RaytracingAccelerationStructure)"(i32, %struct.RaytracingAccelerationStructure) #0
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RaytracingAccelerationStructure)"(i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RaytracingAccelerationStructure) #0
+
+; Function Attrs: nounwind
+declare i32 @"dx.hl.op..i32 (i32, i32, i32)"(i32, i32, i32) #1
+
+; Function Attrs: nounwind
+define void @main(float* noalias, <3 x float>, float, <3 x float>, float) #1 {
+entry:
+
+  ; Load RayDesc fields from input
+  ; CHECK-DAG: %[[ORIGX_LI:[^ ,]+]] = call float @dx.op.loadInput.f32(i32 4, i32 0, i32 0, i8 0, i32 undef)
+  ; CHECK-DAG: %[[ORIGY_LI:[^ ,]+]] = call float @dx.op.loadInput.f32(i32 4, i32 0, i32 0, i8 1, i32 undef)
+  ; CHECK-DAG: %[[ORIGZ_LI:[^ ,]+]] = call float @dx.op.loadInput.f32(i32 4, i32 0, i32 0, i8 2, i32 undef)
+  ; CHECK-DAG: %[[TMIN:[^ ,]+]] = call float @dx.op.loadInput.f32(i32 4, i32 1, i32 0, i8 0, i32 undef)
+  ; CHECK-DAG: %[[DIRX_LI:[^ ,]+]] = call float @dx.op.loadInput.f32(i32 4, i32 2, i32 0, i8 0, i32 undef)
+  ; CHECK-DAG: %[[DIRY_LI:[^ ,]+]] = call float @dx.op.loadInput.f32(i32 4, i32 2, i32 0, i8 1, i32 undef)
+  ; CHECK-DAG: %[[DIRZ_LI:[^ ,]+]] = call float @dx.op.loadInput.f32(i32 4, i32 2, i32 0, i8 2, i32 undef)
+  ; CHECK-DAG: %[[TMAX:[^ ,]+]] = call float @dx.op.loadInput.f32(i32 4, i32 3, i32 0, i8 0, i32 undef)
+  ; CHECK-DAG: %[[ORIG_VX:[^ ,]+]] = insertelement <3 x float> undef, float %[[ORIGX_LI]], i64 0
+  ; CHECK-DAG: %[[ORIG_VXY:[^ ,]+]] = insertelement <3 x float> %[[ORIG_VX]], float %[[ORIGY_LI]], i64 1
+  ; CHECK-DAG: %[[ORIG_VXYZ:[^ ,]+]] = insertelement <3 x float> %[[ORIG_VXY]], float %[[ORIGZ_LI]], i64 2
+  ; CHECK-DAG: %[[DIR_VX:[^ ,]+]] = insertelement <3 x float> undef, float %[[DIRX_LI]], i64 0
+  ; CHECK-DAG: %[[DIR_VXY:[^ ,]+]] = insertelement <3 x float> %[[DIR_VX]], float %[[DIRY_LI]], i64 1
+  ; CHECK-DAG: %[[DIR_VXYZ:[^ ,]+]] = insertelement <3 x float> %[[DIR_VXY]], float %[[DIRZ_LI]], i64 2
+
+  ; Capture RayQuery and RTAS
+  ; CHECK-DAG: %[[RQ:[^ ,]+]] = call i32 @dx.op.allocateRayQuery(i32 178, i32 513)
+  ; CHECK-DAG: %[[RTAS:[^ ,]+]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %{{[^ ,]+}}, %dx.types.ResourceProperties { i32 16, i32 0 })
+
+  %rayQuery1 = call i32 @"dx.hl.op..i32 (i32, i32, i32)"(i32 4, i32 513, i32 0), !dbg !41 ; line:15 col:71
+  %5 = load %struct.RaytracingAccelerationStructure, %struct.RaytracingAccelerationStructure* @"\01?RTAS@@3URaytracingAccelerationStructure@@A", !dbg !45 ; line:17 col:3
+  %6 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RaytracingAccelerationStructure)"(i32 0, %struct.RaytracingAccelerationStructure %5), !dbg !45 ; line:17 col:3
+  %7 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RaytracingAccelerationStructure)"(i32 14, %dx.types.Handle %6, %dx.types.ResourceProperties { i32 16, i32 0 }, %struct.RaytracingAccelerationStructure zeroinitializer), !dbg !45 ; line:17 col:3
+
+  ; Extract RayDesc vector fields
+  ; CHECK-DAG: %[[ORIGX:[^ ,]+]] = extractelement <3 x float> %[[ORIG_VXYZ]], i64 0
+  ; CHECK-DAG: %[[ORIGY:[^ ,]+]] = extractelement <3 x float> %[[ORIG_VXYZ]], i64 1
+  ; CHECK-DAG: %[[ORIGZ:[^ ,]+]] = extractelement <3 x float> %[[ORIG_VXYZ]], i64 2
+  ; CHECK-DAG: %[[DIRX:[^ ,]+]] = extractelement <3 x float> %[[DIR_VXYZ]], i64 0
+  ; CHECK-DAG: %[[DIRY:[^ ,]+]] = extractelement <3 x float> %[[DIR_VXYZ]], i64 1
+  ; CHECK-DAG: %[[DIRZ:[^ ,]+]] = extractelement <3 x float> %[[DIR_VXYZ]], i64 2
+
+  ; Call TraceRayInline
+  ; CHECK: call void @dx.op.rayQuery_TraceRayInline(i32 179, i32 %[[RQ]], %dx.types.Handle %[[RTAS]], i32 1, i32 2, float %[[ORIGX]], float %[[ORIGY]], float %[[ORIGZ]], float %[[TMIN]], float %[[DIRX]], float %[[DIRY]], float %[[DIRZ]], float %[[TMAX]])
+
+  call void @"dx.hl.op..void (i32, i32, %dx.types.Handle, i32, i32, <3 x float>, float, <3 x float>, float)"(i32 325, i32 %rayQuery1, %dx.types.Handle %7, i32 1, i32 2, <3 x float> %1, float %2, <3 x float> %3, float %4), !dbg !45 ; line:17 col:3
+  store float 0.000000e+00, float* %0, !dbg !46 ; line:18 col:3
+  ret void, !dbg !46 ; line:18 col:3
+}
+
+; Function Attrs: nounwind
+declare void @"dx.hl.op..void (i32, i32, %dx.types.Handle, i32, i32, <3 x float>, float, <3 x float>, float)"(i32, i32, %dx.types.Handle, i32, i32, <3 x float>, float, <3 x float>, float) #1
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
+
+!llvm.module.flags = !{!0}
+!pauseresume = !{!1}
+!llvm.ident = !{!2}
+!dx.version = !{!3}
+!dx.valver = !{!4}
+!dx.shaderModel = !{!5}
+!dx.typeAnnotations = !{!6, !18}
+!dx.entryPoints = !{!33}
+!dx.fnprops = !{!38}
+!dx.options = !{!39, !40}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !{!"hlsl-hlemit", !"hlsl-hlensure"}
+!2 = !{!"dxc(private) 1.8.0.14861 (main, 33bc44a3d)"}
+!3 = !{i32 1, i32 5}
+!4 = !{i32 1, i32 9}
+!5 = !{!"vs", i32 6, i32 5}
+!6 = !{i32 0, %struct.RayDesc undef, !7, %"class.RayQuery<513, 0>" undef, !12}
+!7 = !{i32 32, !8, !9, !10, !11}
+!8 = !{i32 6, !"Origin", i32 3, i32 0, i32 7, i32 9}
+!9 = !{i32 6, !"TMin", i32 3, i32 12, i32 7, i32 9}
+!10 = !{i32 6, !"Direction", i32 3, i32 16, i32 7, i32 9}
+!11 = !{i32 6, !"TMax", i32 3, i32 28, i32 7, i32 9}
+!12 = !{i32 4, !13, !14}
+!13 = !{i32 6, !"h", i32 3, i32 0, i32 7, i32 5}
+!14 = !{i32 0, !15}
+!15 = !{!16, !17}
+!16 = !{i32 1, i64 513}
+!17 = !{i32 1, i64 0}
+!18 = !{i32 1, void (float*, <3 x float>, float, <3 x float>, float)* @main, !19}
+!19 = !{!20, !22, !25, !27, !29, !31}
+!20 = !{i32 0, !21, !21}
+!21 = !{}
+!22 = !{i32 1, !23, !24}
+!23 = !{i32 4, !"OUT", i32 7, i32 9}
+!24 = !{i32 0}
+!25 = !{i32 0, !26, !24}
+!26 = !{i32 4, !"RAYDESC", i32 7, i32 9}
+!27 = !{i32 0, !26, !28}
+!28 = !{i32 1}
+!29 = !{i32 0, !26, !30}
+!30 = !{i32 2}
+!31 = !{i32 0, !26, !32}
+!32 = !{i32 3}
+!33 = !{void (float*, <3 x float>, float, <3 x float>, float)* @main, !"main", null, !34, null}
+!34 = !{!35, null, null, null}
+!35 = !{!36}
+!36 = !{i32 0, %struct.RaytracingAccelerationStructure* @"\01?RTAS@@3URaytracingAccelerationStructure@@A", !"RTAS", i32 -1, i32 -1, i32 1, i32 16, i32 0, !37}
+!37 = !{i32 0, i32 4}
+!38 = !{void (float*, <3 x float>, float, <3 x float>, float)* @main, i32 1}
+!39 = !{i32 64}
+!40 = !{i32 -1}
+!41 = !DILocation(line: 15, column: 71, scope: !42)
+!42 = !DISubprogram(name: "main", scope: !43, file: !43, line: 14, type: !44, isLocal: false, isDefinition: true, scopeLine: 14, flags: DIFlagPrototyped, isOptimized: false, function: void (float*, <3 x float>, float, <3 x float>, float)* @main)
+!43 = !DIFile(filename: "tools/clang/test/CodeGenDXIL/hlsl/objects/RayQuery/tracerayinline.hlsl", directory: "")
+!44 = !DISubroutineType(types: !21)
+!45 = !DILocation(line: 17, column: 3, scope: !42)
+!46 = !DILocation(line: 18, column: 3, scope: !42)
diff --git a/tools/clang/test/DXC/Passes/ScalarReplHLSL/hitobject_fromrayquery_scalarrepl.ll b/tools/clang/test/DXC/Passes/ScalarReplHLSL/hitobject_fromrayquery_scalarrepl.ll
new file mode 100644
index 0000000000..85c3a34eb9
--- /dev/null
+++ b/tools/clang/test/DXC/Passes/ScalarReplHLSL/hitobject_fromrayquery_scalarrepl.ll
@@ -0,0 +1,383 @@
+; RUN: %dxopt %s -hlsl-passes-resume -scalarrepl-param-hlsl -S | FileCheck %s
+
+; COM: Original HLSL code
+; COM: RaytracingAccelerationStructure RTAS;
+; COM: RWStructuredBuffer<float> UAV : register(u0);
+; COM: RWByteAddressBuffer inbuf;
+; COM: RWByteAddressBuffer outbuf;
+; COM: 
+; COM: RayDesc MakeRayDesc() {
+; COM:   RayDesc desc;
+; COM:   desc.Origin = float3(0, 0, 0);
+; COM:   desc.Direction = float3(1, 0, 0);
+; COM:   desc.TMin = 0.0f;
+; COM:   desc.TMax = 9999.0;
+; COM:   return desc;
+; COM: }
+; COM: 
+; COM: struct CustomAttrs {
+; COM:   float x;
+; COM:   float y;
+; COM: };
+; COM: 
+; COM: void Use(in dx::HitObject hit) {
+; COM:   dx::MaybeReorderThread(hit);
+; COM: }
+; COM: 
+; COM: [shader("raygeneration")]
+; COM: void main() {
+; COM:   RayQuery<RAY_FLAG_FORCE_OPAQUE | RAY_FLAG_ACCEPT_FIRST_HIT_AND_END_SEARCH> q;
+; COM:   RayDesc ray = MakeRayDesc();
+; COM:   q.TraceRayInline(RTAS, RAY_FLAG_NONE, 0xFF, ray);
+; COM: 
+; COM:   Use(dx::HitObject::FromRayQuery(q));
+; COM: 
+; COM:   CustomAttrs attrs;
+; COM:   attrs.x = inbuf.Load(0);
+; COM:   attrs.y = inbuf.Load(4);
+; COM:   Use(dx::HitObject::FromRayQuery(q, 16, attrs));
+; COM: 
+; COM:   attrs.x = inbuf.Load(8);
+; COM:   attrs.y = inbuf.Load(12);
+; COM:   Use(dx::HitObject::FromRayQuery(q, 17, attrs));
+; COM: 
+; COM:   outbuf.Store(0, attrs.x);
+; COM:   outbuf.Store(4, attrs.y);
+; COM: }
+
+;
+; Buffer Definitions:
+;
+; cbuffer $Globals
+; {
+;
+;   [0 x i8] (type annotation not present)
+;
+; }
+;
+; Resource bind info for UAV
+; {
+;
+;   float $Element;                                   ; Offset:    0 Size:     4
+;
+; }
+;
+;
+; Resource Bindings:
+;
+; Name                                 Type  Format         Dim      ID      HLSL Bind  Count
+; ------------------------------ ---------- ------- ----------- ------- -------------- ------
+; $Globals                          cbuffer      NA          NA     CB0   cb4294967295     1
+; RTAS                              texture     i32         ras      T0t4294967295,space4294967295     1
+; UAV                                   UAV  struct         r/w      U0             u0     1
+; inbuf                                 UAV    byte         r/w      U1u4294967295,space4294967295     1
+; outbuf                                UAV    byte         r/w      U2u4294967295,space4294967295     1
+;
+target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+%struct.RaytracingAccelerationStructure = type { i32 }
+%"class.RWStructuredBuffer<float>" = type { float }
+%struct.RWByteAddressBuffer = type { i32 }
+%ConstantBuffer = type opaque
+%"class.RayQuery<5, 0>" = type { i32 }
+%struct.RayDesc = type { <3 x float>, float, <3 x float>, float }
+%dx.types.HitObject = type { i8* }
+%struct.CustomAttrs = type { float, float }
+%dx.types.Handle = type { i8* }
+%dx.types.ResourceProperties = type { i32, i32 }
+%"class.dx::HitObject" = type { i32 }
+
+@"\01?RTAS@@3URaytracingAccelerationStructure@@A" = external global %struct.RaytracingAccelerationStructure, align 4
+@"\01?UAV@@3V?$RWStructuredBuffer@M@@A" = external global %"class.RWStructuredBuffer<float>", align 4
+@"\01?inbuf@@3URWByteAddressBuffer@@A" = external global %struct.RWByteAddressBuffer, align 4
+@"\01?outbuf@@3URWByteAddressBuffer@@A" = external global %struct.RWByteAddressBuffer, align 4
+@"$Globals" = external constant %ConstantBuffer
+
+; CHECK: %[[RQA:[^ ]+]] = alloca i32
+; CHECK: %[[XATTRA:[^ ]+]] = alloca float
+; CHECK: %[[YATTRA:[^ ]+]] = alloca float
+; CHECK: %[[ATTRA0:[^ ]+]] = alloca %struct.CustomAttrs
+; CHECK: %[[ATTRA1:[^ ]+]] = alloca %struct.CustomAttrs
+
+; COM: Check same query handle used for TraceRayInline and the FromRayQuery calls
+; CHECK: %[[RQH:[^ ]+]] = load i32, i32* %[[RQA]]
+; CHECK: call void @"dx.hl.op..void (i32, i32, %dx.types.Handle, i32, i32, <3 x float>, float, <3 x float>, float)"(i32 325, i32 %[[RQH]],
+    
+; COM: Check RQ handle loaded for first FromRayQuery call
+; CHECK: %[[RQH0:[^ ]+]] = load i32, i32* %[[RQA]]
+; CHECK: call void @"dx.hl.op..void (i32, %dx.types.HitObject*, i32)"(i32 363, %dx.types.HitObject* %{{[^ ]+}}, i32 %[[RQH0]])
+
+; COM: Check buffer loads for first FromRayQuery-with-attrs call
+; CHECK: %[[XI0:[^ ]+]] = call i32 @"dx.hl.op.ro.i32 (i32, %dx.types.Handle, i32)"(i32 231, %dx.types.Handle %{{[^ ]+}}, i32 0)
+; CHECK: %[[XF0:[^ ]+]] = uitofp i32 %[[XI0]] to float
+; CHECK: store float %[[XF0]], float* %[[XATTRA]], align 4
+; CHECK: %[[YI0:[^ ]+]] = call i32 @"dx.hl.op.ro.i32 (i32, %dx.types.Handle, i32)"(i32 231, %dx.types.Handle %{{[^ ]+}}, i32 4)
+; CHECK: %[[YF0:[^ ]+]] = uitofp i32 %[[YI0]] to float
+; CHECK: store float %[[YF0]], float* %[[YATTRA]], align 4
+
+; COM: Check that values from buffer flow into first FromRayQuery-with-attrs call
+; CHECK: %[[XPTR0:[^ ]+]] = getelementptr inbounds %struct.CustomAttrs, %struct.CustomAttrs* %[[ATTRA0]], i32 0, i32 0
+; CHECK: %[[XF1:[^ ]+]] = load float, float* %[[XATTRA]]
+; CHECK: store float %[[XF1]], float* %[[XPTR0]]
+; CHECK: %[[YPTR0:[^ ]+]] = getelementptr inbounds %struct.CustomAttrs, %struct.CustomAttrs* %[[ATTRA0]], i32 0, i32 1
+; CHECK: %[[YF1:[^ ]+]] = load float, float* %[[YATTRA]]
+; CHECK: store float %[[YF1]], float* %[[YPTR0]]
+; CHECK: %[[RQH1:[^ ]+]] = load i32, i32* %[[RQA]]
+; CHECK: call void @"dx.hl.op..void (i32, %dx.types.HitObject*, i32, i32, %struct.CustomAttrs*)"(i32 363, %dx.types.HitObject* %{{[^ ]+}}, i32 %[[RQH1]], i32 16, %struct.CustomAttrs* %[[ATTRA0]])
+
+; COM: Check buffer loads for second FromRayQuery-with-attrs call
+; CHECK: %[[XI1:[^ ]+]] = call i32 @"dx.hl.op.ro.i32 (i32, %dx.types.Handle, i32)"(i32 231, %dx.types.Handle %{{[^ ]+}}, i32 8)
+; CHECK: %[[XF1:[^ ]+]] = uitofp i32 %[[XI1]] to float
+; CHECK: store float %[[XF1]], float* %[[XATTRA]], align 4
+; CHECK: %[[YI1:[^ ]+]] = call i32 @"dx.hl.op.ro.i32 (i32, %dx.types.Handle, i32)"(i32 231, %dx.types.Handle %{{[^ ]+}}, i32 12)
+; CHECK: %[[YF1:[^ ]+]] = uitofp i32 %[[YI1]] to float
+; CHECK: store float %[[YF1]], float* %[[YATTRA]], align 4
+
+; COM: Check that values from buffer flow into second FromRayQuery-with-attrs call
+; CHECK: %[[XPTR1:[^ ]+]] = getelementptr inbounds %struct.CustomAttrs, %struct.CustomAttrs* %[[ATTRA1]], i32 0, i32 0
+; CHECK: %[[XF2:[^ ]+]] = load float, float* %[[XATTRA]]
+; CHECK: store float %[[XF2]], float* %[[XPTR1]]
+; CHECK: %[[YPTR1:[^ ]+]] = getelementptr inbounds %struct.CustomAttrs, %struct.CustomAttrs* %[[ATTRA1]], i32 0, i32 1
+; CHECK: %[[YF2:[^ ]+]] = load float, float* %[[YATTRA]]
+; CHECK: store float %[[YF2]], float* %[[YPTR1]]
+; CHECK: %[[RQH2:[^ ]+]] = load i32, i32* %[[RQA]]
+; CHECK: call void @"dx.hl.op..void (i32, %dx.types.HitObject*, i32, i32, %struct.CustomAttrs*)"(i32 363, %dx.types.HitObject* %{{[^ ]+}}, i32 %[[RQH2]], i32 17, %struct.CustomAttrs* %[[ATTRA1]])
+
+
+; Function Attrs: nounwind
+define void @"\01?main@@YAXXZ"() #0 {
+entry:
+  %q = alloca %"class.RayQuery<5, 0>", align 4
+  %ray = alloca %struct.RayDesc, align 4
+  %agg.tmp = alloca %dx.types.HitObject, align 4
+  %attrs = alloca %struct.CustomAttrs, align 4
+  %agg.tmp4 = alloca %dx.types.HitObject, align 4
+  %agg.tmp11 = alloca %dx.types.HitObject, align 4
+  %0 = bitcast %"class.RayQuery<5, 0>"* %q to i8*, !dbg !45 ; line:26 col:3
+  call void @llvm.lifetime.start(i64 4, i8* %0) #0, !dbg !45 ; line:26 col:3
+  %q14 = call i32 @"dx.hl.op..i32 (i32, i32, i32)"(i32 4, i32 5, i32 0), !dbg !49 ; line:26 col:78
+  %1 = getelementptr inbounds %"class.RayQuery<5, 0>", %"class.RayQuery<5, 0>"* %q, i32 0, i32 0, !dbg !49 ; line:26 col:78
+  store i32 %q14, i32* %1, !dbg !49 ; line:26 col:78
+  %2 = bitcast %struct.RayDesc* %ray to i8*, !dbg !50 ; line:27 col:3
+  call void @llvm.lifetime.start(i64 32, i8* %2) #0, !dbg !50 ; line:27 col:3
+  %Origin.i = getelementptr inbounds %struct.RayDesc, %struct.RayDesc* %ray, i32 0, i32 0, !dbg !51 ; line:8 col:8
+  store <3 x float> zeroinitializer, <3 x float>* %Origin.i, align 4, !dbg !54, !tbaa !55, !alias.scope !58 ; line:8 col:15
+  %Direction.i = getelementptr inbounds %struct.RayDesc, %struct.RayDesc* %ray, i32 0, i32 2, !dbg !61 ; line:9 col:8
+  store <3 x float> <float 1.000000e+00, float 0.000000e+00, float 0.000000e+00>, <3 x float>* %Direction.i, align 4, !dbg !62, !tbaa !55, !alias.scope !58 ; line:9 col:18
+  %TMin.i = getelementptr inbounds %struct.RayDesc, %struct.RayDesc* %ray, i32 0, i32 1, !dbg !63 ; line:10 col:8
+  store float 0.000000e+00, float* %TMin.i, align 4, !dbg !64, !tbaa !65, !alias.scope !58 ; line:10 col:13
+  %TMax.i = getelementptr inbounds %struct.RayDesc, %struct.RayDesc* %ray, i32 0, i32 3, !dbg !67 ; line:11 col:8
+  store float 9.999000e+03, float* %TMax.i, align 4, !dbg !68, !tbaa !65, !alias.scope !58 ; line:11 col:13
+  %3 = load %struct.RaytracingAccelerationStructure, %struct.RaytracingAccelerationStructure* @"\01?RTAS@@3URaytracingAccelerationStructure@@A", !dbg !69 ; line:28 col:3
+  %4 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RaytracingAccelerationStructure)"(i32 0, %struct.RaytracingAccelerationStructure %3), !dbg !69 ; line:28 col:3
+  %5 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RaytracingAccelerationStructure)"(i32 14, %dx.types.Handle %4, %dx.types.ResourceProperties { i32 16, i32 0 }, %struct.RaytracingAccelerationStructure undef), !dbg !69 ; line:28 col:3
+  call void @"dx.hl.op..void (i32, %\22class.RayQuery<5, 0>\22*, %dx.types.Handle, i32, i32, %struct.RayDesc*)"(i32 325, %"class.RayQuery<5, 0>"* %q, %dx.types.Handle %5, i32 0, i32 255, %struct.RayDesc* %ray), !dbg !69 ; line:28 col:3
+  call void @"dx.hl.op..void (i32, %dx.types.HitObject*, %\22class.RayQuery<5, 0>\22*)"(i32 363, %dx.types.HitObject* %agg.tmp, %"class.RayQuery<5, 0>"* %q), !dbg !70 ; line:30 col:7
+  call void @"dx.hl.op..void (i32, %dx.types.HitObject*)"(i32 359, %dx.types.HitObject* %agg.tmp) #0, !dbg !71 ; line:21 col:3
+  %6 = bitcast %struct.CustomAttrs* %attrs to i8*, !dbg !74 ; line:32 col:3
+  call void @llvm.lifetime.start(i64 8, i8* %6) #0, !dbg !74 ; line:32 col:3
+  %7 = load %struct.RWByteAddressBuffer, %struct.RWByteAddressBuffer* @"\01?inbuf@@3URWByteAddressBuffer@@A", !dbg !75 ; line:33 col:13
+  %8 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RWByteAddressBuffer)"(i32 0, %struct.RWByteAddressBuffer %7), !dbg !75 ; line:33 col:13
+  %9 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RWByteAddressBuffer)"(i32 14, %dx.types.Handle %8, %dx.types.ResourceProperties { i32 4107, i32 0 }, %struct.RWByteAddressBuffer undef), !dbg !75 ; line:33 col:13
+  %10 = call i32 @"dx.hl.op.ro.i32 (i32, %dx.types.Handle, i32)"(i32 231, %dx.types.Handle %9, i32 0), !dbg !75 ; line:33 col:13
+  %conv = uitofp i32 %10 to float, !dbg !75 ; line:33 col:13
+  %x = getelementptr inbounds %struct.CustomAttrs, %struct.CustomAttrs* %attrs, i32 0, i32 0, !dbg !76 ; line:33 col:9
+  store float %conv, float* %x, align 4, !dbg !77, !tbaa !65 ; line:33 col:11
+  %11 = load %struct.RWByteAddressBuffer, %struct.RWByteAddressBuffer* @"\01?inbuf@@3URWByteAddressBuffer@@A", !dbg !78 ; line:34 col:13
+  %12 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RWByteAddressBuffer)"(i32 0, %struct.RWByteAddressBuffer %11), !dbg !78 ; line:34 col:13
+  %13 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RWByteAddressBuffer)"(i32 14, %dx.types.Handle %12, %dx.types.ResourceProperties { i32 4107, i32 0 }, %struct.RWByteAddressBuffer undef), !dbg !78 ; line:34 col:13
+  %14 = call i32 @"dx.hl.op.ro.i32 (i32, %dx.types.Handle, i32)"(i32 231, %dx.types.Handle %13, i32 4), !dbg !78 ; line:34 col:13
+  %conv3 = uitofp i32 %14 to float, !dbg !78 ; line:34 col:13
+  %y = getelementptr inbounds %struct.CustomAttrs, %struct.CustomAttrs* %attrs, i32 0, i32 1, !dbg !79 ; line:34 col:9
+  store float %conv3, float* %y, align 4, !dbg !80, !tbaa !65 ; line:34 col:11
+  call void @"dx.hl.op..void (i32, %dx.types.HitObject*, %\22class.RayQuery<5, 0>\22*, i32, %struct.CustomAttrs*)"(i32 363, %dx.types.HitObject* %agg.tmp4, %"class.RayQuery<5, 0>"* %q, i32 16, %struct.CustomAttrs* %attrs), !dbg !81 ; line:35 col:7
+  call void @"dx.hl.op..void (i32, %dx.types.HitObject*)"(i32 359, %dx.types.HitObject* %agg.tmp4) #0, !dbg !82 ; line:21 col:3
+  %15 = load %struct.RWByteAddressBuffer, %struct.RWByteAddressBuffer* @"\01?inbuf@@3URWByteAddressBuffer@@A", !dbg !84 ; line:37 col:13
+  %16 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RWByteAddressBuffer)"(i32 0, %struct.RWByteAddressBuffer %15), !dbg !84 ; line:37 col:13
+  %17 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RWByteAddressBuffer)"(i32 14, %dx.types.Handle %16, %dx.types.ResourceProperties { i32 4107, i32 0 }, %struct.RWByteAddressBuffer undef), !dbg !84 ; line:37 col:13
+  %18 = call i32 @"dx.hl.op.ro.i32 (i32, %dx.types.Handle, i32)"(i32 231, %dx.types.Handle %17, i32 8), !dbg !84 ; line:37 col:13
+  %conv6 = uitofp i32 %18 to float, !dbg !84 ; line:37 col:13
+  %x7 = getelementptr inbounds %struct.CustomAttrs, %struct.CustomAttrs* %attrs, i32 0, i32 0, !dbg !85 ; line:37 col:9
+  store float %conv6, float* %x7, align 4, !dbg !86, !tbaa !65 ; line:37 col:11
+  %19 = load %struct.RWByteAddressBuffer, %struct.RWByteAddressBuffer* @"\01?inbuf@@3URWByteAddressBuffer@@A", !dbg !87 ; line:38 col:13
+  %20 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RWByteAddressBuffer)"(i32 0, %struct.RWByteAddressBuffer %19), !dbg !87 ; line:38 col:13
+  %21 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RWByteAddressBuffer)"(i32 14, %dx.types.Handle %20, %dx.types.ResourceProperties { i32 4107, i32 0 }, %struct.RWByteAddressBuffer undef), !dbg !87 ; line:38 col:13
+  %22 = call i32 @"dx.hl.op.ro.i32 (i32, %dx.types.Handle, i32)"(i32 231, %dx.types.Handle %21, i32 12), !dbg !87 ; line:38 col:13
+  %conv9 = uitofp i32 %22 to float, !dbg !87 ; line:38 col:13
+  %y10 = getelementptr inbounds %struct.CustomAttrs, %struct.CustomAttrs* %attrs, i32 0, i32 1, !dbg !88 ; line:38 col:9
+  store float %conv9, float* %y10, align 4, !dbg !89, !tbaa !65 ; line:38 col:11
+  call void @"dx.hl.op..void (i32, %dx.types.HitObject*, %\22class.RayQuery<5, 0>\22*, i32, %struct.CustomAttrs*)"(i32 363, %dx.types.HitObject* %agg.tmp11, %"class.RayQuery<5, 0>"* %q, i32 17, %struct.CustomAttrs* %attrs), !dbg !90 ; line:39 col:7
+  call void @"dx.hl.op..void (i32, %dx.types.HitObject*)"(i32 359, %dx.types.HitObject* %agg.tmp11) #0, !dbg !91 ; line:21 col:3
+  %x12 = getelementptr inbounds %struct.CustomAttrs, %struct.CustomAttrs* %attrs, i32 0, i32 0, !dbg !93 ; line:41 col:25
+  %23 = load float, float* %x12, align 4, !dbg !93, !tbaa !65 ; line:41 col:25
+  %24 = load %struct.RWByteAddressBuffer, %struct.RWByteAddressBuffer* @"\01?outbuf@@3URWByteAddressBuffer@@A", !dbg !94 ; line:41 col:3
+  %25 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RWByteAddressBuffer)"(i32 0, %struct.RWByteAddressBuffer %24), !dbg !94 ; line:41 col:3
+  %26 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RWByteAddressBuffer)"(i32 14, %dx.types.Handle %25, %dx.types.ResourceProperties { i32 4107, i32 0 }, %struct.RWByteAddressBuffer undef), !dbg !94 ; line:41 col:3
+  call void @"dx.hl.op..void (i32, %dx.types.Handle, i32, float)"(i32 277, %dx.types.Handle %26, i32 0, float %23), !dbg !94 ; line:41 col:3
+  %y13 = getelementptr inbounds %struct.CustomAttrs, %struct.CustomAttrs* %attrs, i32 0, i32 1, !dbg !95 ; line:42 col:25
+  %27 = load float, float* %y13, align 4, !dbg !95, !tbaa !65 ; line:42 col:25
+  %28 = load %struct.RWByteAddressBuffer, %struct.RWByteAddressBuffer* @"\01?outbuf@@3URWByteAddressBuffer@@A", !dbg !96 ; line:42 col:3
+  %29 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RWByteAddressBuffer)"(i32 0, %struct.RWByteAddressBuffer %28), !dbg !96 ; line:42 col:3
+  %30 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RWByteAddressBuffer)"(i32 14, %dx.types.Handle %29, %dx.types.ResourceProperties { i32 4107, i32 0 }, %struct.RWByteAddressBuffer undef), !dbg !96 ; line:42 col:3
+  call void @"dx.hl.op..void (i32, %dx.types.Handle, i32, float)"(i32 277, %dx.types.Handle %30, i32 4, float %27), !dbg !96 ; line:42 col:3
+  %31 = bitcast %struct.CustomAttrs* %attrs to i8*, !dbg !97 ; line:43 col:1
+  call void @llvm.lifetime.end(i64 8, i8* %31) #0, !dbg !97 ; line:43 col:1
+  %32 = bitcast %struct.RayDesc* %ray to i8*, !dbg !97 ; line:43 col:1
+  call void @llvm.lifetime.end(i64 32, i8* %32) #0, !dbg !97 ; line:43 col:1
+  %33 = bitcast %"class.RayQuery<5, 0>"* %q to i8*, !dbg !97 ; line:43 col:1
+  call void @llvm.lifetime.end(i64 4, i8* %33) #0, !dbg !97 ; line:43 col:1
+  ret void, !dbg !97 ; line:43 col:1
+}
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture) #0
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture) #0
+
+; Function Attrs: nounwind
+declare void @"dx.hl.op..void (i32, %dx.types.HitObject*)"(i32, %dx.types.HitObject*) #0
+
+; Function Attrs: nounwind
+declare void @"dx.hl.op..void (i32, %\22class.RayQuery<5, 0>\22*, %dx.types.Handle, i32, i32, %struct.RayDesc*)"(i32, %"class.RayQuery<5, 0>"*, %dx.types.Handle, i32, i32, %struct.RayDesc*) #0
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RaytracingAccelerationStructure)"(i32, %struct.RaytracingAccelerationStructure) #1
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RaytracingAccelerationStructure)"(i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RaytracingAccelerationStructure) #1
+
+; Function Attrs: nounwind
+declare void @"dx.hl.op..void (i32, %dx.types.HitObject*, %\22class.RayQuery<5, 0>\22*)"(i32, %dx.types.HitObject*, %"class.RayQuery<5, 0>"*) #0
+
+; Function Attrs: nounwind readonly
+declare i32 @"dx.hl.op.ro.i32 (i32, %dx.types.Handle, i32)"(i32, %dx.types.Handle, i32) #2
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RWByteAddressBuffer)"(i32, %struct.RWByteAddressBuffer) #1
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RWByteAddressBuffer)"(i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RWByteAddressBuffer) #1
+
+; Function Attrs: nounwind
+declare void @"dx.hl.op..void (i32, %dx.types.HitObject*, %\22class.RayQuery<5, 0>\22*, i32, %struct.CustomAttrs*)"(i32, %dx.types.HitObject*, %"class.RayQuery<5, 0>"*, i32, %struct.CustomAttrs*) #0
+
+; Function Attrs: nounwind
+declare void @"dx.hl.op..void (i32, %dx.types.Handle, i32, float)"(i32, %dx.types.Handle, i32, float) #0
+
+; Function Attrs: nounwind
+declare i32 @"dx.hl.op..i32 (i32, i32, i32)"(i32, i32, i32) #0
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!pauseresume = !{!1}
+!dx.version = !{!2}
+!dx.valver = !{!2}
+!dx.shaderModel = !{!3}
+!dx.typeAnnotations = !{!4, !26}
+!dx.entryPoints = !{!30}
+!dx.fnprops = !{!42}
+!dx.options = !{!43, !44}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !{!"hlsl-hlemit", !"hlsl-hlensure"}
+!2 = !{i32 1, i32 9}
+!3 = !{!"lib", i32 6, i32 9}
+!4 = !{i32 0, %"class.RWStructuredBuffer<float>" undef, !5, %struct.RayDesc undef, !10, %"class.dx::HitObject" undef, !15, %"class.RayQuery<5, 0>" undef, !17, %struct.CustomAttrs undef, !23}
+!5 = !{i32 4, !6, !7}
+!6 = !{i32 6, !"h", i32 3, i32 0, i32 7, i32 9}
+!7 = !{i32 0, !8}
+!8 = !{!9}
+!9 = !{i32 0, float undef}
+!10 = !{i32 32, !11, !12, !13, !14}
+!11 = !{i32 6, !"Origin", i32 3, i32 0, i32 7, i32 9, i32 13, i32 3}
+!12 = !{i32 6, !"TMin", i32 3, i32 12, i32 7, i32 9}
+!13 = !{i32 6, !"Direction", i32 3, i32 16, i32 7, i32 9, i32 13, i32 3}
+!14 = !{i32 6, !"TMax", i32 3, i32 28, i32 7, i32 9}
+!15 = !{i32 4, !16}
+!16 = !{i32 6, !"h", i32 3, i32 0, i32 7, i32 4}
+!17 = !{i32 4, !18, !19}
+!18 = !{i32 6, !"h", i32 3, i32 0, i32 7, i32 5}
+!19 = !{i32 0, !20}
+!20 = !{!21, !22}
+!21 = !{i32 1, i64 5}
+!22 = !{i32 1, i64 0}
+!23 = !{i32 8, !24, !25}
+!24 = !{i32 6, !"x", i32 3, i32 0, i32 7, i32 9}
+!25 = !{i32 6, !"y", i32 3, i32 4, i32 7, i32 9}
+!26 = !{i32 1, void ()* @"\01?main@@YAXXZ", !27}
+!27 = !{!28}
+!28 = !{i32 1, !29, !29}
+!29 = !{}
+!30 = !{null, !"", null, !31, null}
+!31 = !{!32, !35, !40, null}
+!32 = !{!33}
+!33 = !{i32 0, %struct.RaytracingAccelerationStructure* @"\01?RTAS@@3URaytracingAccelerationStructure@@A", !"RTAS", i32 -1, i32 -1, i32 1, i32 16, i32 0, !34}
+!34 = !{i32 0, i32 4}
+!35 = !{!36, !38, !39}
+!36 = !{i32 0, %"class.RWStructuredBuffer<float>"* @"\01?UAV@@3V?$RWStructuredBuffer@M@@A", !"UAV", i32 0, i32 0, i32 1, i32 12, i1 false, i1 false, i1 false, !37}
+!37 = !{i32 1, i32 4}
+!38 = !{i32 1, %struct.RWByteAddressBuffer* @"\01?inbuf@@3URWByteAddressBuffer@@A", !"inbuf", i32 -1, i32 -1, i32 1, i32 11, i1 false, i1 false, i1 false, null}
+!39 = !{i32 2, %struct.RWByteAddressBuffer* @"\01?outbuf@@3URWByteAddressBuffer@@A", !"outbuf", i32 -1, i32 -1, i32 1, i32 11, i1 false, i1 false, i1 false, null}
+!40 = !{!41}
+!41 = !{i32 0, %ConstantBuffer* @"$Globals", !"$Globals", i32 0, i32 -1, i32 1, i32 0, null}
+!42 = !{void ()* @"\01?main@@YAXXZ", i32 7}
+!43 = !{i32 -2147483584}
+!44 = !{i32 -1}
+!45 = !DILocation(line: 26, column: 3, scope: !46)
+!46 = !DISubprogram(name: "main", scope: !47, file: !47, line: 25, type: !48, isLocal: false, isDefinition: true, scopeLine: 25, flags: DIFlagPrototyped, isOptimized: false, function: void ()* @"\01?main@@YAXXZ")
+!47 = !DIFile(filename: "hitobject_fromrayquery_scalarrepl.hlsl", directory: "")
+!48 = !DISubroutineType(types: !29)
+!49 = !DILocation(line: 26, column: 78, scope: !46)
+!50 = !DILocation(line: 27, column: 3, scope: !46)
+!51 = !DILocation(line: 8, column: 8, scope: !52, inlinedAt: !53)
+!52 = !DISubprogram(name: "MakeRayDesc", scope: !47, file: !47, line: 6, type: !48, isLocal: false, isDefinition: true, scopeLine: 6, flags: DIFlagPrototyped, isOptimized: false)
+!53 = distinct !DILocation(line: 27, column: 17, scope: !46)
+!54 = !DILocation(line: 8, column: 15, scope: !52, inlinedAt: !53)
+!55 = !{!56, !56, i64 0}
+!56 = !{!"omnipotent char", !57, i64 0}
+!57 = !{!"Simple C/C++ TBAA"}
+!58 = !{!59}
+!59 = distinct !{!59, !60, !"\01?MakeRayDesc@@YA?AURayDesc@@XZ: %agg.result"}
+!60 = distinct !{!60, !"\01?MakeRayDesc@@YA?AURayDesc@@XZ"}
+!61 = !DILocation(line: 9, column: 8, scope: !52, inlinedAt: !53)
+!62 = !DILocation(line: 9, column: 18, scope: !52, inlinedAt: !53)
+!63 = !DILocation(line: 10, column: 8, scope: !52, inlinedAt: !53)
+!64 = !DILocation(line: 10, column: 13, scope: !52, inlinedAt: !53)
+!65 = !{!66, !66, i64 0}
+!66 = !{!"float", !56, i64 0}
+!67 = !DILocation(line: 11, column: 8, scope: !52, inlinedAt: !53)
+!68 = !DILocation(line: 11, column: 13, scope: !52, inlinedAt: !53)
+!69 = !DILocation(line: 28, column: 3, scope: !46)
+!70 = !DILocation(line: 30, column: 7, scope: !46)
+!71 = !DILocation(line: 21, column: 3, scope: !72, inlinedAt: !73)
+!72 = !DISubprogram(name: "Use", scope: !47, file: !47, line: 20, type: !48, isLocal: false, isDefinition: true, scopeLine: 20, flags: DIFlagPrototyped, isOptimized: false)
+!73 = distinct !DILocation(line: 30, column: 3, scope: !46)
+!74 = !DILocation(line: 32, column: 3, scope: !46)
+!75 = !DILocation(line: 33, column: 13, scope: !46)
+!76 = !DILocation(line: 33, column: 9, scope: !46)
+!77 = !DILocation(line: 33, column: 11, scope: !46)
+!78 = !DILocation(line: 34, column: 13, scope: !46)
+!79 = !DILocation(line: 34, column: 9, scope: !46)
+!80 = !DILocation(line: 34, column: 11, scope: !46)
+!81 = !DILocation(line: 35, column: 7, scope: !46)
+!82 = !DILocation(line: 21, column: 3, scope: !72, inlinedAt: !83)
+!83 = distinct !DILocation(line: 35, column: 3, scope: !46)
+!84 = !DILocation(line: 37, column: 13, scope: !46)
+!85 = !DILocation(line: 37, column: 9, scope: !46)
+!86 = !DILocation(line: 37, column: 11, scope: !46)
+!87 = !DILocation(line: 38, column: 13, scope: !46)
+!88 = !DILocation(line: 38, column: 9, scope: !46)
+!89 = !DILocation(line: 38, column: 11, scope: !46)
+!90 = !DILocation(line: 39, column: 7, scope: !46)
+!91 = !DILocation(line: 21, column: 3, scope: !72, inlinedAt: !92)
+!92 = distinct !DILocation(line: 39, column: 3, scope: !46)
+!93 = !DILocation(line: 41, column: 25, scope: !46)
+!94 = !DILocation(line: 41, column: 3, scope: !46)
+!95 = !DILocation(line: 42, column: 25, scope: !46)
+!96 = !DILocation(line: 42, column: 3, scope: !46)
+!97 = !DILocation(line: 43, column: 1, scope: !46)
diff --git a/tools/clang/test/DXC/Passes/ScalarReplHLSL/hitobject_make_scalarrepl.ll b/tools/clang/test/DXC/Passes/ScalarReplHLSL/hitobject_make_scalarrepl.ll
index 89ee886c2e..78f7271e94 100644
--- a/tools/clang/test/DXC/Passes/ScalarReplHLSL/hitobject_make_scalarrepl.ll
+++ b/tools/clang/test/DXC/Passes/ScalarReplHLSL/hitobject_make_scalarrepl.ll
@@ -33,7 +33,7 @@ entry:
   %hit = alloca %dx.types.HitObject, align 4
   %tmp = alloca %dx.types.HitObject, align 4
   %ray = alloca %struct.RayDesc, align 4
-; CHECK-NOT: %{{[^ ]+}} = alloca %struct.RayDesc
+; CHECK-NOT: alloca %struct.RayDesc
   %tmp2 = alloca %dx.types.HitObject, align 4
 ; CHECK: %[[HIT0:[^ ]+]] = alloca %dx.types.HitObject, align 4
 ; CHECK: %[[HIT1:[^ ]+]] = alloca %dx.types.HitObject, align 4
@@ -69,7 +69,16 @@ entry:
 ; CHECK-DAG: %[[RDTMIN:[^ ]+]] = load float, float* %[[pRDTMIN]],
 ; CHECK-DAG: %[[RDD:[^ ]+]] = load <3 x float>, <3 x float>* %[[pRDD]],
 ; CHECK-DAG: %[[RDTMAX:[^ ]+]] = load float, float* %[[pRDTMAX]],
-; CHECK:  call void @"dx.hl.op..void (i32, %dx.types.HitObject*, i32, i32, <3 x float>, float, <3 x float>, float)"(i32 387, %dx.types.HitObject* %[[HIT2]], i32 0, i32 1, <3 x float> %[[RDO]], float %[[RDTMIN]], <3 x float> %[[RDD]], float %[[RDTMAX]])
+; Copy introduced for RayDesc argument
+; CHECK-DAG: store <3 x float> %[[RDO]], <3 x float>* %[[pRDO2:[^ ]+]],
+; CHECK-DAG: store float %[[RDTMIN]], float* %[[pRDTMIN2:[^ ]+]],
+; CHECK-DAG: store <3 x float> %[[RDD]], <3 x float>* %[[pRDD2:[^ ]+]],
+; CHECK-DAG: store float %[[RDTMAX]], float* %[[pRDTMAX2:[^ ]+]],
+; CHECK-DAG: %[[RDO2:[^ ]+]] = load <3 x float>, <3 x float>* %[[pRDO2]],
+; CHECK-DAG: %[[RDTMIN2:[^ ]+]] = load float, float* %[[pRDTMIN2]],
+; CHECK-DAG: %[[RDD2:[^ ]+]] = load <3 x float>, <3 x float>* %[[pRDD2]],
+; CHECK-DAG: %[[RDTMAX2:[^ ]+]] = load float, float* %[[pRDTMAX2]],
+; CHECK:  call void @"dx.hl.op..void (i32, %dx.types.HitObject*, i32, i32, <3 x float>, float, <3 x float>, float)"(i32 387, %dx.types.HitObject* %[[HIT2]], i32 0, i32 1, <3 x float> %[[RDO2]], float %[[RDTMIN2]], <3 x float> %[[RDD2]], float %[[RDTMAX2]])
   call void @"dx.hl.op..void (i32, %dx.types.HitObject*, i32, i32, %struct.RayDesc*)"(i32 387, %dx.types.HitObject* %tmp2, i32 0, i32 1, %struct.RayDesc* %ray), !dbg !31 ; line:45 col:3
   %10 = bitcast %dx.types.HitObject* %tmp2 to i8*, !dbg !31 ; line:45 col:3
   call void @llvm.lifetime.end(i64 4, i8* %10) #0, !dbg !31 ; line:45 col:3
diff --git a/tools/clang/test/DXC/Passes/ScalarReplHLSL/hitobject_traceinvoke_scalarrepl.ll b/tools/clang/test/DXC/Passes/ScalarReplHLSL/hitobject_traceinvoke_scalarrepl.ll
new file mode 100644
index 0000000000..fa22ee5744
--- /dev/null
+++ b/tools/clang/test/DXC/Passes/ScalarReplHLSL/hitobject_traceinvoke_scalarrepl.ll
@@ -0,0 +1,198 @@
+; RUN: %dxopt %s -hlsl-passes-resume -scalarrepl-param-hlsl -S | FileCheck %s
+
+; Based on tools/clang/test/CodeGenDXIL/hlsl/objects/HitObject/hitobject_traceinvoke.hlsl
+
+target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+%struct.RaytracingAccelerationStructure = type { i32 }
+%"class.RWStructuredBuffer<float>" = type { float }
+%ConstantBuffer = type opaque
+%struct.RayDesc = type { <3 x float>, float, <3 x float>, float }
+%struct.Payload = type { <3 x float> }
+%dx.types.HitObject = type { i8* }
+%dx.types.Handle = type { i8* }
+%dx.types.ResourceProperties = type { i32, i32 }
+%"class.dx::HitObject" = type { i32 }
+
+@"\01?RTAS@@3URaytracingAccelerationStructure@@A" = external global %struct.RaytracingAccelerationStructure, align 4
+@"\01?UAV@@3V?$RWStructuredBuffer@M@@A" = external global %"class.RWStructuredBuffer<float>", align 4
+@"$Globals" = external constant %ConstantBuffer
+
+; Function Attrs: nounwind
+define void @"\01?main@@YAXXZ"() #0 {
+entry:
+  %rayDesc = alloca %struct.RayDesc, align 4
+  %pld = alloca %struct.Payload, align 4
+
+  ; CHECK: %[[HITOBJ:[^ ,]+]] = alloca %dx.types.HitObject, align 4
+
+  %hit = alloca %dx.types.HitObject, align 4
+
+  %0 = bitcast %struct.RayDesc* %rayDesc to i8*, !dbg !37 ; line:82 col:3
+  call void @llvm.lifetime.start(i64 32, i8* %0) #0, !dbg !37 ; line:82 col:3
+
+  ; Init RayDesc.
+  ; CHECK-DAG: store <3 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00>, <3 x float>* %[[ORIGIN_P0:[^ ,]+]], align 4
+  ; CHECK-DAG: store float 3.000000e+00, float* %[[TMIN_P0:[^ ,]+]], align 4
+  ; CHECK-DAG: store <3 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00>, <3 x float>* %[[DIRECTION_P0:[^ ,]+]], align 4
+  ; CHECK-DAG: store float 7.000000e+00, float* %[[TMAX_P0:[^ ,]+]], align 4
+
+  %Origin = getelementptr inbounds %struct.RayDesc, %struct.RayDesc* %rayDesc, i32 0, i32 0, !dbg !41 ; line:83 col:11
+  store <3 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00>, <3 x float>* %Origin, align 4, !dbg !42, !tbaa !43 ; line:83 col:18
+  %TMin = getelementptr inbounds %struct.RayDesc, %struct.RayDesc* %rayDesc, i32 0, i32 1, !dbg !46 ; line:84 col:11
+  store float 3.000000e+00, float* %TMin, align 4, !dbg !47, !tbaa !48 ; line:84 col:16
+  %Direction = getelementptr inbounds %struct.RayDesc, %struct.RayDesc* %rayDesc, i32 0, i32 2, !dbg !50 ; line:85 col:11
+  store <3 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00>, <3 x float>* %Direction, align 4, !dbg !51, !tbaa !43 ; line:85 col:21
+  %TMax = getelementptr inbounds %struct.RayDesc, %struct.RayDesc* %rayDesc, i32 0, i32 3, !dbg !52 ; line:86 col:11
+  store float 7.000000e+00, float* %TMax, align 4, !dbg !53, !tbaa !48 ; line:86 col:16
+
+  %1 = bitcast %struct.Payload* %pld to i8*, !dbg !54 ; line:88 col:3
+  call void @llvm.lifetime.start(i64 12, i8* %1) #0, !dbg !54 ; line:88 col:3
+  %dummy = getelementptr inbounds %struct.Payload, %struct.Payload* %pld, i32 0, i32 0, !dbg !55 ; line:89 col:7
+  store <3 x float> <float 7.000000e+00, float 8.000000e+00, float 9.000000e+00>, <3 x float>* %dummy, align 4, !dbg !56, !tbaa !43 ; line:89 col:13
+  %2 = bitcast %dx.types.HitObject* %hit to i8*, !dbg !57 ; line:91 col:3
+  call void @llvm.lifetime.start(i64 4, i8* %2) #0, !dbg !57 ; line:91 col:3
+  %3 = load %struct.RaytracingAccelerationStructure, %struct.RaytracingAccelerationStructure* @"\01?RTAS@@3URaytracingAccelerationStructure@@A", !dbg !58 ; line:91 col:23
+  %4 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RaytracingAccelerationStructure)"(i32 0, %struct.RaytracingAccelerationStructure %3), !dbg !58 ; line:91 col:23
+
+  ; CHECK-DAG: %[[RTAS:[^ ,]+]] = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RaytracingAccelerationStructure)"(i32 14, %dx.types.Handle %{{[^ ,]+}}, %dx.types.ResourceProperties { i32 16, i32 0 }, %struct.RaytracingAccelerationStructure undef)
+
+  %5 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RaytracingAccelerationStructure)"(i32 14, %dx.types.Handle %4, %dx.types.ResourceProperties { i32 16, i32 0 }, %struct.RaytracingAccelerationStructure undef), !dbg !58 ; line:91 col:23
+
+  ; Copy RayDesc.
+  ; CHECK-DAG: %[[ORIGIN_L0:[^ ,]+]] = load <3 x float>, <3 x float>* %[[ORIGIN_P0]]
+  ; CHECK-DAG: store <3 x float> %[[ORIGIN_L0]], <3 x float>* %[[ORIGIN_P1:[^ ,]+]]
+  ; CHECK-DAG: %[[TMIN_L0:[^ ,]+]] = load float, float* %[[TMIN_P0]]
+  ; CHECK-DAG: store float %[[TMIN_L0]], float* %[[TMIN_P1:[^ ,]+]]
+  ; CHECK-DAG: %[[DIRECTION_L0:[^ ,]+]] = load <3 x float>, <3 x float>* %[[DIRECTION_P0]]
+  ; CHECK-DAG: store <3 x float> %[[DIRECTION_L0]], <3 x float>* %[[DIRECTION_P1:[^ ,]+]]
+  ; CHECK-DAG: %[[TMAX_L0:[^ ,]+]] = load float, float* %[[TMAX_P0]]
+  ; CHECK-DAG: store float %[[TMAX_L0]], float* %[[TMAX_P1:[^ ,]+]]
+
+  ; Load RayDesc.
+  ; CHECK-DAG: %[[ORIGIN_L1:[^ ,]+]] = load <3 x float>, <3 x float>* %[[ORIGIN_P1]]
+  ; CHECK-DAG: %[[TMIN_L1:[^ ,]+]] = load float, float* %[[TMIN_P1]]
+  ; CHECK-DAG: %[[DIRECTION_L1:[^ ,]+]] = load <3 x float>, <3 x float>* %[[DIRECTION_P1]]
+  ; CHECK-DAG: %[[TMAX_L1:[^ ,]+]] = load float, float* %[[TMAX_P1]]
+
+  ; RayDesc is scalar replaced in HL op for dx::HitObject::TraceRay.
+  ; CHECK: call void @"dx.hl.op..void (i32, %dx.types.HitObject*, %dx.types.Handle, i32, i32, i32, i32, i32, <3 x float>, float, <3 x float>, float, %struct.Payload*)"(i32 389, %dx.types.HitObject* %[[HITOBJ]], %dx.types.Handle %[[RTAS]], i32 513, i32 1, i32 2, i32 4, i32 0, <3 x float> %[[ORIGIN_L1]], float %[[TMIN_L1]], <3 x float> %[[DIRECTION_L1]], float %[[TMAX_L1]], %struct.Payload* %[[PLD_P0:[^ ,]+]])
+
+  call void @"dx.hl.op..void (i32, %dx.types.HitObject*, %dx.types.Handle, i32, i32, i32, i32, i32, %struct.RayDesc*, %struct.Payload*)"(i32 389, %dx.types.HitObject* %hit, %dx.types.Handle %5, i32 513, i32 1, i32 2, i32 4, i32 0, %struct.RayDesc* %rayDesc, %struct.Payload* %pld), !dbg !58 ; line:91 col:23
+
+  ; Copy payload.
+  ; CHECK: %[[GEP_PLD_P0:[^ ,]+]] = getelementptr inbounds %struct.Payload, %struct.Payload* %[[PLD_P0]], i32 0, i32 0
+  ; CHECK: %[[PLD_L0:[^ ,]+]] = load <3 x float>, <3 x float>* %[[GEP_PLD_P0]]
+  ; CHECK: store <3 x float> %[[PLD_L0]], <3 x float>* %[[PLD_M0_P0:[^ ,]+]]
+  ; CHECK: %[[GEP_PLD_P1:[^ ,]+]] = getelementptr inbounds %struct.Payload, %struct.Payload* %[[PLD_P1:[^ ,]+]], i32 0, i32 0
+  ; CHECK: [[PLD_L1:[^ ,]+]] = load <3 x float>, <3 x float>* %[[PLD_M0_P0]]
+  ; CHECK: store <3 x float> [[PLD_L1]], <3 x float>* %[[GEP_PLD_P1]]
+
+  ; dx::HitObject::Invoke
+  ; CHECK: call void @"dx.hl.op..void (i32, %dx.types.HitObject*, %struct.Payload*)"(i32 382, %dx.types.HitObject* %[[HITOBJ]], %struct.Payload* %[[PLD_P1]])
+
+  call void @"dx.hl.op..void (i32, %dx.types.HitObject*, %struct.Payload*)"(i32 382, %dx.types.HitObject* %hit, %struct.Payload* %pld), !dbg !59 ; line:101 col:3
+
+  %6 = bitcast %dx.types.HitObject* %hit to i8*, !dbg !60 ; line:102 col:1
+  call void @llvm.lifetime.end(i64 4, i8* %6) #0, !dbg !60 ; line:102 col:1
+  %7 = bitcast %struct.Payload* %pld to i8*, !dbg !60 ; line:102 col:1
+  call void @llvm.lifetime.end(i64 12, i8* %7) #0, !dbg !60 ; line:102 col:1
+  %8 = bitcast %struct.RayDesc* %rayDesc to i8*, !dbg !60 ; line:102 col:1
+  call void @llvm.lifetime.end(i64 32, i8* %8) #0, !dbg !60 ; line:102 col:1
+  ret void, !dbg !60 ; line:102 col:1
+}
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture) #0
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture) #0
+
+; Function Attrs: nounwind
+declare void @"dx.hl.op..void (i32, %dx.types.HitObject*, %dx.types.Handle, i32, i32, i32, i32, i32, %struct.RayDesc*, %struct.Payload*)"(i32, %dx.types.HitObject*, %dx.types.Handle, i32, i32, i32, i32, i32, %struct.RayDesc*, %struct.Payload*) #0
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RaytracingAccelerationStructure)"(i32, %struct.RaytracingAccelerationStructure) #1
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RaytracingAccelerationStructure)"(i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RaytracingAccelerationStructure) #1
+
+; Function Attrs: nounwind
+declare void @"dx.hl.op..void (i32, %dx.types.HitObject*, %struct.Payload*)"(i32, %dx.types.HitObject*, %struct.Payload*) #0
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+
+!llvm.module.flags = !{!0}
+!pauseresume = !{!1}
+!llvm.ident = !{!2}
+!dx.version = !{!3}
+!dx.valver = !{!3}
+!dx.shaderModel = !{!4}
+!dx.typeAnnotations = !{!5, !20}
+!dx.entryPoints = !{!24}
+!dx.fnprops = !{!34}
+!dx.options = !{!35, !36}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !{!"hlsl-hlemit", !"hlsl-hlensure"}
+!2 = !{!"dxc(private) 1.8.0.4928 (ser_hlslattributes_patch, 937c16cc6)"}
+!3 = !{i32 1, i32 9}
+!4 = !{!"lib", i32 6, i32 9}
+!5 = !{i32 0, %"class.RWStructuredBuffer<float>" undef, !6, %struct.RayDesc undef, !11, %struct.Payload undef, !16, %"class.dx::HitObject" undef, !18}
+!6 = !{i32 4, !7, !8}
+!7 = !{i32 6, !"h", i32 3, i32 0, i32 7, i32 9}
+!8 = !{i32 0, !9}
+!9 = !{!10}
+!10 = !{i32 0, float undef}
+!11 = !{i32 32, !12, !13, !14, !15}
+!12 = !{i32 6, !"Origin", i32 3, i32 0, i32 7, i32 9, i32 13, i32 3}
+!13 = !{i32 6, !"TMin", i32 3, i32 12, i32 7, i32 9}
+!14 = !{i32 6, !"Direction", i32 3, i32 16, i32 7, i32 9, i32 13, i32 3}
+!15 = !{i32 6, !"TMax", i32 3, i32 28, i32 7, i32 9}
+!16 = !{i32 12, !17}
+!17 = !{i32 6, !"dummy", i32 3, i32 0, i32 7, i32 9, i32 13, i32 3}
+!18 = !{i32 4, !19}
+!19 = !{i32 6, !"h", i32 3, i32 0, i32 7, i32 4}
+!20 = !{i32 1, void ()* @"\01?main@@YAXXZ", !21}
+!21 = !{!22}
+!22 = !{i32 1, !23, !23}
+!23 = !{}
+!24 = !{null, !"", null, !25, null}
+!25 = !{!26, !29, !32, null}
+!26 = !{!27}
+!27 = !{i32 0, %struct.RaytracingAccelerationStructure* @"\01?RTAS@@3URaytracingAccelerationStructure@@A", !"RTAS", i32 -1, i32 -1, i32 1, i32 16, i32 0, !28}
+!28 = !{i32 0, i32 4}
+!29 = !{!30}
+!30 = !{i32 0, %"class.RWStructuredBuffer<float>"* @"\01?UAV@@3V?$RWStructuredBuffer@M@@A", !"UAV", i32 0, i32 0, i32 1, i32 12, i1 false, i1 false, i1 false, !31}
+!31 = !{i32 1, i32 4}
+!32 = !{!33}
+!33 = !{i32 0, %ConstantBuffer* @"$Globals", !"$Globals", i32 0, i32 -1, i32 1, i32 0, null}
+!34 = !{void ()* @"\01?main@@YAXXZ", i32 7}
+!35 = !{i32 -2147483584}
+!36 = !{i32 -1}
+!37 = !DILocation(line: 82, column: 3, scope: !38)
+!38 = !DISubprogram(name: "main", scope: !39, file: !39, line: 81, type: !40, isLocal: false, isDefinition: true, scopeLine: 81, flags: DIFlagPrototyped, isOptimized: false, function: void ()* @"\01?main@@YAXXZ")
+!39 = !DIFile(filename: "D:\5Cgit\5Cdxc\5Cmain\5Ctools\5Cclang\5Ctest\5CCodeGenDXIL\5Chlsl\5Cobjects\5CHitObject\5Chitobject_traceinvoke.hlsl", directory: "")
+!40 = !DISubroutineType(types: !23)
+!41 = !DILocation(line: 83, column: 11, scope: !38)
+!42 = !DILocation(line: 83, column: 18, scope: !38)
+!43 = !{!44, !44, i64 0}
+!44 = !{!"omnipotent char", !45, i64 0}
+!45 = !{!"Simple C/C++ TBAA"}
+!46 = !DILocation(line: 84, column: 11, scope: !38)
+!47 = !DILocation(line: 84, column: 16, scope: !38)
+!48 = !{!49, !49, i64 0}
+!49 = !{!"float", !44, i64 0}
+!50 = !DILocation(line: 85, column: 11, scope: !38)
+!51 = !DILocation(line: 85, column: 21, scope: !38)
+!52 = !DILocation(line: 86, column: 11, scope: !38)
+!53 = !DILocation(line: 86, column: 16, scope: !38)
+!54 = !DILocation(line: 88, column: 3, scope: !38)
+!55 = !DILocation(line: 89, column: 7, scope: !38)
+!56 = !DILocation(line: 89, column: 13, scope: !38)
+!57 = !DILocation(line: 91, column: 3, scope: !38)
+!58 = !DILocation(line: 91, column: 23, scope: !38)
+!59 = !DILocation(line: 101, column: 3, scope: !38)
+!60 = !DILocation(line: 102, column: 1, scope: !38)
diff --git a/tools/clang/test/DXC/Passes/ScalarReplHLSL/traceray_scalarrepl.ll b/tools/clang/test/DXC/Passes/ScalarReplHLSL/traceray_scalarrepl.ll
new file mode 100644
index 0000000000..59551a7eb4
--- /dev/null
+++ b/tools/clang/test/DXC/Passes/ScalarReplHLSL/traceray_scalarrepl.ll
@@ -0,0 +1,182 @@
+; RUN: %dxopt %s -hlsl-passes-resume -scalarrepl-param-hlsl -S | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+%struct.RaytracingAccelerationStructure = type { i32 }
+%"$Globals" = type { i32, i32, i32, i32, i32 }
+%struct.RayDesc = type { <3 x float>, float, <3 x float>, float }
+%struct.Payload = type { <2 x float>, <3 x i32> }
+%dx.types.Handle = type { i8* }
+%dx.types.ResourceProperties = type { i32, i32 }
+
+@"\01?Acc@@3URaytracingAccelerationStructure@@A" = external global %struct.RaytracingAccelerationStructure, align 4
+@"\01?RayFlags@@3IB" = external constant i32, align 4
+@"\01?InstanceInclusionMask@@3IB" = external constant i32, align 4
+@"\01?RayContributionToHitGroupIndex@@3IB" = external constant i32, align 4
+@"\01?MultiplierForGeometryContributionToHitGroupIndex@@3IB" = external constant i32, align 4
+@"\01?MissShaderIndex@@3IB" = external constant i32, align 4
+@"$Globals" = external constant %"$Globals"
+
+; CHECK: define <4 x float> @"
+; CHECK-SAME: ?emit@@YA?AV?$vector@M$03@@AIAV?$vector@M$01@@URayDesc@@UPayload@@@Z"(<2 x float>* noalias dereferenceable(8) %f2, %struct.RayDesc* %Ray, %struct.Payload* noalias %p)
+
+; Function Attrs: nounwind
+define <4 x float> @"\01?emit@@YA?AV?$vector@M$03@@AIAV?$vector@M$01@@URayDesc@@UPayload@@@Z"(<2 x float>* noalias dereferenceable(8) %f2, %struct.RayDesc* %Ray, %struct.Payload* noalias %p) #0 {
+entry:
+
+  ; Copy Payload fields (PLD_F0, PLD_F1) to local allocas:
+  ; CHECK: %[[GEP:[^ ,]+]] = getelementptr inbounds %struct.Payload, %struct.Payload* %p, i32 0, i32 0
+  ; CHECK: %[[LOAD:[^ ,]+]] = load <2 x float>, <2 x float>* %[[GEP]]
+  ; CHECK: store <2 x float> %[[LOAD]], <2 x float>* %[[PLD_F0:[^ ,]+]]
+  ; CHECK: %[[GEP:[^ ,]+]] = getelementptr inbounds %struct.Payload, %struct.Payload* %p, i32 0, i32 1
+  ; CHECK: %[[LOAD:[^ ,]+]] = load <3 x i32>, <3 x i32>* %[[GEP]]
+  ; CHECK: store <3 x i32> %[[LOAD]], <3 x i32>* %[[PLD_F1:[^ ,]+]]
+
+  %0 = alloca %struct.RayDesc, !dbg !39 ; line:22 col:61
+  %1 = bitcast %struct.RayDesc* %0 to i8*, !dbg !39 ; line:22 col:61
+  %2 = bitcast %struct.RayDesc* %Ray to i8*, !dbg !39 ; line:22 col:61
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* %2, i64 32, i32 1, i1 false), !dbg !39 ; line:22 col:61
+  %3 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22$Globals\22*, i32)"(i32 0, %"$Globals"* @"$Globals", i32 0), !dbg !39 ; line:22 col:61
+  %4 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22$Globals\22)"(i32 14, %dx.types.Handle %3, %dx.types.ResourceProperties { i32 13, i32 20 }, %"$Globals" undef), !dbg !39 ; line:22 col:61
+  %5 = call %"$Globals"* @"dx.hl.subscript.cb.rn.%\22$Globals\22* (i32, %dx.types.Handle, i32)"(i32 6, %dx.types.Handle %4, i32 0), !dbg !39 ; line:22 col:61
+  %6 = getelementptr inbounds %"$Globals", %"$Globals"* %5, i32 0, i32 0, !dbg !39 ; line:22 col:61
+  %7 = getelementptr inbounds %"$Globals", %"$Globals"* %5, i32 0, i32 1, !dbg !39 ; line:22 col:61
+  %8 = getelementptr inbounds %"$Globals", %"$Globals"* %5, i32 0, i32 2, !dbg !39 ; line:22 col:61
+  %9 = getelementptr inbounds %"$Globals", %"$Globals"* %5, i32 0, i32 3, !dbg !39 ; line:22 col:61
+  %10 = getelementptr inbounds %"$Globals", %"$Globals"* %5, i32 0, i32 4, !dbg !39 ; line:22 col:61
+  %11 = load i32, i32* %10, align 4, !dbg !39, !tbaa !43 ; line:22 col:61
+  %12 = load i32, i32* %9, align 4, !dbg !47, !tbaa !43 ; line:22 col:12
+  %13 = load i32, i32* %8, align 4, !dbg !48, !tbaa !43 ; line:21 col:12
+  %14 = load i32, i32* %7, align 4, !dbg !49, !tbaa !43 ; line:20 col:25
+  %15 = load i32, i32* %6, align 4, !dbg !50, !tbaa !43 ; line:20 col:16
+  %16 = load %struct.RaytracingAccelerationStructure, %struct.RaytracingAccelerationStructure* @"\01?Acc@@3URaytracingAccelerationStructure@@A", !dbg !51 ; line:20 col:3
+  %17 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RaytracingAccelerationStructure)"(i32 0, %struct.RaytracingAccelerationStructure %16), !dbg !51 ; line:20 col:3
+
+  ; CHECK: %[[RTAS:[^ ,]+]] = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RaytracingAccelerationStructure)"(i32 14, %dx.types.Handle %{{[^ ,]+}}, %dx.types.ResourceProperties { i32 16, i32 0 }, %struct.RaytracingAccelerationStructure undef)
+  %18 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RaytracingAccelerationStructure)"(i32 14, %dx.types.Handle %17, %dx.types.ResourceProperties { i32 16, i32 0 }, %struct.RaytracingAccelerationStructure undef), !dbg !51 ; line:20 col:3
+
+  ; Copy RayDesc fields (Origin, TMin, Direction, TMax) to local allocas:
+  ; CHECK: %[[RAY_ORIGIN_GEP:[^ ,]+]] = getelementptr inbounds %struct.RayDesc, %struct.RayDesc* %Ray, i32 0, i32 0
+  ; CHECK: %[[RAY_ORIGIN_LOAD:[^ ,]+]] = load <3 x float>, <3 x float>* %[[RAY_ORIGIN_GEP]]
+  ; CHECK: store <3 x float> %[[RAY_ORIGIN_LOAD]], <3 x float>* %[[RAY_ORIGIN_P0:[^ ,]+]]
+  ; CHECK: %[[TMIN_GEP:[^ ,]+]] = getelementptr inbounds %struct.RayDesc, %struct.RayDesc* %Ray, i32 0, i32 1
+  ; CHECK: %[[TMIN_LOAD:[^ ,]+]] = load float, float* %[[TMIN_GEP]]
+  ; CHECK: store float %[[TMIN_LOAD]], float* %[[TMIN_P0:[^ ,]+]]
+  ; CHECK: %[[DIRECTION_GEP:[^ ,]+]] = getelementptr inbounds %struct.RayDesc, %struct.RayDesc* %Ray, i32 0, i32 2
+  ; CHECK: %[[DIRECTION_LOAD:[^ ,]+]] = load <3 x float>, <3 x float>* %[[DIRECTION_GEP]]
+  ; CHECK: store <3 x float> %[[DIRECTION_LOAD]], <3 x float>* %[[DIRECTION_P0:[^ ,]+]]
+  ; CHECK: %[[TMAX_GEP:[^ ,]+]] = getelementptr inbounds %struct.RayDesc, %struct.RayDesc* %Ray, i32 0, i32 3
+  ; CHECK: %[[TMAX_LOAD:[^ ,]+]] = load float, float* %[[TMAX_GEP]]
+  ; CHECK: store float %[[TMAX_LOAD]], float* %[[TMAX_P0:[^ ,]+]]
+
+  ; Copy Payload fields into payload struct for call:
+  ; CHECK: %[[PLD_F0_GEP:[^ ,]+]] = getelementptr inbounds %struct.Payload, %struct.Payload* %[[PLD_P0:[^ ,]+]], i32 0, i32 0
+  ; CHECK: %[[PLD_F0_LOAD:[^ ,]+]] = load <2 x float>, <2 x float>* %[[PLD_F0]]
+  ; CHECK: store <2 x float> %[[PLD_F0_LOAD]], <2 x float>* %[[PLD_F0_GEP]]
+  ; CHECK: %[[PLD_F1_GEP:[^ ,]+]] = getelementptr inbounds %struct.Payload, %struct.Payload* %[[PLD_P0]], i32 0, i32 1
+  ; CHECK: %[[PLD_F1_LOAD:[^ ,]+]] = load <3 x i32>, <3 x i32>* %[[PLD_F1]]
+  ; CHECK: store <3 x i32> %[[PLD_F1_LOAD]], <3 x i32>* %[[PLD_F1_GEP]]
+
+  ; Load RayDesc fields:
+  ; CHECK: %[[RAY_ORIGIN_LOAD2:[^ ,]+]] = load <3 x float>, <3 x float>* %[[RAY_ORIGIN_P0]]
+  ; CHECK: %[[TMIN_LOAD2:[^ ,]+]] = load float, float* %[[TMIN_P0]]
+  ; CHECK: %[[DIRECTION_LOAD2:[^ ,]+]] = load <3 x float>, <3 x float>* %[[DIRECTION_P0]]
+  ; CHECK: %[[TMAX_LOAD2:[^ ,]+]] = load float, float* %[[TMAX_P0]]
+
+  ; call TraceRay with the local allocas:
+  ; CHECK: call void @"dx.hl.op..void (i32, %dx.types.Handle, i32, i32, i32, i32, i32, <3 x float>, float, <3 x float>, float, %struct.Payload*)"(i32 69, %dx.types.Handle %[[RTAS]], i32 %{{[^ ,]+}}, i32 %{{[^ ,]+}}, i32 %{{[^ ,]+}}, i32 %{{[^ ,]+}}, i32 %{{[^ ,]+}}, <3 x float> %[[RAY_ORIGIN_LOAD2]], float %[[TMIN_LOAD2]], <3 x float> %[[DIRECTION_LOAD2]], float %[[TMAX_LOAD2]], %struct.Payload* %[[PLD_P0]])
+
+  call void @"dx.hl.op..void (i32, %dx.types.Handle, i32, i32, i32, i32, i32, %struct.RayDesc*, %struct.Payload*)"(i32 69, %dx.types.Handle %18, i32 %15, i32 %14, i32 %13, i32 %12, i32 %11, %struct.RayDesc* %0, %struct.Payload* %p), !dbg !51 ; line:20 col:3
+
+  ret <4 x float> <float 0x4004CCCCC0000000, float 0x4004CCCCC0000000, float 0x4004CCCCC0000000, float 0x4004CCCCC0000000>, !dbg !52 ; line:24 col:4
+}
+
+; Function Attrs: nounwind
+declare void @"dx.hl.op..void (i32, %dx.types.Handle, i32, i32, i32, i32, i32, %struct.RayDesc*, %struct.Payload*)"(i32, %dx.types.Handle, i32, i32, i32, i32, i32, %struct.RayDesc*, %struct.Payload*) #0
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RaytracingAccelerationStructure)"(i32, %struct.RaytracingAccelerationStructure) #1
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RaytracingAccelerationStructure)"(i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RaytracingAccelerationStructure) #1
+
+; Function Attrs: nounwind readnone
+declare %"$Globals"* @"dx.hl.subscript.cb.rn.%\22$Globals\22* (i32, %dx.types.Handle, i32)"(i32, %dx.types.Handle, i32) #1
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22$Globals\22*, i32)"(i32, %"$Globals"*, i32) #1
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22$Globals\22)"(i32, %dx.types.Handle, %dx.types.ResourceProperties, %"$Globals") #1
+
+; Function Attrs: nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #0
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+
+!llvm.module.flags = !{!0}
+!pauseresume = !{!1}
+!llvm.ident = !{!2}
+!dx.version = !{!3}
+!dx.valver = !{!4}
+!dx.shaderModel = !{!5}
+!dx.typeAnnotations = !{!6, !21}
+!dx.entryPoints = !{!30}
+!dx.fnprops = !{}
+!dx.options = !{!37, !38}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !{!"hlsl-hlemit", !"hlsl-hlensure"}
+!2 = !{!"dxc(private) 1.8.0.4928 (ser_hlslattributes_patch, 937c16cc6)"}
+!3 = !{i32 1, i32 3}
+!4 = !{i32 1, i32 9}
+!5 = !{!"lib", i32 6, i32 3}
+!6 = !{i32 0, %struct.RayDesc undef, !7, %struct.Payload undef, !12, %"$Globals" undef, !15}
+!7 = !{i32 32, !8, !9, !10, !11}
+!8 = !{i32 6, !"Origin", i32 3, i32 0, i32 7, i32 9}
+!9 = !{i32 6, !"TMin", i32 3, i32 12, i32 7, i32 9}
+!10 = !{i32 6, !"Direction", i32 3, i32 16, i32 7, i32 9}
+!11 = !{i32 6, !"TMax", i32 3, i32 28, i32 7, i32 9}
+!12 = !{i32 28, !13, !14}
+!13 = !{i32 6, !"t", i32 3, i32 0, i32 7, i32 9}
+!14 = !{i32 6, !"t2", i32 3, i32 16, i32 7, i32 4}
+!15 = !{i32 20, !16, !17, !18, !19, !20}
+!16 = !{i32 6, !"RayFlags", i32 3, i32 0, i32 7, i32 5}
+!17 = !{i32 6, !"InstanceInclusionMask", i32 3, i32 4, i32 7, i32 5}
+!18 = !{i32 6, !"RayContributionToHitGroupIndex", i32 3, i32 8, i32 7, i32 5}
+!19 = !{i32 6, !"MultiplierForGeometryContributionToHitGroupIndex", i32 3, i32 12, i32 7, i32 5}
+!20 = !{i32 6, !"MissShaderIndex", i32 3, i32 16, i32 7, i32 5}
+!21 = !{i32 1, <4 x float> (<2 x float>*, %struct.RayDesc*, %struct.Payload*)* @"\01?emit@@YA?AV?$vector@M$03@@AIAV?$vector@M$01@@URayDesc@@UPayload@@@Z", !22}
+!22 = !{!23, !26, !27, !29}
+!23 = !{i32 1, !24, !25}
+!24 = !{i32 7, i32 9}
+!25 = !{}
+!26 = !{i32 2, !24, !25}
+!27 = !{i32 0, !28, !25}
+!28 = !{i32 4, !"R"}
+!29 = !{i32 2, !25, !25}
+!30 = !{null, !"", null, !31, null}
+!31 = !{!32, null, !35, null}
+!32 = !{!33}
+!33 = !{i32 0, %struct.RaytracingAccelerationStructure* @"\01?Acc@@3URaytracingAccelerationStructure@@A", !"Acc", i32 -1, i32 -1, i32 1, i32 16, i32 0, !34}
+!34 = !{i32 0, i32 4}
+!35 = !{!36}
+!36 = !{i32 0, %"$Globals"* @"$Globals", !"$Globals", i32 0, i32 -1, i32 1, i32 20, null}
+!37 = !{i32 -2147483584}
+!38 = !{i32 11}
+!39 = !DILocation(line: 22, column: 61, scope: !40)
+!40 = !DISubprogram(name: "emit", scope: !41, file: !41, line: 19, type: !42, isLocal: false, isDefinition: true, scopeLine: 19, flags: DIFlagPrototyped, isOptimized: false, function: <4 x float> (<2 x float>*, %struct.RayDesc*, %struct.Payload*)* @"\01?emit@@YA?AV?$vector@M$03@@AIAV?$vector@M$01@@URayDesc@@UPayload@@@Z")
+!41 = !DIFile(filename: "D:\5Cgit\5Cdxc\5Cmain\5Ctools\5Cclang\5Ctest\5CHLSLFileCheck\5Cshader_targets\5Craytracing\5Craytracing_traceray.hlsl", directory: "")
+!42 = !DISubroutineType(types: !25)
+!43 = !{!44, !44, i64 0}
+!44 = !{!"int", !45, i64 0}
+!45 = !{!"omnipotent char", !46, i64 0}
+!46 = !{!"Simple C/C++ TBAA"}
+!47 = !DILocation(line: 22, column: 12, scope: !40)
+!48 = !DILocation(line: 21, column: 12, scope: !40)
+!49 = !DILocation(line: 20, column: 25, scope: !40)
+!50 = !DILocation(line: 20, column: 16, scope: !40)
+!51 = !DILocation(line: 20, column: 3, scope: !40)
+!52 = !DILocation(line: 24, column: 4, scope: !40)
diff --git a/tools/clang/test/DXC/Passes/ScalarReplHLSL/tracerayinline_cb_raydesc_scalarrepl.ll b/tools/clang/test/DXC/Passes/ScalarReplHLSL/tracerayinline_cb_raydesc_scalarrepl.ll
new file mode 100644
index 0000000000..c01ec797bb
--- /dev/null
+++ b/tools/clang/test/DXC/Passes/ScalarReplHLSL/tracerayinline_cb_raydesc_scalarrepl.ll
@@ -0,0 +1,154 @@
+; RUN: %dxopt %s -hlsl-passes-resume -scalarrepl-param-hlsl -S | FileCheck %s
+
+; Based on tools/clang/test/CodeGenDXIL/hlsl/objects/RayQuery/tracerayinline_cb_raydesc.hlsl
+
+target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+%struct.RaytracingAccelerationStructure = type { i32 }
+%struct.RayDesc = type { <3 x float>, float, <3 x float>, float }
+%"$Globals" = type { %struct.RayDesc }
+%dx.types.Handle = type { i8* }
+%dx.types.ResourceProperties = type { i32, i32 }
+%"class.RayQuery<513, 0>" = type { i32 }
+
+@"\01?RTAS@@3URaytracingAccelerationStructure@@A" = external global %struct.RaytracingAccelerationStructure, align 4
+@"\01?rayDesc@@3URayDesc@@B" = external constant %struct.RayDesc, align 4
+@"$Globals" = external constant %"$Globals"
+
+; Function Attrs: nounwind
+define void @main() #0 {
+entry:
+  %0 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22$Globals\22*, i32)"(i32 0, %"$Globals"* @"$Globals", i32 0)
+
+  ; Capture CB, RayDesc ptr from CB, RTAS, and init RayQuery
+  ; CHECK-DAG: %[[CB_H:[^ ,]+]] = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22$Globals\22)"(i32 14, %dx.types.Handle %{{[^ ,]+}}, %dx.types.ResourceProperties { i32 13, i32 32 }, %"$Globals" undef)
+
+  %1 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22$Globals\22)"(i32 14, %dx.types.Handle %0, %dx.types.ResourceProperties { i32 13, i32 32 }, %"$Globals" undef)
+
+  ; CHECK-DAG: %[[CB_PTR:[^ ,]+]] = call %"$Globals"* @"dx.hl.subscript.cb.rn.%\22$Globals\22* (i32, %dx.types.Handle, i32)"(i32 6, %dx.types.Handle %[[CB_H]], i32 0)
+
+  %2 = call %"$Globals"* @"dx.hl.subscript.cb.rn.%\22$Globals\22* (i32, %dx.types.Handle, i32)"(i32 6, %dx.types.Handle %1, i32 0)
+
+  ; CHECK-DAG: %[[RAYDESC_PTR:[^ ,]+]] = getelementptr inbounds %"$Globals", %"$Globals"* %[[CB_PTR]], i32 0, i32 0
+
+  %3 = getelementptr inbounds %"$Globals", %"$Globals"* %2, i32 0, i32 0
+
+  ; CHECK-DAG: %[[RQ0:[^ ,]+]] = call i32 @"dx.hl.op..i32 (i32, i32, i32)"(i32 4, i32 513, i32 0)
+  ; CHECK-DAG: store i32 %[[RQ0]], i32* %[[RQ_P0:[^ ,]+]]
+
+  %rayQuery = alloca %"class.RayQuery<513, 0>", align 4
+  %rayQuery1 = call i32 @"dx.hl.op..i32 (i32, i32, i32)"(i32 4, i32 513, i32 0), !dbg !34 ; line:12 col:71
+  %4 = getelementptr inbounds %"class.RayQuery<513, 0>", %"class.RayQuery<513, 0>"* %rayQuery, i32 0, i32 0, !dbg !34 ; line:12 col:71
+  store i32 %rayQuery1, i32* %4, !dbg !34 ; line:12 col:71
+
+  %5 = load %struct.RaytracingAccelerationStructure, %struct.RaytracingAccelerationStructure* @"\01?RTAS@@3URaytracingAccelerationStructure@@A", !dbg !38 ; line:13 col:3
+  %6 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RaytracingAccelerationStructure)"(i32 0, %struct.RaytracingAccelerationStructure %5), !dbg !38 ; line:13 col:3
+
+  ; CHECK-DAG: %[[RTAS:[^ ,]+]] = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RaytracingAccelerationStructure)"(i32 14, %dx.types.Handle %{{[^ ,]+}}, %dx.types.ResourceProperties { i32 16, i32 0 }, %struct.RaytracingAccelerationStructure undef)
+
+  %7 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RaytracingAccelerationStructure)"(i32 14, %dx.types.Handle %6, %dx.types.ResourceProperties { i32 16, i32 0 }, %struct.RaytracingAccelerationStructure undef), !dbg !38 ; line:13 col:3
+
+  ; Load RayDesc fields from CB to local copy
+  ; CHECK-DAG: %[[ORIG_CBP:[^ ,]+]] = getelementptr inbounds %struct.RayDesc, %struct.RayDesc* %[[RAYDESC_PTR]], i32 0, i32 0
+  ; CHECK-DAG: %[[ORIG_LD_CB:[^ ,]+]] = load <3 x float>, <3 x float>* %[[ORIG_CBP]]
+  ; CHECK-DAG: store <3 x float> %[[ORIG_LD_CB]], <3 x float>* %[[ORIG_P0:[^ ,]+]]
+  ; CHECK-DAG: %[[TMIN_CBP:[^ ,]+]] = getelementptr inbounds %struct.RayDesc, %struct.RayDesc* %[[RAYDESC_PTR]], i32 0, i32 1
+  ; CHECK-DAG: %[[TMIN_LD_CB:[^ ,]+]] = load float, float* %[[TMIN_CBP]]
+  ; CHECK-DAG: store float %[[TMIN_LD_CB]], float* %[[TMIN_P0:[^ ,]+]]
+  ; CHECK-DAG: %[[DIR_CBP:[^ ,]+]] = getelementptr inbounds %struct.RayDesc, %struct.RayDesc* %[[RAYDESC_PTR]], i32 0, i32 2
+  ; CHECK-DAG: %[[DIR_LD_CB:[^ ,]+]] = load <3 x float>, <3 x float>* %[[DIR_CBP]]
+  ; CHECK-DAG: store <3 x float> %[[DIR_LD_CB]], <3 x float>* %[[DIR_P0:[^ ,]+]]
+  ; CHECK-DAG: %[[TMAX_CBP:[^ ,]+]] = getelementptr inbounds %struct.RayDesc, %struct.RayDesc* %[[RAYDESC_PTR]], i32 0, i32 3
+  ; CHECK-DAG: %[[TMAX_LD_CB:[^ ,]+]] = load float, float* %[[TMAX_CBP]]
+  ; CHECK-DAG: store float %[[TMAX_LD_CB]], float* %[[TMAX_P0:[^ ,]+]]
+
+  ; Load RayDesc fields from local copy
+  ; CHECK-DAG: %[[ORIG:[^ ,]+]] = load <3 x float>, <3 x float>* %[[ORIG_P0]]
+  ; CHECK-DAG: %[[TMIN:[^ ,]+]] = load float, float* %[[TMIN_P0]]
+  ; CHECK-DAG: %[[DIR:[^ ,]+]] = load <3 x float>, <3 x float>* %[[DIR_P0]]
+  ; CHECK-DAG: %[[TMAX:[^ ,]+]] = load float, float* %[[TMAX_P0]]
+  ; CHECK-DAG: %[[RQ:[^ ,]+]] = load i32, i32* %[[RQ_P0]]
+
+  ; Call TraceRayInline
+  ; CHECK: call void @"dx.hl.op..void (i32, i32, %dx.types.Handle, i32, i32, <3 x float>, float, <3 x float>, float)"(i32 325, i32 %[[RQ]], %dx.types.Handle %[[RTAS]], i32 1, i32 2, <3 x float> %[[ORIG]], float %[[TMIN]], <3 x float> %[[DIR]], float %[[TMAX]])
+
+  call void @"dx.hl.op..void (i32, %\22class.RayQuery<513, 0>\22*, %dx.types.Handle, i32, i32, %struct.RayDesc*)"(i32 325, %"class.RayQuery<513, 0>"* %rayQuery, %dx.types.Handle %7, i32 1, i32 2, %struct.RayDesc* %3), !dbg !38 ; line:13 col:3
+  ret void, !dbg !39 ; line:14 col:1
+}
+
+; Function Attrs: nounwind
+declare void @"dx.hl.op..void (i32, %\22class.RayQuery<513, 0>\22*, %dx.types.Handle, i32, i32, %struct.RayDesc*)"(i32, %"class.RayQuery<513, 0>"*, %dx.types.Handle, i32, i32, %struct.RayDesc*) #0
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RaytracingAccelerationStructure)"(i32, %struct.RaytracingAccelerationStructure) #1
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RaytracingAccelerationStructure)"(i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RaytracingAccelerationStructure) #1
+
+; Function Attrs: nounwind readnone
+declare %"$Globals"* @"dx.hl.subscript.cb.rn.%\22$Globals\22* (i32, %dx.types.Handle, i32)"(i32, %dx.types.Handle, i32) #1
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22$Globals\22*, i32)"(i32, %"$Globals"*, i32) #1
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22$Globals\22)"(i32, %dx.types.Handle, %dx.types.ResourceProperties, %"$Globals") #1
+
+; Function Attrs: nounwind
+declare i32 @"dx.hl.op..i32 (i32, i32, i32)"(i32, i32, i32) #0
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+
+!llvm.module.flags = !{!0}
+!pauseresume = !{!1}
+!llvm.ident = !{!2}
+!dx.version = !{!3}
+!dx.valver = !{!4}
+!dx.shaderModel = !{!5}
+!dx.typeAnnotations = !{!6, !20}
+!dx.entryPoints = !{!24}
+!dx.fnprops = !{!31}
+!dx.options = !{!32, !33}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !{!"hlsl-hlemit", !"hlsl-hlensure"}
+!2 = !{!"dxc(private) 1.8.0.14861 (main, 33bc44a3d)"}
+!3 = !{i32 1, i32 5}
+!4 = !{i32 1, i32 9}
+!5 = !{!"vs", i32 6, i32 5}
+!6 = !{i32 0, %struct.RayDesc undef, !7, %"class.RayQuery<513, 0>" undef, !12, %"$Globals" undef, !18}
+!7 = !{i32 32, !8, !9, !10, !11}
+!8 = !{i32 6, !"Origin", i32 3, i32 0, i32 7, i32 9}
+!9 = !{i32 6, !"TMin", i32 3, i32 12, i32 7, i32 9}
+!10 = !{i32 6, !"Direction", i32 3, i32 16, i32 7, i32 9}
+!11 = !{i32 6, !"TMax", i32 3, i32 28, i32 7, i32 9}
+!12 = !{i32 4, !13, !14}
+!13 = !{i32 6, !"h", i32 3, i32 0, i32 7, i32 5}
+!14 = !{i32 0, !15}
+!15 = !{!16, !17}
+!16 = !{i32 1, i64 513}
+!17 = !{i32 1, i64 0}
+!18 = !{i32 32, !19}
+!19 = !{i32 6, !"rayDesc", i32 3, i32 0}
+!20 = !{i32 1, void ()* @main, !21}
+!21 = !{!22}
+!22 = !{i32 1, !23, !23}
+!23 = !{}
+!24 = !{void ()* @main, !"main", null, !25, null}
+!25 = !{!26, null, !29, null}
+!26 = !{!27}
+!27 = !{i32 0, %struct.RaytracingAccelerationStructure* @"\01?RTAS@@3URaytracingAccelerationStructure@@A", !"RTAS", i32 -1, i32 -1, i32 1, i32 16, i32 0, !28}
+!28 = !{i32 0, i32 4}
+!29 = !{!30}
+!30 = !{i32 0, %"$Globals"* @"$Globals", !"$Globals", i32 0, i32 -1, i32 1, i32 32, null}
+!31 = !{void ()* @main, i32 1}
+!32 = !{i32 64}
+!33 = !{i32 -1}
+!34 = !DILocation(line: 12, column: 71, scope: !35)
+!35 = !DISubprogram(name: "main", scope: !36, file: !36, line: 11, type: !37, isLocal: false, isDefinition: true, scopeLine: 11, flags: DIFlagPrototyped, isOptimized: false, function: void ()* @main)
+!36 = !DIFile(filename: "/home/texr/git/dxc/main/tools/clang/test/CodeGenDXIL/hlsl/objects/RayQuery/tracerayinline_cb_raydesc.hlsl", directory: "")
+!37 = !DISubroutineType(types: !23)
+!38 = !DILocation(line: 13, column: 3, scope: !35)
+!39 = !DILocation(line: 14, column: 1, scope: !35)
diff --git a/tools/clang/test/DXC/Passes/ScalarReplHLSL/tracerayinline_scalarrepl.ll b/tools/clang/test/DXC/Passes/ScalarReplHLSL/tracerayinline_scalarrepl.ll
new file mode 100644
index 0000000000..ee76872441
--- /dev/null
+++ b/tools/clang/test/DXC/Passes/ScalarReplHLSL/tracerayinline_scalarrepl.ll
@@ -0,0 +1,155 @@
+; RUN: %dxopt %s -hlsl-passes-resume -scalarrepl-param-hlsl -S | FileCheck %s
+
+; Based on tools/clang/test/CodeGenDXIL/hlsl/objects/RayQuery/tracerayinline.hlsl,
+; with call to DoTrace commented out.
+
+target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+%struct.RaytracingAccelerationStructure = type { i32 }
+%ConstantBuffer = type opaque
+%struct.RayDesc = type { <3 x float>, float, <3 x float>, float }
+%"class.RayQuery<513, 0>" = type { i32 }
+%dx.types.Handle = type { i8* }
+%dx.types.ResourceProperties = type { i32, i32 }
+
+@"\01?RTAS@@3URaytracingAccelerationStructure@@A" = external global %struct.RaytracingAccelerationStructure, align 4
+@"$Globals" = external constant %ConstantBuffer
+
+; CHECK: define void @main(float* noalias, <3 x float>, float, <3 x float>, float)
+
+; Function Attrs: nounwind
+define float @main(%struct.RayDesc* %rayDesc) #0 {
+entry:
+  %0 = alloca %struct.RayDesc
+
+  ; Copy flattened RayDesc input to main function
+  ; RayDesc fields: %1: Origin, %2: TMin, %3: Direction, %4: TMax
+  ; CHECK: store float %4, float* %[[RD3_P0:[^ ,]+]]
+  ; CHECK: store <3 x float> %3, <3 x float>* %[[RD2_P0:[^ ,]+]]
+  ; CHECK: store float %2, float* %[[RD1_P0:[^ ,]+]]
+  ; CHECK: store <3 x float> %1, <3 x float>* %[[RD0_P0:[^ ,]+]]
+
+  ; Copy RayDesc fields again
+  ; CHECK: %[[LOAD:[^ ,]+]] = load <3 x float>, <3 x float>* %[[RD0_P0]]
+  ; CHECK: store <3 x float> %[[LOAD]], <3 x float>* %[[RD0_P1:[^ ,]+]]
+  ; CHECK: %[[LOAD:[^ ,]+]] = load float, float* %[[RD1_P0]]
+  ; CHECK: store float %[[LOAD]], float* %[[RD1_P1:[^ ,]+]]
+  ; CHECK: %[[LOAD:[^ ,]+]] = load <3 x float>, <3 x float>* %[[RD2_P0]]
+  ; CHECK: store <3 x float> %[[LOAD]], <3 x float>* %[[RD2_P1:[^ ,]+]]
+  ; CHECK: %[[LOAD:[^ ,]+]] = load float, float* %[[RD3_P0]]
+  ; CHECK: store float %[[LOAD]], float* %[[RD3_P1:[^ ,]+]]
+
+  %1 = bitcast %struct.RayDesc* %0 to i8*
+  %2 = bitcast %struct.RayDesc* %rayDesc to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* %2, i64 32, i32 1, i1 false)
+
+  ; Capture RayQuery ptr and RTAS handle
+  ; CHECK: %[[RQ0:[^ ]+]] = call i32 @"dx.hl.op..i32 (i32, i32, i32)"(i32 4, i32 513, i32 0)
+  ; CHECK: store i32 %[[RQ0]], i32* %[[RQ_P0:[^ ,]+]]
+  ; CHECK: %[[RTAS:[^ ,]+]] = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RaytracingAccelerationStructure)"(i32 14, %dx.types.Handle %{{[^ ,]+}}, %dx.types.ResourceProperties { i32 16, i32 0 }, %struct.RaytracingAccelerationStructure undef)
+
+  %rayQuery = alloca %"class.RayQuery<513, 0>", align 4
+  %rayQuery1 = call i32 @"dx.hl.op..i32 (i32, i32, i32)"(i32 4, i32 513, i32 0), !dbg !35 ; line:15 col:71
+  %3 = getelementptr inbounds %"class.RayQuery<513, 0>", %"class.RayQuery<513, 0>"* %rayQuery, i32 0, i32 0, !dbg !35 ; line:15 col:71
+  store i32 %rayQuery1, i32* %3, !dbg !35 ; line:15 col:71
+  %4 = load %struct.RaytracingAccelerationStructure, %struct.RaytracingAccelerationStructure* @"\01?RTAS@@3URaytracingAccelerationStructure@@A", !dbg !39 ; line:17 col:3
+  %5 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RaytracingAccelerationStructure)"(i32 0, %struct.RaytracingAccelerationStructure %4), !dbg !39 ; line:17 col:3
+  %6 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RaytracingAccelerationStructure)"(i32 14, %dx.types.Handle %5, %dx.types.ResourceProperties { i32 16, i32 0 }, %struct.RaytracingAccelerationStructure undef), !dbg !39 ; line:17 col:3
+
+  ; Copy RayDesc fields again
+  ; CHECK: %[[LOAD:[^ ,]+]] = load <3 x float>, <3 x float>* %[[RD0_P1]]
+  ; CHECK: store <3 x float> %[[LOAD]], <3 x float>* %[[RD0_P2:[^ ,]+]]
+  ; CHECK: %[[LOAD:[^ ,]+]] = load float, float* %[[RD1_P1]]
+  ; CHECK: store float %[[LOAD]], float* %[[RD1_P2:[^ ,]+]]
+  ; CHECK: %[[LOAD:[^ ,]+]] = load <3 x float>, <3 x float>* %[[RD2_P1]]
+  ; CHECK: store <3 x float> %[[LOAD]], <3 x float>* %[[RD2_P2:[^ ,]+]]
+  ; CHECK: %[[LOAD:[^ ,]+]] = load float, float* %[[RD3_P1]]
+  ; CHECK: store float %[[LOAD]], float* %[[RD3_P2:[^ ,]+]]
+
+  ; Load RayDesc fields for TraceRayInline
+  ; CHECK: %[[RD0:[^ ,]+]] = load <3 x float>, <3 x float>* %[[RD0_P2]]
+  ; CHECK: %[[RD1:[^ ,]+]] = load float, float* %[[RD1_P2]]
+  ; CHECK: %[[RD2:[^ ,]+]] = load <3 x float>, <3 x float>* %[[RD2_P2]]
+  ; CHECK: %[[RD3:[^ ,]+]] = load float, float* %[[RD3_P2]]
+
+  ; Load RayQuery
+  ; CHECK: %[[RQ:[^ ,]+]] = load i32, i32* %[[RQ_P0]]
+
+  ; TraceRayInline call
+  ; CHECK: call void @"dx.hl.op..void (i32, i32, %dx.types.Handle, i32, i32, <3 x float>, float, <3 x float>, float)"(i32 325, i32 %[[RQ]], %dx.types.Handle %[[RTAS]], i32 1, i32 2, <3 x float> %[[RD0]], float %[[RD1]], <3 x float> %[[RD2]], float %[[RD3]])
+
+  call void @"dx.hl.op..void (i32, %\22class.RayQuery<513, 0>\22*, %dx.types.Handle, i32, i32, %struct.RayDesc*)"(i32 325, %"class.RayQuery<513, 0>"* %rayQuery, %dx.types.Handle %6, i32 1, i32 2, %struct.RayDesc* %0), !dbg !39 ; line:17 col:3
+  ret float 0.000000e+00, !dbg !40 ; line:18 col:3
+}
+
+; Function Attrs: nounwind
+declare void @"dx.hl.op..void (i32, %\22class.RayQuery<513, 0>\22*, %dx.types.Handle, i32, i32, %struct.RayDesc*)"(i32, %"class.RayQuery<513, 0>"*, %dx.types.Handle, i32, i32, %struct.RayDesc*) #0
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RaytracingAccelerationStructure)"(i32, %struct.RaytracingAccelerationStructure) #1
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RaytracingAccelerationStructure)"(i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RaytracingAccelerationStructure) #1
+
+; Function Attrs: nounwind
+declare i32 @"dx.hl.op..i32 (i32, i32, i32)"(i32, i32, i32) #0
+
+; Function Attrs: nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #0
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+
+!llvm.module.flags = !{!0}
+!pauseresume = !{!1}
+!llvm.ident = !{!2}
+!dx.version = !{!3}
+!dx.valver = !{!4}
+!dx.shaderModel = !{!5}
+!dx.typeAnnotations = !{!6, !18}
+!dx.entryPoints = !{!25}
+!dx.fnprops = !{!32}
+!dx.options = !{!33, !34}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !{!"hlsl-hlemit", !"hlsl-hlensure"}
+!2 = !{!"dxc(private) 1.8.0.14861 (main, 33bc44a3d)"}
+!3 = !{i32 1, i32 5}
+!4 = !{i32 1, i32 9}
+!5 = !{!"vs", i32 6, i32 5}
+!6 = !{i32 0, %struct.RayDesc undef, !7, %"class.RayQuery<513, 0>" undef, !12}
+!7 = !{i32 32, !8, !9, !10, !11}
+!8 = !{i32 6, !"Origin", i32 3, i32 0, i32 7, i32 9}
+!9 = !{i32 6, !"TMin", i32 3, i32 12, i32 7, i32 9}
+!10 = !{i32 6, !"Direction", i32 3, i32 16, i32 7, i32 9}
+!11 = !{i32 6, !"TMax", i32 3, i32 28, i32 7, i32 9}
+!12 = !{i32 4, !13, !14}
+!13 = !{i32 6, !"h", i32 3, i32 0, i32 7, i32 5}
+!14 = !{i32 0, !15}
+!15 = !{!16, !17}
+!16 = !{i32 1, i64 513}
+!17 = !{i32 1, i64 0}
+!18 = !{i32 1, float (%struct.RayDesc*)* @main, !19}
+!19 = !{!20, !23}
+!20 = !{i32 1, !21, !22}
+!21 = !{i32 4, !"OUT", i32 7, i32 9}
+!22 = !{}
+!23 = !{i32 0, !24, !22}
+!24 = !{i32 4, !"RAYDESC"}
+!25 = !{float (%struct.RayDesc*)* @main, !"main", null, !26, null}
+!26 = !{!27, null, !30, null}
+!27 = !{!28}
+!28 = !{i32 0, %struct.RaytracingAccelerationStructure* @"\01?RTAS@@3URaytracingAccelerationStructure@@A", !"RTAS", i32 -1, i32 -1, i32 1, i32 16, i32 0, !29}
+!29 = !{i32 0, i32 4}
+!30 = !{!31}
+!31 = !{i32 0, %ConstantBuffer* @"$Globals", !"$Globals", i32 0, i32 -1, i32 1, i32 0, null}
+!32 = !{float (%struct.RayDesc*)* @main, i32 1}
+!33 = !{i32 64}
+!34 = !{i32 -1}
+!35 = !DILocation(line: 15, column: 71, scope: !36)
+!36 = !DISubprogram(name: "main", scope: !37, file: !37, line: 14, type: !38, isLocal: false, isDefinition: true, scopeLine: 14, flags: DIFlagPrototyped, isOptimized: false, function: float (%struct.RayDesc*)* @main)
+!37 = !DIFile(filename: "tools/clang/test/CodeGenDXIL/hlsl/objects/RayQuery/tracerayinline.hlsl", directory: "")
+!38 = !DISubroutineType(types: !22)
+!39 = !DILocation(line: 17, column: 3, scope: !36)
+!40 = !DILocation(line: 18, column: 3, scope: !36)
diff --git a/tools/clang/test/DXC/deprecated-select-validator.hlsl b/tools/clang/test/DXC/deprecated-select-validator.hlsl
new file mode 100644
index 0000000000..2ad3e5199c
--- /dev/null
+++ b/tools/clang/test/DXC/deprecated-select-validator.hlsl
@@ -0,0 +1,14 @@
+// Test that the deprecated option, select-validator, doesn't work.
+// RUN: not %dxc -E main -T vs_6_7 -select-validator internal %s 2>&1 | FileCheck %s
+
+// CHECK: dxc failed : Unknown argument: '-select-validator'
+
+float4 main(int loc : SV_StartVertexLocation
+           , uint loc2 : SV_StartInstanceLocation
+           ) : SV_Position
+{
+    float4 r = 0;
+    r += loc;
+    r += loc2;
+    return r;
+}
diff --git a/tools/clang/test/DXC/dot4add_i8_u8_packed-types.hlsl b/tools/clang/test/DXC/dot4add_i8_u8_packed-types.hlsl
new file mode 100644
index 0000000000..53c87bb9c1
--- /dev/null
+++ b/tools/clang/test/DXC/dot4add_i8_u8_packed-types.hlsl
@@ -0,0 +1,34 @@
+// RUN: %dxc /enable-16bit-types /T cs_6_8 %s | FileCheck %s
+
+// Compiling this HLSL would fail this assertion in TranslateDot4AddPacked:
+//
+//     DXASSERT(
+//         !accTy->isVectorTy() && accTy->isIntegerTy(32),
+//         "otherwise, unexpected vector support in high level intrinsic template");
+//
+// Bug was fixed by changing the declarations of dot4add_i8packed and
+// dot4add_u8packed in utils/hct/gen_intrin_main.txt to simply write
+// out their argument and return types, rather than using the $typeN
+// reference syntax.
+
+// CHECK: call i32 @dx.op.dot4AddPacked.i32{{.*}}Dot4AddI8Packed(acc,a,b)
+// CHECK: call i32 @dx.op.dot4AddPacked.i32{{.*}}Dot4AddU8Packed(acc,a,b)
+// CHECK: call float @dx.op.dot2AddHalf.f32{{.*}}Dot2AddHalf(acc,ax,ay,bx,by)
+
+RWByteAddressBuffer buf;
+
+[numthreads(1, 1, 1)]
+void main()
+{
+    int a = dot4add_i8packed(0, 0, 0);
+    int b = dot4add_i8packed(0, 0, a);
+    buf.Store<int>(0, b);
+
+    uint c = dot4add_u8packed(0, 0, 0);
+    uint d = dot4add_u8packed(0, 0, c);
+    buf.Store<uint>(4, d);
+
+    float e = dot2add(half2(0,0), half2(0,0), 1.0);
+    float f = dot2add(half2(0,0), half2(0,0), e);
+    buf.Store<float>(8, f);
+}
diff --git a/tools/clang/test/HLSLFileCheck/hlsl/diagnostics/errors/integer_literal_too_large.hlsl b/tools/clang/test/HLSLFileCheck/hlsl/diagnostics/errors/integer_literal_too_large.hlsl
new file mode 100644
index 0000000000..98db6a6f56
--- /dev/null
+++ b/tools/clang/test/HLSLFileCheck/hlsl/diagnostics/errors/integer_literal_too_large.hlsl
@@ -0,0 +1,14 @@
+// RUN: %dxc -T lib_6_6 %s | FileCheck %s
+
+// A diagnostic is generated for an integer literal that is too large to be
+// represented by any integer type - an argument indicates whether the  text
+// contains "signed". That argument was missing in HLSL specific code within
+// Sema::ActOnNumericConstant() which resulted in an assert being raised if
+// the diagnostic was generated in an assert enabled DXC and a random string
+// being inserted in a non-assert enabled DXC.
+
+// CHECK: integer literal is too large to be represented in any integer type
+int a = 98765432109876543210;
+
+// CHECK: integer literal is too large to be represented in any integer type
+uint b = 98765432109876543210U;
diff --git a/tools/clang/test/HLSLFileCheck/hlsl/types/struct/struct-annotations.hlsl b/tools/clang/test/HLSLFileCheck/hlsl/types/struct/struct-annotations.hlsl
index 5a1b5e43d8..4ffb325c8b 100644
--- a/tools/clang/test/HLSLFileCheck/hlsl/types/struct/struct-annotations.hlsl
+++ b/tools/clang/test/HLSLFileCheck/hlsl/types/struct/struct-annotations.hlsl
@@ -1,5 +1,5 @@
-// RUN: %dxc -T ps_6_8 -E main -Qkeep_reflect_in_dxil -select-validator internal %s | FileCheck -check-prefix=CHECK68 %s
-// RUN: %dxc -T ps_6_7 -E main -Qkeep_reflect_in_dxil -select-validator internal %s | FileCheck -check-prefix=CHECK67 %s
+// RUN: %dxc -T ps_6_8 -E main -Qkeep_reflect_in_dxil %s | FileCheck -check-prefix=CHECK68 %s
+// RUN: %dxc -T ps_6_7 -E main -Qkeep_reflect_in_dxil %s | FileCheck -check-prefix=CHECK67 %s
 
 // Make sure the vector is annotated with vector size (DXIL 1.8 and higher),
 // matrix is annotated with matrix size and orientation, and scalar does not
@@ -47,4 +47,4 @@ StructuredBuffer<MyStruct> g_myStruct;
 float main() : SV_Target 
 { 
     return g_myStruct[0].vec.x + g_myStruct[0].vec.y; 
-}
\ No newline at end of file
+}
diff --git a/tools/clang/test/HLSLFileCheck/hlsl/workgraph/nested_sv_dispatchgrid.hlsl b/tools/clang/test/HLSLFileCheck/hlsl/workgraph/nested_sv_dispatchgrid.hlsl
new file mode 100644
index 0000000000..1da45dae1d
--- /dev/null
+++ b/tools/clang/test/HLSLFileCheck/hlsl/workgraph/nested_sv_dispatchgrid.hlsl
@@ -0,0 +1,130 @@
+// RUN: %dxc -T lib_6_8 %s | FileCheck %s
+
+// Check that the SV_DispatchGrid DXIL metadata for a node input record is
+// generated in cases where:
+// node1 - the field with the SV_DispatchGrid semantic is in a nested record
+// node2 - the field with the SV_DispatchGrid semantic is in a record field
+// node3 - the field with the SV_DispatchGrid semantic is inherited from a base record
+// node4 - the field with the SV_DispatchGrid semantic is within a nested record inherited from a base record
+// node5 - the field with the SV_DispatchGrid semantic is within a base record of a nested record
+// node6 - the field with the SV_DispatchGrid semantic is within a templated base record
+// node7 - the field with the SV_DispatchGrid semantic is within a templated base record of a templated record
+// node8 - the field with the SV_DispatchGrid semantic has templated type
+
+struct Record1 {
+    struct {
+      // SV_DispatchGrid is within a nested record
+      uint3 grid : SV_DispatchGrid;
+    };
+};
+
+[Shader("node")]
+[NodeMaxDispatchGrid(32,16,1)]
+[NumThreads(32,1,1)]
+void node1(DispatchNodeInputRecord<Record1> input) {}
+// CHECK: {!"node1"
+// CHECK: , i32 1, ![[SVDG_1:[0-9]+]]
+// CHECK: [[SVDG_1]] = !{i32 0, i32 5, i32 3}
+
+struct Record2a {
+  uint u;
+  uint2 grid : SV_DispatchGrid;
+};
+
+struct Record2 {
+  uint a;
+  // SV_DispatchGrid is within a record field
+  Record2a b;
+};
+
+[Shader("node")]
+[NodeMaxDispatchGrid(32,16,1)]
+[NumThreads(32,1,1)]
+void node2(DispatchNodeInputRecord<Record2> input) {}
+// CHECK: {!"node2"
+// CHECK: , i32 1, ![[SVDG_2:[0-9]+]]
+// CHECK: [[SVDG_2]] = !{i32 8, i32 5, i32 2}
+
+struct Record3 : Record2a {
+  // SV_DispatchGrid is inherited
+  uint4 n;
+};
+
+[Shader("node")]
+[NodeMaxDispatchGrid(32,16,1)]
+[NumThreads(32,1,1)]
+void node3(DispatchNodeInputRecord<Record3> input) {}
+// CHECK: {!"node3"
+// CHECK: , i32 1, ![[SVDG_3:[0-9]+]]
+// CHECK: [[SVDG_3]] = !{i32 4, i32 5, i32 2}
+
+struct Record4 : Record2 {
+  // SV_DispatchGrid is in a nested field in a base record
+  float f;
+};
+
+[Shader("node")]
+[NodeMaxDispatchGrid(32,16,1)]
+[NumThreads(32,1,1)]
+void node4(DispatchNodeInputRecord<Record4> input) {}
+// CHECK: {!"node4"
+// CHECK: , i32 1, ![[SVDG_2]]
+
+struct Record5 {
+  uint4 x;
+  // SV_DispatchGrid is in a base record of a record field
+  Record3 r;
+};
+
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NodeMaxDispatchGrid(32,16,1)]
+[NumThreads(32,1,1)]
+void node5(DispatchNodeInputRecord<Record5> input) {}
+// CHECK: {!"node5"
+// CHECK: , i32 1, ![[SVDG_5:[0-9]+]]
+// CHECK: [[SVDG_5]] = !{i32 20, i32 5, i32 2}
+
+template <typename T>
+struct Base {
+  T DG : SV_DispatchGrid;
+};
+
+struct Derived1 : Base<uint3> {
+  int4 x;
+};
+
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NodeMaxDispatchGrid(32,16,1)]
+[NumThreads(32,1,1)]
+void node6(DispatchNodeInputRecord<Derived1 > input) {}
+// CHECK: {!"node6"
+// CHECK: , i32 1, ![[SVDG_1]]
+
+template <typename T>
+struct Derived2 : Base<T> {
+  T Y;
+};
+
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NodeMaxDispatchGrid(32,16,1)]
+[NumThreads(32,1,1)]
+void node7(DispatchNodeInputRecord<Derived2<uint2> > input) {}
+// CHECK: {!"node7"
+// CHECK: , i32 1, ![[SVDG_7:[0-9]+]]
+// CHECK: [[SVDG_7]] = !{i32 0, i32 5, i32 2}
+
+template <typename T>
+struct Derived3 {
+  Derived2<T> V;
+};
+
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NodeMaxDispatchGrid(32,16,1)]
+[NumThreads(32,1,1)]
+void node8(DispatchNodeInputRecord< Derived3 <uint3> > input) {}
+// CHECK: {!"node8"
+// CHECK: , i32 1, ![[SVDG_1]]
diff --git a/tools/clang/test/HLSLFileCheck/infra/auto-dxilver.hlsl b/tools/clang/test/HLSLFileCheck/infra/auto-dxilver.hlsl
index 166fa5918d..14ee7f7bf9 100644
--- a/tools/clang/test/HLSLFileCheck/infra/auto-dxilver.hlsl
+++ b/tools/clang/test/HLSLFileCheck/infra/auto-dxilver.hlsl
@@ -5,14 +5,17 @@
 // This should implicitly require dxilver 1.8.
 
 // RUN: %dxc -T vs_6_8 -Vd %s | FileCheck %s
-// Even though this is using -Vd, the validator version is set by the available
-// validator.  If that isn't version 1.8 or above, we'll see an error.
+// Even though this is using -Vd, the validator version being checked is the internal
+// validator's version. If a pre-DXIL-1.8 DXC was used to run this test, we expect failure,
+// since the internal validator will be the same version as the older DXC.
 // The implicit dxilver logic should not skip the check when -Vd is used.
 // CHECK-NOT: error: validator version {{.*}} does not support target profile.
 
 // RUN: %dxc -T vs_6_0 -validator-version 1.8 %s | FileCheck %s
 // Even though target is 6.0, the explicit -validator-version should add an
-// implicit dxilver 1.8 requirement.
+// implicit dxilver 1.8 requirement. The requirement should pass for DXCs that
+// are newer than DXIL Version 1.8, since then, the internal validator's version will
+// be sufficiently new for this check.
 // CHECK-NOT: error: The module cannot be validated by the version of the validator currently attached.
 
 // This error would occur if run against wrong compiler.
@@ -21,8 +24,6 @@
 // Catch any other unexpected error cases.
 // CHECK-NOT: error
 
-// RUN: %dxc -T vs_6_8 -select-validator internal %s | FileCheck %s
-// This should always be run, and always succeed.
 // CHECK: define void @main()
 
 void main() {}
diff --git a/tools/clang/test/HLSLFileCheck/pix/AnnotateVirtualRegs-Raygen.hlsl b/tools/clang/test/HLSLFileCheck/pix/AnnotateVirtualRegs-Raygen.hlsl
deleted file mode 100644
index b9670bdaba..0000000000
--- a/tools/clang/test/HLSLFileCheck/pix/AnnotateVirtualRegs-Raygen.hlsl
+++ /dev/null
@@ -1,36 +0,0 @@
-// RUN: %dxc -Od -T lib_6_6 %s | %opt -S -dxil-annotate-with-virtual-regs | FileCheck %s
-
-
-/* To run locally run:
-%dxc -Od -T lib_6_6 %s -Fc %t.ll
-%opt %t.ll -S -dxil-annotate-with-virtual-regs | FileCheck %s
-*/
-
-RaytracingAccelerationStructure scene : register(t0);
-
-struct RayPayload
-{
-    int3 color;
-};
-
-[shader("raygeneration")]
-void ENTRY()
-{
-    RayDesc ray = {{0,0,0}, {0,0,1}, 0.05, 1000.0};
-    RayPayload pld;
-    TraceRay(scene, 0 /*rayFlags*/, 0xFF /*rayMask*/, 0 /*sbtRecordOffset*/, 1 /*sbtRecordStride*/, 0 /*missIndex*/, ray, pld);
-}
-
-// CHECK: {{.*}} = getelementptr inbounds %struct.RayDesc, %struct.RayDesc* {{.*}}, i32 0, i32 0, !pix-dxil-reg [[RDGEP:![0-9]+]], !pix-dxil-inst-num {{.*}}
-// CHECK: {{.*}} = load i32, i32* getelementptr inbounds ([1 x i32], [1 x i32]* @dx.nothing.a, i32 0, i32 0), !pix-dxil-reg [[NothGEP:![0-9]+]], !pix-dxil-inst-num {{.*}}
-// CHECK: {{.*}} = getelementptr inbounds %struct.RayDesc, %struct.RayDesc* {{.*}}, i32 0, i32 1, !pix-dxil-reg [[RDGEP2:![0-9]+]], !pix-dxil-inst-num {{.*}}
-// CHECK: {{.*}} = load i32, i32* getelementptr inbounds ([1 x i32], [1 x i32]* @dx.nothing.a, i32 0, i32 0), !pix-dxil-reg [[NothGEP2:![0-9]+]], !pix-dxil-inst-num {{.*}}
-// CHECK: {{.*}} = getelementptr inbounds %struct.RayDesc, %struct.RayDesc* {{.*}}, i32 0, i32 2, !pix-dxil-reg [[RDGEP3:![0-9]+]], !pix-dxil-inst-num {{.*}}
-// CHECK: {{.*}} = load i32, i32* getelementptr inbounds ([1 x i32], [1 x i32]* @dx.nothing.a, i32 0, i32 0), !pix-dxil-reg [[NothGEP3:![0-9]+]], !pix-dxil-inst-num {{.*}}
-
-// CHECK-DAG: [[RDGEP]] = !{i32 0, i32 0}
-// CHECK-DAG: [[NothGEP]] = !{i32 0, i32 11}
-// CHECK-DAG: [[RDGEP2]] = !{i32 0, i32 3}
-// CHECK-DAG: [[NothGEP2]] = !{i32 0, i32 12}
-// CHECK-DAG: [[RDGEP3]] = !{i32 0, i32 4}
-// CHECK-DAG: [[NothGEP3]] = !{i32 0, i32 13}
diff --git a/tools/clang/test/HLSLFileCheck/pix/DbgValueToDbgDeclare_dynamic_array_index.hlsl b/tools/clang/test/HLSLFileCheck/pix/DbgValueToDbgDeclare_dynamic_array_index.hlsl
new file mode 100644
index 0000000000..cba891424a
--- /dev/null
+++ b/tools/clang/test/HLSLFileCheck/pix/DbgValueToDbgDeclare_dynamic_array_index.hlsl
@@ -0,0 +1,27 @@
+// RUN: %dxc -Tcs_6_0 /Od %s | %opt -S -dxil-annotate-with-virtual-regs | %FileCheck %s
+
+// Check that there is an alloca backing the local array
+// CHECK: [[ARRAYNAME:%.*]] = alloca [4 x float]
+
+// Grab the GEP for the above array's element that we're expecting to store to:
+// CHECK: [[ARRAYELEMENTPTR:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[ARRAYNAME]]
+
+// Check that the store to the alloca is annotated with pix-alloca-reg-read metadata 
+// (meaning that the pass accurately noted that the 8.0 is stored to a dynamic array index)
+// CHECK: store float 8.000000e+00, float* [[ARRAYELEMENTPTR]]
+// CHECK-SAME: !pix-alloca-reg-write
+
+
+RWByteAddressBuffer RawUAV: register(u1);
+
+[numthreads(1, 1, 1)]
+void main()
+{
+    float local_array[4];
+    local_array[RawUAV.Load(0)] = 8;
+    local_array[RawUAV.Load(1)] = 128;
+
+    RawUAV.Store(64+0,local_array[0]);
+    RawUAV.Store(64+4,local_array[1]);
+}
+
diff --git a/tools/clang/test/HLSLFileCheck/pix/Debug_dynamic_array_index.hlsl b/tools/clang/test/HLSLFileCheck/pix/Debug_dynamic_array_index.hlsl
new file mode 100644
index 0000000000..9ab5bce95a
--- /dev/null
+++ b/tools/clang/test/HLSLFileCheck/pix/Debug_dynamic_array_index.hlsl
@@ -0,0 +1,19 @@
+// RUN: %dxc -Tcs_6_0 /Od %s | %opt -S -dxil-annotate-with-virtual-regs -hlsl-dxil-debug-instrumentation,UAVSize=128,upstreamSVPositionRow=2 -hlsl-dxilemit | %FileCheck %s
+
+// Check that there is a block precis that correctly returns that the array is a 4-value float array
+// CHECK: Block#0
+// CHECK-SAME: d,0-4
+
+RWByteAddressBuffer RawUAV: register(u1);
+
+[numthreads(1, 1, 1)]
+void main()
+{
+    float local_array[4];
+    local_array[RawUAV.Load(0)] = 8;
+    local_array[RawUAV.Load(1)] = 128;
+
+    RawUAV.Store(64+0,local_array[0]);
+    RawUAV.Store(64+4,local_array[1]);
+}
+
diff --git a/tools/clang/test/HLSLFileCheck/shader_targets/raytracing/raytracing_intersection_geometryIndex.hlsl b/tools/clang/test/HLSLFileCheck/shader_targets/raytracing/raytracing_intersection_geometryIndex.hlsl
index 12df1ecbcf..98997a52b1 100644
--- a/tools/clang/test/HLSLFileCheck/shader_targets/raytracing/raytracing_intersection_geometryIndex.hlsl
+++ b/tools/clang/test/HLSLFileCheck/shader_targets/raytracing/raytracing_intersection_geometryIndex.hlsl
@@ -1,10 +1,10 @@
 // RUN: %dxc -T lib_6_5 -auto-binding-space 11 %s | FileCheck %s
 
 // CHECK: define void [[intersection1:@"\\01\?intersection1@[^\"]+"]]() #0 {
-// CHECK:   [[rayTCurrent:%[^ ]+]] = call float @dx.op.rayTCurrent.f32(i32 154)
-// CHECK:   [[GeometryIndex:%[^ ]+]] = call i32 @dx.op.geometryIndex.i32(i32 213)
-// CHECK:   icmp eq i32 [[GeometryIndex]], 0
-// CHECK:   call i1 @dx.op.reportHit.struct.MyAttributes(i32 158, float [[rayTCurrent]], i32 0, %struct.MyAttributes* nonnull {{.*}})
+// CHECK-DAG:   [[rayTCurrent:%[^ ]+]] = call float @dx.op.rayTCurrent.f32(i32 154)
+// CHECK-DAG:   [[GeometryIndex:%[^ ]+]] = call i32 @dx.op.geometryIndex.i32(i32 213)
+// CHECK-DAG:   icmp eq i32 [[GeometryIndex]], 0
+// CHECK-DAG:   call i1 @dx.op.reportHit.struct.MyAttributes(i32 158, float [[rayTCurrent]], i32 0, %struct.MyAttributes* nonnull {{.*}})
 // CHECK:   ret void
 
 struct MyAttributes {
diff --git a/tools/clang/test/LitDXILValidation/load-store-validation.ll b/tools/clang/test/LitDXILValidation/load-store-validation.ll
index 34b2f6b602..16c64672bd 100644
--- a/tools/clang/test/LitDXILValidation/load-store-validation.ll
+++ b/tools/clang/test/LitDXILValidation/load-store-validation.ll
@@ -1,3 +1,4 @@
+; REQUIRES: dxil-1-9
 ; RUN: not %dxv %s 2>&1 | FileCheck %s
 
 ; Ensure proper validation errors are produced for invalid parameters to load and store operations.
diff --git a/tools/clang/test/LitDXILValidation/outer-product-accumulate-matrix-layout-failing.ll b/tools/clang/test/LitDXILValidation/outer-product-accumulate-matrix-layout-failing.ll
new file mode 100644
index 0000000000..33591126e5
--- /dev/null
+++ b/tools/clang/test/LitDXILValidation/outer-product-accumulate-matrix-layout-failing.ll
@@ -0,0 +1,86 @@
+; REQUIRES: dxil-1-9
+; RUN: not %dxv %s 2>&1 | FileCheck %s
+
+; Original Source: \tools\clang\test\CodeGenHLSL\linalg\outer-product-accumulate-matrix-layout.hlsl
+; The failing tests were generated by manually editing the IR produced from the IR from the passing
+; case generated by running the hlsl above (Original Source)
+
+target datalayout = "e-m:e-p:32:32-i1:32-i8:8-i16:16-i32:32-i64:64-f16:16-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+%dx.types.Handle = type { i8* }
+%dx.types.ResBind = type { i32, i32, i32, i8 }
+%dx.types.ResourceProperties = type { i32, i32 }
+%dx.types.ResRet.v8f16 = type { <8 x half>, i32 }
+%struct.ByteAddressBuffer = type { i32 }
+%struct.RWByteAddressBuffer = type { i32 }
+
+; As noted in other tests, the validation errors come out in
+; an order different from the IR. So listed them here in the
+; order they appear and added comments for correlation
+
+;CHECK: error: matrix stride must be a constant zero for optimal layouts
+;CHECK: error: matrix stride must be a constant zero for optimal layouts
+;CHECK-NOT: error: matrix layout value 'OuterProductOptimal' is not valid for outerproductaccumulate, must be 'OuterProductOptimal'
+;CHECK: error: matrix layout value 'MulOptimal' is not valid for outerproductaccumulate, must be 'OuterProductOptimal'
+;CHECK: error: matrix layout value 'ColumnMajor' is not valid for outerproductaccumulate, must be 'OuterProductOptimal'
+;CHECK: error: matrix layout value 'RowMajor' is not valid for outerproductaccumulate, must be 'OuterProductOptimal'
+; CHECK: Validation failed.
+
+define void @main() {
+  %1 = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217, %dx.types.ResBind { i32 0, i32 0, i32 0, i8 1 }, i32 0, i1 false)  ; CreateHandleFromBinding(bind,index,nonUniformIndex)
+  %2 = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217, %dx.types.ResBind { i32 1, i32 1, i32 0, i8 0 }, i32 1, i1 false)  ; CreateHandleFromBinding(bind,index,nonUniformIndex)
+  %3 = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217, %dx.types.ResBind zeroinitializer, i32 0, i1 false)  ; CreateHandleFromBinding(bind,index,nonUniformIndex)
+  %4 = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %3, %dx.types.ResourceProperties { i32 11, i32 0 })  ; AnnotateHandle(res,props)  resource: ByteAddressBuffer
+  %5 = call %dx.types.ResRet.v8f16 @dx.op.rawBufferVectorLoad.v8f16(i32 303, %dx.types.Handle %4, i32 0, i32 undef, i32 2)  ; RawBufferVectorLoad(buf,index,elementOffset,alignment)
+  %6 = extractvalue %dx.types.ResRet.v8f16 %5, 0
+  %7 = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %2, %dx.types.ResourceProperties { i32 11, i32 0 })  ; AnnotateHandle(res,props)  resource: ByteAddressBuffer
+  %8 = call %dx.types.ResRet.v8f16 @dx.op.rawBufferVectorLoad.v8f16(i32 303, %dx.types.Handle %7, i32 0, i32 undef, i32 2)  ; RawBufferVectorLoad(buf,index,elementOffset,alignment)
+  %9 = extractvalue %dx.types.ResRet.v8f16 %8, 0
+  %10 = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %1, %dx.types.ResourceProperties { i32 4107, i32 0 })  ; AnnotateHandle(res,props)  resource: RWByteAddressBuffer
+  ; error: matrix layout value 'RowMajor' is not valid for outerproductaccumulate, must be 'OuterProductOptimal'
+  call void @dx.op.outerProductAccumulate.v8f16.v8f16(i32 307, <8 x half> %6, <8 x half> %9, %dx.types.Handle %10, i32 0, i32 8, i32 0, i32 0)  ; OuterProductAccumulate(inputVector1,inputVector2,matrixBuffer,matrixOffset,matrixIntepretation,matrixLayout,matrixStride)
+  ; error: matrix layout value 'ColumnMajor' is not valid for outerproductaccumulate, must be 'OuterProductOptimal'
+  call void @dx.op.outerProductAccumulate.v8f16.v8f16(i32 307, <8 x half> %6, <8 x half> %9, %dx.types.Handle %10, i32 0, i32 8, i32 1, i32 0)  ; OuterProductAccumulate(inputVector1,inputVector2,matrixBuffer,matrixOffset,matrixIntepretation,matrixLayout,matrixStride)
+  ; matrix layout value 'MulOptimal' is not valid for outerproductaccumulate, must be 'OuterProductOptimal'
+  call void @dx.op.outerProductAccumulate.v8f16.v8f16(i32 307, <8 x half> %6, <8 x half> %9, %dx.types.Handle %10, i32 0, i32 8, i32 2, i32 0)  ; OuterProductAccumulate(inputVector1,inputVector2,matrixBuffer,matrixOffset,matrixIntepretation,matrixLayout,matrixStride)
+  ; error: matrix stride must be a constant zero for optimal layouts
+  call void @dx.op.outerProductAccumulate.v8f16.v8f16(i32 307, <8 x half> %6, <8 x half> %9, %dx.types.Handle %10, i32 0, i32 8, i32 3, i32 64)  ; OuterProductAccumulate(inputVector1,inputVector2,matrixBuffer,matrixOffset,matrixIntepretation,matrixLayout,matrixStride)
+  ; error: matrix stride must be a constant zero for optimal layouts
+  call void @dx.op.outerProductAccumulate.v8f16.v8f16(i32 307, <8 x half> %6, <8 x half> %9, %dx.types.Handle %10, i32 0, i32 8, i32 3, i32 63)  ; OuterProductAccumulate(inputVector1,inputVector2,matrixBuffer,matrixOffset,matrixIntepretation,matrixLayout,matrixStride)
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare %dx.types.ResRet.v8f16 @dx.op.rawBufferVectorLoad.v8f16(i32, %dx.types.Handle, i32, i32, i32) #0
+
+; Function Attrs: nounwind
+declare void @dx.op.outerProductAccumulate.v8f16.v8f16(i32, <8 x half>, <8 x half>, %dx.types.Handle, i32, i32, i32, i32) #1
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @dx.op.annotateHandle(i32, %dx.types.Handle, %dx.types.ResourceProperties) #2
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @dx.op.createHandleFromBinding(i32, %dx.types.ResBind, i32, i1) #2
+
+attributes #0 = { nounwind readonly }
+attributes #1 = { nounwind }
+attributes #2 = { nounwind readnone }
+
+!dx.version = !{!0}
+!dx.valver = !{!0}
+!dx.shaderModel = !{!1}
+!dx.resources = !{!2}
+!dx.entryPoints = !{!8}
+
+!0 = !{i32 1, i32 9}
+!1 = !{!"cs", i32 6, i32 9}
+!2 = !{!3, !6, null, null}
+!3 = !{!4, !5}
+!4 = !{i32 0, %struct.ByteAddressBuffer* undef, !"", i32 0, i32 0, i32 1, i32 11, i32 0, null}
+!5 = !{i32 1, %struct.ByteAddressBuffer* undef, !"", i32 0, i32 1, i32 1, i32 11, i32 0, null}
+!6 = !{!7}
+!7 = !{i32 0, %struct.RWByteAddressBuffer* undef, !"", i32 0, i32 0, i32 1, i32 11, i1 false, i1 false, i1 false, null}
+!8 = !{void ()* @main, !"main", null, !2, !9}
+!9 = !{i32 0, i64 8598323216, i32 4, !10}
+!10 = !{i32 1, i32 1, i32 1}
diff --git a/tools/clang/test/LitDXILValidation/outer-product-accumulate-matrix-layout-passing.ll b/tools/clang/test/LitDXILValidation/outer-product-accumulate-matrix-layout-passing.ll
new file mode 100644
index 0000000000..44cd3e48b3
--- /dev/null
+++ b/tools/clang/test/LitDXILValidation/outer-product-accumulate-matrix-layout-passing.ll
@@ -0,0 +1,65 @@
+; REQUIRES: dxil-1-9
+; RUN: %dxv %s 2>&1 | FileCheck %s
+
+;Original Source: \tools\clang\test\CodeGenHLSL\linalg\outer-product-accumulate-matrix-layout.hlsl
+
+target datalayout = "e-m:e-p:32:32-i1:32-i8:8-i16:16-i32:32-i64:64-f16:16-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+%dx.types.Handle = type { i8* }
+%dx.types.ResBind = type { i32, i32, i32, i8 }
+%dx.types.ResourceProperties = type { i32, i32 }
+%dx.types.ResRet.v8f16 = type { <8 x half>, i32 }
+%struct.ByteAddressBuffer = type { i32 }
+%struct.RWByteAddressBuffer = type { i32 }
+
+;CHECK: Validation succeeded.
+
+define void @main() {
+  %1 = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217, %dx.types.ResBind { i32 0, i32 0, i32 0, i8 1 }, i32 0, i1 false)  ; CreateHandleFromBinding(bind,index,nonUniformIndex)
+  %2 = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217, %dx.types.ResBind { i32 1, i32 1, i32 0, i8 0 }, i32 1, i1 false)  ; CreateHandleFromBinding(bind,index,nonUniformIndex)
+  %3 = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217, %dx.types.ResBind zeroinitializer, i32 0, i1 false)  ; CreateHandleFromBinding(bind,index,nonUniformIndex)
+  %4 = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %3, %dx.types.ResourceProperties { i32 11, i32 0 })  ; AnnotateHandle(res,props)  resource: ByteAddressBuffer
+  %5 = call %dx.types.ResRet.v8f16 @dx.op.rawBufferVectorLoad.v8f16(i32 303, %dx.types.Handle %4, i32 0, i32 undef, i32 2)  ; RawBufferVectorLoad(buf,index,elementOffset,alignment)
+  %6 = extractvalue %dx.types.ResRet.v8f16 %5, 0
+  %7 = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %2, %dx.types.ResourceProperties { i32 11, i32 0 })  ; AnnotateHandle(res,props)  resource: ByteAddressBuffer
+  %8 = call %dx.types.ResRet.v8f16 @dx.op.rawBufferVectorLoad.v8f16(i32 303, %dx.types.Handle %7, i32 0, i32 undef, i32 2)  ; RawBufferVectorLoad(buf,index,elementOffset,alignment)
+  %9 = extractvalue %dx.types.ResRet.v8f16 %8, 0
+  %10 = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %1, %dx.types.ResourceProperties { i32 4107, i32 0 })  ; AnnotateHandle(res,props)  resource: RWByteAddressBuffer
+  call void @dx.op.outerProductAccumulate.v8f16.v8f16(i32 307, <8 x half> %6, <8 x half> %9, %dx.types.Handle %10, i32 0, i32 8, i32 3, i32 0)  ; OuterProductAccumulate(inputVector1,inputVector2,matrixBuffer,matrixOffset,matrixIntepretation,matrixLayout,matrixStride)
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare %dx.types.ResRet.v8f16 @dx.op.rawBufferVectorLoad.v8f16(i32, %dx.types.Handle, i32, i32, i32) #0
+
+; Function Attrs: nounwind
+declare void @dx.op.outerProductAccumulate.v8f16.v8f16(i32, <8 x half>, <8 x half>, %dx.types.Handle, i32, i32, i32, i32) #1
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @dx.op.annotateHandle(i32, %dx.types.Handle, %dx.types.ResourceProperties) #2
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @dx.op.createHandleFromBinding(i32, %dx.types.ResBind, i32, i1) #2
+
+attributes #0 = { nounwind readonly }
+attributes #1 = { nounwind }
+attributes #2 = { nounwind readnone }
+
+!dx.version = !{!0}
+!dx.valver = !{!0}
+!dx.shaderModel = !{!1}
+!dx.resources = !{!2}
+!dx.entryPoints = !{!8}
+
+!0 = !{i32 1, i32 9}
+!1 = !{!"cs", i32 6, i32 9}
+!2 = !{!3, !6, null, null}
+!3 = !{!4, !5}
+!4 = !{i32 0, %struct.ByteAddressBuffer* undef, !"", i32 0, i32 0, i32 1, i32 11, i32 0, null}
+!5 = !{i32 1, %struct.ByteAddressBuffer* undef, !"", i32 0, i32 1, i32 1, i32 11, i32 0, null}
+!6 = !{!7}
+!7 = !{i32 0, %struct.RWByteAddressBuffer* undef, !"", i32 0, i32 0, i32 1, i32 11, i1 false, i1 false, i1 false, null}
+!8 = !{void ()* @main, !"main", null, !2, !9}
+!9 = !{i32 0, i64 8598323216, i32 4, !10}
+!10 = !{i32 1, i32 1, i32 1}
diff --git a/tools/clang/test/LitDXILValidation/ser_hitobject_accessors_failing.ll b/tools/clang/test/LitDXILValidation/ser_hitobject_accessors_failing.ll
new file mode 100644
index 0000000000..7270996b91
--- /dev/null
+++ b/tools/clang/test/LitDXILValidation/ser_hitobject_accessors_failing.ll
@@ -0,0 +1,202 @@
+; REQUIRES: dxil-1-9
+; RUN: not %dxv %s 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+%dx.types.HitObject = type { i8* }
+
+; CHECK: Function: ?main@@YAXXZ: error: HitObject is undef.
+; CHECK: note: at '%r287_ud = call %dx.types.HitObject @dx.op.hitObject_SetShaderTableIndex(i32 287, %dx.types.HitObject undef, i32 undef)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: Instructions should not read uninitialized value.
+; CHECK: note: at '%r287_ud = call %dx.types.HitObject @dx.op.hitObject_SetShaderTableIndex(i32 287, %dx.types.HitObject undef, i32 undef)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: HitObject is undef.
+; CHECK: note: at '%r287 = call %dx.types.HitObject @dx.op.hitObject_SetShaderTableIndex(i32 287, %dx.types.HitObject undef, i32 1)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: HitObject is undef.
+; CHECK: note: at '%r271 = call i1 @dx.op.hitObject_StateScalar.i1(i32 271, %dx.types.HitObject undef)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: HitObject is undef.
+; CHECK: note: at '%r270 = call i1 @dx.op.hitObject_StateScalar.i1(i32 270, %dx.types.HitObject undef)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: HitObject is undef.
+; CHECK: note: at '%r269 = call i1 @dx.op.hitObject_StateScalar.i1(i32 269, %dx.types.HitObject undef)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: HitObject is undef.
+; CHECK: note: at '%r286 = call i32 @dx.op.hitObject_StateScalar.i32(i32 286, %dx.types.HitObject undef)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: HitObject is undef.
+; CHECK: note: at '%r285 = call i32 @dx.op.hitObject_StateScalar.i32(i32 285, %dx.types.HitObject undef)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: HitObject is undef.
+; CHECK: note: at '%r284 = call i32 @dx.op.hitObject_StateScalar.i32(i32 284, %dx.types.HitObject undef)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: HitObject is undef.
+; CHECK: note: at '%r283 = call i32 @dx.op.hitObject_StateScalar.i32(i32 283, %dx.types.HitObject undef)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: HitObject is undef.
+; CHECK: note: at '%r282 = call i32 @dx.op.hitObject_StateScalar.i32(i32 282, %dx.types.HitObject undef)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: HitObject is undef.
+; CHECK: note: at '%r281 = call i32 @dx.op.hitObject_StateScalar.i32(i32 281, %dx.types.HitObject undef)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: HitObject is undef.
+; CHECK: note: at '%r272 = call i32 @dx.op.hitObject_StateScalar.i32(i32 272, %dx.types.HitObject undef)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: HitObject is undef.
+; CHECK: note: at '%r288_wrongmul = call i32 @dx.op.hitObject_LoadLocalRootTableConstant(i32 288, %dx.types.HitObject undef, i32 7)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: parameter 'offset' must be a multiple of 4, got 7
+; CHECK: note: at '%r288_wrongmul = call i32 @dx.op.hitObject_LoadLocalRootTableConstant(i32 288, %dx.types.HitObject undef, i32 7)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: HitObject is undef.
+; CHECK: note: at '%r288 = call i32 @dx.op.hitObject_LoadLocalRootTableConstant(i32 288, %dx.types.HitObject undef, i32 42)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: parameter 'offset' must be a multiple of 4, got 42
+; CHECK: note: at '%r288 = call i32 @dx.op.hitObject_LoadLocalRootTableConstant(i32 288, %dx.types.HitObject undef, i32 42)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: expect component between 0~2, got 3.
+; CHECK: note: at '%r278_oobc = call float @dx.op.hitObject_StateVector.f32(i32 278, %dx.types.HitObject %nop, i32 3)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: component of HitObject_ObjectRayDirection must be an immediate constant.
+; CHECK: note: at '%r278_dync = call float @dx.op.hitObject_StateVector.f32(i32 278, %dx.types.HitObject %nop, i32 %r272)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: HitObject is undef.
+; CHECK: note: at '%r278 = call float @dx.op.hitObject_StateVector.f32(i32 278, %dx.types.HitObject undef, i32 0)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: expect component between 0~2, got 3.
+; CHECK: note: at '%r277_oobc = call float @dx.op.hitObject_StateVector.f32(i32 277, %dx.types.HitObject %nop, i32 3)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: component of HitObject_ObjectRayOrigin must be an immediate constant.
+; CHECK: note: at '%r277_dync = call float @dx.op.hitObject_StateVector.f32(i32 277, %dx.types.HitObject %nop, i32 %r272)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: HitObject is undef.
+; CHECK: note: at '%r277 = call float @dx.op.hitObject_StateVector.f32(i32 277, %dx.types.HitObject undef, i32 0)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: expect component between 0~2, got 3.
+; CHECK: note: at '%r276_oobc = call float @dx.op.hitObject_StateVector.f32(i32 276, %dx.types.HitObject %nop, i32 3)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: component of HitObject_WorldRayDirection must be an immediate constant.
+; CHECK: note: at '%r276_dync = call float @dx.op.hitObject_StateVector.f32(i32 276, %dx.types.HitObject %nop, i32 %r272)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: HitObject is undef.
+; CHECK: note: at '%r276 = call float @dx.op.hitObject_StateVector.f32(i32 276, %dx.types.HitObject undef, i32 0)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: expect component between 0~2, got 3.
+; CHECK: note: at '%r275_oobc = call float @dx.op.hitObject_StateVector.f32(i32 275, %dx.types.HitObject %nop, i32 3)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: component of HitObject_WorldRayOrigin must be an immediate constant.
+; CHECK: note: at '%r275_dync = call float @dx.op.hitObject_StateVector.f32(i32 275, %dx.types.HitObject %nop, i32 %r272)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: HitObject is undef.
+; CHECK: note: at '%r275 = call float @dx.op.hitObject_StateVector.f32(i32 275, %dx.types.HitObject undef, i32 0)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: HitObject is undef.
+; CHECK: note: at '%r274 = call float @dx.op.hitObject_StateScalar.f32(i32 274, %dx.types.HitObject undef)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: HitObject is undef.
+; CHECK: note: at '%r273 = call float @dx.op.hitObject_StateScalar.f32(i32 273, %dx.types.HitObject undef)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: expect column between 0~3, got 4.
+; CHECK: note: at '%r280_oobc = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %nop, i32 0, i32 4)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: column of HitObject_WorldToObject3x4 must be an immediate constant.
+; CHECK: note: at '%r280_dync = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %nop, i32 0, i32 %r272)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: expect row between 0~2, got 3.
+; CHECK: note: at '%r280_oobr = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %nop, i32 3, i32 0)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: row of HitObject_WorldToObject3x4 must be an immediate constant.
+; CHECK: note: at '%r280_dynr = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %nop, i32 %r272, i32 0)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: HitObject is undef.
+; CHECK: note: at '%r280 = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject undef, i32 0, i32 0)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: expect column between 0~3, got 4.
+; CHECK: note: at '%r279_oobc = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %nop, i32 0, i32 4)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: column of HitObject_ObjectToWorld3x4 must be an immediate constant.
+; CHECK: note: at '%r279_dync = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %nop, i32 0, i32 %r272)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: expect row between 0~2, got 3.
+; CHECK: note: at '%r279_oobr = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %nop, i32 3, i32 0)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: row of HitObject_ObjectToWorld3x4 must be an immediate constant.
+; CHECK: note: at '%r279_dynr = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %nop, i32 %r272, i32 0)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: HitObject is undef.
+; CHECK: note: at '%r279 = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject undef, i32 0, i32 0)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Validation failed.
+
+; Function Attrs: nounwind
+define void @"\01?main@@YAXXZ"() #0 {
+%nop = call %dx.types.HitObject @dx.op.hitObject_MakeNop(i32 266)  ; HitObject_MakeNop()
+  %r269 = call i1 @dx.op.hitObject_StateScalar.i1(i32 269, %dx.types.HitObject undef)  ; HitObject_IsMiss(hitObject)
+
+  %r270 = call i1 @dx.op.hitObject_StateScalar.i1(i32 270, %dx.types.HitObject undef)  ; HitObject_IsHit(hitObject)
+
+  %r271 = call i1 @dx.op.hitObject_StateScalar.i1(i32 271, %dx.types.HitObject undef)  ; HitObject_IsNop(hitObject)
+
+  %r272 = call i32 @dx.op.hitObject_StateScalar.i32(i32 272, %dx.types.HitObject undef)  ; HitObject_RayFlags(hitObject)
+
+  %r273 = call float @dx.op.hitObject_StateScalar.f32(i32 273, %dx.types.HitObject undef)  ; HitObject_RayTMin(hitObject)
+
+  %r274 = call float @dx.op.hitObject_StateScalar.f32(i32 274, %dx.types.HitObject undef)  ; HitObject_RayTCurrent(hitObject)
+
+  %r275 = call float @dx.op.hitObject_StateVector.f32(i32 275, %dx.types.HitObject undef, i32 0)  ; HitObject_WorldRayOrigin(hitObject,component)
+  %r275_dync = call float @dx.op.hitObject_StateVector.f32(i32 275, %dx.types.HitObject %nop, i32 %r272)  ; HitObject_WorldRayOrigin(hitObject,component)
+  %r275_oobc = call float @dx.op.hitObject_StateVector.f32(i32 275, %dx.types.HitObject %nop, i32 3)  ; HitObject_WorldRayOrigin(hitObject,component)
+
+  %r276 = call float @dx.op.hitObject_StateVector.f32(i32 276, %dx.types.HitObject undef, i32 0)  ; HitObject_WorldRayDirection(hitObject,component)
+  %r276_dync = call float @dx.op.hitObject_StateVector.f32(i32 276, %dx.types.HitObject %nop, i32 %r272)  ; HitObject_WorldRayDirection(hitObject,component)
+  %r276_oobc = call float @dx.op.hitObject_StateVector.f32(i32 276, %dx.types.HitObject %nop, i32 3)  ; HitObject_WorldRayDirection(hitObject,component)
+
+  %r277 = call float @dx.op.hitObject_StateVector.f32(i32 277, %dx.types.HitObject undef, i32 0)  ; HitObject_ObjectRayOrigin(hitObject,component)
+  %r277_dync = call float @dx.op.hitObject_StateVector.f32(i32 277, %dx.types.HitObject %nop, i32 %r272)  ; HitObject_ObjectRayOrigin(hitObject,component)
+  %r277_oobc = call float @dx.op.hitObject_StateVector.f32(i32 277, %dx.types.HitObject %nop, i32 3)  ; HitObject_ObjectRayOrigin(hitObject,component)
+
+  %r278 = call float @dx.op.hitObject_StateVector.f32(i32 278, %dx.types.HitObject undef, i32 0)  ; HitObject_ObjectRayDirection(hitObject,component)
+  %r278_dync = call float @dx.op.hitObject_StateVector.f32(i32 278, %dx.types.HitObject %nop, i32 %r272)  ; HitObject_ObjectRayDirection(hitObject,component)
+  %r278_oobc = call float @dx.op.hitObject_StateVector.f32(i32 278, %dx.types.HitObject %nop, i32 3)  ; HitObject_ObjectRayDirection(hitObject,component)
+
+  %r279 = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject undef, i32 0, i32 0)  ; HitObject_ObjectToWorld3x4(hitObject,row,col)
+  %r279_dynr = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %nop, i32 %r272, i32 0)  ; HitObject_ObjectToWorld3x4(hitObject,row,col)
+  %r279_oobr = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %nop, i32 3, i32 0)  ; HitObject_ObjectToWorld3x4(hitObject,row,col)
+  %r279_dync = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %nop, i32 0, i32 %r272)  ; HitObject_ObjectToWorld3x4(hitObject,row,col)
+  %r279_oobc = call float @dx.op.hitObject_StateMatrix.f32(i32 279, %dx.types.HitObject %nop, i32 0, i32 4)  ; HitObject_ObjectToWorld3x4(hitObject,row,col)
+
+  %r280 = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject undef, i32 0, i32 0)  ; HitObject_WorldToObject3x4(hitObject,row,col)
+  %r280_dynr = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %nop, i32 %r272, i32 0)  ; HitObject_WorldToObject3x4(hitObject,row,col)
+  %r280_oobr = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %nop, i32 3, i32 0)  ; HitObject_WorldToObject3x4(hitObject,row,col)
+  %r280_dync = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %nop, i32 0, i32 %r272)  ; HitObject_WorldToObject3x4(hitObject,row,col)
+  %r280_oobc = call float @dx.op.hitObject_StateMatrix.f32(i32 280, %dx.types.HitObject %nop, i32 0, i32 4)  ; HitObject_WorldToObject3x4(hitObject,row,col)
+
+  %r281 = call i32 @dx.op.hitObject_StateScalar.i32(i32 281, %dx.types.HitObject undef)  ; HitObject_GeometryIndex(hitObject)
+
+  %r282 = call i32 @dx.op.hitObject_StateScalar.i32(i32 282, %dx.types.HitObject undef)  ; HitObject_InstanceIndex(hitObject)
+
+  %r283 = call i32 @dx.op.hitObject_StateScalar.i32(i32 283, %dx.types.HitObject undef)  ; HitObject_InstanceID(hitObject)
+
+  %r284 = call i32 @dx.op.hitObject_StateScalar.i32(i32 284, %dx.types.HitObject undef)  ; HitObject_PrimitiveIndex(hitObject)
+
+  %r285 = call i32 @dx.op.hitObject_StateScalar.i32(i32 285, %dx.types.HitObject undef)  ; HitObject_HitKind(hitObject)
+
+  %r286 = call i32 @dx.op.hitObject_StateScalar.i32(i32 286, %dx.types.HitObject undef)  ; HitObject_ShaderTableIndex(hitObject)
+
+  %r287 = call %dx.types.HitObject @dx.op.hitObject_SetShaderTableIndex(i32 287, %dx.types.HitObject undef, i32 1)  ; HitObject_SetShaderTableIndex(hitObject,shaderTableIndex)
+  %r287_ud = call %dx.types.HitObject @dx.op.hitObject_SetShaderTableIndex(i32 287, %dx.types.HitObject undef, i32 undef)  ; HitObject_SetShaderTableIndex(hitObject,shaderTableIndex)
+
+  %r288 = call i32 @dx.op.hitObject_LoadLocalRootTableConstant(i32 288, %dx.types.HitObject undef, i32 42)  ; HitObject_LoadLocalRootTableConstant(hitObject,offset)
+  %r288_wrongmul = call i32 @dx.op.hitObject_LoadLocalRootTableConstant(i32 288, %dx.types.HitObject undef, i32 7)  ; HitObject_LoadLocalRootTableConstant(hitObject,offset)
+
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare %dx.types.HitObject @dx.op.hitObject_MakeNop(i32) #1
+
+; Function Attrs: nounwind readnone
+declare %dx.types.HitObject @dx.op.hitObject_SetShaderTableIndex(i32, %dx.types.HitObject, i32) #1
+
+; Function Attrs: nounwind readnone
+declare i1 @dx.op.hitObject_StateScalar.i1(i32, %dx.types.HitObject) #1
+
+; Function Attrs: nounwind readnone
+declare i32 @dx.op.hitObject_StateScalar.i32(i32, %dx.types.HitObject) #1
+
+; Function Attrs: nounwind readonly
+declare i32 @dx.op.hitObject_LoadLocalRootTableConstant(i32, %dx.types.HitObject, i32) #2
+
+; Function Attrs: nounwind readnone
+declare float @dx.op.hitObject_StateVector.f32(i32, %dx.types.HitObject, i32) #1
+
+; Function Attrs: nounwind readnone
+declare float @dx.op.hitObject_StateScalar.f32(i32, %dx.types.HitObject) #1
+
+; Function Attrs: nounwind readnone
+declare float @dx.op.hitObject_StateMatrix.f32(i32, %dx.types.HitObject, i32, i32) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind readonly }
+attributes #3 = { nounwind argmemonly }
+
+!dx.version = !{!0}
+!dx.valver = !{!0}
+!dx.shaderModel = !{!1}
+!dx.typeAnnotations = !{!2}
+!dx.entryPoints = !{!3, !4}
+
+!0 = !{i32 1, i32 9}
+!1 = !{!"lib", i32 6, i32 9}
+!2 = !{i32 1, void ()* @"\01?main@@YAXXZ", !5}
+!3 = !{null, !"", null, null, !6}
+!4 = !{void ()* @"\01?main@@YAXXZ", !"\01?main@@YAXXZ", null, null, !7}
+!5 = !{!8}
+!6 = !{i32 0, i64 0}
+!7 = !{i32 8, i32 7, i32 5, !9}
+!8 = !{i32 1, !10, !10}
+!9 = !{i32 0}
+!10 = !{}
diff --git a/tools/clang/test/LitDXILValidation/ser_hitobject_accessors_passing.ll b/tools/clang/test/LitDXILValidation/ser_hitobject_accessors_passing.ll
index e527125009..74cc94fb78 100644
--- a/tools/clang/test/LitDXILValidation/ser_hitobject_accessors_passing.ll
+++ b/tools/clang/test/LitDXILValidation/ser_hitobject_accessors_passing.ll
@@ -52,7 +52,7 @@ define void @"\01?main@@YAXXZ"() #0 {
 
   %r287 = call %dx.types.HitObject @dx.op.hitObject_SetShaderTableIndex(i32 287, %dx.types.HitObject %nop, i32 1)  ; HitObject_SetShaderTableIndex(hitObject,shaderTableIndex)
 
-  %r288 = call i32 @dx.op.hitObject_LoadLocalRootTableConstant(i32 288, %dx.types.HitObject %nop, i32 42)  ; HitObject_LoadLocalRootTableConstant(hitObject,offset)
+  %r288 = call i32 @dx.op.hitObject_LoadLocalRootTableConstant(i32 288, %dx.types.HitObject %nop, i32 16)  ; HitObject_LoadLocalRootTableConstant(hitObject,offset)
 
   call void @dx.op.hitObject_Attributes.struct.AttribType(i32 289, %dx.types.HitObject %nop, %struct.AttribType* nonnull %attrs)  ; HitObject_Attributes(hitObject,attributes)
   ret void
diff --git a/tools/clang/test/LitDXILValidation/ser_hitobject_fromrayquery_failing.ll b/tools/clang/test/LitDXILValidation/ser_hitobject_fromrayquery_failing.ll
new file mode 100644
index 0000000000..602ff99a55
--- /dev/null
+++ b/tools/clang/test/LitDXILValidation/ser_hitobject_fromrayquery_failing.ll
@@ -0,0 +1,99 @@
+; REQUIRES: dxil-1-9
+; RUN: not %dxv %s 2>&1 | FileCheck %s
+
+; CHECK: Function: ?main@@YAXXZ: error: Instructions should not read uninitialized value.
+; CHECK-NEXT: note: at '%attrsud3 = call %dx.types.HitObject @dx.op.hitObject_FromRayQueryWithAttrs.struct.CustomAttrs(i32 264, i32 %rq, i32 16, %struct.CustomAttrs* nonnull undef)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK-NEXT: Function: ?main@@YAXXZ: error: Instructions should not read uninitialized value.
+; CHECK-NEXT: note: at '%attrsud2 = call %dx.types.HitObject @dx.op.hitObject_FromRayQueryWithAttrs.struct.CustomAttrs(i32 264, i32 %rq, i32 undef, %struct.CustomAttrs* nonnull %attra)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK-NEXT: Function: ?main@@YAXXZ: error: Instructions should not read uninitialized value.
+; CHECK-NEXT: note: at '%attrsud1 = call %dx.types.HitObject @dx.op.hitObject_FromRayQueryWithAttrs.struct.CustomAttrs(i32 264, i32 undef, i32 16, %struct.CustomAttrs* nonnull %attra)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK-NEXT: Function: ?main@@YAXXZ: error: Instructions should not read uninitialized value.
+; CHECK-NEXT: note: at '%ud1 = call %dx.types.HitObject @dx.op.hitObject_FromRayQuery(i32 263, i32 undef)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK-NEXT: Validation failed.
+
+target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+%dx.types.Handle = type { i8* }
+%struct.Payload = type { <3 x float> }
+%struct.CustomAttrs = type { float, float }
+%dx.types.ResourceProperties = type { i32, i32 }
+%dx.types.HitObject = type { i8* }
+%struct.RaytracingAccelerationStructure = type { i32 }
+
+@"\01?RTAS@@3URaytracingAccelerationStructure@@A" = external constant %dx.types.Handle, align 4
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture) #0
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture) #0
+
+; Function Attrs: nounwind
+define void @"\01?main@@YAXXZ"() #0 {
+  %ldh = load %dx.types.Handle, %dx.types.Handle* @"\01?RTAS@@3URaytracingAccelerationStructure@@A", align 4
+  %attra = alloca %struct.CustomAttrs, align 4
+  %rq = call i32 @dx.op.allocateRayQuery(i32 178, i32 5)  ; AllocateRayQuery(constRayFlags)
+  %createh = call %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32 160, %dx.types.Handle %ldh)  ; CreateHandleForLib(Resource)
+  %annoth = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %createh, %dx.types.ResourceProperties { i32 16, i32 0 })  ; AnnotateHandle(res,props)  resource: RTAccelerationStructure
+  call void @dx.op.rayQuery_TraceRayInline(i32 179, i32 %rq, %dx.types.Handle %annoth, i32 0, i32 255, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 9.999000e+03)  ; RayQuery_TraceRayInline(rayQueryHandle,accelerationStructure,rayFlags,instanceInclusionMask,origin_X,origin_Y,origin_Z,tMin,direction_X,direction_Y,direction_Z,tMax)
+
+  %ok = call %dx.types.HitObject @dx.op.hitObject_FromRayQuery(i32 263, i32 %rq)  ; HitObject_FromRayQuery(rayQueryHandle)
+  %ud1 = call %dx.types.HitObject @dx.op.hitObject_FromRayQuery(i32 263, i32 undef)  ; HitObject_FromRayQuery(rayQueryHandle)
+
+  %attrsok = call %dx.types.HitObject @dx.op.hitObject_FromRayQueryWithAttrs.struct.CustomAttrs(i32 264, i32 %rq, i32 16, %struct.CustomAttrs* nonnull %attra)  ; HitObject_FromRayQueryWithAttrs(rayQueryHandle,HitKind,CommittedAttribs)
+  %attrsud1 = call %dx.types.HitObject @dx.op.hitObject_FromRayQueryWithAttrs.struct.CustomAttrs(i32 264, i32 undef, i32 16, %struct.CustomAttrs* nonnull %attra)  ; HitObject_FromRayQueryWithAttrs(rayQueryHandle,HitKind,CommittedAttribs)
+  %attrsud2 = call %dx.types.HitObject @dx.op.hitObject_FromRayQueryWithAttrs.struct.CustomAttrs(i32 264, i32 %rq, i32 undef, %struct.CustomAttrs* nonnull %attra)  ; HitObject_FromRayQueryWithAttrs(rayQueryHandle,HitKind,CommittedAttribs)
+  %attrsud3 = call %dx.types.HitObject @dx.op.hitObject_FromRayQueryWithAttrs.struct.CustomAttrs(i32 264, i32 %rq, i32 16, %struct.CustomAttrs* nonnull undef)  ; HitObject_FromRayQueryWithAttrs(rayQueryHandle,HitKind,CommittedAttribs)
+
+  ret void
+}
+
+; Function Attrs: nounwind
+declare i32 @dx.op.allocateRayQuery(i32, i32) #0
+
+; Function Attrs: nounwind
+declare void @dx.op.rayQuery_TraceRayInline(i32, i32, %dx.types.Handle, i32, i32, float, float, float, float, float, float, float, float) #0
+
+; Function Attrs: nounwind readonly
+declare %dx.types.HitObject @dx.op.hitObject_FromRayQueryWithAttrs.struct.CustomAttrs(i32, i32, i32, %struct.CustomAttrs*) #1
+
+; Function Attrs: nounwind readonly
+declare %dx.types.HitObject @dx.op.hitObject_FromRayQuery(i32, i32) #1
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @dx.op.annotateHandle(i32, %dx.types.Handle, %dx.types.ResourceProperties) #2
+
+; Function Attrs: nounwind readonly
+declare %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32, %dx.types.Handle) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readonly }
+attributes #2 = { nounwind readnone }
+
+!dx.version = !{!0}
+!dx.valver = !{!0}
+!dx.shaderModel = !{!1}
+!dx.resources = !{!2}
+!dx.typeAnnotations = !{!6}
+!dx.dxrPayloadAnnotations = !{!10}
+!dx.entryPoints = !{!13, !15}
+
+!0 = !{i32 1, i32 9}
+!1 = !{!"lib", i32 6, i32 9}
+!2 = !{!3, null, null, null}
+!3 = !{!4}
+!4 = !{i32 0, %struct.RaytracingAccelerationStructure* bitcast (%dx.types.Handle* @"\01?RTAS@@3URaytracingAccelerationStructure@@A" to %struct.RaytracingAccelerationStructure*), !"RTAS", i32 -1, i32 -1, i32 1, i32 16, i32 0, !5}
+!5 = !{i32 0, i32 4}
+!6 = !{i32 1, void ()* @"\01?main@@YAXXZ", !7}
+!7 = !{!8}
+!8 = !{i32 1, !9, !9}
+!9 = !{}
+!10 = !{i32 0, %struct.Payload undef, !11}
+!11 = !{!12}
+!12 = !{i32 0, i32 8210}
+!13 = !{null, !"", null, !2, !14}
+!14 = !{i32 0, i64 33554432}
+!15 = !{void ()* @"\01?main@@YAXXZ", !"\01?main@@YAXXZ", null, null, !16}
+!16 = !{i32 8, i32 7, i32 5, !17}
+!17 = !{i32 0}
diff --git a/tools/clang/test/LitDXILValidation/ser_hitobject_invoke_failing.ll b/tools/clang/test/LitDXILValidation/ser_hitobject_invoke_failing.ll
new file mode 100644
index 0000000000..a6bdd49f72
--- /dev/null
+++ b/tools/clang/test/LitDXILValidation/ser_hitobject_invoke_failing.ll
@@ -0,0 +1,58 @@
+; REQUIRES: dxil-1-9
+; RUN: not %dxv %s 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+%struct.Payload = type { <3 x float> }
+%dx.types.HitObject = type { i8* }
+
+; CHECK: Function: ?main@@YAXXZ: error: Instructions should not read uninitialized value.
+; CHECK-NEXT: note: at 'call void @dx.op.hitObject_Invoke.struct.Payload(i32 267, %dx.types.HitObject %nop, %struct.Payload* nonnull undef)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK-NEXT: Function: ?main@@YAXXZ: error: HitObject is undef.
+; CHECK-NEXT: note: at 'call void @dx.op.hitObject_Invoke.struct.Payload(i32 267, %dx.types.HitObject undef, %struct.Payload* nonnull %pld)' in block '#0' of function '?main@@YAXXZ'.
+
+; CHECK-NEXT: Validation failed.
+
+; Function Attrs: nounwind
+define void @"\01?main@@YAXXZ"() #0 {
+  %pld = alloca %struct.Payload, align 4
+  %nop = call %dx.types.HitObject @dx.op.hitObject_MakeNop(i32 266)  ; HitObject_MakeNop()
+  call void @dx.op.hitObject_Invoke.struct.Payload(i32 267, %dx.types.HitObject %nop, %struct.Payload* nonnull %pld)  ; HitObject_Invoke(hitObject,payload)
+  call void @dx.op.hitObject_Invoke.struct.Payload(i32 267, %dx.types.HitObject undef, %struct.Payload* nonnull %pld)  ; HitObject_Invoke(hitObject,payload)
+  call void @dx.op.hitObject_Invoke.struct.Payload(i32 267, %dx.types.HitObject %nop, %struct.Payload* nonnull undef)  ; HitObject_Invoke(hitObject,payload)
+
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare %dx.types.HitObject @dx.op.hitObject_MakeNop(i32) #1
+
+; Function Attrs: nounwind
+declare void @dx.op.hitObject_Invoke.struct.Payload(i32, %dx.types.HitObject, %struct.Payload*) #0
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind readonly }
+
+!dx.version = !{!0}
+!dx.valver = !{!0}
+!dx.shaderModel = !{!1}
+!dx.typeAnnotations = !{!2}
+!dx.dxrPayloadAnnotations = !{!3}
+!dx.entryPoints = !{!4, !6}
+
+!0 = !{i32 1, i32 9}
+!1 = !{!"lib", i32 6, i32 9}
+!2 = !{i32 1, void ()* @"\01?main@@YAXXZ", !7}
+!3 = !{i32 0, %struct.Payload undef, !8}
+!4 = !{null, !"", null, null, !5}
+!5 = !{i32 0, i64 0}
+!6 = !{void ()* @"\01?main@@YAXXZ", !"\01?main@@YAXXZ", null, null, !9}
+!7 = !{!10}
+!8 = !{!11}
+!9 = !{i32 8, i32 7, i32 5, !12}
+!10 = !{i32 1, !13, !13}
+!11 = !{i32 0, i32 8210}
+!12 = !{i32 0}
+!13 = !{}
diff --git a/tools/clang/test/LitDXILValidation/ser_hitobject_make_failing.ll b/tools/clang/test/LitDXILValidation/ser_hitobject_make_failing.ll
new file mode 100644
index 0000000000..b47f178ca2
--- /dev/null
+++ b/tools/clang/test/LitDXILValidation/ser_hitobject_make_failing.ll
@@ -0,0 +1,44 @@
+; REQUIRES: dxil-1-9
+; RUN: not %dxv %s 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+%dx.types.HitObject = type { i8* }
+
+; CHECK: Function: ?main@@YAXXZ: error: Instructions should not read uninitialized value.
+; CHECK: note: at '%r265_udmiss = call %dx.types.HitObject @dx.op.hitObject_MakeMiss(i32 265, i32 4, i32 undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 9.999000e+03)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Function: ?main@@YAXXZ: error: Instructions should not read uninitialized value.
+; CHECK: note: at '%r265_udflags = call %dx.types.HitObject @dx.op.hitObject_MakeMiss(i32 265, i32 undef, i32 0, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 9.999000e+03)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK: Validation failed.
+
+; Function Attrs: nounwind
+define void @"\01?main@@YAXXZ"() #0 {
+  %r265_udflags = call %dx.types.HitObject @dx.op.hitObject_MakeMiss(i32 265, i32 undef, i32 0, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 9.999000e+03)  ; HitObject_MakeMiss(RayFlags,MissShaderIndex,Origin_X,Origin_Y,Origin_Z,TMin,Direction_X,Direction_Y,Direction_Z,TMax)
+  %r265_udmiss = call %dx.types.HitObject @dx.op.hitObject_MakeMiss(i32 265, i32 4, i32 undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 9.999000e+03)  ; HitObject_MakeMiss(RayFlags,MissShaderIndex,Origin_X,Origin_Y,Origin_Z,TMin,Direction_X,Direction_Y,Direction_Z,TMax)
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare %dx.types.HitObject @dx.op.hitObject_MakeMiss(i32, i32, i32, float, float, float, float, float, float, float, float) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+
+!dx.version = !{!0}
+!dx.valver = !{!0}
+!dx.shaderModel = !{!1}
+!dx.typeAnnotations = !{!2}
+!dx.entryPoints = !{!9, !11}
+
+!0 = !{i32 1, i32 9}
+!1 = !{!"lib", i32 6, i32 9}
+!2 = !{i32 1, void ()* @"\01?main@@YAXXZ", !3}
+!3 = !{!4}
+!4 = !{i32 1, !5, !5}
+!5 = !{}
+!9 = !{null, !"", null, null, !10}
+!10 = !{i32 0, i64 0}
+!11 = !{void ()* @"\01?main@@YAXXZ", !"\01?main@@YAXXZ", null, null, !12}
+!12 = !{i32 8, i32 7, i32 5, !13}
+!13 = !{i32 0}
diff --git a/tools/clang/test/LitDXILValidation/ser_hitobject_trace_failing.ll b/tools/clang/test/LitDXILValidation/ser_hitobject_trace_failing.ll
new file mode 100644
index 0000000000..eb0d2576b0
--- /dev/null
+++ b/tools/clang/test/LitDXILValidation/ser_hitobject_trace_failing.ll
@@ -0,0 +1,114 @@
+; REQUIRES: dxil-1-9
+; RUN: not %dxv %s 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+%dx.types.Handle = type { i8* }
+%struct.Payload = type { <3 x float> }
+%dx.types.ResourceProperties = type { i32, i32 }
+%dx.types.HitObject = type { i8* }
+%struct.RaytracingAccelerationStructure = type { i32 }
+
+@"\01?RTAS@@3URaytracingAccelerationStructure@@A" = external constant %dx.types.Handle, align 4
+
+; CHECK: Function: ?main@@YAXXZ: error: Instructions should not read uninitialized value.
+; CHECK-NEXT: note: at '%tud16 = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle %4, i32 513, i32 1, i32 2, i32 4, i32 0, float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, %struct.Payload* undef)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK-NEXT: Function: ?main@@YAXXZ: error: Instructions should not read uninitialized value.
+; CHECK-NEXT: note: at '%tud15 = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle %4, i32 513, i32 1, i32 2, i32 4, i32 0, float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float undef, %struct.Payload* nonnull %2)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK-NEXT: Function: ?main@@YAXXZ: error: Instructions should not read uninitialized value.
+; CHECK-NEXT: note: at '%tud14 = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle %4, i32 513, i32 1, i32 2, i32 4, i32 0, float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float undef, float 7.000000e+00, %struct.Payload* nonnull %2)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK-NEXT: Function: ?main@@YAXXZ: error: Instructions should not read uninitialized value.
+; CHECK-NEXT: note: at '%tud13 = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle %4, i32 513, i32 1, i32 2, i32 4, i32 0, float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float undef, float 6.000000e+00, float 7.000000e+00, %struct.Payload* nonnull %2)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK-NEXT: Function: ?main@@YAXXZ: error: Instructions should not read uninitialized value.
+; CHECK-NEXT: note: at '%tud12 = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle %4, i32 513, i32 1, i32 2, i32 4, i32 0, float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float undef, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, %struct.Payload* nonnull %2)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK-NEXT: Function: ?main@@YAXXZ: error: Instructions should not read uninitialized value.
+; CHECK-NEXT: note: at '%tud11 = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle %4, i32 513, i32 1, i32 2, i32 4, i32 0, float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float undef, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, %struct.Payload* nonnull %2)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK-NEXT: Function: ?main@@YAXXZ: error: Instructions should not read uninitialized value.
+; CHECK-NEXT: note: at '%tud10 = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle %4, i32 513, i32 1, i32 2, i32 4, i32 0, float 0.000000e+00, float 1.000000e+00, float undef, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, %struct.Payload* nonnull %2)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK-NEXT: Function: ?main@@YAXXZ: error: Instructions should not read uninitialized value.
+; CHECK-NEXT: note: at '%tud9 = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle %4, i32 513, i32 1, i32 2, i32 4, i32 0, float 0.000000e+00, float undef, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, %struct.Payload* nonnull %2)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK-NEXT: Function: ?main@@YAXXZ: error: Instructions should not read uninitialized value.
+; CHECK-NEXT: note: at '%tud8 = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle %4, i32 513, i32 1, i32 2, i32 4, i32 0, float undef, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, %struct.Payload* nonnull %2)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK-NEXT: Function: ?main@@YAXXZ: error: Instructions should not read uninitialized value.
+; CHECK-NEXT: note: at '%tud7 = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle %4, i32 513, i32 1, i32 2, i32 4, i32 undef, float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, %struct.Payload* nonnull %2)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK-NEXT: Function: ?main@@YAXXZ: error: Instructions should not read uninitialized value.
+; CHECK-NEXT: note: at '%tud6 = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle %4, i32 513, i32 1, i32 2, i32 undef, i32 0, float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, %struct.Payload* nonnull %2)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK-NEXT: Function: ?main@@YAXXZ: error: Instructions should not read uninitialized value.
+; CHECK-NEXT: note: at '%tud5 = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle %4, i32 513, i32 1, i32 undef, i32 4, i32 0, float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, %struct.Payload* nonnull %2)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK-NEXT: Function: ?main@@YAXXZ: error: Instructions should not read uninitialized value.
+; CHECK-NEXT: note: at '%tud4 = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle %4, i32 513, i32 undef, i32 2, i32 4, i32 0, float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, %struct.Payload* nonnull %2)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK-NEXT: Function: ?main@@YAXXZ: error: Instructions should not read uninitialized value.
+; CHECK-NEXT: note: at '%tud3 = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle %4, i32 undef, i32 1, i32 2, i32 4, i32 0, float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, %struct.Payload* nonnull %2)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK-NEXT: Function: ?main@@YAXXZ: error: Instructions should not read uninitialized value.
+; CHECK-NEXT: note: at '%tud2 = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle undef, i32 513, i32 1, i32 2, i32 4, i32 0, float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, %struct.Payload* nonnull %2)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK-NEXT: Function: ?main@@YAXXZ: error: TraceRay should only use RTAccelerationStructure.
+; CHECK-NEXT: note: at '%tud2 = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle undef, i32 513, i32 1, i32 2, i32 4, i32 0, float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, %struct.Payload* nonnull %2)' in block '#0' of function '?main@@YAXXZ'.
+
+; CHECK-NEXT: Validation failed.
+
+; Function Attrs: nounwind
+define void @"\01?main@@YAXXZ"() #0 {
+  %1 = load %dx.types.Handle, %dx.types.Handle* @"\01?RTAS@@3URaytracingAccelerationStructure@@A", align 4
+  %2 = alloca %struct.Payload, align 4
+  %3 = call %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32 160, %dx.types.Handle %1)  ; CreateHandleForLib(Resource)
+  %4 = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %3, %dx.types.ResourceProperties { i32 16, i32 0 })  ; AnnotateHandle(res,props)  resource: RTAccelerationStructure
+
+  %tok = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle %4, i32 513, i32 1, i32 2, i32 4, i32 0, float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, %struct.Payload* nonnull %2)  ; HitObject_TraceRay(accelerationStructure,rayFlags,instanceInclusionMask,rayContributionToHitGroupIndex,multiplierForGeometryContributionToHitGroupIndex,missShaderIndex,Origin_X,Origin_Y,Origin_Z,TMin,Direction_X,Direction_Y,Direction_Z,TMax,payload)
+  %tud2 = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle undef, i32 513, i32 1, i32 2, i32 4, i32 0, float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, %struct.Payload* nonnull %2)  ; HitObject_TraceRay(accelerationStructure,rayFlags,instanceInclusionMask,rayContributionToHitGroupIndex,multiplierForGeometryContributionToHitGroupIndex,missShaderIndex,Origin_X,Origin_Y,Origin_Z,TMin,Direction_X,Direction_Y,Direction_Z,TMax,payload)
+  %tud3 = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle %4, i32 undef, i32 1, i32 2, i32 4, i32 0, float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, %struct.Payload* nonnull %2)  ; HitObject_TraceRay(accelerationStructure,rayFlags,instanceInclusionMask,rayContributionToHitGroupIndex,multiplierForGeometryContributionToHitGroupIndex,missShaderIndex,Origin_X,Origin_Y,Origin_Z,TMin,Direction_X,Direction_Y,Direction_Z,TMax,payload)
+  %tud4 = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle %4, i32 513, i32 undef, i32 2, i32 4, i32 0, float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, %struct.Payload* nonnull %2)  ; HitObject_TraceRay(accelerationStructure,rayFlags,instanceInclusionMask,rayContributionToHitGroupIndex,multiplierForGeometryContributionToHitGroupIndex,missShaderIndex,Origin_X,Origin_Y,Origin_Z,TMin,Direction_X,Direction_Y,Direction_Z,TMax,payload)
+  %tud5 = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle %4, i32 513, i32 1, i32 undef, i32 4, i32 0, float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, %struct.Payload* nonnull %2)  ; HitObject_TraceRay(accelerationStructure,rayFlags,instanceInclusionMask,rayContributionToHitGroupIndex,multiplierForGeometryContributionToHitGroupIndex,missShaderIndex,Origin_X,Origin_Y,Origin_Z,TMin,Direction_X,Direction_Y,Direction_Z,TMax,payload)
+  %tud6 = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle %4, i32 513, i32 1, i32 2, i32 undef, i32 0, float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, %struct.Payload* nonnull %2)  ; HitObject_TraceRay(accelerationStructure,rayFlags,instanceInclusionMask,rayContributionToHitGroupIndex,multiplierForGeometryContributionToHitGroupIndex,missShaderIndex,Origin_X,Origin_Y,Origin_Z,TMin,Direction_X,Direction_Y,Direction_Z,TMax,payload)
+  %tud7 = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle %4, i32 513, i32 1, i32 2, i32 4, i32 undef, float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, %struct.Payload* nonnull %2)  ; HitObject_TraceRay(accelerationStructure,rayFlags,instanceInclusionMask,rayContributionToHitGroupIndex,multiplierForGeometryContributionToHitGroupIndex,missShaderIndex,Origin_X,Origin_Y,Origin_Z,TMin,Direction_X,Direction_Y,Direction_Z,TMax,payload)
+  %tud8 = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle %4, i32 513, i32 1, i32 2, i32 4, i32 0, float undef, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, %struct.Payload* nonnull %2)  ; HitObject_TraceRay(accelerationStructure,rayFlags,instanceInclusionMask,rayContributionToHitGroupIndex,multiplierForGeometryContributionToHitGroupIndex,missShaderIndex,Origin_X,Origin_Y,Origin_Z,TMin,Direction_X,Direction_Y,Direction_Z,TMax,payload)
+  %tud9 = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle %4, i32 513, i32 1, i32 2, i32 4, i32 0, float 0.000000e+00, float undef, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, %struct.Payload* nonnull %2)  ; HitObject_TraceRay(accelerationStructure,rayFlags,instanceInclusionMask,rayContributionToHitGroupIndex,multiplierForGeometryContributionToHitGroupIndex,missShaderIndex,Origin_X,Origin_Y,Origin_Z,TMin,Direction_X,Direction_Y,Direction_Z,TMax,payload)
+  %tud10 = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle %4, i32 513, i32 1, i32 2, i32 4, i32 0, float 0.000000e+00, float 1.000000e+00, float undef, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, %struct.Payload* nonnull %2)  ; HitObject_TraceRay(accelerationStructure,rayFlags,instanceInclusionMask,rayContributionToHitGroupIndex,multiplierForGeometryContributionToHitGroupIndex,missShaderIndex,Origin_X,Origin_Y,Origin_Z,TMin,Direction_X,Direction_Y,Direction_Z,TMax,payload)
+  %tud11 = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle %4, i32 513, i32 1, i32 2, i32 4, i32 0, float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float undef, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, %struct.Payload* nonnull %2)  ; HitObject_TraceRay(accelerationStructure,rayFlags,instanceInclusionMask,rayContributionToHitGroupIndex,multiplierForGeometryContributionToHitGroupIndex,missShaderIndex,Origin_X,Origin_Y,Origin_Z,TMin,Direction_X,Direction_Y,Direction_Z,TMax,payload)
+  %tud12 = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle %4, i32 513, i32 1, i32 2, i32 4, i32 0, float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float undef, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, %struct.Payload* nonnull %2)  ; HitObject_TraceRay(accelerationStructure,rayFlags,instanceInclusionMask,rayContributionToHitGroupIndex,multiplierForGeometryContributionToHitGroupIndex,missShaderIndex,Origin_X,Origin_Y,Origin_Z,TMin,Direction_X,Direction_Y,Direction_Z,TMax,payload)
+  %tud13 = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle %4, i32 513, i32 1, i32 2, i32 4, i32 0, float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float undef, float 6.000000e+00, float 7.000000e+00, %struct.Payload* nonnull %2)  ; HitObject_TraceRay(accelerationStructure,rayFlags,instanceInclusionMask,rayContributionToHitGroupIndex,multiplierForGeometryContributionToHitGroupIndex,missShaderIndex,Origin_X,Origin_Y,Origin_Z,TMin,Direction_X,Direction_Y,Direction_Z,TMax,payload)
+  %tud14 = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle %4, i32 513, i32 1, i32 2, i32 4, i32 0, float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float undef, float 7.000000e+00, %struct.Payload* nonnull %2)  ; HitObject_TraceRay(accelerationStructure,rayFlags,instanceInclusionMask,rayContributionToHitGroupIndex,multiplierForGeometryContributionToHitGroupIndex,missShaderIndex,Origin_X,Origin_Y,Origin_Z,TMin,Direction_X,Direction_Y,Direction_Z,TMax,payload)
+  %tud15 = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle %4, i32 513, i32 1, i32 2, i32 4, i32 0, float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float undef, %struct.Payload* nonnull %2)  ; HitObject_TraceRay(accelerationStructure,rayFlags,instanceInclusionMask,rayContributionToHitGroupIndex,multiplierForGeometryContributionToHitGroupIndex,missShaderIndex,Origin_X,Origin_Y,Origin_Z,TMin,Direction_X,Direction_Y,Direction_Z,TMax,payload)
+  %tud16 = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle %4, i32 513, i32 1, i32 2, i32 4, i32 0, float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, %struct.Payload* undef)  ; HitObject_TraceRay(accelerationStructure,rayFlags,instanceInclusionMask,rayContributionToHitGroupIndex,multiplierForGeometryContributionToHitGroupIndex,missShaderIndex,Origin_X,Origin_Y,Origin_Z,TMin,Direction_X,Direction_Y,Direction_Z,TMax,payload)
+
+  ret void
+}
+
+; Function Attrs: nounwind
+declare %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32, %dx.types.Handle, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, %struct.Payload*) #0
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @dx.op.annotateHandle(i32, %dx.types.Handle, %dx.types.ResourceProperties) #1
+
+; Function Attrs: nounwind readonly
+declare %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32, %dx.types.Handle) #2
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind readonly }
+
+!dx.version = !{!0}
+!dx.valver = !{!0}
+!dx.shaderModel = !{!1}
+!dx.resources = !{!2}
+!dx.typeAnnotations = !{!3}
+!dx.dxrPayloadAnnotations = !{!4}
+!dx.entryPoints = !{!5, !6}
+
+!0 = !{i32 1, i32 9}
+!1 = !{!"lib", i32 6, i32 9}
+!2 = !{!7, null, null, null}
+!3 = !{i32 1, void ()* @"\01?main@@YAXXZ", !8}
+!4 = !{i32 0, %struct.Payload undef, !9}
+!5 = !{null, !"", null, !2, null}
+!6 = !{void ()* @"\01?main@@YAXXZ", !"\01?main@@YAXXZ", null, null, !10}
+!7 = !{!11}
+!8 = !{!12}
+!9 = !{!13}
+!10 = !{i32 8, i32 7, i32 5, !14}
+!11 = !{i32 0, %struct.RaytracingAccelerationStructure* bitcast (%dx.types.Handle* @"\01?RTAS@@3URaytracingAccelerationStructure@@A" to %struct.RaytracingAccelerationStructure*), !"RTAS", i32 -1, i32 -1, i32 1, i32 16, i32 0, !15}
+!12 = !{i32 1, !16, !16}
+!13 = !{i32 0, i32 8210}
+!14 = !{i32 0}
+!15 = !{i32 0, i32 4}
+!16 = !{}
diff --git a/tools/clang/test/LitDXILValidation/ser_hitobject_trace_invaliduav.ll b/tools/clang/test/LitDXILValidation/ser_hitobject_trace_invaliduav.ll
new file mode 100644
index 0000000000..c4f3a918f8
--- /dev/null
+++ b/tools/clang/test/LitDXILValidation/ser_hitobject_trace_invaliduav.ll
@@ -0,0 +1,108 @@
+; REQUIRES: dxil-1-9
+; RUN: not %dxv %s 2>&1 | FileCheck %s
+
+; shader hash: b22988e7874179601860019e56fb877e
+;
+; Buffer Definitions:
+;
+;
+; Resource Bindings:
+;
+; Name                                 Type  Format         Dim      ID      HLSL Bind  Count
+; ------------------------------ ---------- ------- ----------- ------- -------------- ------
+; RTAS                              texture     i32         ras      T0t4294967295,space4294967295     1
+; nonas_buf                             UAV    byte         r/w      U0u4294967295,space4294967295     1
+;
+target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+%dx.types.Handle = type { i8* }
+%struct.Payload = type { <3 x float> }
+%dx.types.ResourceProperties = type { i32, i32 }
+%dx.types.HitObject = type { i8* }
+%struct.RaytracingAccelerationStructure = type { i32 }
+%struct.RWByteAddressBuffer = type { i32 }
+
+@"\01?RTAS@@3URaytracingAccelerationStructure@@A" = external constant %dx.types.Handle, align 4
+@"\01?nonas_buf@@3URWByteAddressBuffer@@A" = external constant %dx.types.Handle, align 4
+
+; CHECK: Function: ?main@@YAXXZ: error: TraceRay should only use RTAccelerationStructure.
+; CHECK-NEXT: note: at '%invalid = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle %7, i32 513, i32 1, i32 2, i32 4, i32 0, float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, %struct.Payload* nonnull %3)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK-NEXT: Validation failed.
+
+; Function Attrs: nounwind
+define void @"\01?main@@YAXXZ"() #0 {
+  %1 = load %dx.types.Handle, %dx.types.Handle* @"\01?RTAS@@3URaytracingAccelerationStructure@@A", align 4
+  %2 = load %dx.types.Handle, %dx.types.Handle* @"\01?nonas_buf@@3URWByteAddressBuffer@@A", align 4
+  %3 = alloca %struct.Payload, align 4
+  %4 = bitcast %struct.Payload* %3 to i8*
+  call void @llvm.lifetime.start(i64 12, i8* %4) #0
+  %5 = getelementptr inbounds %struct.Payload, %struct.Payload* %3, i32 0, i32 0
+  store <3 x float> <float 7.000000e+00, float 8.000000e+00, float 9.000000e+00>, <3 x float>* %5, align 4, !tbaa !20
+  %6 = call %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32 160, %dx.types.Handle %2)  ; CreateHandleForLib(Resource)
+  %7 = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %6, %dx.types.ResourceProperties { i32 4107, i32 0 })  ; AnnotateHandle(res,props)  resource: RWByteAddressBuffer
+  call void @dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle %7, i32 0, i32 undef, float 1.100000e+01, float undef, float undef, float undef, i8 1, i32 4)  ; RawBufferStore(uav,index,elementOffset,value0,value1,value2,value3,mask,alignment)
+  %8 = call %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32 160, %dx.types.Handle %1)  ; CreateHandleForLib(Resource)
+  %9 = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %8, %dx.types.ResourceProperties { i32 16, i32 0 })  ; AnnotateHandle(res,props)  resource: RTAccelerationStructure
+
+  %valid = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle %9, i32 513, i32 1, i32 2, i32 4, i32 0, float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, %struct.Payload* nonnull %3)  ; HitObject_TraceRay(accelerationStructure,rayFlags,instanceInclusionMask,rayContributionToHitGroupIndex,multiplierForGeometryContributionToHitGroupIndex,missShaderIndex,Origin_X,Origin_Y,Origin_Z,TMin,Direction_X,Direction_Y,Direction_Z,TMax,payload)
+
+  %invalid = call %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32 262, %dx.types.Handle %7, i32 513, i32 1, i32 2, i32 4, i32 0, float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, %struct.Payload* nonnull %3)  ; HitObject_TraceRay(accelerationStructure,rayFlags,instanceInclusionMask,rayContributionToHitGroupIndex,multiplierForGeometryContributionToHitGroupIndex,missShaderIndex,Origin_X,Origin_Y,Origin_Z,TMin,Direction_X,Direction_Y,Direction_Z,TMax,payload)
+
+  call void @llvm.lifetime.end(i64 12, i8* %4) #0
+  ret void
+}
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture) #0
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture) #0
+
+; Function Attrs: nounwind
+declare void @dx.op.rawBufferStore.f32(i32, %dx.types.Handle, i32, i32, float, float, float, float, i8, i32) #0
+
+; Function Attrs: nounwind
+declare %dx.types.HitObject @dx.op.hitObject_TraceRay.struct.Payload(i32, %dx.types.Handle, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, %struct.Payload*) #0
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @dx.op.annotateHandle(i32, %dx.types.Handle, %dx.types.ResourceProperties) #1
+
+; Function Attrs: nounwind readonly
+declare %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32, %dx.types.Handle) #2
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind readonly }
+
+!dx.version = !{!0}
+!dx.valver = !{!0}
+!dx.shaderModel = !{!1}
+!dx.resources = !{!2}
+!dx.typeAnnotations = !{!8}
+!dx.dxrPayloadAnnotations = !{!12}
+!dx.entryPoints = !{!15, !17}
+
+!0 = !{i32 1, i32 9}
+!1 = !{!"lib", i32 6, i32 9}
+!2 = !{!3, !6, null, null}
+!3 = !{!4}
+!4 = !{i32 0, %struct.RaytracingAccelerationStructure* bitcast (%dx.types.Handle* @"\01?RTAS@@3URaytracingAccelerationStructure@@A" to %struct.RaytracingAccelerationStructure*), !"RTAS", i32 -1, i32 -1, i32 1, i32 16, i32 0, !5}
+!5 = !{i32 0, i32 4}
+!6 = !{!7}
+!7 = !{i32 0, %struct.RWByteAddressBuffer* bitcast (%dx.types.Handle* @"\01?nonas_buf@@3URWByteAddressBuffer@@A" to %struct.RWByteAddressBuffer*), !"nonas_buf", i32 -1, i32 -1, i32 1, i32 11, i1 false, i1 false, i1 false, null}
+!8 = !{i32 1, void ()* @"\01?main@@YAXXZ", !9}
+!9 = !{!10}
+!10 = !{i32 1, !11, !11}
+!11 = !{}
+!12 = !{i32 0, %struct.Payload undef, !13}
+!13 = !{!14}
+!14 = !{i32 0, i32 8210}
+!15 = !{null, !"", null, !2, !16}
+!16 = !{i32 0, i64 8589934608}
+!17 = !{void ()* @"\01?main@@YAXXZ", !"\01?main@@YAXXZ", null, null, !18}
+!18 = !{i32 8, i32 7, i32 5, !19}
+!19 = !{i32 0}
+!20 = !{!21, !21, i64 0}
+!21 = !{!"omnipotent char", !22, i64 0}
+!22 = !{!"Simple C/C++ TBAA"}
diff --git a/tools/clang/test/LitDXILValidation/ser_reorder_scope_sm68_failing.ll b/tools/clang/test/LitDXILValidation/ser_reorder_scope_sm68_failing.ll
new file mode 100644
index 0000000000..cd93eca793
--- /dev/null
+++ b/tools/clang/test/LitDXILValidation/ser_reorder_scope_sm68_failing.ll
@@ -0,0 +1,77 @@
+; REQUIRES: dxil-1-8
+; RUN: not %dxv %s 2>&1 | FileCheck %s
+
+; Buffer Definitions:
+;
+;
+; Resource Bindings:
+;
+; Name                                 Type  Format         Dim      ID      HLSL Bind  Count
+; ------------------------------ ---------- ------- ----------- ------- -------------- ------
+; BAB                                   UAV    byte         r/w      U0             u1     1
+;
+target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+%dx.types.Handle = type { i8* }
+%dx.types.ResourceProperties = type { i32, i32 }
+%struct.RWByteAddressBuffer = type { i32 }
+
+@"\01?BAB@@3URWByteAddressBuffer@@A" = external constant %dx.types.Handle, align 4
+
+; CHECK: Function: ?main@@YAXXZ: error: Invalid semantic flags on DXIL operation 'BarrierByMemoryType'
+; CHECK-NEXT: note: at 'call void @dx.op.barrierByMemoryType(i32 244, i32 1, i32 8)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK-NEXT: Function: ?main@@YAXXZ: error: Invalid semantic flags on DXIL operation 'barrierByMemoryHandle'
+; CHECK-NEXT: note: at 'call void @dx.op.barrierByMemoryHandle(i32 245, %dx.types.Handle %3, i32 8)' in block '#0' of function '?main@@YAXXZ'.
+; CHECK-NEXT: Function: ?main@@YAXXZ: error: Entry function performs some operation that is incompatible with the shader stage or other entry properties.  See other errors for details.
+; CHECK-NEXT: Function: ?main@@YAXXZ: error: Function uses features incompatible with the shader model.
+; CHECK-NEXT: Validation failed.
+
+; Function Attrs: nounwind
+define void @"\01?main@@YAXXZ"() #0 {
+  %1 = load %dx.types.Handle, %dx.types.Handle* @"\01?BAB@@3URWByteAddressBuffer@@A", align 4
+  call void @dx.op.barrierByMemoryType(i32 244, i32 1, i32 8)  ; BarrierByMemoryType(MemoryTypeFlags,SemanticFlags)
+  %2 = call %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32 160, %dx.types.Handle %1)  ; CreateHandleForLib(Resource)
+  %3 = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %2, %dx.types.ResourceProperties { i32 4107, i32 0 })  ; AnnotateHandle(res,props)  resource: RWByteAddressBuffer
+  call void @dx.op.barrierByMemoryHandle(i32 245, %dx.types.Handle %3, i32 8)  ; BarrierByMemoryHandle(object,SemanticFlags)
+  ret void
+}
+
+; Function Attrs: noduplicate nounwind
+declare void @dx.op.barrierByMemoryType(i32, i32, i32) #1
+
+; Function Attrs: noduplicate nounwind
+declare void @dx.op.barrierByMemoryHandle(i32, %dx.types.Handle, i32) #1
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @dx.op.annotateHandle(i32, %dx.types.Handle, %dx.types.ResourceProperties) #2
+
+; Function Attrs: nounwind readonly
+declare %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32, %dx.types.Handle) #3
+
+attributes #0 = { nounwind }
+attributes #1 = { noduplicate nounwind }
+attributes #2 = { nounwind readnone }
+attributes #3 = { nounwind readonly }
+
+!dx.version = !{!0}
+!dx.valver = !{!0}
+!dx.shaderModel = !{!1}
+!dx.resources = !{!2}
+!dx.typeAnnotations = !{!5}
+!dx.entryPoints = !{!9, !11}
+
+!0 = !{i32 1, i32 8}
+!1 = !{!"lib", i32 6, i32 8}
+!2 = !{null, !3, null, null}
+!3 = !{!4}
+!4 = !{i32 0, %struct.RWByteAddressBuffer* bitcast (%dx.types.Handle* @"\01?BAB@@3URWByteAddressBuffer@@A" to %struct.RWByteAddressBuffer*), !"BAB", i32 0, i32 1, i32 1, i32 11, i1 false, i1 false, i1 false, null}
+!5 = !{i32 1, void ()* @"\01?main@@YAXXZ", !6}
+!6 = !{!7}
+!7 = !{i32 1, !8, !8}
+!8 = !{}
+!9 = !{null, !"", null, !2, !10}
+!10 = !{i32 0, i64 8589934608}
+!11 = !{void ()* @"\01?main@@YAXXZ", !"\01?main@@YAXXZ", null, null, !12}
+!12 = !{i32 8, i32 7, i32 5, !13}
+!13 = !{i32 0}
diff --git a/tools/clang/test/HLSLFileCheck/validation/ser_reorder_scope_sm69_passing.ll b/tools/clang/test/LitDXILValidation/ser_reorder_scope_sm69_passing.ll
similarity index 96%
rename from tools/clang/test/HLSLFileCheck/validation/ser_reorder_scope_sm69_passing.ll
rename to tools/clang/test/LitDXILValidation/ser_reorder_scope_sm69_passing.ll
index cab9942b02..fa2733ef22 100644
--- a/tools/clang/test/HLSLFileCheck/validation/ser_reorder_scope_sm69_passing.ll
+++ b/tools/clang/test/LitDXILValidation/ser_reorder_scope_sm69_passing.ll
@@ -1,4 +1,7 @@
-; RUN: %dxilver 1.9 | %dxv %s
+; REQUIRES: dxil-1-9
+; RUN: %dxv %s 2>&1 | FileCheck %s
+
+; CHECK: Validation succeeded.
 
 ; Buffer Definitions:
 ;
diff --git a/tools/clang/test/LitDXILValidation/ser_reordercoherent_invalid_incdec.ll b/tools/clang/test/LitDXILValidation/ser_reordercoherent_invalid_incdec.ll
new file mode 100644
index 0000000000..1f68a9a95f
--- /dev/null
+++ b/tools/clang/test/LitDXILValidation/ser_reordercoherent_invalid_incdec.ll
@@ -0,0 +1,92 @@
+; REQUIRES: dxil-1-9
+; RUN: not %dxv %s 2>&1 | FileCheck %s
+
+; COM: Original HLSL source:
+; COM: reordercoherent RWStructuredBuffer<float> buffer;
+; COM:
+; COM:
+; COM: [Shader("raygeneration")]
+; COM: void
+; COM: main()
+; COM: {
+; COM:   buffer.IncrementCounter();
+; COM:   buffer.DecrementCounter();
+; COM: }
+
+; CHECK: error: reordercoherent cannot be used on buffer with counter 'buffer'
+; CHECK-NEXT: Validation failed.
+
+; shader hash: 638950814a9023bf537d61dbb330a4c8
+;
+; Buffer Definitions:
+;
+; Resource bind info for buffer
+; {
+;
+;   float $Element;                                   ; Offset:    0 Size:     4
+;
+; }
+;
+;
+; Resource Bindings:
+;
+; Name                                 Type  Format         Dim      ID      HLSL Bind  Count
+; ------------------------------ ---------- ------- ----------- ------- -------------- ------
+; buffer                                UAV  struct     r/w+cnt      U0u4294967295,space4294967295     1
+;
+target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+%dx.types.Handle = type { i8* }
+%dx.types.ResourceProperties = type { i32, i32 }
+%"class.RWStructuredBuffer<float>" = type { float }
+
+@"\01?buffer@@3V?$RWStructuredBuffer@M@@A" = external constant %dx.types.Handle, align 4
+
+; Function Attrs: nounwind
+define void @"\01?main@@YAXXZ"() #0 {
+  %1 = load %dx.types.Handle, %dx.types.Handle* @"\01?buffer@@3V?$RWStructuredBuffer@M@@A", align 4
+  %2 = call %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32 160, %dx.types.Handle %1)  ; CreateHandleForLib(Resource)
+  %3 = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %2, %dx.types.ResourceProperties { i32 102412, i32 4 })  ; AnnotateHandle(res,props)  resource: reordercoherent RWStructuredBuffer<stride=4, counter>
+  %4 = call i32 @dx.op.bufferUpdateCounter(i32 70, %dx.types.Handle %3, i8 1)  ; BufferUpdateCounter(uav,inc)
+  %5 = call %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32 160, %dx.types.Handle %1)  ; CreateHandleForLib(Resource)
+  %6 = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %5, %dx.types.ResourceProperties { i32 102412, i32 4 })  ; AnnotateHandle(res,props)  resource: reordercoherent RWStructuredBuffer<stride=4, counter>
+  %7 = call i32 @dx.op.bufferUpdateCounter(i32 70, %dx.types.Handle %6, i8 -1)  ; BufferUpdateCounter(uav,inc)
+  ret void
+}
+
+; Function Attrs: nounwind
+declare i32 @dx.op.bufferUpdateCounter(i32, %dx.types.Handle, i8) #0
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @dx.op.annotateHandle(i32, %dx.types.Handle, %dx.types.ResourceProperties) #1
+
+; Function Attrs: nounwind readonly
+declare %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32, %dx.types.Handle) #2
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind readonly }
+
+!dx.version = !{!0}
+!dx.valver = !{!0}
+!dx.shaderModel = !{!1}
+!dx.resources = !{!2}
+!dx.typeAnnotations = !{!6}
+!dx.entryPoints = !{!10, !12}
+
+!0 = !{i32 1, i32 9}
+!1 = !{!"lib", i32 6, i32 9}
+!2 = !{null, !3, null, null}
+!3 = !{!4}
+!4 = !{i32 0, %"class.RWStructuredBuffer<float>"* bitcast (%dx.types.Handle* @"\01?buffer@@3V?$RWStructuredBuffer@M@@A" to %"class.RWStructuredBuffer<float>"*), !"buffer", i32 -1, i32 -1, i32 1, i32 12, i1 false, i1 true, i1 false, !5}
+!5 = !{i32 1, i32 4, i32 4, i1 true}
+!6 = !{i32 1, void ()* @"\01?main@@YAXXZ", !7}
+!7 = !{!8}
+!8 = !{i32 1, !9, !9}
+!9 = !{}
+!10 = !{null, !"", null, !2, !11}
+!11 = !{i32 0, i64 8589934608}
+!12 = !{void ()* @"\01?main@@YAXXZ", !"\01?main@@YAXXZ", null, null, !13}
+!13 = !{i32 8, i32 7, i32 5, !14}
+!14 = !{i32 0}
\ No newline at end of file
diff --git a/tools/clang/test/LitDXILValidation/ser_reordercoherent_invalid_sm.ll b/tools/clang/test/LitDXILValidation/ser_reordercoherent_invalid_sm.ll
new file mode 100644
index 0000000000..efcb7d3c2b
--- /dev/null
+++ b/tools/clang/test/LitDXILValidation/ser_reordercoherent_invalid_sm.ll
@@ -0,0 +1,83 @@
+; REQUIRES: dxil-1-8
+; RUN: not %dxv %s 2>&1 | FileCheck %s
+
+
+; CHECK: error: reordercoherent requires SM 6.9 or later. 'buf'
+; CHECK-NEXT: Function: ?main@@YAXXZ: error: reordercoherent requires SM 6.9 or later.
+; CHECK-NEXT: note: at '%3 = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %2, %dx.types.ResourceProperties { i32 69643, i32 0 })' in block '#0' of function '?main@@YAXXZ'.
+; CHECK-NEXT: Function: ?main@@YAXXZ: error: reordercoherent requires SM 6.9 or later.
+; CHECK-NEXT: note: at '%3 = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %2, %dx.types.ResourceProperties { i32 69643, i32 0 })' in block '#0' of function '?main@@YAXXZ'.
+; CHECK-NEXT: Validation failed.
+; COM: Original HLSL source:
+; COM: reordercoherent RWByteAddressBuffer buf;
+; COM:
+; COM: [Shader("raygeneration")]
+; COM: void main()
+; COM: {
+; COM:   buf.Store(0, 11.f);
+; COM: }
+
+; shader hash: f7be6354830d1423764991adcfc26b0b
+;
+; Buffer Definitions:
+;
+;
+; Resource Bindings:
+;
+; Name                                 Type  Format         Dim      ID      HLSL Bind  Count
+; ------------------------------ ---------- ------- ----------- ------- -------------- ------
+; buf                                   UAV    byte         r/w      U0u4294967295,space4294967295     1
+;
+target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+%dx.types.Handle = type { i8* }
+%dx.types.ResourceProperties = type { i32, i32 }
+%struct.RWByteAddressBuffer = type { i32 }
+
+@"\01?buf@@3URWByteAddressBuffer@@A" = external constant %dx.types.Handle, align 4
+
+; Function Attrs: nounwind
+define void @"\01?main@@YAXXZ"() #0 {
+  %1 = load %dx.types.Handle, %dx.types.Handle* @"\01?buf@@3URWByteAddressBuffer@@A", align 4
+  %2 = call %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32 160, %dx.types.Handle %1)  ; CreateHandleForLib(Resource)
+  %3 = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %2, %dx.types.ResourceProperties { i32 69643, i32 0 })  ; AnnotateHandle(res,props)  resource: reordercoherent RWByteAddressBuffer
+  call void @dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle %3, i32 0, i32 undef, float 1.100000e+01, float undef, float undef, float undef, i8 1, i32 4)  ; RawBufferStore(uav,index,elementOffset,value0,value1,value2,value3,mask,alignment)
+  ret void
+}
+
+; Function Attrs: nounwind
+declare void @dx.op.rawBufferStore.f32(i32, %dx.types.Handle, i32, i32, float, float, float, float, i8, i32) #0
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @dx.op.annotateHandle(i32, %dx.types.Handle, %dx.types.ResourceProperties) #1
+
+; Function Attrs: nounwind readonly
+declare %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32, %dx.types.Handle) #2
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind readonly }
+
+!dx.version = !{!0}
+!dx.valver = !{!0}
+!dx.shaderModel = !{!1}
+!dx.resources = !{!2}
+!dx.typeAnnotations = !{!3}
+!dx.entryPoints = !{!4, !5}
+
+!0 = !{i32 1, i32 8}
+!1 = !{!"lib", i32 6, i32 8}
+!2 = !{null, !6, null, null}
+!3 = !{i32 1, void ()* @"\01?main@@YAXXZ", !7}
+!4 = !{null, !"", null, !2, !8}
+!5 = !{void ()* @"\01?main@@YAXXZ", !"\01?main@@YAXXZ", null, null, !9}
+!6 = !{!10}
+!7 = !{!11}
+!8 = !{i32 0, i64 8589934608}
+!9 = !{i32 8, i32 7, i32 5, !12}
+!10 = !{i32 0, %struct.RWByteAddressBuffer* bitcast (%dx.types.Handle* @"\01?buf@@3URWByteAddressBuffer@@A" to %struct.RWByteAddressBuffer*), !"buf", i32 -1, i32 -1, i32 1, i32 11, i1 false, i1 false, i1 false, !13}
+!11 = !{i32 1, !14, !14}
+!12 = !{i32 0}
+!13 = !{i32 4, i1 true}
+!14 = !{}
diff --git a/tools/clang/test/LitDXILValidation/vector-validation.ll b/tools/clang/test/LitDXILValidation/vector-validation.ll
index 74e8116e88..b32ac0cd5c 100644
--- a/tools/clang/test/LitDXILValidation/vector-validation.ll
+++ b/tools/clang/test/LitDXILValidation/vector-validation.ll
@@ -1,3 +1,4 @@
+; REQUIRES: dxil-1-9
 ; RUN: not %dxv %s 2>&1 | FileCheck %s
 
 ; Confirm that 6.9 specific LLVM operations and DXIL intrinsics fail in 6.8
diff --git a/tools/clang/test/SemaHLSL/attributes/spv.inline.decorate.member.hlsl b/tools/clang/test/SemaHLSL/attributes/spv.inline.decorate.member.hlsl
deleted file mode 100644
index ece7e3f2f4..0000000000
--- a/tools/clang/test/SemaHLSL/attributes/spv.inline.decorate.member.hlsl
+++ /dev/null
@@ -1,13 +0,0 @@
-// REQUIRES: spirv
-// RUN: %dxc -T ps_6_0 -E main -verify -spirv %s
-
-struct S
-{
-    [[vk::ext_decorate_id(/*offset*/ 35, 0)]] float4 f1; /* expected-error{{'ext_decorate_id' attribute only applies to functions, variables, parameters, and types}} */
-    [[vk::ext_decorate_string(/*offset*/ 35, "16")]] float4 f2; /* expected-error{{'ext_decorate_string' attribute only applies to functions, variables, parameters, and types}} */
-};
-
-float4 main() : SV_TARGET
-{
-
-}
diff --git a/tools/clang/test/SemaHLSL/effects-syntax.hlsl b/tools/clang/test/SemaHLSL/effects-syntax.hlsl
index 5a7492a9da..e5468cbd41 100644
--- a/tools/clang/test/SemaHLSL/effects-syntax.hlsl
+++ b/tools/clang/test/SemaHLSL/effects-syntax.hlsl
@@ -108,12 +108,10 @@ static const PixelShader ps1 { state=foo; };                /* expected-warning
 /*verify-ast
   No matching AST found for line!
 */
-// expected-note@? {{'PixelShader' declared here}}
 PixelShadeR ps < int foo=1;>  = ps1;   // Case insensitive! /* expected-error {{unknown type name 'PixelShadeR'; did you mean 'PixelShader'?}} expected-warning {{effect object ignored - effect syntax is deprecated}} expected-warning {{possible effect annotation ignored - effect syntax is deprecated}} fxc-pass {{}} */
 /*verify-ast
   No matching AST found for line!
 */
-// expected-note@? {{'VertexShader' declared here}}
 VertexShadeR vs;        // Case insensitive!                /* expected-error {{unknown type name 'VertexShadeR'; did you mean 'VertexShader'?}} expected-warning {{effect object ignored - effect syntax is deprecated}} fxc-pass {{}} */
 
 // Case sensitive
diff --git a/tools/clang/test/SemaHLSL/enum_sizeof.hlsl b/tools/clang/test/SemaHLSL/enum_sizeof.hlsl
new file mode 100644
index 0000000000..71723976a9
--- /dev/null
+++ b/tools/clang/test/SemaHLSL/enum_sizeof.hlsl
@@ -0,0 +1,31 @@
+// RUN: %dxc -T cs_6_9 -E main %s -ast-dump-implicit | FileCheck %s --check-prefix AST
+
+enum E1 : uint64_t
+{
+    v1 = 0,
+};
+
+enum E2 : uint32_t
+{
+    v2 = 0,
+};
+
+struct S {
+  E1 e1;
+  E2 e2;
+};
+
+RWBuffer<int> b;
+
+[numthreads(128, 1, 1)]
+void main()
+{
+// AST: UnaryExprOrTypeTraitExpr {{.*}} 'unsigned long' sizeof 'E1'
+    b[0] = sizeof(E1);
+
+// AST: UnaryExprOrTypeTraitExpr {{.*}} 'unsigned long' sizeof 'E2'
+    b[1] = sizeof(E2);
+
+// AST: UnaryExprOrTypeTraitExpr {{.*}} 'unsigned long' sizeof 'S'
+    b[2] = sizeof(S);
+}
diff --git a/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/mul_add_invalid.hlsl b/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/mul_add_invalid.hlsl
new file mode 100644
index 0000000000..866fad8225
--- /dev/null
+++ b/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/mul_add_invalid.hlsl
@@ -0,0 +1,1398 @@
+// RUN: %dxc -I %hlsl_headers -T lib_6_9 -enable-16bit-types %s -verify
+
+#include <dx/linalg.h>
+
+using namespace dx::linalg;
+
+ByteAddressBuffer input_vector_buffer;
+ByteAddressBuffer matrix_buffer;
+ByteAddressBuffer bias_buffer;
+RWByteAddressBuffer output_vector_buffer;
+ByteAddressBuffer constants_buffer;
+
+// Output vector, isUnsigned mismatch
+void test_invalid_output_vector_type() {
+
+  vector<float, 4> input_vector =
+      input_vector_buffer.Load<vector<float, 4> >(0);
+  const uint is_input_unsigned = 0;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+  const uint bias_offset = 0;
+  const uint bias_interpretation = DataType::DATA_TYPE_FLOAT32;
+
+  vector<uint, 4> output_vector_0;
+  const uint is_output_unsigned_0 = 0;
+
+  // expected-error@+1 {{IsOuputUnsigned must be true for vector of unsigned integer type}}
+  __builtin_MatVecMulAdd(output_vector_0, is_output_unsigned_0, input_vector,
+                        is_input_unsigned, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  vector<int32_t, 4> output_vector_1;
+  const uint is_output_unsigned_1 = 1;
+
+  // expected-error@+1 {{IsOuputUnsigned must be false for vector of signed integer type}}
+  __builtin_MatVecMulAdd(output_vector_1, is_output_unsigned_1, input_vector,
+                        is_input_unsigned, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  vector<float, 4> output_vector_2;
+  const uint is_output_unsigned_2 = 1;
+
+  // expected-error@+1 {{IsOuputUnsigned must be false for vector of floating point type}}
+  __builtin_MatVecMulAdd(output_vector_2, is_output_unsigned_2, input_vector,
+                        is_input_unsigned, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+}
+
+// IsOutputUnsigned is not a constant parameter
+void test_invalid_is_output_unsigned_non_const() {
+
+  vector<uint, 4> output_vector_0;
+  vector<float, 4> input_vector =
+      input_vector_buffer.Load<vector<float, 4> >(0);
+  const uint is_input_unsigned = 0;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+  const uint bias_offset = 0;
+  const uint bias_interpretation = DataType::DATA_TYPE_FLOAT32;
+
+  const uint is_output_unsigned_0 = constants_buffer.Load<uint>(0);
+
+  // expected-error@+1 {{expression is not an integer constant expression}}
+  __builtin_MatVecMulAdd(output_vector_0, is_output_unsigned_0, input_vector,
+                        is_input_unsigned, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+}
+
+// Input vector is incorrect type - 64 bit types
+void test_invalid_input_vector_type() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+  const uint bias_offset = 0;
+  const uint bias_interpretation = DataType::DATA_TYPE_FLOAT32;
+
+    vector<int64_t, 4> input_vector_0 =
+      input_vector_buffer.Load<vector<int64_t, 4> >(0);
+    const uint is_input_unsigned_0 = 0;
+
+// expected-error@+2 {{no matching function for call to '__builtin_MatVecMulAdd'}}
+// expected-note@+1 {{candidate function not viable: no known conversion from 'vector<int64_t, 4>' to 'vector<float, 4>' for 3rd argument}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector_0,
+                        is_input_unsigned_0, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+    vector<uint64_t, 4> input_vector_1 =
+      input_vector_buffer.Load<vector<uint64_t, 4> >(0);
+    const uint is_input_unsigned_1 = 1;
+
+// expected-error@+2 {{no matching function for call to '__builtin_MatVecMulAdd'}}
+// expected-note@+1 {{candidate function not viable: no known conversion from 'vector<uint64_t, 4>' to 'vector<float, 4>' for 3rd argument}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector_1,
+                        is_input_unsigned_1, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+    vector<float64_t, 4> input_vector_2 =
+      input_vector_buffer.Load<vector<float64_t, 4> >(0);
+    const uint is_input_unsigned_2 = 0;
+
+// expected-error@+2 {{no matching function for call to '__builtin_MatVecMulAdd'}}
+// expected-note@+1 {{candidate function not viable: no known conversion from 'vector<float64_t, 4>' to 'vector<float, 4>' for 3rd argument}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector_2,
+                        is_input_unsigned_2, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+}
+
+// Input vector is incorrect type for packed InputInterpretation
+void test_invalid_input_vector_type_packed_input_interpretation() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+  const uint bias_offset = 0;
+  const uint bias_interpretation = DataType::DATA_TYPE_FLOAT32;
+
+  const uint input_interpretation_0 = DataType::DATA_TYPE_SINT8_T4_PACKED;
+  vector<int16_t, 2> input_vector_0 =
+      input_vector_buffer.Load<vector<int16_t, 2> >(0);
+  const uint is_input_unsigned_0 = 1;
+
+  // expected-error@+1 {{packed input vector type must be a 32-bit unsigned int type in linalg mul/muladd operations}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector_0,
+                        is_input_unsigned_0, input_interpretation_0, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  const uint input_interpretation_1 = DataType::DATA_TYPE_UINT8_T4_PACKED;
+  vector<uint16_t, 2> input_vector_1 =
+      input_vector_buffer.Load<vector<uint16_t, 2> >(0);
+  const uint is_input_unsigned_1 = 0;
+
+  // expected-error@+1 {{packed input vector type must be a 32-bit unsigned int type in linalg mul/muladd operations}} 
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector_1,
+                        is_input_unsigned_1, input_interpretation_1, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  const uint input_interpretation_2 = DataType::DATA_TYPE_UINT8_T4_PACKED;
+  vector<int32_t, 1> input_vector_2 =
+      input_vector_buffer.Load<vector<int32_t, 1> >(0);
+  const uint is_input_unsigned_2 = 1;
+  
+  // expected-error@+1 {{packed input vector type must be a 32-bit unsigned int type in linalg mul/muladd operations}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector_2,
+                        is_input_unsigned_2, input_interpretation_2, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  const uint input_interpretation_3 = DataType::DATA_TYPE_SINT8_T4_PACKED;
+  vector<int32_t, 1> input_vector_3 =
+      input_vector_buffer.Load<vector<int32_t, 1> >(0);
+  const uint is_input_unsigned_3 = 0;
+
+  // expected-error@+1 {{packed input vector type must be a 32-bit unsigned int type in linalg mul/muladd operations}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector_3,
+                        is_input_unsigned_3, input_interpretation_3, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  const uint input_interpretation_4 = DataType::DATA_TYPE_SINT8_T4_PACKED;
+  vector<float, 1> input_vector_4 =
+      input_vector_buffer.Load<vector<float, 1> >(0);
+  const uint is_input_unsigned_4 = 0;
+
+  // expected-error@+1 {{packed input vector type must be a 32-bit unsigned int type in linalg mul/muladd operations}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector_4, 
+                        is_input_unsigned_4, input_interpretation_4, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+}
+
+// IsInputUnsigned must be true for packed input vector type
+void test_invalid_is_input_unsigned_packed_input_vector_type() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;  
+  const uint matrix_stride = 64;
+  const uint bias_offset = 0;
+  const uint bias_interpretation = DataType::DATA_TYPE_FLOAT32;
+
+  const uint input_interpretation_0 = DataType::DATA_TYPE_UINT8_T4_PACKED;  
+  vector<uint, 1> input_vector_0 = 
+      input_vector_buffer.Load<vector<uint, 1> >(0);
+  const uint is_input_unsigned_0 = 0;
+
+  // expected-error@+2 {{IsInputUnsigned must be true for packed input interpretations in linalg mul/muladd operations}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector_0,
+                        is_input_unsigned_0, input_interpretation_0, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,  
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  const uint input_interpretation_1 = DataType::DATA_TYPE_SINT8_T4_PACKED;
+  vector<uint, 1> input_vector_1 =
+      input_vector_buffer.Load<vector<uint, 1> >(0);
+  const uint is_input_unsigned_1 = 0;
+  
+  // expected-error@+2 {{IsInputUnsigned must be true for packed input interpretations in linalg mul/muladd operations}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector_1,
+                        is_input_unsigned_1, input_interpretation_1, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+}
+
+// Check packed input vector dimension
+void test_invalid_packed_input_vector_dimension() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  const uint is_input_unsigned = 1;
+  const uint input_interpretation = DataType::DATA_TYPE_UINT8_T4_PACKED;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_UINT8;
+  const uint matrix_dimM = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_MUL_OPTIMAL;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 0;
+  const uint bias_offset = 0;
+  const uint bias_interpretation = DataType::DATA_TYPE_UINT32;
+
+  vector<uint, 4> input_vector_0 =
+      input_vector_buffer.Load<vector<uint, 4> >(0);
+  const uint matrix_dimK_0 = 4;
+
+  // expected-error@+1 {{packed input vector length must be the smallest number that can hold matrix dim K values of the packed(smaller) type in linalg mul/muladd operations}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector_0,
+                        is_input_unsigned, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK_0, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  vector<uint, 1> input_vector_1 =
+      input_vector_buffer.Load<vector<uint, 1> >(0);
+  const uint matrix_dimK_1 = 7;
+
+  // expected-error@+1 {{packed input vector length must be the smallest number that can hold matrix dim K values of the packed(smaller) type in linalg mul/muladd operations}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector_1,
+                        is_input_unsigned, input_interpretation, matrix_buffer, 
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK_1, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  vector<uint, 3> input_vector_2 =
+      input_vector_buffer.Load<vector<uint, 1> >(0);
+  const uint matrix_dimK_2 = 7;
+
+  // expected-error@+1 {{packed input vector length must be the smallest number that can hold matrix dim K values of the packed(smaller) type in linalg mul/muladd operations}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector_2,
+                        is_input_unsigned, input_interpretation, matrix_buffer, 
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK_2, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+}
+
+// Check is Input vector type/isInputUnsigned matched
+void test_invalid_input_vector_type_mismatch() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+  const uint bias_offset = 0;
+  const uint bias_interpretation = DataType::DATA_TYPE_FLOAT32;
+
+  vector<uint, 4> input_vector_0 =
+      input_vector_buffer.Load<vector<uint, 4> >(0);    
+  const uint is_input_unsigned_0 = 0;
+
+  // expected-error@+2 {{IsInputUnsigned must be true for vector of unsigned integer type}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector_0,
+                        is_input_unsigned_0, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  vector<int32_t, 4> input_vector_1 =
+      input_vector_buffer.Load<vector<int32_t, 4> >(0);
+  const uint is_input_unsigned_1 = 1;
+
+  // expected-error@+2 {{IsInputUnsigned must be false for vector of signed integer type}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector_1,
+                        is_input_unsigned_1, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  vector<float16_t, 4> input_vector_2 =
+      input_vector_buffer.Load<vector<float16_t, 4> >(0);
+  const uint is_input_unsigned_2 = 1;
+
+  // expected-error@+2 {{IsInputUnsigned must be false for vector of floating point type}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector_2,
+                        is_input_unsigned_2, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+}
+
+//  Check is Matrix M dimension is a constant parameter
+void test_invalid_matrix_M_dimension() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  vector<float, 4> input_vector =
+      input_vector_buffer.Load<vector<float, 4> >(0);
+  const uint is_input_unsigned = 0;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimK = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64; 
+  const uint bias_offset = 0;
+  const uint bias_interpretation = DataType::DATA_TYPE_FLOAT32;
+
+  const uint matrix_dimM = constants_buffer.Load<uint>(0);   
+  
+  // expected-error@+3 {{expression is not an integer constant expression}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+}
+
+//  Check is Matrix K dimension is a constant parameter
+void test_invalid_matrix_K_dimension() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  vector<float, 4> input_vector =
+      input_vector_buffer.Load<vector<float, 4> >(0);
+  const uint is_input_unsigned = 0; 
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+  const uint bias_offset = 0;
+  const uint bias_interpretation = DataType::DATA_TYPE_FLOAT32;
+
+  const uint matrix_dimK = constants_buffer.Load<uint>(0);
+  
+  // expected-error@+4 {{expression is not an integer constant expression}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+}
+
+// Check is Matrix M dimension is non-zero
+void test_invalid_matrix_M_dimension_non_zero() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  vector<float, 4> input_vector =
+      input_vector_buffer.Load<vector<float, 4> >(0);
+  const uint is_input_unsigned = 0;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimK = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+  const uint bias_offset = 0;
+  const uint bias_interpretation = DataType::DATA_TYPE_FLOAT32;
+
+  const uint matrix_dimM = 0;
+  // expected-error@+3 {{matrix dimension must be greater than 0}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+}
+
+// Check is Matrix K dimension is non-zero
+void test_invalid_matrix_K_dimension_non_zero() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  vector<float, 4> input_vector =
+      input_vector_buffer.Load<vector<float, 4> >(0);
+  const uint is_input_unsigned = 0;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+  const uint bias_offset = 0;
+  const uint bias_interpretation = DataType::DATA_TYPE_FLOAT32;
+
+  const uint matrix_dimK = 0;
+  // expected-error@+4 {{matrix dimension must be greater than 0}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+}
+
+// Check if Matrix M dimension is less than Max
+void test_invalid_matrix_M_dimension_less_than_Max() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  const uint is_input_unsigned = 1;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimK = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = matrix_dimK * 4;
+  const uint bias_offset = 0;
+  const uint bias_interpretation = DataType::DATA_TYPE_FLOAT32;
+
+  vector<uint, 4> input_vector_0 =
+      input_vector_buffer.Load<vector<uint, 4> >(0);
+  const uint input_interpretation_0 = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM_0 = 1025;
+
+  // expected-error@+3 {{matrix dimension M must be less than 1024, in a linalg Mul/MulAdd operation}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector_0,
+                        is_input_unsigned, input_interpretation_0, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM_0,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  vector<uint, 1> input_vector_1 =
+      input_vector_buffer.Load<vector<uint, 1> >(0);
+  const uint input_interpretation_1 = DataType::DATA_TYPE_UINT8_T4_PACKED;
+  const uint matrix_dimM_1 = 4097;
+
+  // expected-error@+3 {{matrix dimension M must be less than 1024, in a linalg Mul/MulAdd operation}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector_1,
+                        is_input_unsigned, input_interpretation_1, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM_1,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+}
+
+// Check if Matrix K dimension is less than Max in unpacked input vector case
+void test_invalid_matrix_K_dimension_less_than_Max_unpacked_input_vector() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  const uint is_input_unsigned = 1;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+  const uint bias_offset = 0;
+  const uint bias_interpretation = DataType::DATA_TYPE_FLOAT32;
+
+  vector<uint, 4> input_vector_0 =
+      input_vector_buffer.Load<vector<uint, 4> >(0);
+  const uint input_interpretation_0 = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimK_0 = 1025;
+
+  // expected-error@+4 {{matrix dimension K when using unpacked input vectors must be less than 1024, in a linalg Mul/MulAdd operation}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector_0,
+                        is_input_unsigned, input_interpretation_0, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK_0, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  vector<uint, 4> input_vector_1 =
+      input_vector_buffer.Load<vector<uint, 4> >(0);
+  const uint input_interpretation_1 = DataType::DATA_TYPE_UINT8;
+  const uint matrix_dimK_1 = 4096;
+  // expected-error@+4 {{matrix dimension K when using unpacked input vectors must be less than 1024, in a linalg Mul/MulAdd operation}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector_1, 
+                        is_input_unsigned, input_interpretation_1, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK_1, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+}
+
+// Check if Matrix M dimension is less than Max in packed input vector case
+void test_invalid_matrix_M_dimension_less_than_Max_packed_input_vector() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  const uint is_input_unsigned = 1;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 1024;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 4096;
+  const uint bias_offset = 0;
+  const uint bias_interpretation = DataType::DATA_TYPE_FLOAT32;
+
+  vector<uint, 1024> input_vector_0 =
+      input_vector_buffer.Load<vector<uint, 1024> >(0);
+  const uint input_interpretation_0 = DataType::DATA_TYPE_UINT8_T4_PACKED;
+  const uint matrix_dimK_0 = 4097;
+
+  // expected-error@+4 {{matrix dimension K when using packed input vectors must be less than 4096, in a linalg Mul/MulAdd operation}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector_0,
+                        is_input_unsigned, input_interpretation_0, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK_0, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+}
+
+void test_invalid_input_interpretation_non_const() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  vector<float, 4> input_vector =
+      input_vector_buffer.Load<vector<float, 4> >(0);
+  const uint is_input_unsigned = 0;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+  const uint bias_offset = 0;
+  const uint bias_interpretation = DataType::DATA_TYPE_FLOAT32;
+
+  const uint input_interpretation = constants_buffer.Load<uint>(0);
+
+  // expected-error@+2 {{expression is not an integer constant expression}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+}
+
+// Check if InputInterpretation is a valid value
+void test_invalid_input_interpretation_value() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  vector<float, 4> input_vector =
+      input_vector_buffer.Load<vector<float, 4> >(0);   
+  const uint is_input_unsigned = 0;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+  const uint bias_offset = 0;
+  const uint bias_interpretation = DataType::DATA_TYPE_FLOAT32;
+
+  const uint input_interpretation_0 = 0;
+
+  // expected-error@+2 {{0 is an invalid register interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation_0, matrix_buffer,   
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  const uint input_interpretation_1 = 1;
+
+  // expected-error@+2 {{1 is an invalid register interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation_1, matrix_buffer,   
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  const uint input_interpretation_2 = 6;
+
+  // expected-error@+2 {{6 is an invalid register interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation_2, matrix_buffer,   
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  const uint input_interpretation_3 = 7;
+
+  // expected-error@+2 {{7 is an invalid register interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation_3, matrix_buffer,   
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);   
+
+  const uint input_interpretation_4 = 10;
+
+  // expected-error@+2 {{10 is an invalid register interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation_4, matrix_buffer,   
+                        matrix_offset, matrix_interpretation, matrix_dimM,    
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  const uint input_interpretation_5 = 11;       
+
+  // expected-error@+2 {{11 is an invalid register interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation_5, matrix_buffer,   
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  const uint input_interpretation_6 = 12;
+
+  // expected-error@+2 {{12 is an invalid register interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation_6, matrix_buffer,   
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  const uint input_interpretation_7 = 13;
+
+  // expected-error@+2 {{13 is an invalid register interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation_7, matrix_buffer,   
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  const uint input_interpretation_8 = 14;
+
+  // expected-error@+2 {{14 is an invalid register interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation_8, matrix_buffer,   
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  const uint input_interpretation_9 = 15;
+
+  // expected-error@+2 {{15 is an invalid register interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation_9, matrix_buffer,   
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  const uint input_interpretation_10 = 16;
+
+  // expected-error@+2 {{16 is an invalid register interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation_10, matrix_buffer,   
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  const uint input_interpretation_11 = 23;
+
+  // expected-error@+2 {{23 is an invalid register interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation_11, matrix_buffer,   
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  const uint input_interpretation_12 = 100;
+
+  // expected-error@+2 {{100 is an invalid register interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation_12, matrix_buffer,   
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+}
+// Check if Input and Output vector dimensions are valid -non packed
+void test_invalid_input_output_vector_dimensions_non_packed_square_matrix() {
+
+  const uint is_output_unsigned = 1;
+  const uint is_input_unsigned = 0;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 32;
+  const uint matrix_dimK = 32;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+  const uint bias_offset = 0;
+  const uint bias_interpretation = DataType::DATA_TYPE_FLOAT32;
+
+  vector<uint, 32> output_vector_0;
+  vector<float, 30> input_vector_0 =   
+      input_vector_buffer.Load<vector<float, 30> >(0);
+
+  // expected-error@+1 {{unpacked input vector length must be equal to Matrix K dimension in a linalg Mul/MulAdd operation}}
+  __builtin_MatVecMulAdd(output_vector_0, is_output_unsigned, input_vector_0,  
+                        is_input_unsigned, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  vector<uint, 30> output_vector_1;
+  vector<float, 32> input_vector_1 =   
+      input_vector_buffer.Load<vector<float, 32> >(0);
+
+  // expected-error@+1 {{output vector length must be equal to Matrix M dimension in a linalg Mul/MulAdd operation}}
+  __builtin_MatVecMulAdd(output_vector_1, is_output_unsigned, input_vector_1,    
+                        is_input_unsigned, input_interpretation, matrix_buffer,   
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+}
+
+// Check if Input and Output vector dimensions are valid -non packed
+void test_invalid_input_output_vector_dimensions_non_packed_rectangle_matrix() {
+
+  const uint is_output_unsigned = 1;
+  const uint is_input_unsigned = 0;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 16;
+  const uint matrix_dimK = 32;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+  const uint bias_offset = 0;
+  const uint bias_interpretation = DataType::DATA_TYPE_FLOAT32;
+
+  // Use dimension of Matrix K to trigger error
+  vector<uint, 32> output_vector_0;
+  vector<float, 32> input_vector_0 =   
+      input_vector_buffer.Load<vector<float, 32> >(0);
+
+  // expected-error@+1 {{output vector length must be equal to Matrix M dimension in a linalg Mul/MulAdd operation}}
+  __builtin_MatVecMulAdd(output_vector_0, is_output_unsigned, input_vector_0,  
+                        is_input_unsigned, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+ 
+ // Check off by 1 errors
+  vector<uint, 17> output_vector_1;
+  vector<float, 16> input_vector_1 =   
+      input_vector_buffer.Load<vector<float, 16> >(0);
+
+  // expected-error@+1 {{output vector length must be equal to Matrix M dimension in a linalg Mul/MulAdd operation}}
+  __builtin_MatVecMulAdd(output_vector_1, is_output_unsigned, input_vector_1,    
+                        is_input_unsigned, input_interpretation, matrix_buffer,   
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+ // Check off by 1 errors
+ vector<uint, 15> output_vector_2;
+ vector<float, 16> input_vector_2 =   
+     input_vector_buffer.Load<vector<float, 16> >(0);
+
+ // expected-error@+1 {{output vector length must be equal to Matrix M dimension in a linalg Mul/MulAdd operation}}         
+ __builtin_MatVecMulAdd(output_vector_2, is_output_unsigned, input_vector_2,    
+                        is_input_unsigned, input_interpretation, matrix_buffer,   
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  // Use dimension of Matrix M to trigger error 
+  vector<uint, 16> output_vector_3;
+  vector<float, 16> input_vector_3 =   
+      input_vector_buffer.Load<vector<float, 16> >(0);
+
+  // expected-error@+1 {{unpacked input vector length must be equal to Matrix K dimension in a linalg Mul/MulAdd operation}}
+  __builtin_MatVecMulAdd(output_vector_3, is_output_unsigned, input_vector_3,  
+                        is_input_unsigned, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  // Check off by 1 errors
+  vector<uint, 16> output_vector_4;
+  vector<float, 31> input_vector_4 =   
+      input_vector_buffer.Load<vector<float, 31> >(0);
+
+  // expected-error@+1 {{unpacked input vector length must be equal to Matrix K dimension in a linalg Mul/MulAdd operation}}    
+  __builtin_MatVecMulAdd(output_vector_4, is_output_unsigned, input_vector_4,  
+                        is_input_unsigned, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  // Check off by 1 errors
+  vector<uint, 16> output_vector_5;
+  vector<float, 33> input_vector_5 =   
+      input_vector_buffer.Load<vector<float, 33> >(0);
+
+  // expected-error@+1 {{unpacked input vector length must be equal to Matrix K dimension in a linalg Mul/MulAdd operation}}    
+  __builtin_MatVecMulAdd(output_vector_5, is_output_unsigned, input_vector_5,  
+                        is_input_unsigned, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+    // Swap dimensions to trigger error
+    vector<uint, 32> output_vector_6;
+    vector<float, 16> input_vector_6 =   
+        input_vector_buffer.Load<vector<float, 16> >(0);
+
+    // expected-error@+1 {{output vector length must be equal to Matrix M dimension in a linalg Mul/MulAdd operation}}    
+    __builtin_MatVecMulAdd(output_vector_6, is_output_unsigned, input_vector_6,  
+                          is_input_unsigned, input_interpretation, matrix_buffer,
+                          matrix_offset, matrix_interpretation, matrix_dimM,
+                          matrix_dimK, matrix_layout, matrix_is_transposed,
+                          matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+}
+
+// Check if matrtrix  interpretation is a constant value
+void test_invalid_matrix_interpretation_constant_value() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  vector<float, 4> input_vector =
+      input_vector_buffer.Load<vector<float, 4> >(0);
+  const uint is_input_unsigned = 0;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+  const uint bias_offset = 0;
+  const uint bias_interpretation = DataType::DATA_TYPE_FLOAT32;
+
+  const uint matrix_interpretation_0 = constants_buffer.Load<uint>(0);
+
+  // expected-error@+3 {{expression is not an integer constant expression}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation_0, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+}
+
+// Check for invalid matrix interpretation value
+void test_invalid_matrix_interpretation_value() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  vector<float, 4> input_vector =
+      input_vector_buffer.Load<vector<float, 4> >(0);
+  const uint is_input_unsigned = 0;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+  const uint bias_offset = 0;
+  const uint bias_interpretation = DataType::DATA_TYPE_FLOAT32;
+
+  const uint matrix_interpretation_0 = 0;
+
+  // expected-error@+3 {{0 is an invalid memory interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation_0, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  const uint matrix_interpretation_1 = 1;
+
+  // expected-error@+3 {{1 is an invalid memory interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation_1, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  const uint matrix_interpretation_2 = 6;
+
+  // expected-error@+3 {{6 is an invalid memory interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation_2, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  const uint matrix_interpretation_3 = 7;
+
+  // expected-error@+3 {{7 is an invalid memory interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation, matrix_buffer,   
+                        matrix_offset, matrix_interpretation_3, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  const uint matrix_interpretation_4 = 10;
+
+  // expected-error@+3 {{10 is an invalid memory interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation_4, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  const uint matrix_interpretation_5 = 11;
+
+  // expected-error@+3 {{11 is an invalid memory interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation_5, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  const uint matrix_interpretation_6 = 12;
+
+  // expected-error@+3 {{12 is an invalid memory interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation_6, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  const uint matrix_interpretation_7 = 13;
+
+  // expected-error@+3 {{13 is an invalid memory interpretation value}} 
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation_7, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  const uint matrix_interpretation_8 = 14;
+
+  // expected-error@+3 {{14 is an invalid memory interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation_8, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  const uint matrix_interpretation_9 = 15;
+
+  // expected-error@+3 {{15 is an invalid memory interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation_9, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  const uint matrix_interpretation_10 = 16;
+
+  // expected-error@+3 {{16 is an invalid memory interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation_10, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  const uint matrix_interpretation_11 = 23;
+  // expected-error@+3 {{23 is an invalid memory interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation_11, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  const uint matrix_interpretation_12 = 100;
+
+  // expected-error@+3 {{100 is an invalid memory interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation_12, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+}
+
+// Check if matrix Layout is a constant value
+void test_invalid_matrix_layout_constant_value() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  vector<float, 4> input_vector =
+      input_vector_buffer.Load<vector<float, 4> >(0);
+  const uint is_input_unsigned = 0;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+  const uint bias_offset = 0;
+  const uint bias_interpretation = DataType::DATA_TYPE_FLOAT32;
+
+  const uint matrix_layout = constants_buffer.Load<uint>(0);
+
+  // expected-error@+4 {{expression is not an integer constant expression}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation, matrix_buffer,   
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+}
+
+// Check invalid matrix layout value
+void test_invalid_matrix_layout_value() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  vector<float, 4> input_vector =
+      input_vector_buffer.Load<vector<float, 4> >(0);
+  const uint is_input_unsigned = 0;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+  const uint bias_offset = 0;
+  const uint bias_interpretation = DataType::DATA_TYPE_FLOAT32;
+
+  const uint matrix_layout_0 = 4;
+
+  // expected-error@+4 {{matrix layout 4 is not valid, must be in the range [0, 3]}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout_0, matrix_is_transposed,
+                      matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+}
+
+// Check if matrix is transposed is a constant value
+void test_invalid_matrix_transposed_constant_value() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  vector<float, 4> input_vector =
+      input_vector_buffer.Load<vector<float, 4> >(0);
+  const uint is_input_unsigned = 0;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = constants_buffer.Load<bool>(0);
+  const uint matrix_stride = 64;
+  const uint bias_offset = 0;
+  const uint bias_interpretation = DataType::DATA_TYPE_FLOAT32;
+
+  // expected-error@+4 {{expression is not an integer constant expression}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+}
+
+// Check if invalid matrix transpose value is used
+void test_invalid_matrix_transpose_value() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  vector<float, 4> input_vector =   
+      input_vector_buffer.Load<vector<float, 4> >(0);
+  const uint is_input_unsigned = 0;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4;   
+  const uint matrix_stride = 64;
+  const uint bias_offset = 0;
+  const uint bias_interpretation = DataType::DATA_TYPE_FLOAT32;
+
+  const uint matrix_layout_0 = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed_0 = true;
+
+  // expected-error@+4 {{RowMajor and ColumnMajor matrices are not transposable}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout_0, matrix_is_transposed_0,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  const uint matrix_layout_1 = MatrixLayout::MATRIX_LAYOUT_COLUMN_MAJOR;
+  const bool matrix_is_transposed_1 = true;
+
+  // expected-error@+4 {{RowMajor and ColumnMajor matrices are not transposable}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout_1, matrix_is_transposed_1,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+}
+
+
+// Check invalid matrix stride value for optimal matrix layout
+void test_invalid_matrix_stride_constant_value() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  vector<float, 4> input_vector =
+      input_vector_buffer.Load<vector<float, 4> >(0);
+  const uint is_input_unsigned = 0;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4;
+  const bool matrix_is_transposed = false;
+
+  const uint matrix_layout_0 = MatrixLayout::MATRIX_LAYOUT_MUL_OPTIMAL;
+  const uint matrix_stride_0 = 64;
+  const uint bias_offset = 0;
+  const uint bias_interpretation = DataType::DATA_TYPE_FLOAT32;
+
+  // expected-error@+5 {{for optimal matrix layout, matrix stride must be 0}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout_0, matrix_is_transposed,
+                        matrix_stride_0, bias_buffer, bias_offset, bias_interpretation);
+
+  const uint matrix_layout_1 = MatrixLayout::MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL;
+  const uint matrix_stride_1 = 64;
+  
+  // expected-error@+5 {{for optimal matrix layout, matrix stride must be 0}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                        is_input_unsigned, input_interpretation, matrix_buffer,   
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout_1, matrix_is_transposed,
+                        matrix_stride_1, bias_buffer, bias_offset, bias_interpretation);
+}
+
+// Check bias interpretation is not a constant value
+void test_invalid_bias_interpretation() {
+  vector<float, 4> output_vector;
+  const uint is_output_unsigned = 0;
+  vector<float, 4> input_vector =
+      input_vector_buffer.Load<vector<float, 4> >(0);
+  const uint is_input_unsigned = 0;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const uint matrix_is_transposed = 0;
+  const uint matrix_stride = 0;
+  const uint bias_offset = 0;
+
+  const uint bias_interpretation_0 = constants_buffer.Load<uint>(0);
+
+  // expected-error@+6 {{expression is not an integer constant expression}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                         is_input_unsigned, input_interpretation, matrix_buffer,
+                         matrix_offset, matrix_interpretation, matrix_dimM,
+                         matrix_dimK, matrix_layout, matrix_is_transposed,
+                         matrix_stride, bias_buffer, bias_offset,
+                         bias_interpretation_0);
+}
+
+// Check bias interpretation is not a valid value
+void test_invalid_bias_interpretation_value() {
+  vector<float, 4> output_vector;
+  const uint is_output_unsigned = 0;
+  vector<float, 4> input_vector =
+      input_vector_buffer.Load<vector<float, 4> >(0);
+  const uint is_input_unsigned = 0;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4; 
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const uint matrix_is_transposed = 0;
+  const uint matrix_stride = 0;
+  const uint bias_offset = 0;
+
+  const uint bias_interpretation_0 = 0;
+
+  // expected-error@+6 {{0 is an invalid memory interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                         is_input_unsigned, input_interpretation, matrix_buffer,
+                         matrix_offset, matrix_interpretation, matrix_dimM,
+                         matrix_dimK, matrix_layout, matrix_is_transposed,
+                         matrix_stride, bias_buffer, bias_offset,
+                         bias_interpretation_0);
+
+  const uint bias_interpretation_1 = 1;
+
+  // expected-error@+6 {{1 is an invalid memory interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                         is_input_unsigned, input_interpretation, matrix_buffer,
+                         matrix_offset, matrix_interpretation, matrix_dimM,
+                         matrix_dimK, matrix_layout, matrix_is_transposed,
+                         matrix_stride, bias_buffer, bias_offset,
+                         bias_interpretation_1);
+
+  const uint bias_interpretation_2 = 6;
+
+  // expected-error@+6 {{6 is an invalid memory interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                         is_input_unsigned, input_interpretation, matrix_buffer,
+                         matrix_offset, matrix_interpretation, matrix_dimM,
+                         matrix_dimK, matrix_layout, matrix_is_transposed,
+                         matrix_stride, bias_buffer, bias_offset,
+                         bias_interpretation_2);
+
+  const uint bias_interpretation_3 = 7;
+
+  // expected-error@+6 {{7 is an invalid memory interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                         is_input_unsigned, input_interpretation, matrix_buffer,
+                         matrix_offset, matrix_interpretation, matrix_dimM,
+                         matrix_dimK, matrix_layout, matrix_is_transposed,
+                         matrix_stride, bias_buffer, bias_offset,
+                         bias_interpretation_3);
+
+  const uint bias_interpretation_4 = 10;
+
+  // expected-error@+6 {{10 is an invalid memory interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                         is_input_unsigned, input_interpretation, matrix_buffer,
+                         matrix_offset, matrix_interpretation, matrix_dimM,
+                         matrix_dimK, matrix_layout, matrix_is_transposed,
+                         matrix_stride, bias_buffer, bias_offset,
+                         bias_interpretation_4);
+
+  const uint bias_interpretation_5 = 11;
+
+  // expected-error@+6 {{11 is an invalid memory interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                         is_input_unsigned, input_interpretation, matrix_buffer,
+                         matrix_offset, matrix_interpretation, matrix_dimM,
+                         matrix_dimK, matrix_layout, matrix_is_transposed,
+                         matrix_stride, bias_buffer, bias_offset,
+                         bias_interpretation_5);
+
+  const uint bias_interpretation_6 = 12;
+
+  // expected-error@+6 {{12 is an invalid memory interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                         is_input_unsigned, input_interpretation, matrix_buffer,  
+                         matrix_offset, matrix_interpretation, matrix_dimM,
+                         matrix_dimK, matrix_layout, matrix_is_transposed,
+                         matrix_stride, bias_buffer, bias_offset,
+                         bias_interpretation_6);
+
+  const uint bias_interpretation_7 = 13;
+
+  // expected-error@+6 {{13 is an invalid memory interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                         is_input_unsigned, input_interpretation, matrix_buffer,
+                         matrix_offset, matrix_interpretation, matrix_dimM,
+                         matrix_dimK, matrix_layout, matrix_is_transposed,
+                         matrix_stride, bias_buffer, bias_offset,
+                         bias_interpretation_7);
+
+  const uint bias_interpretation_8 = 14;
+
+  // expected-error@+6 {{14 is an invalid memory interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                         is_input_unsigned, input_interpretation, matrix_buffer,
+                         matrix_offset, matrix_interpretation, matrix_dimM,
+                         matrix_dimK, matrix_layout, matrix_is_transposed,
+                         matrix_stride, bias_buffer, bias_offset,
+                         bias_interpretation_8);
+
+  const uint bias_interpretation_9 = 15;
+
+  // expected-error@+6 {{15 is an invalid memory interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                         is_input_unsigned, input_interpretation, matrix_buffer,
+                         matrix_offset, matrix_interpretation, matrix_dimM,
+                         matrix_dimK, matrix_layout, matrix_is_transposed,
+                         matrix_stride, bias_buffer, bias_offset,
+                         bias_interpretation_9);
+
+  const uint bias_interpretation_10 = 16;  
+  
+  // expected-error@+6 {{16 is an invalid memory interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                         is_input_unsigned, input_interpretation, matrix_buffer,
+                         matrix_offset, matrix_interpretation, matrix_dimM,
+                         matrix_dimK, matrix_layout, matrix_is_transposed,
+                         matrix_stride, bias_buffer, bias_offset,
+                         bias_interpretation_10);
+
+  const uint bias_interpretation_11 = DataType::DATA_TYPE_SINT8_T4_PACKED;
+
+  // expected-error@+6 {{17 is an invalid memory interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                         is_input_unsigned, input_interpretation, matrix_buffer,
+                         matrix_offset, matrix_interpretation, matrix_dimM,
+                         matrix_dimK, matrix_layout, matrix_is_transposed,
+                         matrix_stride, bias_buffer, bias_offset,
+                         bias_interpretation_11);
+
+  const uint bias_interpretation_12 = DataType::DATA_TYPE_UINT8_T4_PACKED;
+
+  // expected-error@+6 {{18 is an invalid memory interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                         is_input_unsigned, input_interpretation, matrix_buffer,
+                         matrix_offset, matrix_interpretation, matrix_dimM,
+                         matrix_dimK, matrix_layout, matrix_is_transposed,
+                         matrix_stride, bias_buffer, bias_offset,
+                         bias_interpretation_12);
+
+  const uint bias_interpretation_13 = 23;
+
+  // expected-error@+6 {{23 is an invalid memory interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                         is_input_unsigned, input_interpretation, matrix_buffer,
+                         matrix_offset, matrix_interpretation, matrix_dimM,
+                         matrix_dimK, matrix_layout, matrix_is_transposed,
+                         matrix_stride, bias_buffer, bias_offset,
+                         bias_interpretation_13);
+
+  const uint bias_interpretation_14 = 100;
+
+  // expected-error@+6 {{100 is an invalid memory interpretation value}}
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+                         is_input_unsigned, input_interpretation, matrix_buffer,
+                         matrix_offset, matrix_interpretation, matrix_dimM,
+                         matrix_dimK, matrix_layout, matrix_is_transposed,
+                         matrix_stride, bias_buffer, bias_offset,
+                         bias_interpretation_14);
+  }     
diff --git a/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/mul_add_valid.hlsl b/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/mul_add_valid.hlsl
new file mode 100644
index 0000000000..4b0bd6dd87
--- /dev/null
+++ b/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/mul_add_valid.hlsl
@@ -0,0 +1,244 @@
+// RUN: %dxc -I %hlsl_headers -T lib_6_9 %s
+
+#include <dx/linalg.h>
+
+using namespace dx::linalg;
+
+ByteAddressBuffer input_vector_buffer;
+ByteAddressBuffer matrix_buffer;
+ByteAddressBuffer bias_buffer;
+RWByteAddressBuffer output_vector_buffer;
+ByteAddressBuffer constants_buffer;
+
+// Check valid input vector packed types
+void test_valid_input_vector_packed_types() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+  const uint bias_offset = 0;
+  const uint bias_interpretation = DataType::DATA_TYPE_FLOAT32;
+
+ const uint input_interpretation_0 = DataType::DATA_TYPE_UINT8_T4_PACKED;
+ vector<uint32_t, 1> input_vector_0 =
+     input_vector_buffer.Load<vector<uint32_t, 4> >(0);
+ const uint is_input_unsigned_0 = 1;
+
+ // expected-no-diagnostics@+1
+ __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector_0, 
+                       is_input_unsigned_0, input_interpretation_0, matrix_buffer,
+                       matrix_offset, matrix_interpretation, matrix_dimM,
+                       matrix_dimK, matrix_layout, matrix_is_transposed,
+                       matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+ const uint input_interpretation_1 = DataType::DATA_TYPE_SINT8_T4_PACKED;
+ vector<uint32_t, 1> input_vector_1 =
+     input_vector_buffer.Load<vector<uint32_t, 1> >(0);
+ const uint is_input_unsigned_1 = 1;
+
+ // expected-no-diagnostics@+1  
+ __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector_1,
+                       is_input_unsigned_1, input_interpretation_1, matrix_buffer,
+                       matrix_offset, matrix_interpretation, matrix_dimM,
+                       matrix_dimK, matrix_layout, matrix_is_transposed,
+                       matrix_stride, bias_buffer, bias_offset, bias_interpretation);                  
+
+}
+
+// IsInputUnsigned must be true for packed input vector type
+void test_valid_is_input_unsigned_packed_input_vector_type() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;  
+  const uint matrix_stride = 64;
+  const uint bias_offset = 0;
+  const uint bias_interpretation = DataType::DATA_TYPE_FLOAT32;
+
+  const uint input_interpretation_0 = DataType::DATA_TYPE_UINT8_T4_PACKED;  
+  vector<uint, 1> input_vector_0 = 
+      input_vector_buffer.Load<vector<uint, 1> >(0);
+  const uint is_input_unsigned_0 = 1;
+
+  // expected-no-diagnostics@+2
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector_0,
+                        is_input_unsigned_0, input_interpretation_0, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,  
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  const uint input_interpretation_1 = DataType::DATA_TYPE_SINT8_T4_PACKED;
+  vector<uint, 1> input_vector_1 =
+      input_vector_buffer.Load<vector<uint, 1> >(0);
+  const uint is_input_unsigned_1 = 1;
+  
+  // expected-no-diagnostics@+2
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector_1,
+                        is_input_unsigned_1, input_interpretation_1, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+}
+
+// Check packed input vector dimension
+void test_valid_packed_input_vector_dimension() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  const uint is_input_unsigned = 1;
+  const uint input_interpretation = DataType::DATA_TYPE_UINT8_T4_PACKED;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_UINT8;
+  const uint matrix_dimM = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_MUL_OPTIMAL;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 0;
+  const uint bias_offset = 0;
+  const uint bias_interpretation = DataType::DATA_TYPE_UINT32;
+
+  vector<uint, 1> input_vector_0 =
+      input_vector_buffer.Load<vector<uint, 1> >(0);
+  const uint matrix_dimK_0 = 4;
+
+  // expected-no-diagnostics@+1
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector_0,
+                        is_input_unsigned, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK_0, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  vector<uint, 2> input_vector_1 =
+      input_vector_buffer.Load<vector<uint, 2> >(0);
+  const uint matrix_dimK_1 = 7;
+
+  // expected-no-diagnostics@+1
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector_1,
+                        is_input_unsigned, input_interpretation, matrix_buffer, 
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK_1, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+}
+
+// Check if Matrix M dimension is less than Max
+void test_valid_matrix_M_dimension_less_than_Max() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  const uint is_input_unsigned = 1;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimK = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = matrix_dimK * 4;
+  const uint bias_offset = 0;
+  const uint bias_interpretation = DataType::DATA_TYPE_FLOAT32;
+
+  vector<uint, 4> input_vector_0 =
+      input_vector_buffer.Load<vector<uint, 4> >(0);
+  const uint input_interpretation_0 = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM_0 = 4;
+
+  // expected-no-diagnostics@+1
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector_0,
+                        is_input_unsigned, input_interpretation_0, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM_0,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  vector<uint, 1> input_vector_1 =
+      input_vector_buffer.Load<vector<uint, 1> >(0);
+  const uint input_interpretation_1 = DataType::DATA_TYPE_UINT8_T4_PACKED;
+  const uint matrix_dimM_1 = 4;
+
+  // expected-no-diagnostics@+1
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector_1,
+                        is_input_unsigned, input_interpretation_1, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM_1,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+}
+
+// Check if Matrix K dimension is less than Max in unpacked input vector case
+void test_valid_matrix_K_dimension_less_than_Max_unpacked_input_vector() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  const uint is_input_unsigned = 1;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+  const uint bias_offset = 0;
+  const uint bias_interpretation = DataType::DATA_TYPE_FLOAT32;
+
+  vector<uint, 4> input_vector_0 =
+      input_vector_buffer.Load<vector<uint, 4> >(0);
+  const uint input_interpretation_0 = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimK_0 = 4;
+
+  // expected-no-diagnostics@+1
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector_0,
+                        is_input_unsigned, input_interpretation_0, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK_0, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+  vector<uint, 4> input_vector_1 =
+      input_vector_buffer.Load<vector<uint, 4> >(0);
+  const uint input_interpretation_1 = DataType::DATA_TYPE_UINT8;
+  const uint matrix_dimK_1 = 4;
+  // expected-no-diagnostics@+1
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector_1, 
+                        is_input_unsigned, input_interpretation_1, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK_1, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+
+}
+
+// Check if Matrix M dimension is less than Max in packed input vector case
+void test_valid_matrix_M_dimension_less_than_Max_packed_input_vector() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  const uint is_input_unsigned = 1;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+  const uint bias_offset = 0;
+  const uint bias_interpretation = DataType::DATA_TYPE_FLOAT32;
+
+  vector<uint, 1024> input_vector_0 =
+      input_vector_buffer.Load<vector<uint, 1024> >(0);
+  const uint input_interpretation_0 = DataType::DATA_TYPE_UINT8_T4_PACKED;
+  const uint matrix_dimK_0 = 4096;
+
+  // expected-no-diagnostics@+1
+  __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector_0,
+                        is_input_unsigned, input_interpretation_0, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK_0, matrix_layout, matrix_is_transposed,
+                        matrix_stride, bias_buffer, bias_offset, bias_interpretation);
+}
+
+
+
diff --git a/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/mul_invalid.hlsl b/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/mul_invalid.hlsl
new file mode 100644
index 0000000000..14f34d62c4
--- /dev/null
+++ b/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/mul_invalid.hlsl
@@ -0,0 +1,1156 @@
+// RUN: %dxc -I %hlsl_headers -T lib_6_9 -enable-16bit-types %s -verify
+
+#include <dx/linalg.h>
+
+using namespace dx::linalg;
+
+ByteAddressBuffer input_vector_buffer;
+ByteAddressBuffer matrix_buffer;
+RWByteAddressBuffer output_vector_buffer;
+ByteAddressBuffer constants_buffer;
+
+// Output vector, isUnsigned mismatch
+void test_invalid_output_vector_type() {
+
+  vector<float, 4> input_vector =
+      input_vector_buffer.Load<vector<float, 4> >(0);
+  const uint is_input_unsigned = 0;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+
+  vector<uint, 4> output_vector_0;
+  const uint is_output_unsigned_0 = 0;
+
+  // expected-error@+1 {{IsOuputUnsigned must be true for vector of unsigned integer type}}
+  __builtin_MatVecMul(output_vector_0, is_output_unsigned_0, input_vector,
+                      is_input_unsigned, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  vector<int32_t, 4> output_vector_1;
+  const uint is_output_unsigned_1 = 1;
+
+  // expected-error@+1 {{IsOuputUnsigned must be false for vector of signed integer type}}
+  __builtin_MatVecMul(output_vector_1, is_output_unsigned_1, input_vector,
+                      is_input_unsigned, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  vector<float, 4> output_vector_2;
+  const uint is_output_unsigned_2 = 1;
+
+  // expected-error@+1 {{IsOuputUnsigned must be false for vector of floating point type}}
+  __builtin_MatVecMul(output_vector_2, is_output_unsigned_2, input_vector,
+                      is_input_unsigned, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+}
+
+// IsOutputUnsigned is not a constant parameter
+void test_invalid_is_output_unsigned_non_const() {
+
+  vector<uint, 4> output_vector_0;
+  vector<float, 4> input_vector =
+      input_vector_buffer.Load<vector<float, 4> >(0);
+  const uint is_input_unsigned = 0;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+
+  const uint is_output_unsigned_0 = constants_buffer.Load<uint>(0);
+
+  // expected-error@+1 {{expression is not an integer constant expression}}
+  __builtin_MatVecMul(output_vector_0, is_output_unsigned_0, input_vector,
+                      is_input_unsigned, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+}
+
+// Input vector is incorrect type - 64 bit types
+void test_invalid_input_vector_type() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+
+    vector<int64_t, 4> input_vector_0 =
+      input_vector_buffer.Load<vector<int64_t, 4> >(0);
+    const uint is_input_unsigned_0 = 0;
+
+// expected-error@+2 {{no matching function for call to '__builtin_MatVecMul'}}
+// expected-note@+1 {{candidate function not viable: no known conversion from 'vector<int64_t, 4>' to 'vector<float, 4>' for 3rd argument}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector_0,
+                      is_input_unsigned_0, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+    vector<uint64_t, 4> input_vector_1 =
+      input_vector_buffer.Load<vector<uint64_t, 4> >(0);
+    const uint is_input_unsigned_1 = 1;
+
+// expected-error@+2 {{no matching function for call to '__builtin_MatVecMul'}}
+// expected-note@+1 {{candidate function not viable: no known conversion from 'vector<uint64_t, 4>' to 'vector<float, 4>' for 3rd argument}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector_1,
+                      is_input_unsigned_1, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+    vector<float64_t, 4> input_vector_2 =
+      input_vector_buffer.Load<vector<float64_t, 4> >(0);
+    const uint is_input_unsigned_2 = 0;
+
+// expected-error@+2 {{no matching function for call to '__builtin_MatVecMul'}}
+// expected-note@+1 {{candidate function not viable: no known conversion from 'vector<float64_t, 4>' to 'vector<float, 4>' for 3rd argument}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector_2,
+                      is_input_unsigned_2, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+}
+
+// Input vector is incorrect type for packed InputInterpretation
+void test_invalid_input_vector_type_packed_input_interpretation() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+
+  const uint input_interpretation_0 = DataType::DATA_TYPE_SINT8_T4_PACKED;
+  vector<int16_t, 2> input_vector_0 =
+      input_vector_buffer.Load<vector<int16_t, 2> >(0);
+  const uint is_input_unsigned_0 = 1;
+
+  // expected-error@+1 {{packed input vector type must be a 32-bit unsigned int type in linalg mul/muladd operations}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector_0,
+                      is_input_unsigned_0, input_interpretation_0, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  const uint input_interpretation_1 = DataType::DATA_TYPE_UINT8_T4_PACKED;
+  vector<uint16_t, 2> input_vector_1 =
+      input_vector_buffer.Load<vector<uint16_t, 2> >(0);
+  const uint is_input_unsigned_1 = 0;
+
+  // expected-error@+1 {{packed input vector type must be a 32-bit unsigned int type in linalg mul/muladd operations}} 
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector_1,
+                      is_input_unsigned_1, input_interpretation_1, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  const uint input_interpretation_2 = DataType::DATA_TYPE_UINT8_T4_PACKED;
+  vector<int32_t, 1> input_vector_2 =
+      input_vector_buffer.Load<vector<int32_t, 1> >(0);
+  const uint is_input_unsigned_2 = 1;
+  
+  // expected-error@+1 {{packed input vector type must be a 32-bit unsigned int type in linalg mul/muladd operations}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector_2,
+                      is_input_unsigned_2, input_interpretation_2, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  const uint input_interpretation_3 = DataType::DATA_TYPE_SINT8_T4_PACKED;
+  vector<int32_t, 1> input_vector_3 =
+      input_vector_buffer.Load<vector<int32_t, 1> >(0);
+  const uint is_input_unsigned_3 = 0;
+
+  // expected-error@+1 {{packed input vector type must be a 32-bit unsigned int type in linalg mul/muladd operations}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector_3,
+                      is_input_unsigned_3, input_interpretation_3, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  const uint input_interpretation_4 = DataType::DATA_TYPE_SINT8_T4_PACKED;
+  vector<float, 1> input_vector_4 =
+      input_vector_buffer.Load<vector<float, 1> >(0);
+  const uint is_input_unsigned_4 = 0;
+
+  // expected-error@+1 {{packed input vector type must be a 32-bit unsigned int type in linalg mul/muladd operations}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector_4, 
+                      is_input_unsigned_4, input_interpretation_4, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+}
+
+// IsInputUnsigned must be true for packed input vector type
+void test_invalid_is_input_unsigned_packed_input_vector_type() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;  
+  const uint matrix_stride = 64;
+  const uint bias_offset = 0;
+  const uint bias_interpretation = DataType::DATA_TYPE_FLOAT32;
+
+  const uint input_interpretation_0 = DataType::DATA_TYPE_UINT8_T4_PACKED;  
+  vector<uint, 1> input_vector_0 = 
+      input_vector_buffer.Load<vector<uint, 1> >(0);
+  const uint is_input_unsigned_0 = 0;
+
+  // expected-error@+2 {{IsInputUnsigned must be true for packed input interpretations in linalg mul/muladd operations}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector_0,
+                      is_input_unsigned_0, input_interpretation_0, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,  
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  const uint input_interpretation_1 = DataType::DATA_TYPE_SINT8_T4_PACKED;
+  vector<uint, 1> input_vector_1 =
+      input_vector_buffer.Load<vector<uint, 1> >(0);
+  const uint is_input_unsigned_1 = 0;
+  
+  // expected-error@+2 {{IsInputUnsigned must be true for packed input interpretations in linalg mul/muladd operations}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector_1,
+                      is_input_unsigned_1, input_interpretation_1, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+}
+
+// Check packed input vector dimension
+void test_invalid_packed_input_vector_dimension() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  const uint is_input_unsigned = 1;
+  const uint input_interpretation = DataType::DATA_TYPE_UINT8_T4_PACKED;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_UINT8;
+  const uint matrix_dimM = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_MUL_OPTIMAL;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 0;
+  const uint bias_offset = 0;
+  const uint bias_interpretation = DataType::DATA_TYPE_UINT32;
+
+  vector<uint, 4> input_vector_0 =
+      input_vector_buffer.Load<vector<uint, 4> >(0);
+  const uint matrix_dimK_0 = 4;
+
+  // expected-error@+1 {{packed input vector length must be the smallest number that can hold matrix dim K values of the packed(smaller) type in linalg mul/muladd operations}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector_0,
+                      is_input_unsigned, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK_0, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  vector<uint, 1> input_vector_1 =
+      input_vector_buffer.Load<vector<uint, 1> >(0);
+  const uint matrix_dimK_1 = 7;
+
+  // expected-error@+1 {{packed input vector length must be the smallest number that can hold matrix dim K values of the packed(smaller) type in linalg mul/muladd operations}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector_1,
+                      is_input_unsigned, input_interpretation, matrix_buffer, 
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK_1, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  vector<uint, 3> input_vector_2 =
+      input_vector_buffer.Load<vector<uint, 1> >(0);
+  const uint matrix_dimK_2 = 7;
+
+  // expected-error@+1 {{packed input vector length must be the smallest number that can hold matrix dim K values of the packed(smaller) type in linalg mul/muladd operations}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector_2,
+                      is_input_unsigned, input_interpretation, matrix_buffer, 
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK_2, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+}
+
+// Input vector type/isInputUnsigned mismatch
+void test_invalid_input_vector_type_mismatch() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+
+  vector<uint, 4> input_vector_0 =
+      input_vector_buffer.Load<vector<uint, 4> >(0);    
+  const uint is_input_unsigned_0 = 0;
+
+  // expected-error@+2 {{IsInputUnsigned must be true for vector of unsigned integer type}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector_0,
+                      is_input_unsigned_0, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  vector<int32_t, 4> input_vector_1 =
+      input_vector_buffer.Load<vector<int32_t, 4> >(0);
+  const uint is_input_unsigned_1 = 1;
+
+  // expected-error@+2 {{IsInputUnsigned must be false for vector of signed integer type}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector_1,
+                      is_input_unsigned_1, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  vector<float16_t, 4> input_vector_2 =
+      input_vector_buffer.Load<vector<float16_t, 4> >(0);
+  const uint is_input_unsigned_2 = 1;
+
+  // expected-error@+2 {{IsInputUnsigned must be false for vector of floating point type}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector_2,
+                      is_input_unsigned_2, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+}
+
+//  Check is Matrix M dimension is a constant parameter
+void test_invalid_matrix_M_dimension() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  vector<float, 4> input_vector =
+      input_vector_buffer.Load<vector<float, 4> >(0);
+  const uint is_input_unsigned = 0;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimK = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64; 
+
+  const uint matrix_dimM = constants_buffer.Load<uint>(0);   
+  
+  // expected-error@+3 {{expression is not an integer constant expression}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+}
+
+//  Check is Matrix K dimension is a constant parameter
+void test_invalid_matrix_K_dimension() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  vector<float, 4> input_vector =
+      input_vector_buffer.Load<vector<float, 4> >(0);
+  const uint is_input_unsigned = 0; 
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+
+  const uint matrix_dimK = constants_buffer.Load<uint>(0);
+  
+  // expected-error@+4 {{expression is not an integer constant expression}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+}
+
+// Check is Matrix M dimension is non-zero
+void test_invalid_matrix_M_dimension_non_zero() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  vector<float, 4> input_vector =
+      input_vector_buffer.Load<vector<float, 4> >(0);
+  const uint is_input_unsigned = 0;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimK = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+
+  const uint matrix_dimM = 0;
+  // expected-error@+3 {{matrix dimension must be greater than 0}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+}
+
+// Check is Matrix K dimension is non-zero
+void test_invalid_matrix_K_dimension_non_zero() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  vector<float, 4> input_vector =
+      input_vector_buffer.Load<vector<float, 4> >(0);
+  const uint is_input_unsigned = 0;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+
+  const uint matrix_dimK = 0;
+  // expected-error@+4 {{matrix dimension must be greater than 0}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+}
+
+// Check if Matrix M dimension is less than Max
+void test_invalid_matrix_M_dimension_less_than_Max() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  const uint is_input_unsigned = 1;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimK = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = matrix_dimK * 4;
+
+  vector<uint, 4> input_vector_0 =
+      input_vector_buffer.Load<vector<uint, 4> >(0);
+  const uint input_interpretation_0 = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM_0 = 1025;
+
+  // expected-error@+3 {{matrix dimension M must be less than 1024, in a linalg Mul/MulAdd operation}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector_0,
+                      is_input_unsigned, input_interpretation_0, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM_0,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  vector<uint, 1> input_vector_1 =
+      input_vector_buffer.Load<vector<uint, 1> >(0);
+  const uint input_interpretation_1 = DataType::DATA_TYPE_UINT8_T4_PACKED;
+  const uint matrix_dimM_1 = 4097;
+
+  // expected-error@+3 {{matrix dimension M must be less than 1024, in a linalg Mul/MulAdd operation}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector_1,
+                      is_input_unsigned, input_interpretation_1, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM_1,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+}
+
+// Check if Matrix K dimension is less than Max in unpacked input vector case
+void test_invalid_matrix_K_dimension_less_than_Max_unpacked_input_vector() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  const uint is_input_unsigned = 1;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+
+  vector<uint, 4> input_vector_0 =
+      input_vector_buffer.Load<vector<uint, 4> >(0);
+  const uint input_interpretation_0 = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimK_0 = 1025;
+
+  // expected-error@+4 {{matrix dimension K when using unpacked input vectors must be less than 1024, in a linalg Mul/MulAdd operation}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector_0,
+                      is_input_unsigned, input_interpretation_0, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK_0, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  vector<uint, 4> input_vector_1 =
+      input_vector_buffer.Load<vector<uint, 4> >(0);
+  const uint input_interpretation_1 = DataType::DATA_TYPE_UINT8;
+  const uint matrix_dimK_1 = 4096;
+  // expected-error@+4 {{matrix dimension K when using unpacked input vectors must be less than 1024, in a linalg Mul/MulAdd operation}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector_1, 
+                      is_input_unsigned, input_interpretation_1, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK_1, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+}
+
+// Check if Matrix M dimension is less than Max in packed input vector case
+void test_invalid_matrix_M_dimension_less_than_Max_packed_input_vector() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  const uint is_input_unsigned = 1;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 1024;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 4096;
+
+  vector<uint, 1024> input_vector_0 =
+      input_vector_buffer.Load<vector<uint, 1024> >(0);
+  const uint input_interpretation_0 = DataType::DATA_TYPE_UINT8_T4_PACKED;
+  const uint matrix_dimK_0 = 4097;
+
+  // expected-error@+4 {{matrix dimension K when using packed input vectors must be less than 4096, in a linalg Mul/MulAdd operation}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector_0,
+                      is_input_unsigned, input_interpretation_0, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK_0, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+}
+
+//Check if InputInterpretation is a constant parameter
+void test_invalid_input_interpretation_non_const() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  vector<float, 4> input_vector =
+      input_vector_buffer.Load<vector<float, 4> >(0);
+  const uint is_input_unsigned = 0;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+
+  const uint input_interpretation = constants_buffer.Load<uint>(0);
+
+  // expected-error@+2 {{expression is not an integer constant expression}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+}
+
+// Check if InputInterpretation is a valid value
+void test_invalid_input_interpretation_value() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  vector<float, 4> input_vector =
+      input_vector_buffer.Load<vector<float, 4> >(0);   
+  const uint is_input_unsigned = 0;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+
+  const uint input_interpretation_0 = 0;
+
+  // expected-error@+2 {{0 is an invalid register interpretation value}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation_0, matrix_buffer,   
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  const uint input_interpretation_1 = 1;
+
+  // expected-error@+2 {{1 is an invalid register interpretation value}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation_1, matrix_buffer,   
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  const uint input_interpretation_2 = 6;
+
+  // expected-error@+2 {{6 is an invalid register interpretation value}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation_2, matrix_buffer,   
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  const uint input_interpretation_3 = 7;
+
+  // expected-error@+2 {{7 is an invalid register interpretation value}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation_3, matrix_buffer,   
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);   
+
+  const uint input_interpretation_4 = 10;
+
+  // expected-error@+2 {{10 is an invalid register interpretation value}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation_4, matrix_buffer,   
+                      matrix_offset, matrix_interpretation, matrix_dimM,    
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  const uint input_interpretation_5 = 11;       
+
+  // expected-error@+2 {{11 is an invalid register interpretation value}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation_5, matrix_buffer,   
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  const uint input_interpretation_6 = 12;
+
+  // expected-error@+2 {{12 is an invalid register interpretation value}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation_6, matrix_buffer,   
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  const uint input_interpretation_7 = 13;
+
+  // expected-error@+2 {{13 is an invalid register interpretation value}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation_7, matrix_buffer,   
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  const uint input_interpretation_8 = 14;
+
+  // expected-error@+2 {{14 is an invalid register interpretation value}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation_8, matrix_buffer,   
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  const uint input_interpretation_9 = 15;
+
+  // expected-error@+2 {{15 is an invalid register interpretation value}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation_9, matrix_buffer,   
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  const uint input_interpretation_10 = 16;
+
+  // expected-error@+2 {{16 is an invalid register interpretation value}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation_10, matrix_buffer,   
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  const uint input_interpretation_11 = 23;
+
+  // expected-error@+2 {{23 is an invalid register interpretation value}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation_11, matrix_buffer,   
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  const uint input_interpretation_12 = 100;
+
+  // expected-error@+2 {{100 is an invalid register interpretation value}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation_12, matrix_buffer,   
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+}
+// Check if Input and Output vector dimensions are valid -non packed
+void test_invalid_input_output_vector_dimensions_non_packed_square_matrix() {
+
+  const uint is_output_unsigned = 1;
+  const uint is_input_unsigned = 0;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 32;
+  const uint matrix_dimK = 32;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+
+  vector<uint, 32> output_vector_0;
+  vector<float, 30> input_vector_0 =   
+      input_vector_buffer.Load<vector<float, 30> >(0);
+
+  // expected-error@+1 {{unpacked input vector length must be equal to Matrix K dimension in a linalg Mul/MulAdd operation}}
+  __builtin_MatVecMul(output_vector_0, is_output_unsigned, input_vector_0,  
+                      is_input_unsigned, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  vector<uint, 30> output_vector_1;
+  vector<float, 32> input_vector_1 =   
+      input_vector_buffer.Load<vector<float, 32> >(0);
+
+  // expected-error@+1 {{output vector length must be equal to Matrix M dimension in a linalg Mul/MulAdd operation}}
+  __builtin_MatVecMul(output_vector_1, is_output_unsigned, input_vector_1,    
+                      is_input_unsigned, input_interpretation, matrix_buffer,   
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+}
+
+// Check if Input and Output vector dimensions are valid -non packed
+void test_invalid_input_output_vector_dimensions_non_packed_rectangle_matrix() {
+
+  const uint is_output_unsigned = 1;
+  const uint is_input_unsigned = 0;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 16;
+  const uint matrix_dimK = 32;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+
+  // Use dimension of Matrix K to trigger error
+  vector<uint, 32> output_vector_0;
+  vector<float, 32> input_vector_0 =   
+      input_vector_buffer.Load<vector<float, 32> >(0);
+
+  // expected-error@+1 {{output vector length must be equal to Matrix M dimension in a linalg Mul/MulAdd operation}}
+  __builtin_MatVecMul(output_vector_0, is_output_unsigned, input_vector_0,  
+                      is_input_unsigned, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+ 
+ // Check off by 1 errors
+  vector<uint, 17> output_vector_1;
+  vector<float, 16> input_vector_1 =   
+      input_vector_buffer.Load<vector<float, 16> >(0);
+
+  // expected-error@+1 {{output vector length must be equal to Matrix M dimension in a linalg Mul/MulAdd operation}}
+  __builtin_MatVecMul(output_vector_1, is_output_unsigned, input_vector_1,    
+                      is_input_unsigned, input_interpretation, matrix_buffer,   
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+ // Check off by 1 errors
+ vector<uint, 15> output_vector_2;
+ vector<float, 16> input_vector_2 =   
+     input_vector_buffer.Load<vector<float, 16> >(0);
+
+ // expected-error@+1 {{output vector length must be equal to Matrix M dimension in a linalg Mul/MulAdd operation}}         
+ __builtin_MatVecMul(output_vector_2, is_output_unsigned, input_vector_2,    
+                      is_input_unsigned, input_interpretation, matrix_buffer,   
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  // Use dimension of Matrix M to trigger error 
+  vector<uint, 16> output_vector_3;
+  vector<float, 16> input_vector_3 =   
+      input_vector_buffer.Load<vector<float, 16> >(0);
+
+  // expected-error@+1 {{unpacked input vector length must be equal to Matrix K dimension in a linalg Mul/MulAdd operation}}
+  __builtin_MatVecMul(output_vector_3, is_output_unsigned, input_vector_3,  
+                      is_input_unsigned, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  // Check off by 1 errors
+  vector<uint, 16> output_vector_4;
+  vector<float, 31> input_vector_4 =   
+      input_vector_buffer.Load<vector<float, 31> >(0);
+
+  // expected-error@+1 {{unpacked input vector length must be equal to Matrix K dimension in a linalg Mul/MulAdd operation}}    
+  __builtin_MatVecMul(output_vector_4, is_output_unsigned, input_vector_4,  
+                      is_input_unsigned, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  // Check off by 1 errors
+  vector<uint, 16> output_vector_5;
+  vector<float, 33> input_vector_5 =   
+      input_vector_buffer.Load<vector<float, 33> >(0);
+
+  // expected-error@+1 {{unpacked input vector length must be equal to Matrix K dimension in a linalg Mul/MulAdd operation}}    
+  __builtin_MatVecMul(output_vector_5, is_output_unsigned, input_vector_5,  
+                      is_input_unsigned, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+    // Swap dimensions to trigger error
+    vector<uint, 32> output_vector_6;
+    vector<float, 16> input_vector_6 =   
+        input_vector_buffer.Load<vector<float, 16> >(0);
+
+    // expected-error@+1 {{output vector length must be equal to Matrix M dimension in a linalg Mul/MulAdd operation}}    
+    __builtin_MatVecMul(output_vector_6, is_output_unsigned, input_vector_6,  
+                        is_input_unsigned, input_interpretation, matrix_buffer,
+                        matrix_offset, matrix_interpretation, matrix_dimM,
+                        matrix_dimK, matrix_layout, matrix_is_transposed,
+                        matrix_stride);
+}
+
+// Check if matrtrix  interpretation is a constant value
+void test_invalid_matrix_interpretation_constant_value() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  vector<float, 4> input_vector =
+      input_vector_buffer.Load<vector<float, 4> >(0);
+  const uint is_input_unsigned = 0;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+
+  const uint matrix_interpretation_0 = constants_buffer.Load<uint>(0);
+
+  // expected-error@+3 {{expression is not an integer constant expression}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation_0, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+}
+
+// Check for invalid matrix interpretation value
+void test_invalid_matrix_interpretation_value() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  vector<float, 4> input_vector =
+      input_vector_buffer.Load<vector<float, 4> >(0);
+  const uint is_input_unsigned = 0;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+
+  const uint matrix_interpretation_0 = 0;
+
+  // expected-error@+3 {{0 is an invalid memory interpretation value}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation_0, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  const uint matrix_interpretation_1 = 1;
+
+  // expected-error@+3 {{1 is an invalid memory interpretation value}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation_1, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  const uint matrix_interpretation_2 = 6;
+
+  // expected-error@+3 {{6 is an invalid memory interpretation value}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation_2, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  const uint matrix_interpretation_3 = 7;
+
+  // expected-error@+3 {{7 is an invalid memory interpretation value}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation, matrix_buffer,   
+                      matrix_offset, matrix_interpretation_3, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  const uint matrix_interpretation_4 = 10;
+
+  // expected-error@+3 {{10 is an invalid memory interpretation value}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation_4, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  const uint matrix_interpretation_5 = 11;
+
+  // expected-error@+3 {{11 is an invalid memory interpretation value}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation_5, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  const uint matrix_interpretation_6 = 12;
+
+  // expected-error@+3 {{12 is an invalid memory interpretation value}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation_6, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  const uint matrix_interpretation_7 = 13;
+
+  // expected-error@+3 {{13 is an invalid memory interpretation value}} 
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation_7, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);       
+
+  const uint matrix_interpretation_8 = 14;
+
+  // expected-error@+3 {{14 is an invalid memory interpretation value}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation_8, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  const uint matrix_interpretation_9 = 15;
+
+  // expected-error@+3 {{15 is an invalid memory interpretation value}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation_9, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  const uint matrix_interpretation_10 = 16;
+
+  // expected-error@+3 {{16 is an invalid memory interpretation value}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation_10, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  const uint matrix_interpretation_11 = 23;
+  // expected-error@+3 {{23 is an invalid memory interpretation value}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation_11, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  const uint matrix_interpretation_12 = 100;
+
+  // expected-error@+3 {{100 is an invalid memory interpretation value}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation_12, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+}
+
+// Check if matrix Layout is a constant value
+void test_invalid_matrix_layout_constant_value() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  vector<float, 4> input_vector =
+      input_vector_buffer.Load<vector<float, 4> >(0);   
+  const uint is_input_unsigned = 0;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+
+  const uint matrix_layout = constants_buffer.Load<uint>(0);
+
+  // expected-error@+4 {{expression is not an integer constant expression}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation, matrix_buffer,   
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+}
+
+// Check invalid matrix layout value
+void test_invalid_matrix_layout_value() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  vector<float, 4> input_vector =
+      input_vector_buffer.Load<vector<float, 4> >(0);
+  const uint is_input_unsigned = 0;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+
+  const uint matrix_layout_0 = 4;
+
+  // expected-error@+4 {{matrix layout 4 is not valid, must be in the range [0, 3]}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout_0, matrix_is_transposed,
+                      matrix_stride);
+}
+
+// Check if matrix is transposed is a constant value
+void test_invalid_matrix_transposed_constant_value() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  vector<float, 4> input_vector =
+      input_vector_buffer.Load<vector<float, 4> >(0);
+  const uint is_input_unsigned = 0;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = constants_buffer.Load<bool>(0);
+  const uint matrix_stride = 64;
+
+  // expected-error@+4 {{expression is not an integer constant expression}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+}
+
+// Check if invalid matrix transpose value is used
+void test_invalid_matrix_transpose_value() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  vector<float, 4> input_vector =   
+      input_vector_buffer.Load<vector<float, 4> >(0);
+  const uint is_input_unsigned = 0;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4;   
+  const uint matrix_stride = 64;
+
+  const uint matrix_layout_0 = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed_0 = true;
+
+  // expected-error@+4 {{RowMajor and ColumnMajor matrices are not transposable}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout_0, matrix_is_transposed_0,
+                      matrix_stride);
+
+  const uint matrix_layout_1 = MatrixLayout::MATRIX_LAYOUT_COLUMN_MAJOR;
+  const bool matrix_is_transposed_1 = true;
+
+  // expected-error@+4 {{RowMajor and ColumnMajor matrices are not transposable}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout_1, matrix_is_transposed_1,
+                      matrix_stride);
+}
+
+
+// Check invalid matrix stride value for optimal matrix layout
+void test_invalid_matrix_stride_constant_value() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  vector<float, 4> input_vector =
+      input_vector_buffer.Load<vector<float, 4> >(0);
+  const uint is_input_unsigned = 0;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4;
+  const bool matrix_is_transposed = false;
+
+  const uint matrix_layout_0 = MatrixLayout::MATRIX_LAYOUT_MUL_OPTIMAL;
+  const uint matrix_stride_0 = 64;
+
+  // expected-error@+5 {{for optimal matrix layout, matrix stride must be 0}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout_0, matrix_is_transposed,
+                      matrix_stride_0);
+
+  const uint matrix_layout_1 = MatrixLayout::MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL;
+  const uint matrix_stride_1 = 64;
+
+  // expected-error@+5 {{for optimal matrix layout, matrix stride must be 0}}
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector,
+                      is_input_unsigned, input_interpretation, matrix_buffer,   
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout_1, matrix_is_transposed,
+                      matrix_stride_1);
+}
diff --git a/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/mul_valid.hlsl b/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/mul_valid.hlsl
new file mode 100644
index 0000000000..5972b22b95
--- /dev/null
+++ b/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/mul_valid.hlsl
@@ -0,0 +1,344 @@
+// RUN: %dxc -I %hlsl_headers -T lib_6_9 -enable-16bit-types %s -verify
+
+#include <dx/linalg.h>
+
+using namespace dx::linalg;
+
+ByteAddressBuffer input_vector_buffer;
+ByteAddressBuffer matrix_buffer; 
+RWByteAddressBuffer output_vector_buffer;
+ByteAddressBuffer const_buffer;
+
+// Output vector, isUnsigned mismatch
+void test_valid_output_vector_type() {
+
+    vector<float, 4> input_vector = input_vector_buffer.Load<vector<float, 4> >(0);
+    const uint is_input_unsigned = 0;
+    const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+    const uint matrix_offset = 0;
+    const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+    const uint matrix_dimM = 4;
+    const uint matrix_dimK = 4;
+    const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+    const bool matrix_is_transposed = false;
+    const uint matrix_stride = 64;
+
+    vector<uint, 4> output_vector_0;
+    const uint is_output_unsigned_0 = 1;
+
+    // expected-no-diagnostics@+1
+    __builtin_MatVecMul(output_vector_0, is_output_unsigned_0, input_vector,
+        is_input_unsigned, input_interpretation, matrix_buffer, matrix_offset,
+        matrix_interpretation, matrix_dimM, matrix_dimK, matrix_layout,
+        matrix_is_transposed, matrix_stride);
+
+    vector<int32_t, 4> output_vector_1;
+    const uint is_output_unsigned_1 = 0;
+
+    // expected-no-diagnostics@+1
+    __builtin_MatVecMul(output_vector_1, is_output_unsigned_1, input_vector,
+        is_input_unsigned, input_interpretation, matrix_buffer, matrix_offset,
+        matrix_interpretation, matrix_dimM, matrix_dimK, matrix_layout,
+        matrix_is_transposed, matrix_stride);
+
+    vector<float, 4> output_vector_2;
+    const uint is_output_unsigned_2 = 0;
+
+    // expected-no-diagnostics@+1
+    __builtin_MatVecMul(output_vector_2, is_output_unsigned_2, input_vector,
+        is_input_unsigned, input_interpretation, matrix_buffer, matrix_offset,
+        matrix_interpretation, matrix_dimM, matrix_dimK, matrix_layout,
+        matrix_is_transposed, matrix_stride);
+}
+
+void test_valid_is_output_unsigned_non_const() {
+
+  vector<uint, 4> output_vector_0;
+  vector<float, 4> input_vector =
+      input_vector_buffer.Load<vector<float, 4> >(0);
+  const uint is_input_unsigned = 0;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+
+  const uint is_output_unsigned_0 = 1;
+
+  // expected-no-diagnostics@+1
+  __builtin_MatVecMul(output_vector_0, is_output_unsigned_0, input_vector,
+                      is_input_unsigned, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+}
+
+// Input vector is incorrect type
+void test_valid_input_vector_type() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+
+    vector<int32_t, 4> input_vector_0 =
+      input_vector_buffer.Load<vector<int32_t, 4> >(0);
+    const uint is_input_unsigned_0 = 0;
+
+ // expected-no-diagnostics@+1
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector_0,
+                      is_input_unsigned_0, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+    vector<uint32_t, 4> input_vector_1 =
+      input_vector_buffer.Load<vector<uint32_t, 4> >(0);
+    const uint is_input_unsigned_1 = 1;
+
+ // expected-no-diagnostics@+1 
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector_1,
+                      is_input_unsigned_1, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+    vector<float16_t, 4> input_vector_2 =
+      input_vector_buffer.Load<vector<float16_t, 4> >(0);
+    const uint is_input_unsigned_2 = 0;
+
+ // expected-no-diagnostics@+1
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector_2,
+                      is_input_unsigned_2, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+}
+
+// Check valid input vector packed types
+void test_valid_input_vector_packed_types() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+
+ const uint input_interpretation_0 = DataType::DATA_TYPE_UINT8_T4_PACKED;
+ vector<uint32_t, 1> input_vector_0 =
+     input_vector_buffer.Load<vector<uint32_t, 1> >(0);
+ const uint is_input_unsigned_0 = 1;
+
+ // expected-no-diagnostics@+1
+ __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector_0, 
+                     is_input_unsigned_0, input_interpretation_0, matrix_buffer,
+                     matrix_offset, matrix_interpretation, matrix_dimM,
+                     matrix_dimK, matrix_layout, matrix_is_transposed,
+                     matrix_stride);
+
+ const uint input_interpretation_1 = DataType::DATA_TYPE_SINT8_T4_PACKED;
+ vector<uint32_t, 1> input_vector_1 =
+     input_vector_buffer.Load<vector<uint32_t, 1> >(0);
+ const uint is_input_unsigned_1 = 1;
+
+ // expected-no-diagnostics@+1  
+ __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector_1,
+                     is_input_unsigned_1, input_interpretation_1, matrix_buffer,
+                     matrix_offset, matrix_interpretation, matrix_dimM,
+                     matrix_dimK, matrix_layout, matrix_is_transposed,
+                     matrix_stride);                  
+
+}
+
+// IsInputUnsigned must be true for packed input vector type
+void test_valid_is_input_unsigned_packed_input_vector_type() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  const uint input_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_dimK = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;  
+  const uint matrix_stride = 64;
+
+  const uint input_interpretation_0 = DataType::DATA_TYPE_UINT8_T4_PACKED;  
+  vector<uint, 1> input_vector_0 = 
+      input_vector_buffer.Load<vector<uint, 1> >(0);
+  const uint is_input_unsigned_0 = 1;
+
+  // expected-no-diagnostics@+2
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector_0,
+                      is_input_unsigned_0, input_interpretation_0, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,  
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  const uint input_interpretation_1 = DataType::DATA_TYPE_SINT8_T4_PACKED;
+  vector<uint, 1> input_vector_1 =
+      input_vector_buffer.Load<vector<uint, 1> >(0);
+  const uint is_input_unsigned_1 = 1;
+  
+  // expected-no-diagnostics@+2
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector_1,
+                      is_input_unsigned_1, input_interpretation_1, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+}
+
+// Check packed input vector dimension
+void test_valid_packed_input_vector_dimension() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  const uint is_input_unsigned = 1;
+  const uint input_interpretation = DataType::DATA_TYPE_UINT8_T4_PACKED;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_UINT8;
+  const uint matrix_dimM = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_MUL_OPTIMAL;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 0;
+
+  vector<uint, 1> input_vector_0 =
+      input_vector_buffer.Load<vector<uint, 1> >(0);
+  const uint matrix_dimK_0 = 4;
+
+  // expected-no-diagnostics@+1
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector_0,
+                      is_input_unsigned, input_interpretation, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK_0, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  vector<uint, 2> input_vector_1 =
+      input_vector_buffer.Load<vector<uint, 2> >(0);
+  const uint matrix_dimK_1 = 7;
+
+  // expected-no-diagnostics@+1
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector_1,
+                      is_input_unsigned, input_interpretation, matrix_buffer, 
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK_1, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+}
+
+// Check if Matrix M dimension is less than Max
+void test_valid_matrix_M_dimension_less_than_Max() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  const uint is_input_unsigned = 1;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimK = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = matrix_dimK * 4;
+
+  vector<uint, 4> input_vector_0 =
+      input_vector_buffer.Load<vector<uint, 4> >(0);
+  const uint input_interpretation_0 = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM_0 = 4;
+
+  // expected-no-diagnostics@+1
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector_0,
+                      is_input_unsigned, input_interpretation_0, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM_0,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  vector<uint, 1> input_vector_1 =
+      input_vector_buffer.Load<vector<uint, 1> >(0);
+  const uint input_interpretation_1 = DataType::DATA_TYPE_UINT8_T4_PACKED;
+  const uint matrix_dimM_1 = 4;
+
+  // expected-no-diagnostics@+1
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector_1,
+                      is_input_unsigned, input_interpretation_1, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM_1,
+                      matrix_dimK, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+}
+
+// Check if Matrix K dimension is less than Max in unpacked input vector case
+void test_valid_matrix_K_dimension_less_than_Max_unpacked_input_vector() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  const uint is_input_unsigned = 1;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+
+  vector<uint, 4> input_vector_0 =
+      input_vector_buffer.Load<vector<uint, 4> >(0);
+  const uint input_interpretation_0 = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimK_0 = 4;
+
+  // expected-no-diagnostics@+1
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector_0,
+                      is_input_unsigned, input_interpretation_0, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK_0, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+  vector<uint, 4> input_vector_1 =
+      input_vector_buffer.Load<vector<uint, 4> >(0);
+  const uint input_interpretation_1 = DataType::DATA_TYPE_UINT8;
+  const uint matrix_dimK_1 = 4;
+
+  // expected-no-diagnostics@+1
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector_1, 
+                      is_input_unsigned, input_interpretation_1, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK_1, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+
+}
+
+// Check if Matrix M dimension is less than Max in packed input vector case
+void test_valid_matrix_M_dimension_less_than_Max_packed_input_vector() {
+
+  vector<uint, 4> output_vector;
+  const uint is_output_unsigned = 1;
+  const uint is_input_unsigned = 1;
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_dimM = 4;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+  const bool matrix_is_transposed = false;
+  const uint matrix_stride = 64;
+
+  vector<uint, 1024> input_vector_0 =
+      input_vector_buffer.Load<vector<uint, 1024> >(0);
+  const uint input_interpretation_0 = DataType::DATA_TYPE_UINT8_T4_PACKED;
+  const uint matrix_dimK_0 = 4096;
+
+  // expected-no-diagnostics@+1
+  __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector_0,
+                      is_input_unsigned, input_interpretation_0, matrix_buffer,
+                      matrix_offset, matrix_interpretation, matrix_dimM,
+                      matrix_dimK_0, matrix_layout, matrix_is_transposed,
+                      matrix_stride);
+}
diff --git a/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/outer_product_accumulate_invalid.hlsl b/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/outer_product_accumulate_invalid.hlsl
new file mode 100644
index 0000000000..4e15c92a5d
--- /dev/null
+++ b/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/outer_product_accumulate_invalid.hlsl
@@ -0,0 +1,256 @@
+// RUN: %dxc -I %hlsl_headers -T lib_6_9 -enable-16bit-types %s -verify
+
+#include <dx/linalg.h>
+
+using namespace dx::linalg;
+
+ByteAddressBuffer input_vector_buffer;
+RWByteAddressBuffer accumulate_buffer;
+ByteAddressBuffer constants_buffer;
+
+// Check if input vectors aren't the same component type
+void test_invalid_input_vector_component_type() {
+
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL;
+  const uint matrix_stride = 0;
+
+  vector<float, 4> input_vector_0_0 = input_vector_buffer.Load<vector<float, 4> >(0);
+  vector<uint, 4> input_vector_1_0 = input_vector_buffer.Load<vector<uint, 4> >(0);
+
+  // expected-error@+1 {{input vectors of outerproductaccumulate must have the same element type}}
+  __builtin_OuterProductAccumulate(input_vector_0_0, input_vector_1_0,
+                                  accumulate_buffer, matrix_offset,
+                                  matrix_interpretation, matrix_layout,
+                                  matrix_stride);
+
+  vector<int, 4> input_vector_0_1 = input_vector_buffer.Load<vector<int, 4> >(0);
+  vector<float, 4> input_vector_1_1 = input_vector_buffer.Load<vector<float, 4> >(0);
+
+  // expected-error@+1 {{input vectors of outerproductaccumulate must have the same element type}}
+  __builtin_OuterProductAccumulate(input_vector_0_1, input_vector_1_1,
+                                  accumulate_buffer, matrix_offset,
+                                  matrix_interpretation, matrix_layout,
+                                  matrix_stride);
+}
+
+// Check for non constant matrix interpretation
+void test_non_constant_matrix_interpretation() {
+
+  vector<float, 4> input_vector_0 = input_vector_buffer.Load<vector<float, 4> >(0);
+  vector<float, 4> input_vector_1 = input_vector_buffer.Load<vector<float, 4> >(0);
+  const uint matrix_offset = 0;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL;
+  const uint matrix_stride = 0;
+
+  const uint matrix_interpretation = constants_buffer.Load<uint>(0);
+
+  // expected-error@+3 {{expression is not an integer constant expression}}
+  __builtin_OuterProductAccumulate(input_vector_0, input_vector_1,
+                                  accumulate_buffer, matrix_offset,
+                                  matrix_interpretation, matrix_layout,
+                                  matrix_stride);
+}
+
+// Check for matrix interpretation is not a valid value
+void test_invalid_matrix_interpretation() {
+
+  vector<float, 4> input_vector_0 = input_vector_buffer.Load<vector<float, 4> >(0);
+  vector<float, 4> input_vector_1 = input_vector_buffer.Load<vector<float, 4> >(0);
+  const uint matrix_offset = 0;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL;
+  const uint matrix_stride = 0;
+
+  const uint matrix_interpretation = 0;
+
+  // expected-error@+3 {{0 is an invalid memory interpretation value}}
+  __builtin_OuterProductAccumulate(input_vector_0, input_vector_1,
+                                  accumulate_buffer, matrix_offset,
+                                  matrix_interpretation, matrix_layout,
+                                  matrix_stride);
+
+  const uint matrix_interpretation_2 = 1;
+
+  // expected-error@+3 {{1 is an invalid memory interpretation value}}
+  __builtin_OuterProductAccumulate(input_vector_0, input_vector_1,
+                                  accumulate_buffer, matrix_offset,
+                                  matrix_interpretation_2, matrix_layout,
+                                  matrix_stride);
+
+  const uint matrix_interpretation_3 = 6;
+
+  // expected-error@+3 {{6 is an invalid memory interpretation value}}
+  __builtin_OuterProductAccumulate(input_vector_0, input_vector_1,
+                                  accumulate_buffer, matrix_offset,
+                                  matrix_interpretation_3, matrix_layout,
+                                  matrix_stride);
+
+  const uint matrix_interpretation_4 = 7;
+
+  // expected-error@+3 {{7 is an invalid memory interpretation value}}
+  __builtin_OuterProductAccumulate(input_vector_0, input_vector_1,
+                                  accumulate_buffer, matrix_offset,
+                                  matrix_interpretation_4, matrix_layout,
+                                  matrix_stride);
+
+  const uint matrix_interpretation_5 = 10;
+
+  // expected-error@+3 {{10 is an invalid memory interpretation value}}
+  __builtin_OuterProductAccumulate(input_vector_0, input_vector_1,
+                                  accumulate_buffer, matrix_offset,
+                                  matrix_interpretation_5, matrix_layout,
+                                  matrix_stride); 
+
+  const uint matrix_interpretation_6 = 11;
+
+  // expected-error@+3 {{11 is an invalid memory interpretation value}}
+  __builtin_OuterProductAccumulate(input_vector_0, input_vector_1,
+                                  accumulate_buffer, matrix_offset,
+                                  matrix_interpretation_6, matrix_layout,
+                                  matrix_stride);
+
+  const uint matrix_interpretation_7 = 12;
+
+  // expected-error@+3 {{12 is an invalid memory interpretation value}}
+  __builtin_OuterProductAccumulate(input_vector_0, input_vector_1,
+                                  accumulate_buffer, matrix_offset,
+                                  matrix_interpretation_7, matrix_layout,
+                                  matrix_stride);
+
+  const uint matrix_interpretation_8 = 13;
+
+  // expected-error@+3 {{13 is an invalid memory interpretation value}}
+  __builtin_OuterProductAccumulate(input_vector_0, input_vector_1,
+                                  accumulate_buffer, matrix_offset,
+                                  matrix_interpretation_8, matrix_layout,
+                                  matrix_stride);
+
+  const uint matrix_interpretation_9 = 14;
+
+  // expected-error@+3 {{14 is an invalid memory interpretation value}}
+  __builtin_OuterProductAccumulate(input_vector_0, input_vector_1,
+                                  accumulate_buffer, matrix_offset,
+                                  matrix_interpretation_9, matrix_layout,
+                                  matrix_stride);
+
+  const uint matrix_interpretation_10 = 15;
+
+  // expected-error@+3 {{15 is an invalid memory interpretation value}}
+  __builtin_OuterProductAccumulate(input_vector_0, input_vector_1,
+                                  accumulate_buffer, matrix_offset,
+                                  matrix_interpretation_10, matrix_layout,
+                                  matrix_stride);
+
+  const uint matrix_interpretation_11 = 16;
+
+  // expected-error@+3 {{16 is an invalid memory interpretation value}}
+  __builtin_OuterProductAccumulate(input_vector_0, input_vector_1,
+                                  accumulate_buffer, matrix_offset,
+                                  matrix_interpretation_11, matrix_layout,
+                                  matrix_stride); 
+
+  const uint matrix_interpretation_12 = DataType::DATA_TYPE_SINT8_T4_PACKED;
+
+  // expected-error@+3 {{17 is an invalid memory interpretation value}}
+  __builtin_OuterProductAccumulate(input_vector_0, input_vector_1,
+                                  accumulate_buffer, matrix_offset,
+                                  matrix_interpretation_12, matrix_layout,
+                                  matrix_stride);
+
+  const uint matrix_interpretation_13 = DataType::DATA_TYPE_UINT8_T4_PACKED;
+
+  // expected-error@+3 {{18 is an invalid memory interpretation value}}
+  __builtin_OuterProductAccumulate(input_vector_0, input_vector_1,
+                                  accumulate_buffer, matrix_offset,
+                                  matrix_interpretation_13, matrix_layout,
+                                  matrix_stride);
+
+  const uint matrix_interpretation_14 = 23;
+
+  // expected-error@+3 {{23 is an invalid memory interpretation value}}
+  __builtin_OuterProductAccumulate(input_vector_0, input_vector_1,
+                                  accumulate_buffer, matrix_offset,
+                                  matrix_interpretation_14, matrix_layout,
+                                  matrix_stride);
+
+  const uint matrix_interpretation_15 = 100;
+
+  // expected-error@+3 {{100 is an invalid memory interpretation value}}
+  __builtin_OuterProductAccumulate(input_vector_0, input_vector_1,
+                                  accumulate_buffer, matrix_offset,
+                                  matrix_interpretation_15, matrix_layout,
+                                  matrix_stride);                   
+                              
+}
+
+// Check for matrix layout is not a constant parameter
+void test_non_constant_matrix_layout() {
+
+  vector<float, 4> input_vector_0 = input_vector_buffer.Load<vector<float, 4> >(0);
+  vector<float, 4> input_vector_1 = input_vector_buffer.Load<vector<float, 4> >(0);
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_stride = 0;
+
+  const uint matrix_layout = constants_buffer.Load<uint>(0);
+
+  // expected-error@+3 {{expression is not an integer constant expression}}
+  __builtin_OuterProductAccumulate(input_vector_0, input_vector_1,
+                                  accumulate_buffer, matrix_offset,
+                                  matrix_interpretation, matrix_layout,
+                                  matrix_stride);
+}
+
+// Check for matrix layout is not a valid value
+void test_invalid_matrix_layout() {
+
+  vector<float, 4> input_vector_0 = input_vector_buffer.Load<vector<float, 4> >(0);
+  vector<float, 4> input_vector_1 = input_vector_buffer.Load<vector<float, 4> >(0);
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32; 
+  const uint matrix_stride = 0;
+
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_ROW_MAJOR;
+
+  // expected-error@+3 {{matrix layout for outerproductaccumulate must be 3}}
+  __builtin_OuterProductAccumulate(input_vector_0, input_vector_1,
+                                  accumulate_buffer, matrix_offset,
+                                  matrix_interpretation, matrix_layout,
+                                  matrix_stride);
+
+  const uint matrix_layout_2 = MatrixLayout::MATRIX_LAYOUT_COLUMN_MAJOR;
+
+  // expected-error@+3 {{matrix layout for outerproductaccumulate must be 3}}
+  __builtin_OuterProductAccumulate(input_vector_0, input_vector_1,
+                                  accumulate_buffer, matrix_offset,
+                                  matrix_interpretation, matrix_layout_2,
+                                  matrix_stride);
+
+  const uint matrix_layout_3 = MatrixLayout::MATRIX_LAYOUT_MUL_OPTIMAL;
+
+  // expected-error@+3 {{matrix layout for outerproductaccumulate must be 3}}
+  __builtin_OuterProductAccumulate(input_vector_0, input_vector_1,
+                                  accumulate_buffer, matrix_offset,
+                                  matrix_interpretation, matrix_layout_3,
+                                  matrix_stride);                               
+                                  
+}
+
+// Check for matrix stride is zero, if constant
+void test_zero_matrix_stride() {
+
+  vector<float, 4> input_vector_0 = input_vector_buffer.Load<vector<float, 4> >(0);
+  vector<float, 4> input_vector_1 = input_vector_buffer.Load<vector<float, 4> >(0);
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL;
+
+  const uint matrix_stride = 16;
+
+  // expected-error@+4 {{for optimal matrix layout, matrix stride must be 0}}
+  __builtin_OuterProductAccumulate(input_vector_0, input_vector_1,
+                                  accumulate_buffer, matrix_offset,
+                                  matrix_interpretation, matrix_layout,
+                                  matrix_stride);
+}
diff --git a/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/outer_product_accumulate_valid.hlsl b/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/outer_product_accumulate_valid.hlsl
new file mode 100644
index 0000000000..85298e2dbb
--- /dev/null
+++ b/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/outer_product_accumulate_valid.hlsl
@@ -0,0 +1,66 @@
+// RUN: %dxc -I %hlsl_headers -T lib_6_9 -enable-16bit-types %s -verify
+
+#include <dx/linalg.h>
+
+using namespace dx::linalg;
+
+ByteAddressBuffer input_vector_buffer;
+RWByteAddressBuffer accumulate_buffer;
+ByteAddressBuffer constants_buffer;
+
+// Check for input vectors aren't the same component type
+void test_invalid_input_vector_component_type() {
+
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL;
+  const uint matrix_stride = 0;
+
+  vector<float, 4> input_vector_0_0 = input_vector_buffer.Load<vector<float, 4> >(0);
+  vector<float, 16> input_vector_1_0 = input_vector_buffer.Load<vector<float, 16> >(0);
+
+      // expected-no-diagnostics@+1
+  __builtin_OuterProductAccumulate(input_vector_0_0, input_vector_1_0,
+                                  accumulate_buffer, matrix_offset,
+                                  matrix_interpretation, matrix_layout,
+                                  matrix_stride);
+
+  vector<int, 32> input_vector_0_1 = input_vector_buffer.Load<vector<int, 32> >(0);
+  vector<int ,16> input_vector_1_1 = input_vector_buffer.Load<vector<int, 16> >(0);
+
+     // expected-no-diagnostics@+1
+  __builtin_OuterProductAccumulate(input_vector_0_1, input_vector_1_1,
+                                  accumulate_buffer, matrix_offset,
+                                  matrix_interpretation, matrix_layout,
+                                  matrix_stride);
+
+  vector<uint, 4> input_vector_0_2 = input_vector_buffer.Load<vector<uint, 4> >(0);
+  vector<uint, 16> input_vector_1_2 = input_vector_buffer.Load<vector<uint, 16> >(0);
+
+  // expected-no-diagnostics@+1
+  __builtin_OuterProductAccumulate(input_vector_0_2, input_vector_1_2,
+                                  accumulate_buffer, matrix_offset,
+                                  matrix_interpretation, matrix_layout,
+                                  matrix_stride);
+}
+
+// Check for non constant matrix stride
+void test_non_constant_matrix_stride() {
+
+  vector<float, 4> input_vector_0 = input_vector_buffer.Load<vector<float, 4> >(0);
+  vector<float, 4> input_vector_1 = input_vector_buffer.Load<vector<float, 4> >(0);
+  const uint matrix_offset = 0;
+  const uint matrix_interpretation = DataType::DATA_TYPE_FLOAT32;
+  const uint matrix_layout = MatrixLayout::MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL;
+
+  const uint matrix_stride = constants_buffer.Load<uint>(0);
+
+  // expected-no-diagnostics@+4
+  __builtin_OuterProductAccumulate(input_vector_0, input_vector_1,
+                                  accumulate_buffer, matrix_offset,
+                                  matrix_interpretation, matrix_layout,
+                                  matrix_stride);
+}
+
+// Check for matrix stride is not a valid value
+
diff --git a/tools/clang/test/SemaHLSL/hlsl/linalg/make-interp-vec-errors.hlsl b/tools/clang/test/SemaHLSL/hlsl/linalg/make-interp-vec-errors.hlsl
new file mode 100644
index 0000000000..be67d92546
--- /dev/null
+++ b/tools/clang/test/SemaHLSL/hlsl/linalg/make-interp-vec-errors.hlsl
@@ -0,0 +1,33 @@
+// RUN: %dxc -I %hlsl_headers -T lib_6_9 %s -verify
+
+#include <dx/linalg.h>
+ByteAddressBuffer Buf;
+
+export float4 Test1(vector<float, 4> Input) {
+  using namespace dx::linalg;
+
+  MatrixRef<DATA_TYPE_UINT16, 4, 4, MATRIX_LAYOUT_MUL_OPTIMAL, true> Matrix = {
+      Buf, 0, 0};
+
+  // expected-error@+3{{no matching function for call to 'MakeInterpretedVector'}}
+  // expected-note@dx/linalg.h:113{{candidate template ignored: invalid explicitly-specified argument for template parameter 'DT'}}
+  return Mul<float>(    
+      Matrix, MakeInterpretedVector<2>(Input));
+}
+
+enum DataType {
+  DATA_TYPE_InvalidType = 40
+};
+
+export float4 Test2(vector<float, 4> Input) {
+  using namespace dx::linalg;
+
+  MatrixRef<DATA_TYPE_UINT16, 4, 4, MATRIX_LAYOUT_MUL_OPTIMAL, true> Matrix = {
+      Buf, 0, 0};
+
+  // expected-error@+3{{no matching function for call to 'MakeInterpretedVector'}}
+  // expected-note@dx/linalg.h:113{{candidate template ignored: invalid explicitly-specified argument for template parameter 'DT'}}
+  return Mul<float>(    
+      Matrix, MakeInterpretedVector<DATA_TYPE_InvalidType>(Input));
+}
+
diff --git a/tools/clang/test/SemaHLSL/hlsl/linalg/mat-vec-mul-errors.hlsl b/tools/clang/test/SemaHLSL/hlsl/linalg/mat-vec-mul-errors.hlsl
new file mode 100644
index 0000000000..b911de648e
--- /dev/null
+++ b/tools/clang/test/SemaHLSL/hlsl/linalg/mat-vec-mul-errors.hlsl
@@ -0,0 +1,16 @@
+// RUN: %dxc -I %hlsl_headers -T lib_6_9 %s -verify
+
+#include <dx/linalg.h>
+
+ByteAddressBuffer Buf;
+
+vector<float, 128> MixUpVectorAndMatrixArguments(vector<float, 128> Input) {
+  using namespace dx::linalg;
+
+  MatrixRef<DATA_TYPE_FLOAT16, 128, 128, MATRIX_LAYOUT_MUL_OPTIMAL> Matrix = {
+      Buf, 0, 0};
+
+  // expected-error@+2{{no matching function for call to 'Mul'}}
+  // expected-note@dx/linalg.h:127{{candidate template ignored: could not match 'MatrixRefImpl' against 'InterpretedVector'}}
+  return Mul<float>(MakeInterpretedVector<DATA_TYPE_FLOAT16>(Input), Matrix);
+}
diff --git a/tools/clang/test/SemaHLSL/hlsl/linalg/mat-vec-mul-transpose-errors.hlsl b/tools/clang/test/SemaHLSL/hlsl/linalg/mat-vec-mul-transpose-errors.hlsl
new file mode 100644
index 0000000000..2018acafab
--- /dev/null
+++ b/tools/clang/test/SemaHLSL/hlsl/linalg/mat-vec-mul-transpose-errors.hlsl
@@ -0,0 +1,30 @@
+// XFAIL: *
+// RUN: %dxc -I %hlsl_headers -T lib_6_9 -enable-16bit-types %s -verify
+
+#include <dx/linalg.h>
+
+ByteAddressBuffer Buf;
+
+export float4 Test1(vector<float, 4> Input) {
+  using namespace dx::linalg;
+
+  MatrixRef<DATA_TYPE_FLOAT16, 4, 4, MATRIX_LAYOUT_ROW_MAJOR, true> Matrix = {
+      Buf, 0, 0};
+
+  // PREVIEW CHECK TODO:
+  // expected-error@+1{{something about transposing not supported for rowmajor / colmajor layouts}}
+  return Mul<float>(    
+      Matrix, MakeInterpretedVector<DATA_TYPE_FLOAT16>(Input));
+}
+
+export vector<float, 8> Test2(vector<uint8_t4_packed, 6> Input) {
+  using namespace dx::linalg;
+
+  MatrixRef<DATA_TYPE_UINT8, 8, 6 * 4, MATRIX_LAYOUT_COLUMN_MAJOR> Matrix = {
+      Buf, 0, 0};
+
+  // PREVIEW CHECK TODO:
+  // expected-error@+1{{something about transposing not supported for rowmajor / colmajor layouts}}
+  return Mul<float>(Matrix,
+                    MakeInterpretedVector<DATA_TYPE_UINT8_T4_PACKED>(Input));
+}
diff --git a/tools/clang/test/SemaHLSL/hlsl/linalg/mat-vec-muladd-errors.hlsl b/tools/clang/test/SemaHLSL/hlsl/linalg/mat-vec-muladd-errors.hlsl
new file mode 100644
index 0000000000..24ad3ef46c
--- /dev/null
+++ b/tools/clang/test/SemaHLSL/hlsl/linalg/mat-vec-muladd-errors.hlsl
@@ -0,0 +1,16 @@
+// RUN: %dxc -I %hlsl_headers -T lib_6_9 %s -verify
+
+#include <dx/linalg.h>
+
+ByteAddressBuffer Buf;
+
+vector<float, 128> MixUpVectorAndMatrixArguments(vector<float, 128> Input) {
+  using namespace dx::linalg;
+
+  MatrixRef<DATA_TYPE_SINT16, 128, 128, MATRIX_LAYOUT_MUL_OPTIMAL> Matrix = {
+      Buf, 0, 0};
+
+  // expected-error@+2{{no matching function for call to 'MulAdd'}}
+  // expected-note@dx/linalg.h:153{{candidate template ignored: could not match 'MatrixRefImpl' against 'InterpretedVector'}}
+  return MulAdd<float>(MakeInterpretedVector<DATA_TYPE_SINT16>(Input), Matrix, MakeInterpretedVector<DATA_TYPE_SINT16>(Input));
+}
diff --git a/tools/clang/test/SemaHLSL/hlsl/linalg/outerproductaccumulate-errors.hlsl b/tools/clang/test/SemaHLSL/hlsl/linalg/outerproductaccumulate-errors.hlsl
new file mode 100644
index 0000000000..5759631bcb
--- /dev/null
+++ b/tools/clang/test/SemaHLSL/hlsl/linalg/outerproductaccumulate-errors.hlsl
@@ -0,0 +1,44 @@
+// RUN: %dxc -I %hlsl_headers -T lib_6_9 -enable-16bit-types %s -verify
+
+#include <dx/linalg.h>
+
+RWByteAddressBuffer RWBuf;
+
+// test for inputs of different size
+export void Test4(vector<half, 128> Input1, vector<half, 64> Input2) {
+  using namespace dx::linalg;
+
+  RWMatrixRef<DATA_TYPE_FLOAT16, 128, 64, MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL, true>
+      matrix = {RWBuf, 0, 0};
+
+  // expected-error@+3{{no matching function for call to 'OuterProductAccumulate'}}
+  // expected-note@dx/linalg.h:177{{candidate template ignored: could not match 0 against 1}}
+
+  OuterProductAccumulate(Input1, Input2, matrix);  
+}
+
+// now test for an error when element types differ
+export void Test5(vector<int, 128> Input1, vector<uint, 128> Input2) {
+  using namespace dx::linalg;
+
+  RWMatrixRef<DATA_TYPE_FLOAT16, 128, 128, MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL, true>
+      matrix = {RWBuf, 0, 0};
+
+  // expected-error@+3{{no matching function for call to 'OuterProductAccumulate'}}
+  // expected-note@dx/linalg.h:177{{candidate template ignored: could not match 0 against 1}}
+
+  OuterProductAccumulate(Input1, Input2, matrix);  
+}
+
+// now test for an error when matrix transpose parameter is true
+export void Test4(vector<half, 64> Input1, vector<half, 64> Input2) {
+  using namespace dx::linalg;
+
+  RWMatrixRef<DATA_TYPE_FLOAT16, 64, 64, MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL, true>
+      matrix = {RWBuf, 0, 0};
+
+  // expected-error@+3{{no matching function for call to 'OuterProductAccumulate'}}
+  // expected-note@dx/linalg.h:177{{candidate template ignored: deduced conflicting types for parameter 'ElTy' ('int' vs. 'unsigned int')}}
+
+  OuterProductAccumulate(Input1, Input2, matrix);  
+}
diff --git a/tools/clang/test/SemaHLSL/hlsl/linalg/unavailable-pre-sm69.hlsl b/tools/clang/test/SemaHLSL/hlsl/linalg/unavailable-pre-sm69.hlsl
new file mode 100644
index 0000000000..57683b9a59
--- /dev/null
+++ b/tools/clang/test/SemaHLSL/hlsl/linalg/unavailable-pre-sm69.hlsl
@@ -0,0 +1,59 @@
+// RUN: %dxc -T lib_6_8 %s -verify
+ 
+ByteAddressBuffer matrix_buffer;
+ByteAddressBuffer bias_buffer;
+RWByteAddressBuffer rw_matrix_buffer;
+
+[Shader("compute")]
+[Numthreads(1,1,1)]
+void cs_main()
+{    
+    vector<float, 4> output_vector;
+    static const uint is_output_unsigned = 0;
+    
+    vector<float, 4> input_vector;
+    const uint is_input_unsigned = 0;
+    const uint input_interpretation = 9; /*F32*/
+    
+    const uint matrix_offset = 0;
+    const uint matrix_interpretation = 9; /*F32*/
+    const uint matrix_dimM = 4;
+    const uint matrix_dimK = 4;
+    const uint matrix_layout = 0; /*RowMajor*/
+    const bool matrix_is_transposed = false; 
+    const uint matrix_stride = 64;
+
+    //expected-error@+1{{intrinsic hlsl::__builtin_MatVecMul potentially used by ''cs_main'' requires shader model 6.9 or greater}}
+    __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector, 
+      is_input_unsigned, input_interpretation, matrix_buffer, matrix_offset,
+      matrix_interpretation, matrix_dimM, matrix_dimK, matrix_layout,
+      matrix_is_transposed, matrix_stride); 
+
+    const uint bias_offset = 0;
+    const uint bias_interpretation = 9; /*F32*/
+
+    //expected-error@+1{{intrinsic hlsl::__builtin_MatVecMulAdd potentially used by ''cs_main'' requires shader model 6.9 or greater}}
+    __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector,
+      is_input_unsigned, input_interpretation, matrix_buffer, matrix_offset,
+      matrix_interpretation, matrix_dimM, matrix_dimK, matrix_layout,
+      matrix_is_transposed, matrix_stride, bias_buffer, bias_offset,
+      bias_interpretation); 
+
+    vector<uint, 4> input_vector1;
+    vector<uint, 4> input_vector2;
+    const uint opa_matrix_offset = 0;
+    const uint opa_matrix_interpretation = 5; /*U32*/
+    const uint opa_matrix_layout = 3; /*OuterProductOptimal*/
+    const uint opa_matrix_stride = 0;
+
+    //expected-error@+1{{intrinsic hlsl::__builtin_OuterProductAccumulate potentially used by ''cs_main'' requires shader model 6.9 or greater}}
+    __builtin_OuterProductAccumulate(input_vector1, input_vector2,
+      rw_matrix_buffer, opa_matrix_offset, opa_matrix_interpretation,
+      opa_matrix_layout, opa_matrix_stride);
+
+    const uint va_matrix_offset = 0;
+
+    //expected-error@+1{{intrinsic hlsl::__builtin_VectorAccumulate potentially used by ''cs_main'' requires shader model 6.9 or greater}}
+    __builtin_VectorAccumulate(input_vector1, rw_matrix_buffer,
+      va_matrix_offset);
+}
\ No newline at end of file
diff --git a/tools/clang/test/SemaHLSL/hlsl/linalg/vectoraccumulate-errors.hlsl b/tools/clang/test/SemaHLSL/hlsl/linalg/vectoraccumulate-errors.hlsl
new file mode 100644
index 0000000000..4c8ae6f049
--- /dev/null
+++ b/tools/clang/test/SemaHLSL/hlsl/linalg/vectoraccumulate-errors.hlsl
@@ -0,0 +1,16 @@
+// XFAIL: *
+// RUN: %dxc -I %hlsl_headers -T lib_6_9 %s | FileCheck %s
+
+#include <dx/linalg.h>
+
+RWByteAddressBuffer RWBuf;
+
+export void Test5(vector<float, 128> Input) {
+  using namespace dx::linalg;
+
+  RWBuf.Store<vector<half, 128> >(0, Input);
+
+  // PREVIEW CHECK TODO:
+  // CHECK: Something about an error due to illegal conversions
+  VectorAccumulate(Input, RWBuf, 0);
+}
diff --git a/tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject-in-buffer.hlsl b/tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject-in-buffer.hlsl
index baa3a07a5b..b091bd2ac5 100644
--- a/tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject-in-buffer.hlsl
+++ b/tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject-in-buffer.hlsl
@@ -1,4 +1,4 @@
 // RUN: %dxc -T lib_6_9 %s -verify
 
-// expected-error@+1{{'dx::HitObject' is an object and cannot be used as a type parameter}}
+// expected-error@+1{{object 'dx::HitObject' is not allowed in structured buffers}}
 RWStructuredBuffer<dx::HitObject> InvalidBuffer;
diff --git a/tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_accessors.hlsl b/tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_accessors.hlsl
new file mode 100644
index 0000000000..05aa790ad4
--- /dev/null
+++ b/tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_accessors.hlsl
@@ -0,0 +1,263 @@
+// RUN: %dxc -T lib_6_9 -E main %s -ast-dump-implicit | FileCheck %s --check-prefix AST
+// RUN: %dxc -T lib_6_9 -E main %s -fcgl | FileCheck %s --check-prefix FCGL
+
+// AST: | | |-FunctionTemplateDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> GetHitKind
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TResult
+// AST-NEXT: | | | |-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> implicit GetHitKind 'TResult () const'
+// AST-NEXT: | | | `-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> used GetHitKind 'unsigned int ()' extern
+// AST-NEXT: | | |   |-TemplateArgument type 'unsigned int'
+// AST-NEXT: | | |   |-HLSLIntrinsicAttr {{[^ ]+}} <<invalid sloc>> Implicit "op" "" 366
+// AST-NEXT: | | |   |-ConstAttr {{[^ ]+}} <<invalid sloc>> Implicit
+// AST-NEXT: | | |   `-AvailabilityAttr {{[^ ]+}} <<invalid sloc>> Implicit  6.9 0 0 ""
+// AST: | | |-FunctionTemplateDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> GetInstanceID
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TResult
+// AST-NEXT: | | | |-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> implicit GetInstanceID 'TResult () const'
+// AST-NEXT: | | | `-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> used GetInstanceID 'unsigned int ()' extern
+// AST-NEXT: | | |   |-TemplateArgument type 'unsigned int'
+// AST-NEXT: | | |   |-HLSLIntrinsicAttr {{[^ ]+}} <<invalid sloc>> Implicit "op" "" 367
+// AST-NEXT: | | |   |-ConstAttr {{[^ ]+}} <<invalid sloc>> Implicit
+// AST-NEXT: | | |   `-AvailabilityAttr {{[^ ]+}} <<invalid sloc>> Implicit  6.9 0 0 ""
+// AST: | | |-FunctionTemplateDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> GetInstanceIndex
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TResult
+// AST-NEXT: | | | |-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> implicit GetInstanceIndex 'TResult () const'
+// AST-NEXT: | | | `-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> used GetInstanceIndex 'unsigned int ()' extern
+// AST-NEXT: | | |   |-TemplateArgument type 'unsigned int'
+// AST-NEXT: | | |   |-HLSLIntrinsicAttr {{[^ ]+}} <<invalid sloc>> Implicit "op" "" 368
+// AST-NEXT: | | |   |-ConstAttr {{[^ ]+}} <<invalid sloc>> Implicit
+// AST-NEXT: | | |   `-AvailabilityAttr {{[^ ]+}} <<invalid sloc>> Implicit  6.9 0 0 ""
+// AST: | | |-FunctionTemplateDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> GetObjectRayDirection
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TResult
+// AST-NEXT: | | | |-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> implicit GetObjectRayDirection 'TResult () const'
+// AST-NEXT: | | | `-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> used GetObjectRayDirection 'vector<float, 3> ()' extern
+// AST-NEXT: | | |   |-TemplateArgument type 'vector<float, 3>':'vector<float, 3>'
+// AST-NEXT: | | |   |-HLSLIntrinsicAttr {{[^ ]+}} <<invalid sloc>> Implicit "op" "" 369
+// AST-NEXT: | | |   |-ConstAttr {{[^ ]+}} <<invalid sloc>> Implicit
+// AST-NEXT: | | |   `-AvailabilityAttr {{[^ ]+}} <<invalid sloc>> Implicit  6.9 0 0 ""
+// AST: | | |-FunctionTemplateDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> GetObjectRayOrigin
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TResult
+// AST-NEXT: | | | |-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> implicit GetObjectRayOrigin 'TResult () const'
+// AST-NEXT: | | | `-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> used GetObjectRayOrigin 'vector<float, 3> ()' extern
+// AST-NEXT: | | |   |-TemplateArgument type 'vector<float, 3>':'vector<float, 3>'
+// AST-NEXT: | | |   |-HLSLIntrinsicAttr {{[^ ]+}} <<invalid sloc>> Implicit "op" "" 370
+// AST-NEXT: | | |   |-ConstAttr {{[^ ]+}} <<invalid sloc>> Implicit
+// AST-NEXT: | | |   `-AvailabilityAttr {{[^ ]+}} <<invalid sloc>> Implicit  6.9 0 0 ""
+// AST: | | |-FunctionTemplateDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> GetObjectToWorld3x4
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TResult
+// AST-NEXT: | | | |-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> implicit GetObjectToWorld3x4 'TResult () const'
+// AST-NEXT: | | | `-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> used GetObjectToWorld3x4 'matrix<float, 3, 4> ()' extern
+// AST-NEXT: | | |   |-TemplateArgument type 'matrix<float, 3, 4>':'matrix<float, 3, 4>'
+// AST-NEXT: | | |   |-HLSLIntrinsicAttr {{[^ ]+}} <<invalid sloc>> Implicit "op" "" 371
+// AST-NEXT: | | |   |-ConstAttr {{[^ ]+}} <<invalid sloc>> Implicit
+// AST-NEXT: | | |   `-AvailabilityAttr {{[^ ]+}} <<invalid sloc>> Implicit  6.9 0 0 ""
+// AST: | | |-FunctionTemplateDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> GetObjectToWorld4x3
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TResult
+// AST-NEXT: | | | |-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> implicit GetObjectToWorld4x3 'TResult () const'
+// AST-NEXT: | | | `-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> used GetObjectToWorld4x3 'matrix<float, 4, 3> ()' extern
+// AST-NEXT: | | |   |-TemplateArgument type 'matrix<float, 4, 3>':'matrix<float, 4, 3>'
+// AST-NEXT: | | |   |-HLSLIntrinsicAttr {{[^ ]+}} <<invalid sloc>> Implicit "op" "" 372
+// AST-NEXT: | | |   |-ConstAttr {{[^ ]+}} <<invalid sloc>> Implicit
+// AST-NEXT: | | |   `-AvailabilityAttr {{[^ ]+}} <<invalid sloc>> Implicit  6.9 0 0 ""
+// AST: | | |-FunctionTemplateDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> GetPrimitiveIndex
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TResult
+// AST-NEXT: | | | |-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> implicit GetPrimitiveIndex 'TResult () const'
+// AST-NEXT: | | | `-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> used GetPrimitiveIndex 'unsigned int ()' extern
+// AST-NEXT: | | |   |-TemplateArgument type 'unsigned int'
+// AST-NEXT: | | |   |-HLSLIntrinsicAttr {{[^ ]+}} <<invalid sloc>> Implicit "op" "" 373
+// AST-NEXT: | | |   |-ConstAttr {{[^ ]+}} <<invalid sloc>> Implicit
+// AST-NEXT: | | |   `-AvailabilityAttr {{[^ ]+}} <<invalid sloc>> Implicit  6.9 0 0 ""
+// AST: | | |-FunctionTemplateDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> GetRayFlags
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TResult
+// AST-NEXT: | | | |-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> implicit GetRayFlags 'TResult () const'
+// AST-NEXT: | | | `-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> used GetRayFlags 'unsigned int ()' extern
+// AST-NEXT: | | |   |-TemplateArgument type 'unsigned int'
+// AST-NEXT: | | |   |-HLSLIntrinsicAttr {{[^ ]+}} <<invalid sloc>> Implicit "op" "" 374
+// AST-NEXT: | | |   |-ConstAttr {{[^ ]+}} <<invalid sloc>> Implicit
+// AST-NEXT: | | |   `-AvailabilityAttr {{[^ ]+}} <<invalid sloc>> Implicit  6.9 0 0 ""
+// AST: | | |-FunctionTemplateDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> GetRayTCurrent
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TResult
+// AST-NEXT: | | | |-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> implicit GetRayTCurrent 'TResult () const'
+// AST-NEXT: | | | `-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> used GetRayTCurrent 'float ()' extern
+// AST-NEXT: | | |   |-TemplateArgument type 'float'
+// AST-NEXT: | | |   |-HLSLIntrinsicAttr {{[^ ]+}} <<invalid sloc>> Implicit "op" "" 375
+// AST-NEXT: | | |   |-ConstAttr {{[^ ]+}} <<invalid sloc>> Implicit
+// AST-NEXT: | | |   `-AvailabilityAttr {{[^ ]+}} <<invalid sloc>> Implicit  6.9 0 0 ""
+// AST: | | |-FunctionTemplateDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> GetRayTMin
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TResult
+// AST-NEXT: | | | |-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> implicit GetRayTMin 'TResult () const'
+// AST-NEXT: | | | `-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> used GetRayTMin 'float ()' extern
+// AST-NEXT: | | |   |-TemplateArgument type 'float'
+// AST-NEXT: | | |   |-HLSLIntrinsicAttr {{[^ ]+}} <<invalid sloc>> Implicit "op" "" 376
+// AST-NEXT: | | |   |-ConstAttr {{[^ ]+}} <<invalid sloc>> Implicit
+// AST-NEXT: | | |   `-AvailabilityAttr {{[^ ]+}} <<invalid sloc>> Implicit  6.9 0 0 ""
+// AST: | | |-FunctionTemplateDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> GetShaderTableIndex
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TResult
+// AST-NEXT: | | | |-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> implicit GetShaderTableIndex 'TResult () const'
+// AST-NEXT: | | | `-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> used GetShaderTableIndex 'unsigned int ()' extern
+// AST-NEXT: | | |   |-TemplateArgument type 'unsigned int'
+// AST-NEXT: | | |   |-HLSLIntrinsicAttr {{[^ ]+}} <<invalid sloc>> Implicit "op" "" 377
+// AST-NEXT: | | |   |-ConstAttr {{[^ ]+}} <<invalid sloc>> Implicit
+// AST-NEXT: | | |   `-AvailabilityAttr {{[^ ]+}} <<invalid sloc>> Implicit  6.9 0 0 ""
+// AST: | | |-FunctionTemplateDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> GetWorldRayDirection
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TResult
+// AST-NEXT: | | | |-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> implicit GetWorldRayDirection 'TResult () const'
+// AST-NEXT: | | | `-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> used GetWorldRayDirection 'vector<float, 3> ()' extern
+// AST-NEXT: | | |   |-TemplateArgument type 'vector<float, 3>':'vector<float, 3>'
+// AST-NEXT: | | |   |-HLSLIntrinsicAttr {{[^ ]+}} <<invalid sloc>> Implicit "op" "" 378
+// AST-NEXT: | | |   |-ConstAttr {{[^ ]+}} <<invalid sloc>> Implicit
+// AST-NEXT: | | |   `-AvailabilityAttr {{[^ ]+}} <<invalid sloc>> Implicit  6.9 0 0 ""
+// AST: | | |-FunctionTemplateDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> GetWorldRayOrigin
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TResult
+// AST-NEXT: | | | |-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> implicit GetWorldRayOrigin 'TResult () const'
+// AST-NEXT: | | | `-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> used GetWorldRayOrigin 'vector<float, 3> ()' extern
+// AST-NEXT: | | |   |-TemplateArgument type 'vector<float, 3>':'vector<float, 3>'
+// AST-NEXT: | | |   |-HLSLIntrinsicAttr {{[^ ]+}} <<invalid sloc>> Implicit "op" "" 379
+// AST-NEXT: | | |   |-ConstAttr {{[^ ]+}} <<invalid sloc>> Implicit
+// AST-NEXT: | | |   `-AvailabilityAttr {{[^ ]+}} <<invalid sloc>> Implicit  6.9 0 0 ""
+// AST: | | |-FunctionTemplateDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> GetWorldToObject3x4
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TResult
+// AST-NEXT: | | | |-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> implicit GetWorldToObject3x4 'TResult () const'
+// AST-NEXT: | | | `-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> used GetWorldToObject3x4 'matrix<float, 3, 4> ()' extern
+// AST-NEXT: | | |   |-TemplateArgument type 'matrix<float, 3, 4>':'matrix<float, 3, 4>'
+// AST-NEXT: | | |   |-HLSLIntrinsicAttr {{[^ ]+}} <<invalid sloc>> Implicit "op" "" 380
+// AST-NEXT: | | |   |-ConstAttr {{[^ ]+}} <<invalid sloc>> Implicit
+// AST-NEXT: | | |   `-AvailabilityAttr {{[^ ]+}} <<invalid sloc>> Implicit  6.9 0 0 ""
+// AST: | | |-FunctionTemplateDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> GetWorldToObject4x3
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TResult
+// AST-NEXT: | | | |-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> implicit GetWorldToObject4x3 'TResult () const'
+// AST-NEXT: | | | `-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> used GetWorldToObject4x3 'matrix<float, 4, 3> ()' extern
+// AST-NEXT: | | |   |-TemplateArgument type 'matrix<float, 4, 3>':'matrix<float, 4, 3>'
+// AST-NEXT: | | |   |-HLSLIntrinsicAttr {{[^ ]+}} <<invalid sloc>> Implicit "op" "" 381
+// AST-NEXT: | | |   |-ConstAttr {{[^ ]+}} <<invalid sloc>> Implicit
+// AST-NEXT: | | |   `-AvailabilityAttr {{[^ ]+}} <<invalid sloc>> Implicit  6.9 0 0 ""
+// AST: | | |-FunctionTemplateDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> IsHit
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TResult
+// AST-NEXT: | | | |-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> implicit IsHit 'TResult () const'
+// AST-NEXT: | | | `-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> used IsHit 'bool ()' extern
+// AST-NEXT: | | |   |-TemplateArgument type 'bool'
+// AST-NEXT: | | |   |-HLSLIntrinsicAttr {{[^ ]+}} <<invalid sloc>> Implicit "op" "" 383
+// AST-NEXT: | | |   |-ConstAttr {{[^ ]+}} <<invalid sloc>> Implicit
+// AST-NEXT: | | |   `-AvailabilityAttr {{[^ ]+}} <<invalid sloc>> Implicit  6.9 0 0 ""
+// AST: | | |-FunctionTemplateDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> IsMiss
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TResult
+// AST-NEXT: | | | |-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> implicit IsMiss 'TResult () const'
+// AST-NEXT: | | | `-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> used IsMiss 'bool ()' extern
+// AST-NEXT: | | |   |-TemplateArgument type 'bool'
+// AST-NEXT: | | |   |-HLSLIntrinsicAttr {{[^ ]+}} <<invalid sloc>> Implicit "op" "" 384
+// AST-NEXT: | | |   |-ConstAttr {{[^ ]+}} <<invalid sloc>> Implicit
+// AST-NEXT: | | |   `-AvailabilityAttr {{[^ ]+}} <<invalid sloc>> Implicit  6.9 0 0 ""
+// AST: | | |-FunctionTemplateDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> IsNop
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TResult
+// AST-NEXT: | | | |-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> implicit IsNop 'TResult () const'
+// AST-NEXT: | | | `-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> used IsNop 'bool ()' extern
+// AST-NEXT: | | |   |-TemplateArgument type 'bool'
+// AST-NEXT: | | |   |-HLSLIntrinsicAttr {{[^ ]+}} <<invalid sloc>> Implicit "op" "" 385
+// AST-NEXT: | | |   |-ConstAttr {{[^ ]+}} <<invalid sloc>> Implicit
+// AST-NEXT: | | |   `-AvailabilityAttr {{[^ ]+}} <<invalid sloc>> Implicit  6.9 0 0 ""
+// AST: | | |-FunctionTemplateDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> LoadLocalRootTableConstant
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TResult
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TRootConstantOffsetInBytes
+// AST-NEXT: | | | |-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> implicit LoadLocalRootTableConstant 'TResult (TRootConstantOffsetInBytes) const'
+// AST-NEXT: | | | | `-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> RootConstantOffsetInBytes 'TRootConstantOffsetInBytes'
+// AST-NEXT: | | | `-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> used LoadLocalRootTableConstant 'unsigned int (unsigned int)' extern
+// AST-NEXT: | | |   |-TemplateArgument type 'unsigned int'
+// AST-NEXT: | | |   |-TemplateArgument type 'unsigned int'
+// AST-NEXT: | | |   |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> LoadLocalRootTableConstant 'unsigned int'
+// AST-NEXT: | | |   |-HLSLIntrinsicAttr {{[^ ]+}} <<invalid sloc>> Implicit "op" "" 386
+// AST-NEXT: | | |   |-PureAttr {{[^ ]+}} <<invalid sloc>> Implicit
+// AST-NEXT: | | |   `-AvailabilityAttr {{[^ ]+}} <<invalid sloc>> Implicit  6.9 0 0 ""
+// AST: | | |-FunctionTemplateDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> SetShaderTableIndex
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TResult
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TRecordIndex
+// AST-NEXT: | | | |-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> implicit SetShaderTableIndex 'TResult (TRecordIndex) const'
+// AST-NEXT: | | | | `-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> RecordIndex 'TRecordIndex'
+// AST-NEXT: | | | `-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> used SetShaderTableIndex 'void (unsigned int)' extern
+// AST-NEXT: | | |   |-TemplateArgument type 'void'
+// AST-NEXT: | | |   |-TemplateArgument type 'unsigned int'
+// AST-NEXT: | | |   |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> SetShaderTableIndex 'unsigned int'
+// AST-NEXT: | | |   |-HLSLIntrinsicAttr {{[^ ]+}} <<invalid sloc>> Implicit "op" "" 388
+// AST-NEXT: | | |   `-AvailabilityAttr {{[^ ]+}} <<invalid sloc>> Implicit  6.9 0 0 ""
+
+// FCGL: define void @"\01?main@@YAXXZ"() #0 {
+// FCGL:   %{{[^ ]+}} = call %dx.types.HitObject* @"dx.hl.op..%dx.types.HitObject* (i32, %dx.types.HitObject*)"(i32 358, %dx.types.HitObject* %[[HIT:[^ ]+]])
+// FCGL:   call void @"dx.hl.op..void (i32, %dx.types.HitObject*, i32)"(i32 388, %dx.types.HitObject* %[[HIT]], i32 1)
+// FCGL:   %{{[^ ]+}} = call i1 @"dx.hl.op.rn.i1 (i32, %dx.types.HitObject*)"(i32 383, %dx.types.HitObject* %[[HIT]])
+// FCGL:   %{{[^ ]+}} = call i1 @"dx.hl.op.rn.i1 (i32, %dx.types.HitObject*)"(i32 384, %dx.types.HitObject* %[[HIT]])
+// FCGL:   %{{[^ ]+}} = call i1 @"dx.hl.op.rn.i1 (i32, %dx.types.HitObject*)"(i32 385, %dx.types.HitObject* %[[HIT]])
+// FCGL:   %{{[^ ]+}} = call i32 @"dx.hl.op.rn.i32 (i32, %dx.types.HitObject*)"(i32 365, %dx.types.HitObject* %[[HIT]])
+// FCGL:   %{{[^ ]+}} = call i32 @"dx.hl.op.rn.i32 (i32, %dx.types.HitObject*)"(i32 366, %dx.types.HitObject* %[[HIT]])
+// FCGL:   %{{[^ ]+}} = call i32 @"dx.hl.op.rn.i32 (i32, %dx.types.HitObject*)"(i32 368, %dx.types.HitObject* %[[HIT]])
+// FCGL:   %{{[^ ]+}} = call i32 @"dx.hl.op.rn.i32 (i32, %dx.types.HitObject*)"(i32 367, %dx.types.HitObject* %[[HIT]])
+// FCGL:   %{{[^ ]+}} = call i32 @"dx.hl.op.rn.i32 (i32, %dx.types.HitObject*)"(i32 373, %dx.types.HitObject* %[[HIT]])
+// FCGL:   %{{[^ ]+}} = call i32 @"dx.hl.op.rn.i32 (i32, %dx.types.HitObject*)"(i32 377, %dx.types.HitObject* %[[HIT]])
+// FCGL:   %{{[^ ]+}} = call i32 @"dx.hl.op.ro.i32 (i32, %dx.types.HitObject*, i32)"(i32 386, %dx.types.HitObject* %[[HIT]], i32 40)
+// FCGL:   %{{[^ ]+}} = call <3 x float> @"dx.hl.op.rn.<3 x float> (i32, %dx.types.HitObject*)"(i32 379, %dx.types.HitObject* %[[HIT]])
+// FCGL:   %{{[^ ]+}} = call <3 x float> @"dx.hl.op.rn.<3 x float> (i32, %dx.types.HitObject*)"(i32 378, %dx.types.HitObject* %[[HIT]])
+// FCGL:   %{{[^ ]+}} = call <3 x float> @"dx.hl.op.rn.<3 x float> (i32, %dx.types.HitObject*)"(i32 370, %dx.types.HitObject* %[[HIT]])
+// FCGL:   %{{[^ ]+}} = call <3 x float> @"dx.hl.op.rn.<3 x float> (i32, %dx.types.HitObject*)"(i32 369, %dx.types.HitObject* %[[HIT]])
+// FCGL:   %{{[^ ]+}} = call %class.matrix.float.3.4 @"dx.hl.op.rn.%class.matrix.float.3.4 (i32, %dx.types.HitObject*)"(i32 371, %dx.types.HitObject* %[[HIT]])
+// FCGL:   %{{[^ ]+}} = call %class.matrix.float.4.3 @"dx.hl.op.rn.%class.matrix.float.4.3 (i32, %dx.types.HitObject*)"(i32 372, %dx.types.HitObject* %[[HIT]])
+// FCGL:   %{{[^ ]+}} = call %class.matrix.float.3.4 @"dx.hl.op.rn.%class.matrix.float.3.4 (i32, %dx.types.HitObject*)"(i32 380, %dx.types.HitObject* %[[HIT]])
+// FCGL:   %{{[^ ]+}} = call %class.matrix.float.4.3 @"dx.hl.op.rn.%class.matrix.float.4.3 (i32, %dx.types.HitObject*)"(i32 381, %dx.types.HitObject* %[[HIT]])
+// FCGL:   %{{[^ ]+}} = call i32 @"dx.hl.op.rn.i32 (i32, %dx.types.HitObject*)"(i32 374, %dx.types.HitObject* %[[HIT]])
+// FCGL:   %{{[^ ]+}} = call float @"dx.hl.op.rn.float (i32, %dx.types.HitObject*)"(i32 376, %dx.types.HitObject* %[[HIT]])
+// FCGL:   %{{[^ ]+}} = call float @"dx.hl.op.rn.float (i32, %dx.types.HitObject*)"(i32 375, %dx.types.HitObject* %[[HIT]])
+// FCGL:   ret void
+
+RWByteAddressBuffer outbuf;
+
+template <int M, int N>
+float hashM(in matrix<float, M, N> mat) {
+  float h = 0.f;
+  for (int i = 0; i < M; ++i)
+    for (int j = 0; j < N; ++j)
+      h += mat[i][j];
+  return h;
+}
+
+[shader("raygeneration")]
+void main() {
+  dx::HitObject hit;
+  int isum = 0;
+  float fsum = 0.0f;
+  vector<float, 3> vsum = 0;
+
+  ///// Setters
+  hit.SetShaderTableIndex(1);
+
+  ///// Getters
+
+  // i1 accessors
+  isum += hit.IsHit();
+  isum += hit.IsMiss();
+  isum += hit.IsNop();
+
+  // i32 accessors
+  isum += hit.GetGeometryIndex();
+  isum += hit.GetHitKind();
+  isum += hit.GetInstanceIndex();
+  isum += hit.GetInstanceID();
+  isum += hit.GetPrimitiveIndex();
+  isum += hit.GetShaderTableIndex();
+  isum += hit.LoadLocalRootTableConstant(40);
+
+  // float3 accessors
+  vsum += hit.GetWorldRayOrigin();
+  vsum += hit.GetWorldRayDirection();
+  vsum += hit.GetObjectRayOrigin();
+  vsum += hit.GetObjectRayDirection();
+  fsum += vsum[0] + vsum[1] + vsum[2];
+
+  // matrix accessors
+  fsum += hashM<3, 4>(hit.GetObjectToWorld3x4());
+  fsum += hashM<4, 3>(hit.GetObjectToWorld4x3());
+  fsum += hashM<3, 4>(hit.GetWorldToObject3x4());
+  fsum += hashM<4, 3>(hit.GetWorldToObject4x3());
+
+  // f32 accessors
+  isum += hit.GetRayFlags();
+  fsum += hit.GetRayTMin();
+  fsum += hit.GetRayTCurrent();
+
+  outbuf.Store(0, fsum);
+  outbuf.Store(4, isum);
+}
diff --git a/tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_attributes.hlsl b/tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_attributes.hlsl
new file mode 100644
index 0000000000..609d94f291
--- /dev/null
+++ b/tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_attributes.hlsl
@@ -0,0 +1,34 @@
+// RUN: %dxc -T lib_6_9 -E main %s -ast-dump-implicit | FileCheck %s --check-prefix AST
+// RUN: %dxc -T lib_6_9 -E main %s -fcgl | FileCheck %s --check-prefix FCGL
+
+
+// AST: | | |-FunctionTemplateDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> GetAttributes
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TResult
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TAttributes
+// AST-NEXT: | | | |-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> implicit GetAttributes 'TResult (TAttributes &) const'
+// AST-NEXT: | | | | `-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> Attributes 'TAttributes &'
+// AST-NEXT: | | | `-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> used GetAttributes 'void (CustomAttrs &)' extern
+// AST-NEXT: | | |   |-TemplateArgument type 'void'
+// AST-NEXT: | | |   |-TemplateArgument type 'CustomAttrs'
+// AST-NEXT: | | |   |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> GetAttributes 'CustomAttrs &&__restrict'
+// AST-NEXT: | | |   |-HLSLIntrinsicAttr {{[^ ]+}} <<invalid sloc>> Implicit "op" "" 364
+// AST-NEXT: | | |   `-AvailabilityAttr {{[^ ]+}} <<invalid sloc>> Implicit  6.9 0 0 ""
+
+// FCGL: call void @"dx.hl.op..void (i32, %dx.types.HitObject*, %struct.CustomAttrs*)"(i32 364, %dx.types.HitObject* %{{[^ ]+}}, %struct.CustomAttrs* %{{[^ ]+}})
+
+RWByteAddressBuffer outbuf;
+
+struct
+CustomAttrs {
+  float4 v;
+  int y;
+};
+
+[shader("raygeneration")]
+void main() {
+  dx::HitObject hit;
+  CustomAttrs attrs;
+  hit.GetAttributes(attrs);
+  float sum = attrs.v.x + attrs.v.y + attrs.v.z + attrs.v.w + attrs.y;
+  outbuf.Store(0, sum);
+}
diff --git a/tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_attributes_invalid_longvec.hlsl b/tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_attributes_invalid_longvec.hlsl
new file mode 100644
index 0000000000..97bb81a7cb
--- /dev/null
+++ b/tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_attributes_invalid_longvec.hlsl
@@ -0,0 +1,15 @@
+// RUN: %dxc -T lib_6_9 -E main %s -verify
+
+struct
+CustomAttrs {
+  vector<float, 32> v;
+  int y;
+};
+
+[shader("raygeneration")]
+void main() {
+  dx::HitObject hit;
+  // expected-error@+2{{vectors of over 4 elements in attributes are not supported}}
+  CustomAttrs attrs;
+  hit.GetAttributes(attrs);
+}
diff --git a/tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_attributes_invalid_udt.hlsl b/tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_attributes_invalid_udt.hlsl
new file mode 100644
index 0000000000..f8935676c5
--- /dev/null
+++ b/tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_attributes_invalid_udt.hlsl
@@ -0,0 +1,16 @@
+// RUN: %dxc -T lib_6_9 -E main %s -verify
+
+struct
+CustomAttrs {
+  vector<float, 32> v;
+  RWStructuredBuffer<float> buf;
+};
+
+[shader("raygeneration")]
+void main() {
+  dx::HitObject hit;
+  CustomAttrs attrs;
+  hit.GetAttributes(attrs);
+  // expected-error@-1{{vectors of over 4 elements in attributes are not supported}}
+  // expected-error@-2{{attributes type must be a user-defined type composed of only numeric types}}
+}
diff --git a/tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_fromrayquery.hlsl b/tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_fromrayquery.hlsl
new file mode 100644
index 0000000000..e4a13d8a62
--- /dev/null
+++ b/tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_fromrayquery.hlsl
@@ -0,0 +1,72 @@
+// RUN: %dxc -T lib_6_9 -E main %s -fcgl | FileCheck %s --check-prefix FCGL
+// RUN: %dxc -T lib_6_9 -E main %s -ast-dump-implicit | FileCheck %s --check-prefix AST
+
+// AST: | | |-FunctionTemplateDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> FromRayQuery
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TResult
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class Trq
+// AST-NEXT: | | | |-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> implicit FromRayQuery 'TResult (Trq) const' static
+// AST-NEXT: | | | | `-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> rq 'Trq'
+// AST-NEXT: | | | `-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> used FromRayQuery 'dx::HitObject (RayQuery<RAY_FLAG_FORCE_OPAQUE | RAY_FLAG_ACCEPT_FIRST_HIT_AND_END_SEARCH>)' static
+// AST-NEXT: | | |   |-TemplateArgument type 'dx::HitObject'
+// AST-NEXT: | | |   |-TemplateArgument type 'RayQuery<RAY_FLAG_FORCE_OPAQUE | RAY_FLAG_ACCEPT_FIRST_HIT_AND_END_SEARCH>':'RayQuery<5, 0>'
+// AST-NEXT: | | |   |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> rq 'RayQuery<RAY_FLAG_FORCE_OPAQUE | RAY_FLAG_ACCEPT_FIRST_HIT_AND_END_SEARCH>':'RayQuery<5, 0>'
+// AST-NEXT: | | |   |-HLSLIntrinsicAttr {{[^ ]+}} <<invalid sloc>> Implicit "op" "" 363
+// AST-NEXT: | | |   `-AvailabilityAttr {{[^ ]+}} <<invalid sloc>> Implicit  6.9 0 0 ""
+
+// AST: | | |-FunctionTemplateDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> FromRayQuery
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TResult
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class Trq
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class THitKind
+// AST-NEXT: | | | |-TemplateTypeParmDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> class TAttributes
+// AST-NEXT: | | | |-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> implicit FromRayQuery 'TResult (Trq, THitKind, TAttributes) const' static
+// AST-NEXT: | | | | |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> rq 'Trq'
+// AST-NEXT: | | | | |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> HitKind 'THitKind'
+// AST-NEXT: | | | | `-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> Attributes 'TAttributes'
+// AST-NEXT: | | | `-CXXMethodDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> used FromRayQuery 'dx::HitObject (RayQuery<RAY_FLAG_FORCE_OPAQUE | RAY_FLAG_ACCEPT_FIRST_HIT_AND_END_SEARCH>, unsigned int, CustomAttrs)' static
+// AST-NEXT: | | |   |-TemplateArgument type 'dx::HitObject'
+// AST-NEXT: | | |   |-TemplateArgument type 'RayQuery<RAY_FLAG_FORCE_OPAQUE | RAY_FLAG_ACCEPT_FIRST_HIT_AND_END_SEARCH>':'RayQuery<5, 0>'
+// AST-NEXT: | | |   |-TemplateArgument type 'unsigned int'
+// AST-NEXT: | | |   |-TemplateArgument type 'CustomAttrs'
+// AST-NEXT: | | |   |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> rq 'RayQuery<RAY_FLAG_FORCE_OPAQUE | RAY_FLAG_ACCEPT_FIRST_HIT_AND_END_SEARCH>':'RayQuery<5, 0>'
+// AST-NEXT: | | |   |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> HitKind 'unsigned int'
+// AST-NEXT: | | |   |-ParmVarDecl {{[^ ]+}} <<invalid sloc>> <invalid sloc> Attributes 'CustomAttrs'
+// AST-NEXT: | | |   |-HLSLIntrinsicAttr {{[^ ]+}} <<invalid sloc>> Implicit "op" "" 363
+// AST-NEXT: | | |   `-AvailabilityAttr {{[^ ]+}} <<invalid sloc>> Implicit  6.9 0 0 ""
+
+// FCGL: call void @"dx.hl.op..void (i32, %dx.types.HitObject*, %\22class.RayQuery<5, 0>\22*)"(i32 363, %dx.types.HitObject* %[[HITPTR0:[^ ]+]], %"class.RayQuery<5, 0>"* %[[RQ:[^ ]+]])
+// FCGL-NEXT: call void @"\01?Use@@YAXVHitObject@dx@@@Z"(%dx.types.HitObject* %[[HITPTR0]])
+// FCGL: call void @"dx.hl.op..void (i32, %dx.types.HitObject*, %\22class.RayQuery<5, 0>\22*, i32, %struct.CustomAttrs*)"(i32 363, %dx.types.HitObject* %[[HITPTR1:[^ ]+]], %"class.RayQuery<5, 0>"* %[[RQ]], i32 16, %struct.CustomAttrs* %{{[^ ]+}})
+// FCGL-NEXT: call void @"\01?Use@@YAXVHitObject@dx@@@Z"(%dx.types.HitObject* %[[HITPTR1]])
+
+RaytracingAccelerationStructure RTAS;
+RWStructuredBuffer<float> UAV : register(u0);
+
+RayDesc MakeRayDesc() {
+  RayDesc desc;
+  desc.Origin = float3(0, 0, 0);
+  desc.Direction = float3(1, 0, 0);
+  desc.TMin = 0.0f;
+  desc.TMax = 9999.0;
+  return desc;
+}
+
+struct CustomAttrs {
+  float x;
+  float y;
+};
+
+void Use(in dx::HitObject hit) {
+  dx::MaybeReorderThread(hit);
+}
+
+[shader("raygeneration")]
+void main() {
+  RayQuery<RAY_FLAG_FORCE_OPAQUE | RAY_FLAG_ACCEPT_FIRST_HIT_AND_END_SEARCH> q;
+  RayDesc ray = MakeRayDesc();
+  q.TraceRayInline(RTAS, RAY_FLAG_NONE, 0xFF, ray);
+
+  Use(dx::HitObject::FromRayQuery(q));
+
+  CustomAttrs attrs = {1.f, 2.f};
+  Use(dx::HitObject::FromRayQuery(q, 16, attrs));
+}
diff --git a/tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_traceinvoke_payload.hlsl b/tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_traceinvoke_payload.hlsl
new file mode 100644
index 0000000000..f4781bc796
--- /dev/null
+++ b/tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_traceinvoke_payload.hlsl
@@ -0,0 +1,27 @@
+// RUN: %dxc -T lib_6_9 %s -D TEST_NUM=0 %s -verify
+// RUN: %dxc -T lib_6_9 %s -D TEST_NUM=1 %s -verify
+
+RaytracingAccelerationStructure scene : register(t0);
+
+struct Payload
+{
+    int a : read (caller, closesthit, miss) : write(caller, closesthit, miss);
+};
+
+struct Attribs
+{
+    float2 barys;
+};
+
+[shader("raygeneration")]
+void RayGen()
+{
+// expected-error@+1{{type 'Payload' used as payload requires that it is annotated with the [raypayload] attribute}}
+    Payload payload_in_rg;
+    RayDesc ray;
+#if TEST_NUM == 0
+    dx::HitObject::TraceRay( scene, RAY_FLAG_NONE, 0xff, 0, 1, 0, ray, payload_in_rg );
+#else
+    dx::HitObject::Invoke( dx::HitObject(), payload_in_rg );
+#endif
+}
\ No newline at end of file
diff --git a/tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_traceinvoke_payload_udt.hlsl b/tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_traceinvoke_payload_udt.hlsl
new file mode 100644
index 0000000000..ee4ff8c020
--- /dev/null
+++ b/tools/clang/test/SemaHLSL/hlsl/objects/HitObject/hitobject_traceinvoke_payload_udt.hlsl
@@ -0,0 +1,31 @@
+// RUN: %dxc -T lib_6_9 %s -verify
+
+struct
+[raypayload]
+Payload
+{
+    int a : read(closesthit, miss) : write(anyhit);
+    dx::HitObject hit;
+};
+
+struct
+[raypayload]
+PayloadLV
+{
+    int a : read(closesthit, miss) : write(anyhit);
+    vector<float, 5> b : read(closesthit, miss) : write(anyhit);
+};
+
+[shader("raygeneration")]
+void RayGen()
+{
+  // expected-error@+3{{payload parameter 'payload_in_rg' must be a user-defined type composed of only numeric types}}
+  // expected-error@+2{{object 'dx::HitObject' is not allowed in payload parameters}}
+  // expected-note@8{{'dx::HitObject' field declared here}}
+  Payload payload_in_rg;
+  dx::HitObject::Invoke( dx::HitObject(), payload_in_rg );
+
+  // expected-error@+1{{vectors of over 4 elements in payload parameters are not supported}}
+  PayloadLV payload_with_lv;
+  dx::HitObject::Invoke( dx::HitObject(), payload_with_lv );
+}
\ No newline at end of file
diff --git a/tools/clang/test/SemaHLSL/hlsl/semantics/ExtendedCommandInformation/WrongShaderModel.hlsl b/tools/clang/test/SemaHLSL/hlsl/semantics/ExtendedCommandInformation/WrongShaderModel.hlsl
index 667e1f4579..4bddf37acd 100644
--- a/tools/clang/test/SemaHLSL/hlsl/semantics/ExtendedCommandInformation/WrongShaderModel.hlsl
+++ b/tools/clang/test/SemaHLSL/hlsl/semantics/ExtendedCommandInformation/WrongShaderModel.hlsl
@@ -1,7 +1,5 @@
 // TODO: use -verify instead of FileCheck after fix https://github.com/microsoft/DirectXShaderCompiler/issues/5768
-// -select-validator internal used to avoid downlevel validator testing
-// incompatibility with shader model 6.7.
-// RUN: not %dxc -E main -T vs_6_7 -select-validator internal %s 2>&1 | FileCheck %s --check-prefix=SM67
+// RUN: not %dxc -E main -T vs_6_7 %s 2>&1 | FileCheck %s --check-prefix=SM67
 
 // SM67:invalid semantic 'SV_StartVertexLocation' for vs 6.7
 // SM67:invalid semantic 'SV_StartInstanceLocation' for vs 6.7
diff --git a/tools/clang/test/SemaHLSL/hlsl/types/invalid-hitobject-decls-hs.hlsl b/tools/clang/test/SemaHLSL/hlsl/types/invalid-hitobject-decls-hs.hlsl
new file mode 100644
index 0000000000..3a4457bd5f
--- /dev/null
+++ b/tools/clang/test/SemaHLSL/hlsl/types/invalid-hitobject-decls-hs.hlsl
@@ -0,0 +1,32 @@
+// RUN: %dxc -T hs_6_9 -verify %s
+
+struct HsConstantData {
+  float Edges[3] : SV_TessFactor;
+  dx::HitObject hit;
+};
+
+struct LongVec {
+  float4 f;
+  dx::HitObject hit;
+};
+
+HsConstantData
+PatchConstantFunction(
+    // expected-error@-1{{object 'dx::HitObject' is not allowed in patch constant function return type}}
+    // expected-note@5{{'dx::HitObject' field declared here}}
+	  dx::HitObject hit : V,
+    // expected-error@-1{{object 'dx::HitObject' is not allowed in patch constant function parameters}}
+	  LongVec lv : L)
+    // expected-error@-1{{object 'dx::HitObject' is not allowed in patch constant function parameters}}
+    // expected-note@10{{'dx::HitObject' field declared here}}
+{
+  HsConstantData empty;
+  return empty;
+}
+
+[domain("tri")]
+[outputtopology("triangle_cw")]
+[outputcontrolpoints(32)]
+[patchconstantfunc("PatchConstantFunction")]
+void main() {
+}
diff --git a/tools/clang/test/SemaHLSL/hlsl/types/invalid-hitobject-decls-struct.hlsl b/tools/clang/test/SemaHLSL/hlsl/types/invalid-hitobject-decls-struct.hlsl
new file mode 100644
index 0000000000..c852d17a1a
--- /dev/null
+++ b/tools/clang/test/SemaHLSL/hlsl/types/invalid-hitobject-decls-struct.hlsl
@@ -0,0 +1,344 @@
+// RUN: %dxc -T lib_6_9 -DTYPE=HitStruct -verify %s
+// RUN: %dxc -T lib_6_9 -DTYPE=HitStructSub -verify %s
+
+
+#define PASTE_(x,y) x##y
+#define PASTE(x,y) PASTE_(x,y)
+
+#ifndef TYPE
+#define TYPE HitTpl<dx::HitObject>
+#endif
+
+// Add tests for base types and instantiated template classes with HitObjects
+
+struct HitStruct {
+  float4 f;
+  dx::HitObject hit;
+};
+
+struct HitStructSub : HitStruct {
+  int3 is;
+};
+
+template <typename T>
+struct HitTpl {
+  float4 f;
+  T val;
+};
+
+TYPE global_type;
+// expected-error@-1{{object 'dx::HitObject' is not allowed in cbuffers or tbuffers}}
+// expected-note@16{{'dx::HitObject' field declared here}}
+dx::HitObject global_hit;
+// expected-error@-1{{object 'dx::HitObject' is not allowed in cbuffers or tbuffers}}
+dx::HitObject global_hit_arr[10];
+// expected-error@-1{{object 'dx::HitObject' is not allowed in cbuffers or tbuffers}}
+
+static TYPE static_gv;
+// expected-error@-1{{object 'dx::HitObject' is not allowed in global variables}}
+// expected-note@16{{'dx::HitObject' field declared here}}
+
+cbuffer BadBuffy {
+  dx::HitObject cb_hit;
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in cbuffers or tbuffers}}
+  dx::HitObject cb_hit_arr[10];
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in cbuffers or tbuffers}}
+};
+
+tbuffer BadTuffy {
+  dx::HitObject tb_vec; 
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in cbuffers or tbuffers}}
+  dx::HitObject tb_vec_arr[10];
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in cbuffers or tbuffers}}
+  TYPE tb_vec_rec; 
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in cbuffers or tbuffers}}
+  // expected-note@16{{'dx::HitObject' field declared here}}
+  TYPE tb_vec_rec_arr[10]; 
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in cbuffers or tbuffers}}
+  // expected-note@16{{'dx::HitObject' field declared here}}
+};
+
+StructuredBuffer<TYPE> struct_buf;
+// expected-error@-1{{object 'dx::HitObject' is not allowed in structured buffers}}
+// expected-note@16{{'dx::HitObject' field declared here}}
+RWStructuredBuffer<TYPE> rw_struct_buf;
+// expected-error@-1{{object 'dx::HitObject' is not allowed in structured buffers}}
+// expected-note@16{{'dx::HitObject' field declared here}}
+ConstantBuffer<TYPE> const_buf;
+// expected-error@-1{{object 'dx::HitObject' is not allowed in ConstantBuffers or TextureBuffers}}
+// expected-note@16{{'dx::HitObject' field declared here}}
+TextureBuffer<TYPE> tex_buf;
+// expected-error@-1{{object 'dx::HitObject' is not allowed in ConstantBuffers or TextureBuffers}}
+// expected-note@16{{'dx::HitObject' field declared here}}
+
+ByteAddressBuffer bab;
+RWByteAddressBuffer rw_bab;
+
+[Shader("raygeneration")]
+void main()
+{
+  bab.Load<TYPE>(0);
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in builtin template parameters}}
+  // expected-note@16{{'dx::HitObject' field declared here}}
+  // expected-error@-3{{Explicit template arguments on intrinsic Load must be a single numeric type}}
+  rw_bab.Load<TYPE>(0);
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in builtin template parameters}}
+  // expected-note@16{{'dx::HitObject' field declared here}}
+  // expected-error@-3{{Explicit template arguments on intrinsic Load must be a single numeric type}}
+  TYPE val;
+  rw_bab.Store<TYPE>(0, val);
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in builtin template parameters}}
+  // expected-note@16{{'dx::HitObject' field declared here}}
+  // expected-error@-3{{Explicit template arguments on intrinsic Store must be a single numeric type}}
+}
+
+[shader("pixel")]
+TYPE ps_main( 
+// expected-error@-1{{object 'dx::HitObject' is not allowed in entry function return type}}
+// expected-note@16{{'dx::HitObject' field declared here}}
+    TYPE vec : V) : SV_Target {
+    // expected-error@-1{{object 'dx::HitObject' is not allowed in entry function parameters}}
+    // expected-note@16{{'dx::HitObject' field declared here}}
+  return vec;
+}
+
+[shader("vertex")]
+TYPE vs_main(
+// expected-error@-1{{object 'dx::HitObject' is not allowed in entry function return type}}
+// expected-note@16{{'dx::HitObject' field declared here}}
+    TYPE parm : P) : SV_Target {
+    // expected-error@-1{{object 'dx::HitObject' is not allowed in entry function parameters}}
+    // expected-note@16{{'dx::HitObject' field declared here}}
+  parm.f = 0;
+  return parm;
+}
+
+
+[shader("geometry")]
+[maxvertexcount(3)]
+void gs_point(
+    line TYPE e,
+    // expected-error@-1{{object 'dx::HitObject' is not allowed in entry function parameters}}
+    // expected-note@16{{'dx::HitObject' field declared here}}
+    inout PointStream<TYPE> OutputStream0)
+    // expected-error@-1{{object 'dx::HitObject' is not allowed in geometry streams}}
+    // expected-note@16{{'dx::HitObject' field declared here}}
+{}
+
+[shader("geometry")]
+[maxvertexcount(12)]
+void gs_line( 
+    line TYPE a,
+    // expected-error@-1{{object 'dx::HitObject' is not allowed in entry function parameters}}
+    // expected-note@16{{'dx::HitObject' field declared here}}
+    inout LineStream<TYPE> OutputStream0)
+    // expected-error@-1{{object 'dx::HitObject' is not allowed in geometry streams}}
+    // expected-note@16{{'dx::HitObject' field declared here}}
+{}
+
+
+[shader("geometry")]
+[maxvertexcount(12)]
+void gs_tri(
+    triangle TYPE a,
+    // expected-error@-1{{object 'dx::HitObject' is not allowed in entry function parameters}}
+    // expected-note@16{{'dx::HitObject' field declared here}}
+    inout TriangleStream<TYPE> OutputStream0)
+    // expected-error@-1{{object 'dx::HitObject' is not allowed in geometry streams}}
+    // expected-note@16{{'dx::HitObject' field declared here}}
+{}
+
+[shader("domain")]
+[domain("tri")]
+void ds_main(
+    OutputPatch<TYPE, 3> TrianglePatch)
+    // expected-error@-1{{object 'dx::HitObject' is not allowed in tessellation patches}}
+    // expected-note@16{{'dx::HitObject' field declared here}}
+{}
+
+void patch_const(
+    InputPatch<TYPE, 3> inpatch,
+    // expected-error@-1{{object 'dx::HitObject' is not allowed in tessellation patches}}
+    // expected-note@16{{'dx::HitObject' field declared here}}
+    OutputPatch<TYPE, 3> outpatch)
+    // expected-error@-1{{object 'dx::HitObject' is not allowed in tessellation patches}}
+    // expected-note@16{{'dx::HitObject' field declared here}}
+{}
+
+[shader("hull")]
+[domain("tri")]
+[outputtopology("triangle_cw")]
+[outputcontrolpoints(32)]
+[patchconstantfunc("patch_const")]
+void hs_main(InputPatch<TYPE, 3> TrianglePatch) {}
+// expected-error@-1{{object 'dx::HitObject' is not allowed in tessellation patches}}
+// expected-note@16{{'dx::HitObject' field declared here}}
+
+RaytracingAccelerationStructure RTAS;
+
+struct [raypayload] DXRHitStruct {
+  float4 f : write(closesthit) : read(caller);
+  TYPE hit : write(closesthit) : read(caller);
+};
+
+struct [raypayload] DXRHitStructSub : DXRHitStruct {
+  int3 is : write(closesthit) : read(caller);
+};
+
+template<typename T>
+struct [raypayload] DXRHitTpl {
+  float4 f : write(closesthit) : read(caller);
+  T hit : write(closesthit) : read(caller);
+};
+
+#define RTTYPE PASTE(DXR,TYPE)
+
+
+TYPE userFunc(TYPE arg) {
+  return arg;
+}
+
+[shader("raygeneration")]
+void raygen() {
+  RTTYPE p = (RTTYPE)0;
+  RayDesc ray = (RayDesc)0;
+  TraceRay(RTAS, RAY_FLAG_NONE, 0, 0, 1, 0, ray, p); 
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in user-defined struct parameter}}
+  // expected-note@16{{'dx::HitObject' field declared here}}
+  CallShader(0, p);
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in user-defined struct parameter}}
+  // expected-note@16{{'dx::HitObject' field declared here}}
+  TYPE val;
+  TYPE res = userFunc(val);
+}
+
+[shader("closesthit")]
+void closesthit(
+    inout RTTYPE payload,
+    // expected-error@-1{{payload parameter 'payload' must be a user-defined type composed of only numeric types}}
+    // expected-error@-2{{object 'dx::HitObject' is not allowed in entry function parameters}}
+    // expected-note@16{{'dx::HitObject' field declared here}}
+    in RTTYPE attribs) {
+    // expected-error@-1{{attributes parameter 'attribs' must be a user-defined type composed of only numeric types}}
+    // expected-error@-2{{object 'dx::HitObject' is not allowed in entry function parameters}}
+    // expected-note@16{{'dx::HitObject' field declared here}}
+  RayDesc ray;
+  TraceRay( RTAS, RAY_FLAG_NONE, 0xff, 0, 1, 0, ray, payload );
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in user-defined struct parameter}}
+  // expected-note@16{{'dx::HitObject' field declared here}}
+  CallShader(0, payload); 
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in user-defined struct parameter}}
+  // expected-note@16{{'dx::HitObject' field declared here}}
+}
+
+[shader("anyhit")]
+void AnyHit(
+    inout RTTYPE payload, 
+    // expected-error@-1{{payload parameter 'payload' must be a user-defined type composed of only numeric types}}
+    // expected-error@-2{{object 'dx::HitObject' is not allowed in entry function parameters}}
+    // expected-note@16{{'dx::HitObject' field declared here}}
+    in RTTYPE attribs)
+    // expected-error@-1{{attributes parameter 'attribs' must be a user-defined type composed of only numeric types}}
+    // expected-error@-2{{object 'dx::HitObject' is not allowed in entry function parameters}}
+    // expected-note@16{{'dx::HitObject' field declared here}}
+{
+}
+
+[shader("miss")]
+void Miss(
+    inout RTTYPE payload){
+    // expected-error@-1{{payload parameter 'payload' must be a user-defined type composed of only numeric types}}
+    // expected-error@-2{{object 'dx::HitObject' is not allowed in entry function parameters}}
+    // expected-note@16{{'dx::HitObject' field declared here}}
+  RayDesc ray;
+  TraceRay( RTAS, RAY_FLAG_NONE, 0xff, 0, 1, 0, ray, payload ); 
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in user-defined struct parameter}}
+  // expected-note@16{{'dx::HitObject' field declared here}}
+  CallShader(0, payload);
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in user-defined struct parameter}}
+  // expected-note@16{{'dx::HitObject' field declared here}}
+}
+
+[shader("intersection")]
+void Intersection() {
+  float hitT = RayTCurrent();
+  RTTYPE attr = (RTTYPE)0;
+  bool bReported = ReportHit(hitT, 0, attr);
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in attributes}}
+  // expected-note@16{{'dx::HitObject' field declared here}}
+}
+
+[shader("callable")]
+void callable1(
+    inout RTTYPE p) { 
+    // expected-error@-1{{object 'dx::HitObject' is not allowed in entry function parameters}}
+    // expected-note@16{{'dx::HitObject' field declared here}}
+    // expected-error@-3{{callable parameter 'p' must be a user-defined type composed of only numeric types}}
+  CallShader(0, p); 
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in user-defined struct parameter}}
+  // expected-note@16{{'dx::HitObject' field declared here}}
+}
+
+static groupshared TYPE gs_var;
+// expected-error@-1{{object 'dx::HitObject' is not allowed in groupshared variables}}
+// expected-note@16{{'dx::HitObject' field declared here}}
+
+[shader("amplification")]
+[numthreads(1,1,1)]
+void Amp() {
+  TYPE as_pld;
+  DispatchMesh(1,1,1,as_pld); 
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in user-defined struct parameter}}
+  // expected-note@16{{'dx::HitObject' field declared here}}
+}
+
+struct NodeHitStruct {
+  uint3 grid : SV_DispatchGrid;
+  TYPE hit;
+};
+
+struct NodeHitStructSub : NodeHitStruct {
+  int3 is;
+};
+
+template<typename T>
+struct NodeHitTpl {
+  uint3 grid : SV_DispatchGrid;
+  T hit;
+};
+
+#define NTYPE PASTE(Node,TYPE)
+
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NumThreads(8,1,1)]
+[NodeMaxDispatchGrid(8, 1, 1)]
+void broadcast(
+// expected-error@-1{{Broadcasting node shader 'broadcast' with NodeMaxDispatchGrid attribute must declare an input record containing a field with SV_DispatchGrid semantic}}
+    DispatchNodeInputRecord<NTYPE> input,
+    // expected-error@-1{{object 'dx::HitObject' is not allowed in node records}}
+    // expected-note@16{{'dx::HitObject' field declared here}}
+    NodeOutput<TYPE> output)
+    // expected-error@-1{{object 'dx::HitObject' is not allowed in node records}}
+    // expected-note@16{{'dx::HitObject' field declared here}}
+{
+  ThreadNodeOutputRecords<TYPE> touts; 
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in node records}}
+  // expected-note@16{{'dx::HitObject' field declared here}}
+  GroupNodeOutputRecords<TYPE> gouts;
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in node records}}
+  // expected-note@16{{'dx::HitObject' field declared here}}
+}
+
+[Shader("node")]
+[NodeLaunch("coalescing")]
+[NumThreads(8,1,1)]
+void coalesce(GroupNodeInputRecords<TYPE> input) {}
+// expected-error@-1{{object 'dx::HitObject' is not allowed in node records}}
+// expected-note@16{{'dx::HitObject' field declared here}}
+
+[Shader("node")]
+[NodeLaunch("thread")]
+void threader(ThreadNodeInputRecord<TYPE> input) {}
+// expected-error@-1{{object 'dx::HitObject' is not allowed in node records}}
+// expected-note@16{{'dx::HitObject' field declared here}}
diff --git a/tools/clang/test/SemaHLSL/hlsl/types/invalid-hitobject-decls-templated.hlsl b/tools/clang/test/SemaHLSL/hlsl/types/invalid-hitobject-decls-templated.hlsl
new file mode 100644
index 0000000000..c2303a8608
--- /dev/null
+++ b/tools/clang/test/SemaHLSL/hlsl/types/invalid-hitobject-decls-templated.hlsl
@@ -0,0 +1,340 @@
+// RUN: %dxc -T lib_6_9 -verify %s
+
+
+#define PASTE_(x,y) x##y
+#define PASTE(x,y) PASTE_(x,y)
+
+#define TYPE HitTpl<dx::HitObject>
+
+// Add tests for base types and instantiated template classes with HitObjects
+
+struct HitStruct {
+  float4 f;
+  dx::HitObject hit;
+};
+
+struct HitStructSub : HitStruct {
+  int3 is;
+};
+
+template <typename T>
+struct HitTpl {
+  float4 f;
+  T val;
+};
+
+RaytracingAccelerationStructure RTAS;
+
+struct [raypayload] DXRHitStruct {
+  float4 f : write(closesthit) : read(caller);
+  TYPE hit : write(closesthit) : read(caller);
+};
+
+struct [raypayload] DXRHitStructSub : DXRHitStruct {
+  int3 is : write(closesthit) : read(caller);
+};
+
+template<typename T>
+struct [raypayload] DXRHitTpl {
+  float4 f : write(closesthit) : read(caller);
+  T hit : write(closesthit) : read(caller);
+};
+
+struct NodeHitStruct {
+  uint3 grid : SV_DispatchGrid;
+  TYPE hit;
+};
+
+struct NodeHitStructSub : NodeHitStruct {
+  int3 is;
+};
+
+template<typename T>
+struct NodeHitTpl {
+  uint3 grid : SV_DispatchGrid;
+  T hit;
+};
+
+TYPE global_type;
+// expected-error@-1{{object 'dx::HitObject' is not allowed in cbuffers or tbuffers}}
+// expected-note@23{{'dx::HitObject' field declared here}}
+dx::HitObject global_hit;
+// expected-error@-1{{object 'dx::HitObject' is not allowed in cbuffers or tbuffers}}
+dx::HitObject global_hit_arr[10];
+// expected-error@-1{{object 'dx::HitObject' is not allowed in cbuffers or tbuffers}}
+
+static TYPE static_gv;
+// expected-error@-1{{object 'dx::HitObject' is not allowed in global variables}}
+// expected-note@23{{'dx::HitObject' field declared here}}
+
+cbuffer BadBuffy {
+  dx::HitObject cb_hit;
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in cbuffers or tbuffers}}
+  dx::HitObject cb_hit_arr[10];
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in cbuffers or tbuffers}}
+};
+
+tbuffer BadTuffy {
+  dx::HitObject tb_vec; 
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in cbuffers or tbuffers}}
+  dx::HitObject tb_vec_arr[10];
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in cbuffers or tbuffers}}
+  TYPE tb_vec_rec; 
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in cbuffers or tbuffers}}
+  // expected-note@23{{'dx::HitObject' field declared here}}
+  TYPE tb_vec_rec_arr[10]; 
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in cbuffers or tbuffers}}
+  // expected-note@23{{'dx::HitObject' field declared here}}
+};
+
+StructuredBuffer<TYPE> struct_buf;
+// expected-error@-1{{object 'dx::HitObject' is not allowed in structured buffers}}
+// expected-note@23{{'dx::HitObject' field declared here}}
+RWStructuredBuffer<TYPE> rw_struct_buf;
+// expected-error@-1{{object 'dx::HitObject' is not allowed in structured buffers}}
+// expected-note@23{{'dx::HitObject' field declared here}}
+ConstantBuffer<TYPE> const_buf;
+// expected-error@-1{{object 'dx::HitObject' is not allowed in ConstantBuffers or TextureBuffers}}
+// expected-note@23{{'dx::HitObject' field declared here}}
+TextureBuffer<TYPE> tex_buf;
+// expected-error@-1{{object 'dx::HitObject' is not allowed in ConstantBuffers or TextureBuffers}}
+// expected-note@23{{'dx::HitObject' field declared here}}
+
+ByteAddressBuffer bab;
+RWByteAddressBuffer rw_bab;
+
+[Shader("raygeneration")]
+void main()
+{
+  bab.Load<TYPE>(0);
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in builtin template parameters}}
+  // expected-note@23{{'dx::HitObject' field declared here}}
+  // expected-error@-3{{Explicit template arguments on intrinsic Load must be a single numeric type}}
+  rw_bab.Load<TYPE>(0);
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in builtin template parameters}}
+  // expected-note@23{{'dx::HitObject' field declared here}}
+  // expected-error@-3{{Explicit template arguments on intrinsic Load must be a single numeric type}}
+  TYPE val;
+  rw_bab.Store<TYPE>(0, val);
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in builtin template parameters}}
+  // expected-note@23{{'dx::HitObject' field declared here}}
+  // expected-error@-3{{Explicit template arguments on intrinsic Store must be a single numeric type}}
+}
+
+[shader("pixel")]
+TYPE ps_main( 
+// expected-error@-1{{object 'dx::HitObject' is not allowed in entry function return type}}
+// expected-note@23{{'dx::HitObject' field declared here}}
+    TYPE vec : V) : SV_Target {
+    // expected-error@-1{{object 'dx::HitObject' is not allowed in entry function parameters}}
+    // expected-note@23{{'dx::HitObject' field declared here}}
+  return vec;
+}
+
+[shader("vertex")]
+TYPE vs_main(
+// expected-error@-1{{object 'dx::HitObject' is not allowed in entry function return type}}
+// expected-note@23{{'dx::HitObject' field declared here}}
+    TYPE parm : P) : SV_Target {
+    // expected-error@-1{{object 'dx::HitObject' is not allowed in entry function parameters}}
+    // expected-note@23{{'dx::HitObject' field declared here}}
+  parm.f = 0;
+  return parm;
+}
+
+
+[shader("geometry")]
+[maxvertexcount(3)]
+void gs_point(
+    line TYPE e,
+    // expected-error@-1{{object 'dx::HitObject' is not allowed in entry function parameters}}
+    // expected-note@23{{'dx::HitObject' field declared here}}
+    inout PointStream<TYPE> OutputStream0)
+    // expected-error@-1{{object 'dx::HitObject' is not allowed in geometry streams}}
+    // expected-note@23{{'dx::HitObject' field declared here}}
+{}
+
+[shader("geometry")]
+[maxvertexcount(12)]
+void gs_line( 
+    line TYPE a,
+    // expected-error@-1{{object 'dx::HitObject' is not allowed in entry function parameters}}
+    // expected-note@23{{'dx::HitObject' field declared here}}
+    inout LineStream<TYPE> OutputStream0)
+    // expected-error@-1{{object 'dx::HitObject' is not allowed in geometry streams}}
+    // expected-note@23{{'dx::HitObject' field declared here}}
+{}
+
+
+[shader("geometry")]
+[maxvertexcount(12)]
+void gs_line(
+    line TYPE a,
+    // expected-error@-1{{object 'dx::HitObject' is not allowed in entry function parameters}}
+    // expected-note@23{{'dx::HitObject' field declared here}}
+    inout TriangleStream<TYPE> OutputStream0)
+    // expected-error@-1{{object 'dx::HitObject' is not allowed in geometry streams}}
+    // expected-note@23{{'dx::HitObject' field declared here}}
+{}
+
+[shader("domain")]
+[domain("tri")]
+void ds_main(
+    OutputPatch<TYPE, 3> TrianglePatch)
+    // expected-error@-1{{object 'dx::HitObject' is not allowed in tessellation patches}}
+    // expected-note@23{{'dx::HitObject' field declared here}}
+{}
+
+void patch_const(
+    InputPatch<TYPE, 3> inpatch,
+    // expected-error@-1{{object 'dx::HitObject' is not allowed in tessellation patches}}
+    // expected-note@23{{'dx::HitObject' field declared here}}
+    OutputPatch<TYPE, 3> outpatch)
+    // expected-error@-1{{object 'dx::HitObject' is not allowed in tessellation patches}}
+    // expected-note@23{{'dx::HitObject' field declared here}}
+{}
+
+[shader("hull")]
+[domain("tri")]
+[outputtopology("triangle_cw")]
+[outputcontrolpoints(32)]
+[patchconstantfunc("patch_const")]
+void hs_main(InputPatch<TYPE, 3> TrianglePatch) {}
+// expected-error@-1{{object 'dx::HitObject' is not allowed in tessellation patches}}
+// expected-note@23{{'dx::HitObject' field declared here}}
+
+#define RTTYPE PASTE(DXR,TYPE)
+
+TYPE userFunc(TYPE arg) {
+  return arg;
+}
+
+[shader("raygeneration")]
+void raygen() {
+  RTTYPE p = (RTTYPE)0;
+  RayDesc ray = (RayDesc)0;
+  TraceRay(RTAS, RAY_FLAG_NONE, 0, 0, 1, 0, ray, p); 
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in user-defined struct parameter}}
+  // expected-note@40{{'dx::HitObject' field declared here}}
+  CallShader(0, p);
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in user-defined struct parameter}}
+  // expected-note@40{{'dx::HitObject' field declared here}}
+  TYPE val;
+  TYPE res = userFunc(val);
+}
+
+[shader("closesthit")]
+void closesthit(
+    inout RTTYPE payload,
+    // expected-error@-1{{payload parameter 'payload' must be a user-defined type composed of only numeric types}}
+    // expected-error@-2{{object 'dx::HitObject' is not allowed in entry function parameters}}
+    // expected-note@40{{'dx::HitObject' field declared here}}
+    in RTTYPE attribs) {
+    // expected-error@-1{{attributes parameter 'attribs' must be a user-defined type composed of only numeric types}}
+    // expected-error@-2{{object 'dx::HitObject' is not allowed in entry function parameters}}
+    // expected-note@40{{'dx::HitObject' field declared here}}
+  RayDesc ray;
+  TraceRay( RTAS, RAY_FLAG_NONE, 0xff, 0, 1, 0, ray, payload );
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in user-defined struct parameter}}
+  // expected-note@40{{'dx::HitObject' field declared here}}
+  CallShader(0, payload); 
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in user-defined struct parameter}}
+  // expected-note@40{{'dx::HitObject' field declared here}}
+}
+
+[shader("anyhit")]
+void AnyHit(
+    inout RTTYPE payload, 
+    // expected-error@-1{{payload parameter 'payload' must be a user-defined type composed of only numeric types}}
+    // expected-error@-2{{object 'dx::HitObject' is not allowed in entry function parameters}}
+    // expected-note@40{{'dx::HitObject' field declared here}}
+    in RTTYPE attribs)
+    // expected-error@-1{{attributes parameter 'attribs' must be a user-defined type composed of only numeric types}}
+    // expected-error@-2{{object 'dx::HitObject' is not allowed in entry function parameters}}
+    // expected-note@40{{'dx::HitObject' field declared here}}
+{
+}
+
+[shader("miss")]
+void Miss(
+    inout RTTYPE payload){
+    // expected-error@-1{{payload parameter 'payload' must be a user-defined type composed of only numeric types}}
+    // expected-error@-2{{object 'dx::HitObject' is not allowed in entry function parameters}}
+    // expected-note@40{{'dx::HitObject' field declared here}}
+  RayDesc ray;
+  TraceRay( RTAS, RAY_FLAG_NONE, 0xff, 0, 1, 0, ray, payload ); 
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in user-defined struct parameter}}
+  // expected-note@40{{'dx::HitObject' field declared here}}
+  CallShader(0, payload);
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in user-defined struct parameter}}
+  // expected-note@40{{'dx::HitObject' field declared here}}
+}
+
+[shader("intersection")]
+void Intersection() {
+  float hitT = RayTCurrent();
+  RTTYPE attr = (RTTYPE)0;
+  bool bReported = ReportHit(hitT, 0, attr);
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in attributes}}
+  // expected-note@40{{'dx::HitObject' field declared here}}
+}
+
+[shader("callable")]
+void callable1(
+    inout RTTYPE p) { 
+    // expected-error@-1{{object 'dx::HitObject' is not allowed in entry function parameters}}
+    // expected-note@40{{'dx::HitObject' field declared here}}
+    // expected-error@-3{{callable parameter 'p' must be a user-defined type composed of only numeric types}}
+  CallShader(0, p); 
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in user-defined struct parameter}}
+  // expected-note@40{{'dx::HitObject' field declared here}}
+}
+
+static groupshared TYPE gs_var;
+// expected-error@-1{{object 'dx::HitObject' is not allowed in groupshared variables}}
+// expected-note@23{{'dx::HitObject' field declared here}}
+
+[shader("amplification")]
+[numthreads(1,1,1)]
+void Amp() {
+  TYPE as_pld;
+  DispatchMesh(1,1,1,as_pld); 
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in user-defined struct parameter}}
+  // expected-note@23{{'dx::HitObject' field declared here}}
+}
+
+#define NTYPE PASTE(Node,TYPE)
+
+[Shader("node")]
+[NodeLaunch("broadcasting")]
+[NumThreads(8,1,1)]
+[NodeMaxDispatchGrid(8, 1, 1)]
+void broadcast(
+// expected-error@-1{{Broadcasting node shader 'broadcast' with NodeMaxDispatchGrid attribute must declare an input record containing a field with SV_DispatchGrid semantic}}
+    DispatchNodeInputRecord<NTYPE> input,
+    // expected-error@-1{{object 'dx::HitObject' is not allowed in node records}}
+    // expected-note@55{{'dx::HitObject' field declared here}}
+    NodeOutput<TYPE> output)
+    // expected-error@-1{{object 'dx::HitObject' is not allowed in node records}}
+    // expected-note@23{{'dx::HitObject' field declared here}}
+{
+  ThreadNodeOutputRecords<TYPE> touts; 
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in node records}}
+  // expected-note@23{{'dx::HitObject' field declared here}}
+  GroupNodeOutputRecords<TYPE> gouts;
+  // expected-error@-1{{object 'dx::HitObject' is not allowed in node records}}
+  // expected-note@23{{'dx::HitObject' field declared here}}
+}
+
+[Shader("node")]
+[NodeLaunch("coalescing")]
+[NumThreads(8,1,1)]
+void coalesce(GroupNodeInputRecords<TYPE> input) {}
+// expected-error@-1{{object 'dx::HitObject' is not allowed in node records}}
+// expected-note@23{{'dx::HitObject' field declared here}}
+
+[Shader("node")]
+[NodeLaunch("thread")]
+void threader(ThreadNodeInputRecord<TYPE> input) {}
+// expected-error@-1{{object 'dx::HitObject' is not allowed in node records}}
+// expected-note@23{{'dx::HitObject' field declared here}}
diff --git a/tools/clang/test/SemaHLSL/hlsl/types/invalid-longvec-decls.hlsl b/tools/clang/test/SemaHLSL/hlsl/types/invalid-longvec-decls.hlsl
index 0604feeaec..96c5d4b5f4 100644
--- a/tools/clang/test/SemaHLSL/hlsl/types/invalid-longvec-decls.hlsl
+++ b/tools/clang/test/SemaHLSL/hlsl/types/invalid-longvec-decls.hlsl
@@ -146,7 +146,7 @@ void Miss(inout RTTYPE payload){ // expected-error{{vectors of over 4 elements i
 void Intersection() {
   float hitT = RayTCurrent();
   RTTYPE attr = (RTTYPE)0;
-  bool bReported = ReportHit(hitT, 0, attr); // expected-error{{vectors of over 4 elements in user-defined struct parameter are not supported}}
+  bool bReported = ReportHit(hitT, 0, attr); // expected-error{{vectors of over 4 elements in attributes are not supported}}
 }
 
 [shader("callable")]
diff --git a/tools/clang/test/SemaHLSL/hlsl/workgraph/invalid_node_record_type.hlsl b/tools/clang/test/SemaHLSL/hlsl/workgraph/invalid_node_record_type.hlsl
index 40b820a1b4..de523d51d1 100644
--- a/tools/clang/test/SemaHLSL/hlsl/workgraph/invalid_node_record_type.hlsl
+++ b/tools/clang/test/SemaHLSL/hlsl/workgraph/invalid_node_record_type.hlsl
@@ -76,7 +76,7 @@ void node07(RWThreadNodeInputRecord<f2x2> input) // expected-error {{'f2x2' (aka
 
 [Shader("node")]
 [NodeLaunch("thread")]
-void node08(ThreadNodeInputRecord<BAD_RECORD> input) // expected-error {{object 'SamplerState' may not appear in a node record}}
+void node08(ThreadNodeInputRecord<BAD_RECORD> input) // expected-error {{object 'SamplerState' is not allowed in node records}}
 { }
 
 [Shader("node")]
@@ -86,17 +86,17 @@ void node09(ThreadNodeInputRecord<BAD_RECORD[4]> input) // expected-error {{'BAD
 
 [Shader("node")]
 [NodeLaunch("thread")]
-void node10(RWThreadNodeInputRecord<BAD_RECORD2> input) // expected-error {{object 'SamplerState' may not appear in a node record}}
+void node10(RWThreadNodeInputRecord<BAD_RECORD2> input) // expected-error {{object 'SamplerState' is not allowed in node records}}
 { }
 
 [Shader("node")]
 [NodeLaunch("thread")]
-void node11(NodeOutput<BAD_RECORD> input) // expected-error {{object 'SamplerState' may not appear in a node record}}
+void node11(NodeOutput<BAD_RECORD> input) // expected-error {{object 'SamplerState' is not allowed in node records}}
 { }
 
 [Shader("node")]
 [NodeLaunch("thread")]
-void node12(NodeOutputArray<MyBadRecord> output) // expected-error {{object 'SamplerState' may not appear in a node record}}
+void node12(NodeOutputArray<MyBadRecord> output) // expected-error {{object 'SamplerState' is not allowed in node records}}
 { }
 
 [Shader("node")]
@@ -129,7 +129,7 @@ void node16()
 
   ThreadNodeOutputRecords<f2x2> outrec2; // expected-error {{'f2x2' (aka 'matrix<float, 2, 2>') is not valid as a node record type - struct/class required}}
 
-  GroupNodeOutputRecords<MyBadRecord> outrec3; // expected-error {{object 'SamplerState' may not appear in a node record}}
+  GroupNodeOutputRecords<MyBadRecord> outrec3; // expected-error {{object 'SamplerState' is not allowed in node records}}
 
   ThreadNodeOutputRecords<SamplerState> outrec4; // expected-error {{'SamplerState' is not valid as a node record type - struct/class required}}
 }
@@ -151,10 +151,10 @@ void node17(ThreadNodeInputRecord<MyTemplateStruct<int> > input)
 
 [Shader("node")]
 [NodeLaunch("thread")]
-void node18(ThreadNodeInputRecord<MyTemplateStruct<SamplerState> > input) // expected-error {{object 'SamplerState' may not appear in a node record}}
+void node18(ThreadNodeInputRecord<MyTemplateStruct<SamplerState> > input) // expected-error {{object 'SamplerState' is not allowed in node records}}
 { }
 
 [Shader("node")]
 [NodeLaunch("thread")]
-void node19(RWThreadNodeInputRecord<MyNestedTemplateStruct> input) // expected-error {{object 'SamplerState' may not appear in a node record}}
+void node19(RWThreadNodeInputRecord<MyNestedTemplateStruct> input) // expected-error {{object 'SamplerState' is not allowed in node records}}
 { }
diff --git a/tools/clang/test/SemaHLSL/raytracing-entry-diags.hlsl b/tools/clang/test/SemaHLSL/raytracing-entry-diags.hlsl
index e41c6a2f4f..8dfc927e11 100644
--- a/tools/clang/test/SemaHLSL/raytracing-entry-diags.hlsl
+++ b/tools/clang/test/SemaHLSL/raytracing-entry-diags.hlsl
@@ -181,3 +181,24 @@ void callable7(inout MyPayload payload, float F) {}
 
 [shader("callable")]
 float callable8(inout MyPayload payload) {} // expected-error{{return type for 'callable' shaders must be void}}
+
+// expected-note@+1 6 {{forward declaration of 'Incomplete'}}
+struct Incomplete;
+
+// expected-error@+3{{variable has incomplete type 'Incomplete'}}
+// expected-error@+2{{variable has incomplete type '__restrict Incomplete'}}
+[shader("anyhit")]
+void anyhit_incomplete( inout Incomplete A1, Incomplete A2) { }
+
+// expected-error@+3{{variable has incomplete type 'Incomplete'}}
+// expected-error@+2{{variable has incomplete type '__restrict Incomplete'}}
+[shader("closesthit")]
+void closesthit_incomplete( inout Incomplete payload, Incomplete attr ) {}
+
+// expected-error@+2{{variable has incomplete type '__restrict Incomplete'}}
+[shader("miss")]
+void miss_incomplete( inout Incomplete payload) { }
+
+// expected-error@+2{{variable has incomplete type '__restrict Incomplete'}}
+[shader("callable")]
+void callable_incomplete(inout Incomplete payload) {}
diff --git a/tools/clang/test/SemaHLSL/raytracings.hlsl b/tools/clang/test/SemaHLSL/raytracings.hlsl
index d3bc01fcd6..429037f22b 100644
--- a/tools/clang/test/SemaHLSL/raytracings.hlsl
+++ b/tools/clang/test/SemaHLSL/raytracings.hlsl
@@ -12,14 +12,14 @@ void run() {
     RAY_FLAG_CULL_OPAQUE                     +
     RAY_FLAG_CULL_NON_OPAQUE;
 
-  rayFlags += RAY_FLAG_INVALID;                             /* expected-note@? {{'RAY_FLAG_NONE' declared here}} expected-error {{use of undeclared identifier 'RAY_FLAG_INVALID'; did you mean 'RAY_FLAG_NONE'?}} */
+  rayFlags += RAY_FLAG_INVALID;                             /* expected-error {{use of undeclared identifier 'RAY_FLAG_INVALID'; did you mean 'RAY_FLAG_NONE'?}} */
 
   int intFlag = RAY_FLAG_CULL_OPAQUE;
 
   int hitKindFlag =
     HIT_KIND_TRIANGLE_FRONT_FACE + HIT_KIND_TRIANGLE_BACK_FACE;
 
-  hitKindFlag += HIT_KIND_INVALID;                          /* expected-note@? {{'HIT_KIND_NONE' declared here}} expected-error {{use of undeclared identifier 'HIT_KIND_INVALID'; did you mean 'HIT_KIND_NONE'?}} */
+  hitKindFlag += HIT_KIND_INVALID;                          /* expected-error {{use of undeclared identifier 'HIT_KIND_INVALID'; did you mean 'HIT_KIND_NONE'?}} */
 
 
   BuiltInTriangleIntersectionAttributes attr;
diff --git a/tools/clang/test/SemaHLSL/sizeof-requires-complete-type.hlsl b/tools/clang/test/SemaHLSL/sizeof-requires-complete-type.hlsl
new file mode 100644
index 0000000000..31d4898efe
--- /dev/null
+++ b/tools/clang/test/SemaHLSL/sizeof-requires-complete-type.hlsl
@@ -0,0 +1,27 @@
+// RUN: %dxc -T lib_6_3 -verify %s
+
+struct Complete {};
+
+struct Incomplete; // expected-note{{forward declaration of 'Incomplete'}}
+template<typename T> struct CompleteTemplate {};
+
+void fn() {
+  uint s;
+  // Complete types are easy. They are complete before we get to the expression.
+  s = sizeof(Complete); // This works!
+
+  // A type may be incomplete for several reasons.
+
+  // It may be incomplete because there is only a forward declaration, which
+  // should produce an error since we can't materialize a definition.
+  s = sizeof(Incomplete); // expected-error{{invalid application of 'sizeof' to an incomplete type 'Incomplete'}}
+
+  // It may be incomplete because it is an un-instantiated template, which
+  // should work because we can just instantiate it.
+  s = sizeof(CompleteTemplate<int>); // This works!
+
+  // It may be incomplete because it is a lazy-initialized type from HLSL,
+  // which can be completed, and then will report a non-numeric type error.
+  // expected-error@+1{{invalid application of 'sizeof' to non-numeric type 'Buffer'}}
+  s = sizeof(Buffer);
+}
diff --git a/tools/clang/test/SemaHLSL/template-checks.hlsl b/tools/clang/test/SemaHLSL/template-checks.hlsl
index d0d736fc1f..751e89b652 100644
--- a/tools/clang/test/SemaHLSL/template-checks.hlsl
+++ b/tools/clang/test/SemaHLSL/template-checks.hlsl
@@ -1,8 +1,8 @@
 // RUN: %dxc -Tlib_6_3 -verify %s
 
 Texture2D<float4> t_float4;
-Texture2D<SamplerState> t_obj_sampler;          /* expected-error {{'SamplerState' is an object and cannot be used as a type parameter}} fxc-error {{X3124: object element type cannot be an object type}} */
-Texture2D<Texture2D<float4> > t_obj_tex;        /* expected-error {{'Texture2D<float4>' is an object and cannot be used as a type parameter}} fxc-error {{X3124: object element type cannot be an object type}} */
+Texture2D<SamplerState> t_obj_sampler;          /* expected-error {{object 'SamplerState' is not allowed in builtin template parameters}} fxc-error {{X3124: object element type cannot be an object type}} */
+Texture2D<Texture2D<float4> > t_obj_tex;        /* expected-error {{object 'Texture2D<float4>' is not allowed in builtin template parameters}} fxc-error {{X3124: object element type cannot be an object type}} */
 
 matrix<SamplerState, 1, 2> m_obj_sampler;       /* expected-error {{'SamplerState' cannot be used as a type parameter where a scalar is required}} fxc-error {{X3123: matrix element type must be a scalar type}} */
 matrix<bool, 1, 2> m_bool;
@@ -15,7 +15,7 @@ matrix<bool, 1, 2> m_bool;
 
 RWBuffer<double3> rwb_struct;    /* expected-error {{elements of typed buffers and textures must fit in four 32-bit quantities}} fxc-error {{X3037: elements of typed buffers and textures must fit in four 32-bit quantities}} */
 
-RWBuffer<SamplerState> rwb_struct_objs; /* expected-error {{'SamplerState' is an object and cannot be used as a type parameter}} */
+RWBuffer<SamplerState> rwb_struct_objs; /* expected-error {{object 'SamplerState' is not allowed in builtin template parameters}} */
 
 void vain() {
   // Nothing to do here.
diff --git a/tools/clang/test/SemaHLSL/template-udt-load.hlsl b/tools/clang/test/SemaHLSL/template-udt-load.hlsl
index 591f27b384..dd7cf8bd16 100644
--- a/tools/clang/test/SemaHLSL/template-udt-load.hlsl
+++ b/tools/clang/test/SemaHLSL/template-udt-load.hlsl
@@ -4,10 +4,51 @@
 ByteAddressBuffer In;
 RWBuffer<float> Out;
 
+template <typename T>
+struct Foo {
+  // expected-note@+1{{'RWBuffer<float>' field declared here}}
+  T Member;
+};
+
+template <typename T>
+struct MyTemplate {
+  T GetValue(ByteAddressBuffer srv, uint offset) {
+    // expected-error@+2{{Explicit template arguments on intrinsic Load must be a single numeric type}}
+    // expected-error@+1{{object 'RWBuffer<float>' is not allowed in builtin template parameters}}
+    return srv.Load<T>(offset);
+  }
+};
+template <typename T>
+T GetValue(uint offset) {
+  MyTemplate<T> myTemplate;
+  // expected-error@+2{{scalar, vector, or matrix expected}}
+  // expected-note@+1{{in instantiation of member function 'MyTemplate<RWBuffer<float> >::GetValue' requested here}}
+  return myTemplate.GetValue(In, offset) +
+  // expected-error@+2{{Explicit template arguments on intrinsic Load must be a single numeric type}}
+  // expected-error@+1{{object 'RWBuffer<float>' is not allowed in builtin template parameters}}
+         In.Load<Foo<T> >(offset + 4).Member;
+}
+
+// expected-note@+1{{forward declaration of 'Incomplete'}}
+struct Incomplete;
+
 [shader("compute")]
 [numthreads(1,1,1)]
 void main()
 { 
-  RWBuffer<float> FB = In.Load<RWBuffer<float> >(0); // expected-error {{Explicit template arguments on intrinsic Load must be a single numeric type}}
+  RWBuffer<float> FB = In.Load<RWBuffer<float> >(0);
+  // expected-error@-1{{Explicit template arguments on intrinsic Load must be a single numeric type}}
+  // expected-error@-2{{object 'RWBuffer<float>' is not allowed in builtin template parameters}}
+
   Out[0] = FB[0];
+
+  // Ok:
+  Out[4] = GetValue<float>(4);
+  
+  // expected-note@?{{'Load' declared here}}
+  // expected-error@+1{{calling 'Load' with incomplete return type 'Incomplete'}}
+  Out[8] = In.Load<Incomplete>(8);
+
+  // expected-note@+1 2 {{in instantiation of function template specialization 'GetValue<RWBuffer<float> >' requested here}}
+  RWBuffer<float> FB2 = GetValue<RWBuffer<float> >(16);
 }
diff --git a/tools/clang/test/SemaHLSL/using-namespace-dx-errors.hlsl b/tools/clang/test/SemaHLSL/using-namespace-dx-errors.hlsl
new file mode 100644
index 0000000000..233ce103ce
--- /dev/null
+++ b/tools/clang/test/SemaHLSL/using-namespace-dx-errors.hlsl
@@ -0,0 +1,42 @@
+// RUN: %dxc -T lib_6_9 %s -verify
+
+RaytracingAccelerationStructure Scene : register(t0, space0);
+
+struct[raypayload] RayPayload {
+  float4 color : write(caller) : read(closesthit);
+};
+
+[shader("raygeneration")] void MyRaygenShader() {
+  // Set the ray's extents.
+  RayDesc ray;
+  ray.Origin = float3(0, 0, 1);
+  ray.Direction = float3(1, 0, 0);
+  ray.TMin = 0.001;
+  ray.TMax = 10000.0;
+
+  RayPayload payload = {float4(0, 0, 0, 0)};
+
+  {
+    using namespace dx;
+    HitObject hit =
+        HitObject::TraceRay(Scene, RAY_FLAG_NONE, ~0, 0, 1, 0,
+                            ray, payload);
+
+    int sortKey = 1;
+    MaybeReorderThread(sortKey, 1);
+  }
+
+  {
+    int sortKey = 1;
+    MaybeReorderThread(sortKey, 1); // expected-error{{use of undeclared identifier 'MaybeReorderThread'; did you mean 'MaybeReorderThread'?}}
+  }
+
+  int sortKey = 1;
+  MaybeReorderThread(sortKey, 1); // expected-error{{use of undeclared identifier 'MaybeReorderThread'; did you mean 'MaybeReorderThread'?}}
+
+  HitObject hit = // expected-error{{unknown type name 'HitObject'}}
+        HitObject::TraceRay(Scene, RAY_FLAG_NONE, ~0, 0, 1, 0,
+                            ray, payload);
+
+  HitObject::Invoke(hit, payload); // expected-error{{use of undeclared identifier 'HitObject'}}
+}
diff --git a/tools/clang/test/SemaHLSL/using-namespace-dx.hlsl b/tools/clang/test/SemaHLSL/using-namespace-dx.hlsl
new file mode 100644
index 0000000000..093e86b2fa
--- /dev/null
+++ b/tools/clang/test/SemaHLSL/using-namespace-dx.hlsl
@@ -0,0 +1,56 @@
+// RUN: %dxc -T lib_6_9 -ast-dump-implicit %s | FileCheck %s
+
+RaytracingAccelerationStructure Scene : register(t0, space0);
+
+struct[raypayload] RayPayload {
+  float4 color : write(caller) : read(closesthit);
+};
+
+namespace MyStuff {
+  using namespace dx;
+  void MaybeReorderThread(int2 V);
+}
+
+void MyStuff::MaybeReorderThread(int2 V) {
+  MaybeReorderThread(V.x, V.y);
+}
+
+[shader("raygeneration")] void MyRaygenShader() {
+  // Set the ray's extents.
+  RayDesc ray;
+  ray.Origin = float3(0, 0, 1);
+  ray.Direction = float3(1, 0, 0);
+  ray.TMin = 0.001;
+  ray.TMax = 10000.0;
+
+  RayPayload payload = {float4(0, 0, 0, 0)};
+  
+  using namespace dx;
+  HitObject hit =
+      HitObject::TraceRay(Scene, RAY_FLAG_NONE, ~0, 0, 1, 0,
+                          ray, payload);
+
+  int sortKey = 1;
+  MaybeReorderThread(sortKey, 1);
+
+  HitObject::Invoke(hit, payload);
+
+  MyStuff::MaybeReorderThread(int2(sortKey, 1));
+}
+
+// Find the DeclRefExpr for the call to MaybeReorderThread:
+
+// CHECK: FunctionDecl [[MyDeclAddr:0x[0-9a-fA-F]+]] parent {{.*}} used MaybeReorderThread 'void (int2)'
+// CHECK: DeclRefExpr {{.*}} 'void (unsigned int, unsigned int)' lvalue Function [[DeclAddr:0x[0-9a-fA-F]+]] 'MaybeReorderThread' 'void (unsigned int, unsigned int)'
+
+// CHECK: FunctionDecl [[DeclAddr]] <<invalid sloc>> <invalid sloc> implicit used MaybeReorderThread 'void (unsigned int, unsigned int)' extern
+// CHECK-NEXT: ParmVarDecl {{.*}} CoherenceHint 'unsigned int'
+// CHECK-NEXT: ParmVarDecl {{.*}} NumCoherenceHintBitsFromLSB 'unsigned int'
+// CHECK-NEXT: HLSLIntrinsicAttr {{.*}} Implicit "op" "" 359
+// CHECK-NEXT: AvailabilityAttr {{.*}} Implicit  6.9 0 0 ""
+
+// CHECK-LABEL: MyRaygenShader
+
+// CHECK: DeclRefExpr {{.*}} 'void (unsigned int, unsigned int)' lvalue Function [[DeclAddr:0x[0-9a-fA-F]+]] 'MaybeReorderThread' 'void (unsigned int, unsigned int)'
+// CHECK: DeclRefExpr {{.*}} 'void (int2)' lvalue Function [[MyDeclAddr:0x[0-9a-fA-F]+]] 'MaybeReorderThread' 'void (int2)'
+
diff --git a/tools/clang/tools/dxcompiler/CMakeLists.txt b/tools/clang/tools/dxcompiler/CMakeLists.txt
index c69e276194..26bf0e5d98 100644
--- a/tools/clang/tools/dxcompiler/CMakeLists.txt
+++ b/tools/clang/tools/dxcompiler/CMakeLists.txt
@@ -57,7 +57,6 @@ set(SOURCES
   DXCompiler.rc
   DXCompiler.def
   dxcfilesystem.cpp
-  dxillib.cpp
   dxcutil.cpp
   dxcdisassembler.cpp
   dxcpdbutils.cpp
@@ -75,7 +74,6 @@ set(SOURCES
   dxcutil.cpp
   dxcdisassembler.cpp
   dxcpdbutils.cpp
-  dxillib.cpp
   dxcvalidator.cpp
   dxclinker.cpp
   dxcshadersourceinfo.cpp
diff --git a/tools/clang/tools/dxcompiler/DXCompiler.cpp b/tools/clang/tools/dxcompiler/DXCompiler.cpp
index c548441449..c7ffcbffa1 100644
--- a/tools/clang/tools/dxcompiler/DXCompiler.cpp
+++ b/tools/clang/tools/dxcompiler/DXCompiler.cpp
@@ -19,7 +19,6 @@
 #ifdef LLVM_ON_WIN32
 #include "dxcetw.h"
 #endif
-#include "dxillib.h"
 
 namespace hlsl {
 HRESULT SetupRegistryPassForHLSL();
@@ -65,7 +64,6 @@ static HRESULT InitMaybeFail() throw() {
   fsSetup = true;
   IFC(hlsl::SetupRegistryPassForHLSL());
   IFC(hlsl::SetupRegistryPassForPIX());
-  IFC(DxilLibInitialize());
   if (hlsl::options::initHlslOptTable()) {
     hr = E_FAIL;
     goto Cleanup;
@@ -110,12 +108,6 @@ BOOL WINAPI DllMain(HINSTANCE hinstDLL, DWORD Reason, LPVOID reserved) {
     ::hlsl::options::cleanupHlslOptTable();
     ::llvm::sys::fs::CleanupPerThreadFileSystem();
     ::llvm::llvm_shutdown();
-    if (reserved ==
-        NULL) { // FreeLibrary has been called or the DLL load failed
-      DxilLibCleanup(DxilLibCleanUpType::UnloadLibrary);
-    } else { // Process termination. We should not call FreeLibrary()
-      DxilLibCleanup(DxilLibCleanUpType::ProcessTermination);
-    }
     DxcClearThreadMalloc();
     DxcCleanupThreadMalloc();
     DxcEtw_DXCompilerShutdown_Stop(S_OK);
diff --git a/tools/clang/tools/dxcompiler/dxcapi.cpp b/tools/clang/tools/dxcompiler/dxcapi.cpp
index a6a877cba4..d4e85bc35c 100644
--- a/tools/clang/tools/dxcompiler/dxcapi.cpp
+++ b/tools/clang/tools/dxcompiler/dxcapi.cpp
@@ -25,7 +25,6 @@
 #include "dxcetw.h"
 #endif
 #include "dxc/DxilContainer/DxcContainerBuilder.h"
-#include "dxillib.h"
 #include <memory>
 
 HRESULT CreateDxcCompiler(REFIID riid, _Out_ LPVOID *ppv);
@@ -59,20 +58,11 @@ HRESULT CreateDxcContainerReflection(REFIID riid, _Out_ LPVOID *ppv) {
 HRESULT CreateDxcContainerBuilder(REFIID riid, _Out_ LPVOID *ppv) {
   // Call dxil.dll's containerbuilder
   *ppv = nullptr;
-  const char *warning;
-  HRESULT hr = DxilLibCreateInstance(CLSID_DxcContainerBuilder,
-                                     (IDxcContainerBuilder **)ppv);
-  if (FAILED(hr)) {
-    warning = "Unable to create container builder from dxil.dll. Resulting "
-              "container will not be signed.\n";
-  } else {
-    return hr;
-  }
 
   CComPtr<DxcContainerBuilder> Result =
       DxcContainerBuilder::Alloc(DxcGetThreadMallocNoRef());
   IFROOM(Result.p);
-  Result->Init(warning);
+  Result->Init();
   return Result->QueryInterface(riid, ppv);
 }
 
@@ -87,11 +77,7 @@ static HRESULT ThreadMallocDxcCreateInstance(REFCLSID rclsid, REFIID riid,
   } else if (IsEqualCLSID(rclsid, CLSID_DxcUtils)) {
     hr = CreateDxcUtils(riid, ppv);
   } else if (IsEqualCLSID(rclsid, CLSID_DxcValidator)) {
-    if (DxilLibIsEnabled()) {
-      hr = DxilLibCreateInstance(rclsid, riid, (IUnknown **)ppv);
-    } else {
-      hr = CreateDxcValidator(riid, ppv);
-    }
+    hr = CreateDxcValidator(riid, ppv);
   } else if (IsEqualCLSID(rclsid, CLSID_DxcAssembler)) {
     hr = CreateDxcAssembler(riid, ppv);
   } else if (IsEqualCLSID(rclsid, CLSID_DxcOptimizer)) {
diff --git a/tools/clang/tools/dxcompiler/dxcassembler.cpp b/tools/clang/tools/dxcompiler/dxcassembler.cpp
index 0ff2abe26c..6622e93cbc 100644
--- a/tools/clang/tools/dxcompiler/dxcassembler.cpp
+++ b/tools/clang/tools/dxcompiler/dxcassembler.cpp
@@ -19,7 +19,6 @@
 #include "dxc/Support/dxcfilesystem.h"
 #include "dxc/Support/microcom.h"
 #include "dxcutil.h"
-#include "dxillib.h"
 
 #include "llvm/Bitcode/ReaderWriter.h"
 #include "llvm/IR/LLVMContext.h"
diff --git a/tools/clang/tools/dxcompiler/dxclinker.cpp b/tools/clang/tools/dxcompiler/dxclinker.cpp
index 82c9b8e96b..f5427ccc08 100644
--- a/tools/clang/tools/dxcompiler/dxclinker.cpp
+++ b/tools/clang/tools/dxcompiler/dxclinker.cpp
@@ -18,7 +18,6 @@
 #include "dxc/Support/dxcapi.impl.h"
 #include "dxc/Support/microcom.h"
 #include "dxc/dxcapi.h"
-#include "dxillib.h"
 
 #include "llvm/ADT/SmallVector.h"
 #include <algorithm>
diff --git a/tools/clang/tools/dxcompiler/dxcompilerobj.cpp b/tools/clang/tools/dxcompiler/dxcompilerobj.cpp
index ebeee380ef..84b568df9c 100644
--- a/tools/clang/tools/dxcompiler/dxcompilerobj.cpp
+++ b/tools/clang/tools/dxcompiler/dxcompilerobj.cpp
@@ -56,7 +56,6 @@
 #include "dxcompileradapter.h"
 #include "dxcshadersourceinfo.h"
 #include "dxcversion.inc"
-#include "dxillib.h"
 #include <algorithm>
 #include <cfloat>
 
@@ -850,11 +849,9 @@ class DxcCompiler : public IDxcCompiler3,
           compiler.getCodeGenOpts().HLSLValidatorMajorVer = opts.ValVerMajor;
           compiler.getCodeGenOpts().HLSLValidatorMinorVer = opts.ValVerMinor;
         } else {
-          // Version from dxil.dll, or internal validator if unavailable
           dxcutil::GetValidatorVersion(
               &compiler.getCodeGenOpts().HLSLValidatorMajorVer,
-              &compiler.getCodeGenOpts().HLSLValidatorMinorVer,
-              opts.SelectValidator);
+              &compiler.getCodeGenOpts().HLSLValidatorMinorVer);
         }
 
         // Root signature-only container validation is only supported on 1.5 and
@@ -934,7 +931,7 @@ class DxcCompiler : public IDxcCompiler3,
             CComPtr<IDxcBlobEncoding> pValErrors;
             // Validation failure communicated through diagnostic error
             dxcutil::ValidateRootSignatureInContainer(
-                pOutputBlob, &compiler.getDiagnostics(), opts.SelectValidator);
+                pOutputBlob, &compiler.getDiagnostics());
           }
         }
       } else if (opts.VerifyDiagnostics) {
@@ -1054,8 +1051,7 @@ class DxcCompiler : public IDxcCompiler3,
               std::move(serializeModule), pOutputBlob, m_pMalloc,
               SerializeFlags, pOutputStream, 0, opts.GetPDBName(),
               &compiler.getDiagnostics(), &ShaderHashContent, pReflectionStream,
-              pRootSigStream, pRootSignatureBlob, pPrivateBlob,
-              opts.SelectValidator);
+              pRootSigStream, pRootSignatureBlob, pPrivateBlob);
 
           inputs.pVersionInfo = static_cast<IDxcVersionInfo *>(this);
 
@@ -1108,8 +1104,7 @@ class DxcCompiler : public IDxcCompiler3,
                 CComPtr<IDxcBlobEncoding> pValErrors;
                 // Validation failure communicated through diagnostic error
                 dxcutil::ValidateRootSignatureInContainer(
-                    pRootSignature, &compiler.getDiagnostics(),
-                    opts.SelectValidator);
+                    pRootSignature, &compiler.getDiagnostics());
               }
               IFT(pResult->SetOutputObject(DXC_OUT_ROOT_SIGNATURE,
                                            pRootSignature));
@@ -1324,13 +1319,6 @@ class DxcCompiler : public IDxcCompiler3,
       CComPtr<IDxcResult> pResult;
       hr = e.hr;
       std::string msg("Internal Compiler error: ");
-      switch (hr) {
-      case DXC_E_VALIDATOR_MISSING:
-        msg = "Error: external validator selected, but DXIL.dll not found.";
-        break;
-      default:
-        break;
-      }
       msg += e.msg;
       if (SUCCEEDED(DxcResult::Create(
               e.hr, DXC_OUT_NONE,
diff --git a/tools/clang/tools/dxcompiler/dxcutil.cpp b/tools/clang/tools/dxcompiler/dxcutil.cpp
index d3a531d4c6..4e5c5c95e8 100644
--- a/tools/clang/tools/dxcompiler/dxcutil.cpp
+++ b/tools/clang/tools/dxcompiler/dxcutil.cpp
@@ -19,7 +19,6 @@
 #include "dxc/Support/WinIncludes.h"
 #include "dxc/Support/dxcapi.impl.h"
 #include "dxc/dxcapi.h"
-#include "dxillib.h"
 #include "clang/Basic/Diagnostic.h"
 #include "llvm/Bitcode/ReaderWriter.h"
 #include "llvm/IR/DebugInfo.h"
@@ -49,23 +48,9 @@ HRESULT RunInternalValidator(IDxcValidator *pValidator,
 namespace {
 // AssembleToContainer helper functions.
 
-bool CreateValidator(CComPtr<IDxcValidator> &pValidator,
-                     hlsl::options::ValidatorSelection SelectValidator =
-                         hlsl::options::ValidatorSelection::Auto) {
-  bool bInternal =
-      SelectValidator == hlsl::options::ValidatorSelection::Internal;
-  bool bExternal =
-      SelectValidator == hlsl::options::ValidatorSelection::External;
-  if (!bInternal && DxilLibIsEnabled())
-    DxilLibCreateInstance(CLSID_DxcValidator, &pValidator);
-
-  bool bInternalValidator = false;
-  if (pValidator == nullptr) {
-    IFTBOOL(!bExternal, DXC_E_VALIDATOR_MISSING);
-    IFT(CreateDxcValidator(IID_PPV_ARGS(&pValidator)));
-    bInternalValidator = true;
-  }
-  return bInternalValidator;
+// return true if the internal validator was used, false otherwise
+void CreateValidator(CComPtr<IDxcValidator> &pValidator) {
+  IFT(CreateDxcValidator(IID_PPV_ARGS(&pValidator)));
 }
 
 } // namespace
@@ -79,23 +64,20 @@ AssembleInputs::AssembleInputs(
     uint32_t ValidationFlags, llvm::StringRef DebugName,
     clang::DiagnosticsEngine *pDiag, hlsl::DxilShaderHash *pShaderHashOut,
     AbstractMemoryStream *pReflectionOut, AbstractMemoryStream *pRootSigOut,
-    CComPtr<IDxcBlob> pRootSigBlob, CComPtr<IDxcBlob> pPrivateBlob,
-    hlsl::options::ValidatorSelection SelectValidator)
+    CComPtr<IDxcBlob> pRootSigBlob, CComPtr<IDxcBlob> pPrivateBlob)
     : pM(std::move(pM)), pOutputContainerBlob(pOutputContainerBlob),
       pMalloc(pMalloc), SerializeFlags(SerializeFlags),
       ValidationFlags(ValidationFlags), pModuleBitcode(pModuleBitcode),
       DebugName(DebugName), pDiag(pDiag), pShaderHashOut(pShaderHashOut),
       pReflectionOut(pReflectionOut), pRootSigOut(pRootSigOut),
-      pRootSigBlob(pRootSigBlob), pPrivateBlob(pPrivateBlob),
-      SelectValidator(SelectValidator) {}
+      pRootSigBlob(pRootSigBlob), pPrivateBlob(pPrivateBlob) {}
 
-void GetValidatorVersion(unsigned *pMajor, unsigned *pMinor,
-                         hlsl::options::ValidatorSelection SelectValidator) {
+void GetValidatorVersion(unsigned *pMajor, unsigned *pMinor) {
   if (pMajor == nullptr || pMinor == nullptr)
     return;
 
   CComPtr<IDxcValidator> pValidator;
-  CreateValidator(pValidator, SelectValidator);
+  CreateValidator(pValidator);
 
   CComPtr<IDxcVersionInfo> pVersionInfo;
   if (SUCCEEDED(pValidator.QueryInterface(&pVersionInfo))) {
@@ -167,76 +149,19 @@ HRESULT ValidateAndAssembleToContainer(AssembleInputs &inputs) {
   std::unique_ptr<llvm::Module> llvmModuleWithDebugInfo;
 
   CComPtr<IDxcValidator> pValidator;
-  bool bInternalValidator = CreateValidator(pValidator, inputs.SelectValidator);
-  // Warning on internal Validator
-
-  CComPtr<IDxcValidator2> pValidator2;
-  if (!bInternalValidator) {
-    pValidator.QueryInterface(&pValidator2);
-  }
+  CreateValidator(pValidator);
 
-  if (bInternalValidator || pValidator2) {
-    // If using the internal validator or external validator supports
-    // IDxcValidator2, we'll use the modules directly. In this case, we'll want
-    // to make a clone to avoid SerializeDxilContainerForModule stripping all
-    // the debug info. The debug info will be stripped from the orginal module,
-    // but preserved in the cloned module.
-    if (llvm::getDebugMetadataVersionFromModule(*inputs.pM) != 0) {
-      llvmModuleWithDebugInfo.reset(llvm::CloneModule(inputs.pM.get()));
-    }
-  }
-
-  // Verify validator version can validate this module
-  CComPtr<IDxcVersionInfo> pValidatorVersion;
-  IFT(pValidator->QueryInterface(&pValidatorVersion));
-  UINT32 ValMajor, ValMinor;
-  IFT(pValidatorVersion->GetVersion(&ValMajor, &ValMinor));
-  DxilModule &DM = inputs.pM.get()->GetOrCreateDxilModule();
-  unsigned ReqValMajor, ReqValMinor;
-  DM.GetValidatorVersion(ReqValMajor, ReqValMinor);
-  if (DXIL::CompareVersions(ValMajor, ValMinor, ReqValMajor, ReqValMinor) < 0) {
-    // Module is expecting to be validated by a newer validator.
-    if (inputs.pDiag) {
-      unsigned diagID = inputs.pDiag->getCustomDiagID(
-          clang::DiagnosticsEngine::Level::Error,
-          "The module cannot be validated by the version of the validator "
-          "currently attached.");
-      inputs.pDiag->Report(diagID);
-    }
-    return E_FAIL;
-  }
+  if (llvm::getDebugMetadataVersionFromModule(*inputs.pM) != 0)
+    llvmModuleWithDebugInfo.reset(llvm::CloneModule(inputs.pM.get()));
 
   AssembleToContainer(inputs);
 
   CComPtr<IDxcOperationResult> pValResult;
-  // Important: in-place edit is required so the blob is reused and thus
-  // dxil.dll can be released.
+  // In-place edit to avoid an extra copy
   inputs.ValidationFlags |= DxcValidatorFlags_InPlaceEdit;
-  if (bInternalValidator) {
-    IFT(RunInternalValidator(pValidator, llvmModuleWithDebugInfo.get(),
-                             inputs.pOutputContainerBlob,
-                             inputs.ValidationFlags, &pValResult));
-  } else {
-    if (pValidator2 && llvmModuleWithDebugInfo) {
-      // If metadata was stripped, re-serialize the input module.
-      CComPtr<AbstractMemoryStream> pDebugModuleStream;
-      IFT(CreateMemoryStream(DxcGetThreadMallocNoRef(), &pDebugModuleStream));
-      raw_stream_ostream outStream(pDebugModuleStream.p);
-      WriteBitcodeToFile(llvmModuleWithDebugInfo.get(), outStream, true);
-      outStream.flush();
-
-      DxcBuffer debugModule = {};
-      debugModule.Ptr = pDebugModuleStream->GetPtr();
-      debugModule.Size = pDebugModuleStream->GetPtrSize();
-
-      IFT(pValidator2->ValidateWithDebug(inputs.pOutputContainerBlob,
-                                         inputs.ValidationFlags, &debugModule,
-                                         &pValResult));
-    } else {
-      IFT(pValidator->Validate(inputs.pOutputContainerBlob,
-                               inputs.ValidationFlags, &pValResult));
-    }
-  }
+  IFT(RunInternalValidator(pValidator, llvmModuleWithDebugInfo.get(),
+                           inputs.pOutputContainerBlob, inputs.ValidationFlags,
+                           &pValResult));
   IFT(pValResult->GetStatus(&valHR));
   if (inputs.pDiag) {
     if (FAILED(valHR)) {
@@ -261,9 +186,8 @@ HRESULT ValidateAndAssembleToContainer(AssembleInputs &inputs) {
   return valHR;
 }
 
-HRESULT ValidateRootSignatureInContainer(
-    IDxcBlob *pRootSigContainer, clang::DiagnosticsEngine *pDiag,
-    hlsl::options::ValidatorSelection SelectValidator) {
+HRESULT ValidateRootSignatureInContainer(IDxcBlob *pRootSigContainer,
+                                         clang::DiagnosticsEngine *pDiag) {
   HRESULT valHR = S_OK;
   CComPtr<IDxcValidator> pValidator;
   CComPtr<IDxcOperationResult> pValResult;
diff --git a/tools/clang/tools/dxcompiler/dxcutil.h b/tools/clang/tools/dxcompiler/dxcutil.h
index 45b3d4dc1a..8612353561 100644
--- a/tools/clang/tools/dxcompiler/dxcutil.h
+++ b/tools/clang/tools/dxcompiler/dxcutil.h
@@ -54,9 +54,7 @@ struct AssembleInputs {
                  hlsl::AbstractMemoryStream *pReflectionOut = nullptr,
                  hlsl::AbstractMemoryStream *pRootSigOut = nullptr,
                  CComPtr<IDxcBlob> pRootSigBlob = nullptr,
-                 CComPtr<IDxcBlob> pPrivateBlob = nullptr,
-                 hlsl::options::ValidatorSelection SelectValidator =
-                     hlsl::options::ValidatorSelection::Auto);
+                 CComPtr<IDxcBlob> pPrivateBlob = nullptr);
   std::unique_ptr<llvm::Module> pM;
   CComPtr<IDxcBlob> &pOutputContainerBlob;
   IDxcVersionInfo *pVersionInfo = nullptr;
@@ -71,18 +69,13 @@ struct AssembleInputs {
   hlsl::AbstractMemoryStream *pRootSigOut = nullptr;
   CComPtr<IDxcBlob> pRootSigBlob = nullptr;
   CComPtr<IDxcBlob> pPrivateBlob = nullptr;
-  hlsl::options::ValidatorSelection SelectValidator =
-      hlsl::options::ValidatorSelection::Auto;
 };
 HRESULT ValidateAndAssembleToContainer(AssembleInputs &inputs);
-HRESULT ValidateRootSignatureInContainer(
-    IDxcBlob *pRootSigContainer, clang::DiagnosticsEngine *pDiag = nullptr,
-    hlsl::options::ValidatorSelection SelectValidator =
-        hlsl::options::ValidatorSelection::Auto);
+HRESULT
+ValidateRootSignatureInContainer(IDxcBlob *pRootSigContainer,
+                                 clang::DiagnosticsEngine *pDiag = nullptr);
 HRESULT SetRootSignature(hlsl::DxilModule *pModule, CComPtr<IDxcBlob> pSource);
-void GetValidatorVersion(unsigned *pMajor, unsigned *pMinor,
-                         hlsl::options::ValidatorSelection SelectValidator =
-                             hlsl::options::ValidatorSelection::Auto);
+void GetValidatorVersion(unsigned *pMajor, unsigned *pMinor);
 void AssembleToContainer(AssembleInputs &inputs);
 HRESULT Disassemble(IDxcBlob *pProgram, llvm::raw_string_ostream &Stream);
 void ReadOptsAndValidate(hlsl::options::MainArgs &mainArgs,
diff --git a/tools/clang/tools/dxcompiler/dxillib.cpp b/tools/clang/tools/dxcompiler/dxillib.cpp
deleted file mode 100644
index 72abc869da..0000000000
--- a/tools/clang/tools/dxcompiler/dxillib.cpp
+++ /dev/null
@@ -1,73 +0,0 @@
-///////////////////////////////////////////////////////////////////////////////
-//                                                                           //
-// dxillib.cpp                                                               //
-// Copyright (C) Microsoft Corporation. All rights reserved.                 //
-// This file is distributed under the University of Illinois Open Source     //
-// License. See LICENSE.TXT for details.                                     //
-//                                                                           //
-// Provides access to dxil.dll                                               //
-//                                                                           //
-///////////////////////////////////////////////////////////////////////////////
-
-#include "dxillib.h"
-#include "dxc/Support/Global.h" // For DXASSERT
-#include "dxc/Support/dxcapi.use.h"
-#include "llvm/Support/Mutex.h"
-
-using namespace dxc;
-
-static DxcDllSupport g_DllSupport;
-static HRESULT g_DllLibResult = S_OK;
-
-static llvm::sys::Mutex *cs = nullptr;
-
-// Check if we can successfully get IDxcValidator from dxil.dll
-// This function is to prevent multiple attempts to load dxil.dll
-HRESULT DxilLibInitialize() {
-  cs = new llvm::sys::Mutex;
-  cs->lock();
-  g_DllLibResult = g_DllSupport.InitializeForDll(kDxilLib, "DxcCreateInstance");
-  cs->unlock();
-  return S_OK;
-}
-
-HRESULT DxilLibCleanup(DxilLibCleanUpType type) {
-  HRESULT hr = S_OK;
-  if (type == DxilLibCleanUpType::ProcessTermination) {
-    g_DllSupport.Detach();
-  } else if (type == DxilLibCleanUpType::UnloadLibrary) {
-    g_DllSupport.Cleanup();
-  } else {
-    hr = E_INVALIDARG;
-  }
-  delete cs;
-  cs = nullptr;
-  return hr;
-}
-
-// g_DllLibResult is S_OK by default, check again to see if dxil.dll is loaded
-// If we fail to load dxil.dll, set g_DllLibResult to E_FAIL so that we don't
-// have multiple attempts to load dxil.dll
-bool DxilLibIsEnabled() {
-  cs->lock();
-  if (SUCCEEDED(g_DllLibResult)) {
-    if (!g_DllSupport.IsEnabled()) {
-      g_DllLibResult =
-          g_DllSupport.InitializeForDll(kDxilLib, "DxcCreateInstance");
-    }
-  }
-  cs->unlock();
-  return SUCCEEDED(g_DllLibResult);
-}
-
-HRESULT DxilLibCreateInstance(REFCLSID rclsid, REFIID riid,
-                              IUnknown **ppInterface) {
-  DXASSERT_NOMSG(ppInterface != nullptr);
-  HRESULT hr = E_FAIL;
-  if (DxilLibIsEnabled()) {
-    cs->lock();
-    hr = g_DllSupport.CreateInstance(rclsid, riid, ppInterface);
-    cs->unlock();
-  }
-  return hr;
-}
diff --git a/tools/clang/tools/dxcompiler/dxillib.h b/tools/clang/tools/dxcompiler/dxillib.h
deleted file mode 100644
index 879d023459..0000000000
--- a/tools/clang/tools/dxcompiler/dxillib.h
+++ /dev/null
@@ -1,42 +0,0 @@
-///////////////////////////////////////////////////////////////////////////////
-//                                                                           //
-// dxillib.h                                                                 //
-// Copyright (C) Microsoft Corporation. All rights reserved.                 //
-// This file is distributed under the University of Illinois Open Source     //
-// License. See LICENSE.TXT for details.                                     //
-//                                                                           //
-// Provides wrappers to handle calls to dxil.dll                             //
-//                                                                           //
-///////////////////////////////////////////////////////////////////////////////
-
-#pragma once
-#ifndef __DXC_DXILLIB__
-#define __DXC_DXILLIB__
-
-#include "dxc/Support/WinIncludes.h"
-#include "dxc/WinAdapter.h"
-
-// Initialize Dxil library.
-HRESULT DxilLibInitialize();
-
-// When dxcompiler is detached from process,
-// we should not call FreeLibrary on process termination.
-// So the caller has to specify if cleaning is from FreeLibrary or process
-// termination
-enum class DxilLibCleanUpType { UnloadLibrary, ProcessTermination };
-
-HRESULT DxilLibCleanup(DxilLibCleanUpType type);
-
-// Check if can access dxil.dll
-bool DxilLibIsEnabled();
-
-HRESULT DxilLibCreateInstance(REFCLSID rclsid, REFIID riid,
-                              IUnknown **ppInterface);
-
-template <class TInterface>
-HRESULT DxilLibCreateInstance(REFCLSID rclsid, TInterface **ppInterface) {
-  return DxilLibCreateInstance(rclsid, __uuidof(TInterface),
-                               (IUnknown **)ppInterface);
-}
-
-#endif // __DXC_DXILLIB__
diff --git a/tools/clang/unittests/HLSL/DxilContainerTest.cpp b/tools/clang/unittests/HLSL/DxilContainerTest.cpp
index 339b33c655..34b4d338fe 100644
--- a/tools/clang/unittests/HLSL/DxilContainerTest.cpp
+++ b/tools/clang/unittests/HLSL/DxilContainerTest.cpp
@@ -103,6 +103,7 @@ class DxilContainerTest : public ::testing::Test {
   TEST_METHOD(CompileCSWaveSizeRange_CheckPSV0)
   TEST_METHOD(CompileWhenOkThenCheckRDAT)
   TEST_METHOD(CompileWhenOkThenCheckRDAT2)
+  TEST_METHOD(CompileWhenOkThenCheckRDATSM69)
   TEST_METHOD(CompileWhenOkThenCheckReflection1)
   TEST_METHOD(DxcUtils_CreateReflection)
   TEST_METHOD(CheckReflectionQueryInterface)
@@ -1444,6 +1445,146 @@ TEST_F(DxilContainerTest, CompileCSWaveSizeRange_CheckPSV0) {
 TEST_F(DxilContainerTest, CompileWhenOkThenCheckRDAT) {
   if (m_ver.SkipDxilVersion(1, 3))
     return;
+  const char *shader =
+      "float c_buf;"
+      "RWTexture1D<int4> tex : register(u5);"
+      "Texture1D<float4> tex2 : register(t0);"
+      "RWByteAddressBuffer b_buf;"
+      "struct Foo { float2 f2; int2 i2; };"
+      "AppendStructuredBuffer<Foo> append_buf;"
+      "ConsumeStructuredBuffer<Foo> consume_buf;"
+      "RasterizerOrderedByteAddressBuffer rov_buf;"
+      "globallycoherent RWByteAddressBuffer gc_buf;"
+      "float function_import(float x);"
+      "export float function0(min16float x) { "
+      "  return x + 1 + tex[0].x; }"
+      "export float function1(float x, min12int i) {"
+      "  return x + c_buf + b_buf.Load(x) + tex2[i].x; }"
+      "export float function2(float x) { return x + function_import(x); }"
+      "export void function3(int i) {"
+      "  Foo f = consume_buf.Consume();"
+      "  f.f2 += 0.5; append_buf.Append(f);"
+      "  rov_buf.Store(i, f.i2.x);"
+      "  gc_buf.Store(i, f.i2.y);"
+      "  b_buf.Store(i, f.i2.x + f.i2.y); }";
+  CComPtr<IDxcCompiler> pCompiler;
+  CComPtr<IDxcBlobEncoding> pSource;
+  CComPtr<IDxcBlob> pProgram;
+  CComPtr<IDxcBlobEncoding> pDisassembly;
+  CComPtr<IDxcOperationResult> pResult;
+
+  struct CheckResFlagInfo {
+    std::string name;
+    hlsl::DXIL::ResourceKind kind;
+    hlsl::RDAT::DxilResourceFlag flag;
+  };
+  const unsigned numResFlagCheck = 5;
+  CheckResFlagInfo resFlags[numResFlagCheck] = {
+      {"b_buf", hlsl::DXIL::ResourceKind::RawBuffer,
+       hlsl::RDAT::DxilResourceFlag::None},
+      {"append_buf", hlsl::DXIL::ResourceKind::StructuredBuffer,
+       hlsl::RDAT::DxilResourceFlag::UAVCounter},
+      {"consume_buf", hlsl::DXIL::ResourceKind::StructuredBuffer,
+       hlsl::RDAT::DxilResourceFlag::UAVCounter},
+      {"gc_buf", hlsl::DXIL::ResourceKind::RawBuffer,
+       hlsl::RDAT::DxilResourceFlag::UAVGloballyCoherent},
+      {"rov_buf", hlsl::DXIL::ResourceKind::RawBuffer,
+       hlsl::RDAT::DxilResourceFlag::UAVRasterizerOrderedView}};
+
+  VERIFY_SUCCEEDED(CreateCompiler(&pCompiler));
+  CreateBlobFromText(shader, &pSource);
+  VERIFY_SUCCEEDED(pCompiler->Compile(pSource, L"hlsl.hlsl", L"main",
+                                      L"lib_6_3", nullptr, 0, nullptr, 0,
+                                      nullptr, &pResult));
+  HRESULT hrStatus;
+  VERIFY_SUCCEEDED(pResult->GetStatus(&hrStatus));
+  VERIFY_SUCCEEDED(hrStatus);
+  VERIFY_SUCCEEDED(pResult->GetResult(&pProgram));
+  CComPtr<IDxcContainerReflection> containerReflection;
+  uint32_t partCount;
+  IFT(m_dllSupport.CreateInstance(CLSID_DxcContainerReflection,
+                                  &containerReflection));
+  IFT(containerReflection->Load(pProgram));
+  IFT(containerReflection->GetPartCount(&partCount));
+  bool blobFound = false;
+  for (uint32_t i = 0; i < partCount; ++i) {
+    uint32_t kind;
+    IFT(containerReflection->GetPartKind(i, &kind));
+    if (kind == (uint32_t)hlsl::DxilFourCC::DFCC_RuntimeData) {
+      blobFound = true;
+      using namespace hlsl::RDAT;
+      CComPtr<IDxcBlob> pBlob;
+      IFT(containerReflection->GetPartContent(i, &pBlob));
+      // Validate using DxilRuntimeData
+      DxilRuntimeData context;
+      context.InitFromRDAT((char *)pBlob->GetBufferPointer(),
+                           pBlob->GetBufferSize());
+      auto funcTable = context.GetFunctionTable();
+      auto resTable = context.GetResourceTable();
+      VERIFY_ARE_EQUAL(funcTable.Count(), 4U);
+      std::string str("function");
+      for (uint32_t j = 0; j < funcTable.Count(); ++j) {
+        auto funcReader = funcTable[j];
+        std::string funcName(funcReader.getUnmangledName());
+        VERIFY_IS_TRUE(str.compare(funcName.substr(0, 8)) == 0);
+        std::string cur_str = str;
+        cur_str.push_back('0' + j);
+        if (cur_str.compare("function0") == 0) {
+          VERIFY_ARE_EQUAL(funcReader.getResources().Count(), 1U);
+          hlsl::ShaderFlags flag;
+          flag.SetUAVLoadAdditionalFormats(true);
+          flag.SetLowPrecisionPresent(true);
+          uint64_t rawFlag = flag.GetFeatureInfo();
+          VERIFY_ARE_EQUAL(funcReader.GetFeatureFlags(), rawFlag);
+          auto resReader = funcReader.getResources()[0];
+          VERIFY_ARE_EQUAL(resReader.getClass(),
+                           hlsl::DXIL::ResourceClass::UAV);
+          VERIFY_ARE_EQUAL(resReader.getKind(),
+                           hlsl::DXIL::ResourceKind::Texture1D);
+        } else if (cur_str.compare("function1") == 0) {
+          hlsl::ShaderFlags flag;
+          flag.SetLowPrecisionPresent(true);
+          uint64_t rawFlag = flag.GetFeatureInfo();
+          VERIFY_ARE_EQUAL(funcReader.GetFeatureFlags(), rawFlag);
+          VERIFY_ARE_EQUAL(funcReader.getResources().Count(), 3U);
+        } else if (cur_str.compare("function2") == 0) {
+          VERIFY_ARE_EQUAL(funcReader.GetFeatureFlags() & 0xffffffffffffffff,
+                           0U);
+          VERIFY_ARE_EQUAL(funcReader.getResources().Count(), 0U);
+          std::string dependency = funcReader.getFunctionDependencies()[0];
+          VERIFY_IS_TRUE(dependency.find("function_import") !=
+                         std::string::npos);
+        } else if (cur_str.compare("function3") == 0) {
+          VERIFY_ARE_EQUAL(funcReader.GetFeatureFlags() & 0xffffffffffffffff,
+                           0U);
+          VERIFY_ARE_EQUAL(funcReader.getResources().Count(), numResFlagCheck);
+          for (unsigned i = 0; i < funcReader.getResources().Count(); ++i) {
+            auto resReader = funcReader.getResources()[0];
+            VERIFY_ARE_EQUAL(resReader.getClass(),
+                             hlsl::DXIL::ResourceClass::UAV);
+            unsigned j = 0;
+            for (; j < numResFlagCheck; ++j) {
+              if (resFlags[j].name.compare(resReader.getName()) == 0)
+                break;
+            }
+            VERIFY_IS_LESS_THAN(j, numResFlagCheck);
+            VERIFY_ARE_EQUAL(resReader.getKind(), resFlags[j].kind);
+            VERIFY_ARE_EQUAL(resReader.getFlags(),
+                             static_cast<uint32_t>(resFlags[j].flag));
+          }
+        } else {
+          IFTBOOLMSG(false, E_FAIL, "unknown function name");
+        }
+      }
+      VERIFY_ARE_EQUAL(resTable.Count(), 8U);
+    }
+  }
+  IFTBOOLMSG(blobFound, E_FAIL, "failed to find RDAT blob after compiling");
+}
+
+TEST_F(DxilContainerTest, CompileWhenOkThenCheckRDATSM69) {
+  if (m_ver.SkipDxilVersion(1, 9))
+    return;
   const char *shader =
       "float c_buf;"
       "RWTexture1D<int4> tex : register(u5);"
@@ -1497,7 +1638,7 @@ TEST_F(DxilContainerTest, CompileWhenOkThenCheckRDAT) {
   VERIFY_SUCCEEDED(CreateCompiler(&pCompiler));
   CreateBlobFromText(shader, &pSource);
   VERIFY_SUCCEEDED(pCompiler->Compile(pSource, L"hlsl.hlsl", L"main",
-                                      L"lib_6_3", nullptr, 0, nullptr, 0,
+                                      L"lib_6_9", nullptr, 0, nullptr, 0,
                                       nullptr, &pResult));
   HRESULT hrStatus;
   VERIFY_SUCCEEDED(pResult->GetStatus(&hrStatus));
diff --git a/tools/clang/unittests/HLSL/PixDiaTest.cpp b/tools/clang/unittests/HLSL/PixDiaTest.cpp
index a4439b998d..d36e762762 100644
--- a/tools/clang/unittests/HLSL/PixDiaTest.cpp
+++ b/tools/clang/unittests/HLSL/PixDiaTest.cpp
@@ -13,6 +13,7 @@
 #ifdef _WIN32
 
 #include <array>
+#include <set>
 
 #include "dxc/DxilContainer/DxilContainer.h"
 #include "dxc/Support/WinIncludes.h"
@@ -186,6 +187,7 @@ class PixDiaTest {
   TEST_METHOD(DxcPixDxilDebugInfo_BitFields_Derived)
   TEST_METHOD(DxcPixDxilDebugInfo_BitFields_Bool)
   TEST_METHOD(DxcPixDxilDebugInfo_BitFields_Overlap)
+  TEST_METHOD(DxcPixDxilDebugInfo_BitFields_uint64)
   TEST_METHOD(DxcPixDxilDebugInfo_Min16SizesAndOffsets_Enabled)
   TEST_METHOD(DxcPixDxilDebugInfo_Min16SizesAndOffsets_Disabled)
   TEST_METHOD(DxcPixDxilDebugInfo_Min16VectorOffsets_Enabled)
@@ -658,11 +660,11 @@ class PixDiaTest {
       const char *hlsl, const wchar_t *profile,
       const char *lineAtWhichToExamineVariables,
       std::vector<VariableComponentInfo> const &ExpectedVariables);
-  void RunSizeAndOffsetTestCase(const char *hlsl,
-                                std::array<DWORD, 4> const &memberOffsets,
-                                std::array<DWORD, 4> const &memberSizes,
-                                std::vector<const wchar_t *> extraArgs = {
-                                    L"-Od"});
+  CComPtr<IDxcPixDxilStorage>
+  RunSizeAndOffsetTestCase(const char *hlsl,
+                           std::array<DWORD, 4> const &memberOffsets,
+                           std::array<DWORD, 4> const &memberSizes,
+                           std::vector<const wchar_t *> extraArgs = {L"-Od"});
   void RunVectorSizeAndOffsetTestCase(const char *hlsl,
                                       std::array<DWORD, 4> const &memberOffsets,
                                       std::vector<const wchar_t *> extraArgs = {
@@ -2948,12 +2950,11 @@ void main()
   VERIFY_ARE_EQUAL(32u, secondFieldOffset);
 }
 
-void PixDiaTest::RunSizeAndOffsetTestCase(
-    const char *hlsl, std::array<DWORD, 4> const &memberOffsets,
-    std::array<DWORD, 4> const &memberSizes,
-    std::vector<const wchar_t *> extraArgs) {
-  if (m_ver.SkipDxilVersion(1, 5))
-    return;
+CComPtr<IDxcPixDxilStorage>
+PixDiaTest::RunSizeAndOffsetTestCase(const char *hlsl,
+                                     std::array<DWORD, 4> const &memberOffsets,
+                                     std::array<DWORD, 4> const &memberSizes,
+                                     std::vector<const wchar_t *> extraArgs) {
   auto debugInfo =
       CompileAndCreateDxcDebug(hlsl, L"cs_6_5", nullptr, extraArgs).debugInfo;
   auto live = GetLiveVariablesAt(hlsl, "STOP_HERE", debugInfo);
@@ -2974,9 +2975,46 @@ void PixDiaTest::RunSizeAndOffsetTestCase(
     VERIFY_SUCCEEDED(field->GetFieldSizeInBits(&sizeInBits));
     VERIFY_ARE_EQUAL(memberSizes[i], sizeInBits);
   }
+  // Check that first and second and third are reported as residing in the same
+  // register (cuz they do!), and that the third does not
+
+  CComPtr<IDxcPixDxilStorage> bfStorage;
+  VERIFY_SUCCEEDED(bf->GetStorage(&bfStorage));
+  return bfStorage;
+}
+
+void RunBitfieldAdjacencyTest(
+    IDxcPixDxilStorage *bfStorage,
+    std::vector<std::vector<wchar_t const *>> const &adjacentRuns) {
+  std::vector<std::set<DWORD>> registersByRun;
+  registersByRun.resize(adjacentRuns.size());
+  for (size_t run = 0; run < adjacentRuns.size(); ++run) {
+    for (auto const &field : adjacentRuns[run]) {
+      CComPtr<IDxcPixDxilStorage> fieldStorage;
+      VERIFY_SUCCEEDED(bfStorage->AccessField(field, &fieldStorage));
+      DWORD reg;
+      VERIFY_SUCCEEDED(fieldStorage->GetRegisterNumber(&reg));
+      registersByRun[run].insert(reg);
+    }
+  }
+  for (size_t run = 0; run < registersByRun.size(); ++run) {
+    {
+      // Every field in this run should have the same register number, so this
+      // set should be of size 1:
+      VERIFY_ARE_EQUAL(1, registersByRun[run].size());
+      // Every adjacent run should have different register numbers:
+      if (run != 0) {
+        VERIFY_ARE_NOT_EQUAL(*registersByRun[run - 1].begin(),
+                             *registersByRun[run].begin());
+      }
+    }
+  }
 }
 
 TEST_F(PixDiaTest, DxcPixDxilDebugInfo_BitFields_Simple) {
+  if (m_ver.SkipDxilVersion(1, 5))
+    return;
+
   const char *hlsl = R"(
 struct Bitfields
 {
@@ -3000,10 +3038,16 @@ void main()
 }
 
 )";
-  RunSizeAndOffsetTestCase(hlsl, {0, 17, 32, 64}, {17, 15, 3, 32});
+  auto bfStorage =
+      RunSizeAndOffsetTestCase(hlsl, {0, 17, 32, 64}, {17, 15, 3, 32});
+  RunBitfieldAdjacencyTest(bfStorage,
+                           {{L"first", L"second"}, {L"third"}, {L"fourth"}});
 }
 
 TEST_F(PixDiaTest, DxcPixDxilDebugInfo_BitFields_Derived) {
+  if (m_ver.SkipDxilVersion(1, 5))
+    return;
+
   const char *hlsl = R"(
 struct Bitfields
 {
@@ -3027,10 +3071,16 @@ void main()
 }
 
 )";
-  RunSizeAndOffsetTestCase(hlsl, {0, 17, 32, 64}, {17, 15, 3, 32});
+  auto bfStorage =
+      RunSizeAndOffsetTestCase(hlsl, {0, 17, 32, 64}, {17, 15, 3, 32});
+  RunBitfieldAdjacencyTest(bfStorage,
+                           {{L"first", L"second"}, {L"third"}, {L"fourth"}});
 }
 
 TEST_F(PixDiaTest, DxcPixDxilDebugInfo_BitFields_Bool) {
+  if (m_ver.SkipDxilVersion(1, 5))
+    return;
+
   const char *hlsl = R"(
 struct Bitfields
 {
@@ -3054,17 +3104,58 @@ void main()
 }
 
 )";
-  RunSizeAndOffsetTestCase(hlsl, {0, 1, 2, 32}, {1, 1, 3, 32});
+  auto bfStorage = RunSizeAndOffsetTestCase(hlsl, {0, 1, 2, 32}, {1, 1, 3, 32});
+  RunBitfieldAdjacencyTest(bfStorage,
+                           {{L"first", L"second", L"third"}, {L"fourth"}});
 }
 
 TEST_F(PixDiaTest, DxcPixDxilDebugInfo_BitFields_Overlap) {
+  if (m_ver.SkipDxilVersion(1, 5))
+    return;
+
+  const char *hlsl = R"(
+struct Bitfields
+{
+    uint32_t first : 20;
+    uint32_t second : 20; // should end up in second DWORD
+    uint32_t third : 3; // should shader second DWORD
+    uint32_t fourth; // should be in third DWORD
+};
+
+RWStructuredBuffer<int> UAV: register(u0);
+
+[numthreads(1, 1, 1)]
+void main()
+{
+  Bitfields bf;
+  bf.first = UAV[0];
+  bf.second = UAV[1];
+  bf.third = UAV[2];
+  bf.fourth = UAV[3];
+  UAV[16] = bf.first + bf.second + bf.third + bf.fourth; //STOP_HERE
+}
+
+)";
+  auto bfStorage =
+      RunSizeAndOffsetTestCase(hlsl, {0, 32, 52, 64}, {20, 20, 3, 32});
+  // (PIX #58022343): fields that overlap their storage type are not yet
+  // reflected properly in terms of their packed offsets as maintained via
+  // these PixDxc interfaces based on the dbg.declare data
+  // RunBitfieldAdjacencyTest(bfStorage,
+  //                         {{L"first"}, {L"second", L"third"}, {L"fourth"}});
+}
+
+TEST_F(PixDiaTest, DxcPixDxilDebugInfo_BitFields_uint64) {
+  if (m_ver.SkipDxilVersion(1, 5))
+    return;
+
   const char *hlsl = R"(
 struct Bitfields
 {
-    unsigned int first : 20;
-    unsigned int second : 20; // should end up in second DWORD
-    unsigned int third : 3; // should shader second DWORD
-    unsigned int fourth; // should be in third DWORD
+    uint64_t first : 20;
+    uint64_t second : 20; // should end up in first uint64 also
+    uint64_t third : 24; // in first
+    uint64_t fourth; // should be in second
 };
 
 RWStructuredBuffer<int> UAV: register(u0);
@@ -3081,7 +3172,10 @@ void main()
 }
 
 )";
-  RunSizeAndOffsetTestCase(hlsl, {0, 32, 52, 64}, {20, 20, 3, 32});
+  auto bfStorage =
+      RunSizeAndOffsetTestCase(hlsl, {0, 20, 40, 64}, {20, 20, 24, 64});
+  RunBitfieldAdjacencyTest(bfStorage,
+                           {{L"first", L"second", L"third"}, {L"fourth"}});
 }
 
 TEST_F(PixDiaTest, DxcPixDxilDebugInfo_Alignment_ConstInt) {
@@ -3502,9 +3596,10 @@ void ClosestHitShader3(inout RayPayload payload, in BuiltInTriangleIntersectionA
 
   // Case: same function called from two places in same top-level function.
   // In this case, we expect the storage for the variable to be in the same
-  // place for both "instances" of the function: as a thread proceeds through
-  // the caller, it will write new values into the variable's storage during
-  // the second or subsequent invocations of the inlined function.
+  // place for both "instances" of the function: as a thread proceeds
+  // through the caller, it will write new values into the variable's
+  // storage during the second or subsequent invocations of the inlined
+  // function.
   DWORD instructionOffset =
       AdvanceUntilFunctionEntered(dxilDebugger, 0, L"ClosestHitShader3");
   instructionOffset = AdvanceUntilFunctionEntered(
@@ -3550,9 +3645,10 @@ TEST_F(PixDiaTest, DxcPixDxilDebugInfo_VariableScopes_ForScopes) {
 
   // Case: same function called from two places in same top-level function.
   // In this case, we expect the storage for the variable to be in the same
-  // place for both "instances" of the function: as a thread proceeds through
-  // the caller, it will write new values into the variable's storage during
-  // the second or subsequent invocations of the inlined function.
+  // place for both "instances" of the function: as a thread proceeds
+  // through the caller, it will write new values into the variable's
+  // storage during the second or subsequent invocations of the inlined
+  // function.
   DWORD instructionOffset =
       AdvanceUntilFunctionEntered(dxilDebugger, 0, L"CSMain");
 
@@ -3597,9 +3693,10 @@ TEST_F(PixDiaTest, DxcPixDxilDebugInfo_VariableScopes_ScopeBraces) {
 
   // Case: same function called from two places in same top-level function.
   // In this case, we expect the storage for the variable to be in the same
-  // place for both "instances" of the function: as a thread proceeds through
-  // the caller, it will write new values into the variable's storage during
-  // the second or subsequent invocations of the inlined function.
+  // place for both "instances" of the function: as a thread proceeds
+  // through the caller, it will write new values into the variable's
+  // storage during the second or subsequent invocations of the inlined
+  // function.
   DWORD instructionOffset =
       AdvanceUntilFunctionEntered(dxilDebugger, 0, L"CSMain");
 
@@ -3644,9 +3741,10 @@ TEST_F(PixDiaTest, DxcPixDxilDebugInfo_VariableScopes_Function) {
 
   // Case: same function called from two places in same top-level function.
   // In this case, we expect the storage for the variable to be in the same
-  // place for both "instances" of the function: as a thread proceeds through
-  // the caller, it will write new values into the variable's storage during
-  // the second or subsequent invocations of the inlined function.
+  // place for both "instances" of the function: as a thread proceeds
+  // through the caller, it will write new values into the variable's
+  // storage during the second or subsequent invocations of the inlined
+  // function.
   DWORD instructionOffset =
       AdvanceUntilFunctionEntered(dxilDebugger, 0, L"CSMain");
 
@@ -3692,9 +3790,10 @@ void CSMain()
 
   // Case: same function called from two places in same top-level function.
   // In this case, we expect the storage for the variable to be in the same
-  // place for both "instances" of the function: as a thread proceeds through
-  // the caller, it will write new values into the variable's storage during
-  // the second or subsequent invocations of the inlined function.
+  // place for both "instances" of the function: as a thread proceeds
+  // through the caller, it will write new values into the variable's
+  // storage during the second or subsequent invocations of the inlined
+  // function.
   DWORD instructionOffset =
       AdvanceUntilFunctionEntered(dxilDebugger, 0, L"CSMain");
 
diff --git a/tools/clang/unittests/HLSL/PixTest.cpp b/tools/clang/unittests/HLSL/PixTest.cpp
index e337d2951c..c032e9e872 100644
--- a/tools/clang/unittests/HLSL/PixTest.cpp
+++ b/tools/clang/unittests/HLSL/PixTest.cpp
@@ -119,7 +119,6 @@ class PixTest : public ::testing::Test {
   TEST_METHOD(AccessTracking_ModificationReport_SM66)
 
   TEST_METHOD(PixStructAnnotation_Lib_DualRaygen)
-  TEST_METHOD(PixStructAnnotation_Lib_RaygenAllocaStructAlignment)
 
   TEST_METHOD(PixStructAnnotation_Simple)
   TEST_METHOD(PixStructAnnotation_CopiedStruct)
@@ -1221,7 +1220,6 @@ PixTest::TestableResults PixTest::TestStructAnnotationCase(
 
 #if 0 // handy for debugging
   auto disTextW = Disassemble(pAnnotatedContainer);
-  WEX::Logging::Log::Comment(disTextW.c_str());
 #endif
 
   ModuleAndHangersOn moduleEtc(pAnnotatedContainer);
@@ -1455,100 +1453,6 @@ void Raygen1()
   }
 }
 
-TEST_F(PixTest, PixStructAnnotation_Lib_RaygenAllocaStructAlignment) {
-  if (m_ver.SkipDxilVersion(1, 5))
-    return;
-
-  const char *hlsl = R"(
-
-RaytracingAccelerationStructure Scene : register(t0, space0);
-RWTexture2D<float4> RenderTarget : register(u0);
-
-struct SceneConstantBuffer
-{
-    float4x4 projectionToWorld;
-    float4 cameraPosition;
-    float4 lightPosition;
-    float4 lightAmbientColor;
-    float4 lightDiffuseColor;
-};
-
-ConstantBuffer<SceneConstantBuffer> g_sceneCB : register(b0);
-
-struct RayPayload
-{
-    float4 color;
-};
-
-inline void GenerateCameraRay(uint2 index, out float3 origin, out float3 direction)
-{
-    float2 xy = index + 0.5f; // center in the middle of the pixel.
-    float2 screenPos = xy;// / DispatchRaysDimensions().xy * 2.0 - 1.0;
-
-    // Invert Y for DirectX-style coordinates.
-    screenPos.y = -screenPos.y;
-
-    // Unproject the pixel coordinate into a ray.
-    float4 world = /*mul(*/float4(screenPos, 0, 1)/*, g_sceneCB.projectionToWorld)*/;
-
-    //world.xyz /= world.w;
-    origin = world.xyz; //g_sceneCB.cameraPosition.xyz;
-    direction = float3(1,0,0);//normalize(world.xyz - origin);
-}
-
-void RaygenCommon()
-{
-    float3 rayDir;
-    float3 origin;
-    
-    // Generate a ray for a camera pixel corresponding to an index from the dispatched 2D grid.
-    GenerateCameraRay(DispatchRaysIndex().xy, origin, rayDir);
-
-    // Trace the ray.
-    // Set the ray's extents.
-    RayDesc ray;
-    ray.Origin = origin;
-    ray.Direction = rayDir;
-    // Set TMin to a non-zero small value to avoid aliasing issues due to floating - point errors.
-    // TMin should be kept small to prevent missing geometry at close contact areas.
-    ray.TMin = 0.001;
-    ray.TMax = 10000.0;
-    RayPayload payload = { float4(0, 0, 0, 0) };
-    TraceRay(Scene, RAY_FLAG_CULL_BACK_FACING_TRIANGLES, ~0, 0, 1, 0, ray, payload);
-
-    // Write the raytraced color to the output texture.
-   // RenderTarget[DispatchRaysIndex().xy] = payload.color;
-}
-
-[shader("raygeneration")]
-void Raygen()
-{
-    RaygenCommon();
-}
-)";
-
-  auto Testables = TestStructAnnotationCase(hlsl, L"-Od", true, L"lib_6_6");
-
-  // Built-in type "RayDesc" has this structure: struct { float3 Origin; float
-  // TMin; float3 Direction; float TMax; } This is 8 floats, with members at
-  // offsets 0,3,4,7 respectively.
-
-  auto FindAtLeastOneOf = [=](char const *name, uint32_t index) {
-    VERIFY_IS_TRUE(std::find_if(Testables.AllocaWrites.begin(),
-                                Testables.AllocaWrites.end(),
-                                [&name, &index](AllocaWrite const &aw) {
-                                  return 0 == strcmp(aw.memberName.c_str(),
-                                                     name) &&
-                                         aw.index == index;
-                                }) != Testables.AllocaWrites.end());
-  };
-
-  FindAtLeastOneOf("Origin.x", 0);
-  FindAtLeastOneOf("TMin", 3);
-  FindAtLeastOneOf("Direction.x", 4);
-  FindAtLeastOneOf("TMax", 7);
-}
-
 TEST_F(PixTest, PixStructAnnotation_Simple) {
   if (m_ver.SkipDxilVersion(1, 5))
     return;
@@ -3441,7 +3345,6 @@ void RaygenInternalName()
   // check that there are alloca writes that cover all of them. RayPayload
   // has four elements, and RayDesc has eight.
   std::array<bool, 4> RayPayloadElementCoverage;
-  std::array<bool, 8> RayDescElementCoverage;
 
   for (auto const &write : metaDataKeyToValue.allocaWrites) {
     // the whole point of the changes with this test is to separate vector
@@ -3452,14 +3355,10 @@ void RaygenInternalName()
     if (findAlloca != metaDataKeyToValue.allocaDefinitions.end()) {
       if (findAlloca->second.count == 4) {
         RayPayloadElementCoverage[write.second.offset] = true;
-      } else if (findAlloca->second.count == 8) {
-        RayDescElementCoverage[write.second.offset] = true;
       }
     }
   }
   // Check that coverage for every element was emitted:
   for (auto const &b : RayPayloadElementCoverage)
     VERIFY_IS_TRUE(b);
-  for (auto const &b : RayDescElementCoverage)
-    VERIFY_IS_TRUE(b);
 }
diff --git a/tools/clang/unittests/HLSL/ValidationTest.cpp b/tools/clang/unittests/HLSL/ValidationTest.cpp
index 01f24e0227..980bf6c7c2 100644
--- a/tools/clang/unittests/HLSL/ValidationTest.cpp
+++ b/tools/clang/unittests/HLSL/ValidationTest.cpp
@@ -1488,7 +1488,7 @@ TEST_F(ValidationTest, StructBufGlobalCoherentAndCounter) {
       L"..\\DXILValidation\\struct_buf1.hlsl", "ps_6_0",
       "!\"buf2\", i32 0, i32 0, i32 1, i32 12, i1 false, i1 false",
       "!\"buf2\", i32 0, i32 0, i32 1, i32 12, i1 true, i1 true",
-      "globallycoherent cannot be used with append/consume buffers: 'buf2'");
+      "globallycoherent cannot be used on buffer with counter 'buf2'");
 }
 
 TEST_F(ValidationTest, StructBufStrideAlign) {
diff --git a/tools/clang/unittests/HLSLExec/CMakeLists.txt b/tools/clang/unittests/HLSLExec/CMakeLists.txt
index 3878fa3f34..b490ac94e9 100644
--- a/tools/clang/unittests/HLSLExec/CMakeLists.txt
+++ b/tools/clang/unittests/HLSLExec/CMakeLists.txt
@@ -3,9 +3,13 @@
 find_package(TAEF REQUIRED)
 find_package(D3D12 REQUIRED) # Used for ExecutionTest.cpp.
 
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /bigobj")
+
 add_clang_library(ExecHLSLTests SHARED
   ExecutionTest.cpp
   ShaderOpTest.cpp
+  TableParameterHandler.cpp
+  LongVectors.cpp
   ExecHLSLTests.rc
   )
 
diff --git a/tools/clang/unittests/HLSLExec/ExecHLSLTests.rc b/tools/clang/unittests/HLSLExec/ExecHLSLTests.rc
index 6f4659910c..29459ee825 100644
--- a/tools/clang/unittests/HLSLExec/ExecHLSLTests.rc
+++ b/tools/clang/unittests/HLSLExec/ExecHLSLTests.rc
@@ -1,3 +1,4 @@
 #include <windows.h>
 
-ShaderOpArithTable.xml DATASOURCE_XML "ShaderOpArithTable.xml"
\ No newline at end of file
+ShaderOpArithTable.xml DATASOURCE_XML "ShaderOpArithTable.xml"
+LongVectorOpTable.xml DATASOURCE_XML "LongVectorOpTable.xml"
diff --git a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
index 6db27d7a41..586c55328d 100644
--- a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
+++ b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
@@ -11,7 +11,7 @@
 ///////////////////////////////////////////////////////////////////////////////
 
 // We need to keep & fix these warnings to integrate smoothly with HLK
-#pragma warning(error : 4100 4146 4242 4244 4267 4701 4389 4018)
+#pragma warning(error : 4100 4242 4244 4267 4701 4389 4018)
 
 // *** THIS FILE CANNOT TAKE ANY LLVM DEPENDENCIES  *** //
 
@@ -60,6 +60,8 @@
 #include "ShaderOpTest.h"
 #include <libloaderapi.h>
 #include <DirectXPackedVector.h>
+#include "TableParameterHandler.h"
+#include "HlslExecTestUtils.h"
 // clang-format on
 
 #pragma comment(lib, "d3dcompiler.lib")
@@ -67,47 +69,6 @@
 #pragma comment(lib, "dxguid.lib")
 #pragma comment(lib, "version.lib")
 
-// A more recent Windows SDK than currently required is needed for these.
-typedef HRESULT(WINAPI *D3D12EnableExperimentalFeaturesFn)(
-    UINT NumFeatures, __in_ecount(NumFeatures) const IID *pIIDs,
-    __in_ecount_opt(NumFeatures) void *pConfigurationStructs,
-    __in_ecount_opt(NumFeatures) UINT *pConfigurationStructSizes);
-
-static const GUID D3D12ExperimentalShaderModelsID =
-    {/* 76f5573e-f13a-40f5-b297-81ce9e18933f */
-     0x76f5573e,
-     0xf13a,
-     0x40f5,
-     {0xb2, 0x97, 0x81, 0xce, 0x9e, 0x18, 0x93, 0x3f}};
-
-// Used to create D3D12SDKConfiguration to enable AgilitySDK programmatically.
-typedef HRESULT(WINAPI *D3D12GetInterfaceFn)(REFCLSID rclsid, REFIID riid,
-                                             void **ppvDebug);
-
-#ifndef __ID3D12SDKConfiguration_INTERFACE_DEFINED__
-// Copied from AgilitySDK D3D12.h to programmatically enable when in developer
-// mode.
-#define __ID3D12SDKConfiguration_INTERFACE_DEFINED__
-
-EXTERN_C const GUID DECLSPEC_SELECTANY IID_ID3D12SDKConfiguration = {
-    0xe9eb5314,
-    0x33aa,
-    0x42b2,
-    {0xa7, 0x18, 0xd7, 0x7f, 0x58, 0xb1, 0xf1, 0xc7}};
-EXTERN_C const GUID DECLSPEC_SELECTANY CLSID_D3D12SDKConfiguration = {
-    0x7cda6aca,
-    0xa03e,
-    0x49c8,
-    {0x94, 0x58, 0x03, 0x34, 0xd2, 0x0e, 0x07, 0xce}};
-
-MIDL_INTERFACE("e9eb5314-33aa-42b2-a718-d77f58b1f1c7")
-ID3D12SDKConfiguration : public IUnknown {
-public:
-  virtual HRESULT STDMETHODCALLTYPE SetSDKVersion(UINT SDKVersion,
-                                                  LPCSTR SDKPath) = 0;
-};
-#endif /* __ID3D12SDKConfiguration_INTERFACE_DEFINED__ */
-
 using namespace DirectX;
 using namespace hlsl_test;
 
@@ -271,9 +232,6 @@ typedef struct D3D12_FEATURE_DATA_D3D12_OPTIONS4 {
 
 #endif
 
-// Virtual class to compute the expected result given a set of inputs
-struct TableParameter;
-
 class ExecutionTest {
 public:
   BEGIN_TEST_CLASS(ExecutionTest)
@@ -519,10 +477,10 @@ class ExecutionTest {
         return false;
       // Do not: FreeLibrary(hRuntime);
       // If we actually free the library, it defeats the purpose of
-      // EnableAgilitySDK and EnableExperimentalMode.
+      // enableAgilitySDK and enableExperimentalMode.
 
       HRESULT hr;
-      hr = EnableAgilitySDK(hRuntime);
+      hr = enableAgilitySDK(hRuntime);
       if (FAILED(hr)) {
         LogCommentFmt(L"Unable to enable Agility SDK - 0x%08x.", hr);
       } else if (hr == S_FALSE) {
@@ -531,7 +489,7 @@ class ExecutionTest {
         LogCommentFmt(L"Agility SDK enabled.");
       }
 
-      hr = EnableExperimentalMode(hRuntime);
+      hr = enableExperimentalMode(hRuntime);
       if (FAILED(hr)) {
         LogCommentFmt(L"Unable to enable shader experimental mode - 0x%08x.",
                       hr);
@@ -541,7 +499,7 @@ class ExecutionTest {
         LogCommentFmt(L"Experimental mode enabled.");
       }
 
-      hr = EnableDebugLayer();
+      hr = enableDebugLayer();
       if (FAILED(hr)) {
         LogCommentFmt(L"Unable to enable debug layer - 0x%08x.", hr);
       } else if (hr == S_FALSE) {
@@ -602,41 +560,31 @@ class ExecutionTest {
   // Do not remove the following line - it is used by TranslateExecutionTest.py
   // MARKER: ExecutionTest/DxilConf Shared Implementation Start
 
-  // This is defined in d3d.h for Windows 10 Anniversary Edition SDK, but we
-  // only require the Windows 10 SDK.
-  typedef enum D3D_SHADER_MODEL {
-    D3D_SHADER_MODEL_5_1 = 0x51,
-    D3D_SHADER_MODEL_6_0 = 0x60,
-    D3D_SHADER_MODEL_6_1 = 0x61,
-    D3D_SHADER_MODEL_6_2 = 0x62,
-    D3D_SHADER_MODEL_6_3 = 0x63,
-    D3D_SHADER_MODEL_6_4 = 0x64,
-    D3D_SHADER_MODEL_6_5 = 0x65,
-    D3D_SHADER_MODEL_6_6 = 0x66,
-    D3D_SHADER_MODEL_6_7 = 0x67,
-    D3D_SHADER_MODEL_6_8 = 0x68,
-    D3D_SHADER_MODEL_6_9 = 0x69,
-  } D3D_SHADER_MODEL;
-
-  static const D3D_SHADER_MODEL HIGHEST_SHADER_MODEL = D3D_SHADER_MODEL_6_9;
-
-  bool UseDxbc() {
-#ifdef _HLK_CONF
-    return false;
-#else
-    return GetTestParamBool(L"DXBC");
-#endif
-  }
-
-  bool UseWarpByDefault() {
-#ifdef _HLK_CONF
-    return false;
-#else
-    return true;
-#endif
-  }
-
-  bool UseDebugIfaces() { return true; }
+  // We define D3D_SHADER_MODEL enum values as we don't generally have access to
+  // the latest D3D headers when adding tests for a new SM being added.
+  using D3D_SHADER_MODEL = ExecTestUtils::D3D_SHADER_MODEL;
+  static constexpr ExecTestUtils::D3D_SHADER_MODEL D3D_SHADER_MODEL_6_0 =
+      ExecTestUtils::D3D_SHADER_MODEL_6_0;
+  static constexpr ExecTestUtils::D3D_SHADER_MODEL D3D_SHADER_MODEL_6_1 =
+      ExecTestUtils::D3D_SHADER_MODEL_6_1;
+  static constexpr ExecTestUtils::D3D_SHADER_MODEL D3D_SHADER_MODEL_6_2 =
+      ExecTestUtils::D3D_SHADER_MODEL_6_2;
+  static constexpr ExecTestUtils::D3D_SHADER_MODEL D3D_SHADER_MODEL_6_3 =
+      ExecTestUtils::D3D_SHADER_MODEL_6_3;
+  static constexpr ExecTestUtils::D3D_SHADER_MODEL D3D_SHADER_MODEL_6_4 =
+      ExecTestUtils::D3D_SHADER_MODEL_6_4;
+  static constexpr ExecTestUtils::D3D_SHADER_MODEL D3D_SHADER_MODEL_6_5 =
+      ExecTestUtils::D3D_SHADER_MODEL_6_5;
+  static constexpr ExecTestUtils::D3D_SHADER_MODEL D3D_SHADER_MODEL_6_6 =
+      ExecTestUtils::D3D_SHADER_MODEL_6_6;
+  static constexpr ExecTestUtils::D3D_SHADER_MODEL D3D_SHADER_MODEL_6_7 =
+      ExecTestUtils::D3D_SHADER_MODEL_6_7;
+  static constexpr ExecTestUtils::D3D_SHADER_MODEL D3D_SHADER_MODEL_6_8 =
+      ExecTestUtils::D3D_SHADER_MODEL_6_8;
+  static constexpr ExecTestUtils::D3D_SHADER_MODEL D3D_SHADER_MODEL_6_9 =
+      ExecTestUtils::D3D_SHADER_MODEL_6_9;
+  static constexpr ExecTestUtils::D3D_SHADER_MODEL D3D_HIGHEST_SHADER_MODEL =
+      ExecTestUtils::D3D_HIGHEST_SHADER_MODEL;
 
   bool SaveImages() { return GetTestParamBool(L"SaveImages"); }
 
@@ -766,7 +714,7 @@ class ExecutionTest {
     CComPtr<ID3DBlob> pComputeShader;
 
     // Load and compile shaders.
-    if (UseDxbc()) {
+    if (useDxbc()) {
 #ifndef _HLK_CONF
       DXBCFromText(pShader, L"main", pTargetProfile, &pComputeShader);
 #endif
@@ -784,112 +732,6 @@ class ExecutionTest {
         &computePsoDesc, IID_PPV_ARGS(ppComputeState)));
   }
 
-  bool CreateDevice(ID3D12Device **ppDevice,
-                    D3D_SHADER_MODEL testModel = D3D_SHADER_MODEL_6_0,
-                    bool skipUnsupported = true) {
-    if (testModel > HIGHEST_SHADER_MODEL) {
-      UINT minor = (UINT)testModel & 0x0f;
-      LogCommentFmt(L"Installed SDK does not support "
-                    L"shader model 6.%1u",
-                    minor);
-
-      if (skipUnsupported) {
-        WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped);
-      }
-
-      return false;
-    }
-    CComPtr<IDXGIFactory4> factory;
-    CComPtr<ID3D12Device> pDevice;
-
-    *ppDevice = nullptr;
-
-    VERIFY_SUCCEEDED(CreateDXGIFactory1(IID_PPV_ARGS(&factory)));
-    if (GetTestParamUseWARP(UseWarpByDefault())) {
-      CComPtr<IDXGIAdapter> warpAdapter;
-      VERIFY_SUCCEEDED(factory->EnumWarpAdapter(IID_PPV_ARGS(&warpAdapter)));
-      HRESULT createHR = D3D12CreateDevice(warpAdapter, D3D_FEATURE_LEVEL_11_0,
-                                           IID_PPV_ARGS(&pDevice));
-      if (FAILED(createHR)) {
-        LogCommentFmt(L"The available version of WARP does not support d3d12.");
-
-        if (skipUnsupported) {
-          WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped);
-        }
-
-        return false;
-      }
-
-      if (GetModuleHandleW(L"d3d10warp.dll") != NULL) {
-        WCHAR szFullModuleFilePath[MAX_PATH] = L"";
-        GetModuleFileNameW(GetModuleHandleW(L"d3d10warp.dll"),
-                           szFullModuleFilePath, sizeof(szFullModuleFilePath));
-        WEX::Logging::Log::Comment(WEX::Common::String().Format(
-            L"WARP driver loaded from: %S", szFullModuleFilePath));
-      }
-
-    } else {
-      CComPtr<IDXGIAdapter1> hardwareAdapter;
-      WEX::Common::String AdapterValue;
-      HRESULT hr = WEX::TestExecution::RuntimeParameters::TryGetValue(
-          L"Adapter", AdapterValue);
-      if (SUCCEEDED(hr)) {
-        st::GetHardwareAdapter(factory, AdapterValue, &hardwareAdapter);
-      } else {
-        WEX::Logging::Log::Comment(
-            L"Using default hardware adapter with D3D12 support.");
-      }
-
-      VERIFY_SUCCEEDED(D3D12CreateDevice(
-          hardwareAdapter, D3D_FEATURE_LEVEL_11_0, IID_PPV_ARGS(&pDevice)));
-    }
-    // retrieve adapter information
-    LUID adapterID = pDevice->GetAdapterLuid();
-    CComPtr<IDXGIAdapter> adapter;
-    factory->EnumAdapterByLuid(adapterID, IID_PPV_ARGS(&adapter));
-    DXGI_ADAPTER_DESC AdapterDesc;
-    VERIFY_SUCCEEDED(adapter->GetDesc(&AdapterDesc));
-    LogCommentFmt(L"Using Adapter:%s", AdapterDesc.Description);
-
-    if (pDevice == nullptr)
-      return false;
-
-    if (!UseDxbc()) {
-      // Check for DXIL support.
-      typedef struct D3D12_FEATURE_DATA_SHADER_MODEL {
-        D3D_SHADER_MODEL HighestShaderModel;
-      } D3D12_FEATURE_DATA_SHADER_MODEL;
-      const UINT D3D12_FEATURE_SHADER_MODEL = 7;
-      D3D12_FEATURE_DATA_SHADER_MODEL SMData;
-      SMData.HighestShaderModel = testModel;
-      if (FAILED(pDevice->CheckFeatureSupport(
-              (D3D12_FEATURE)D3D12_FEATURE_SHADER_MODEL, &SMData,
-              sizeof(SMData))) ||
-          SMData.HighestShaderModel < testModel) {
-        UINT minor = (UINT)testModel & 0x0f;
-        LogCommentFmt(L"The selected device does not support "
-                      L"shader model 6.%1u",
-                      minor);
-
-        if (skipUnsupported) {
-          WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped);
-        }
-
-        return false;
-      }
-    }
-
-    if (UseDebugIfaces()) {
-      CComPtr<ID3D12InfoQueue> pInfoQueue;
-      if (SUCCEEDED(pDevice->QueryInterface(&pInfoQueue))) {
-        pInfoQueue->SetMuteDebugOutput(FALSE);
-      }
-    }
-
-    *ppDevice = pDevice.Detach();
-    return true;
-  }
-
   void CreateGraphicsCommandQueue(ID3D12Device *pDevice,
                                   ID3D12CommandQueue **ppCommandQueue) {
     D3D12_COMMAND_QUEUE_DESC queueDesc = {};
@@ -919,7 +761,7 @@ class ExecutionTest {
     CComPtr<ID3DBlob> vertexShader;
     CComPtr<ID3DBlob> pixelShader;
 
-    if (UseDxbc()) {
+    if (useDxbc()) {
 #ifndef _HLK_CONF
       DXBCFromText(pShaders, L"VSMain", L"vs_6_0", &vertexShader);
       DXBCFromText(pShaders, L"PSMain", L"ps_6_0", &pixelShader);
@@ -1642,7 +1484,7 @@ class ExecutionTest {
     // The debug layer does net yet validate DXIL programs that require
     // rewriting, but basic logging should work properly.
     HRESULT hr = S_FALSE;
-    if (UseDebugIfaces()) {
+    if (useDebugIfaces()) {
       CComPtr<ID3D12Debug> debugController;
       hr = D3D12GetDebugInterface(IID_PPV_ARGS(&debugController));
       if (SUCCEEDED(hr)) {
@@ -1830,20 +1672,6 @@ class ExecutionTest {
     }
   }
 
-  void ReadHlslDataIntoNewStream(LPCWSTR relativePath, IStream **ppStream) {
-    VERIFY_SUCCEEDED(m_support.Initialize());
-    CComPtr<IDxcLibrary> pLibrary;
-    CComPtr<IDxcBlobEncoding> pBlob;
-    CComPtr<IStream> pStream;
-    std::wstring path = GetPathToHlslDataFile(relativePath, HLSLDATAFILEPARAM,
-                                              DEFAULT_EXEC_TEST_DIR);
-    VERIFY_SUCCEEDED(m_support.CreateInstance(CLSID_DxcLibrary, &pLibrary));
-    VERIFY_SUCCEEDED(
-        pLibrary->CreateBlobFromFile(path.c_str(), nullptr, &pBlob));
-    VERIFY_SUCCEEDED(pLibrary->CreateStreamFromBlobReadOnly(pBlob, &pStream));
-    *ppStream = pStream.Detach();
-  }
-
   void RecordRenderAndReadback(ID3D12GraphicsCommandList *pList,
                                ID3D12DescriptorHeap *pRtvHeap,
                                UINT rtvDescriptorSize, UINT instanceCount,
@@ -2348,15 +2176,15 @@ TEST_F(ExecutionTest, LifetimeIntrinsicTest) {
   static const int DispatchGroupCount = 1;
 
   CComPtr<ID3D12Device> pDevice;
-  bool bSM_6_6_Supported = CreateDevice(&pDevice, D3D_SHADER_MODEL_6_6, false);
+  bool bSM_6_6_Supported = createDevice(&pDevice, D3D_SHADER_MODEL_6_6, false);
   bool bSM_6_3_Supported = bSM_6_6_Supported;
   if (!bSM_6_6_Supported) {
     // Try 6.3 for downlevel DXR case
-    bSM_6_3_Supported = CreateDevice(&pDevice, D3D_SHADER_MODEL_6_3, false);
+    bSM_6_3_Supported = createDevice(&pDevice, D3D_SHADER_MODEL_6_3, false);
   }
   if (!bSM_6_3_Supported) {
     // Otherwise, 6.0 better be supported for compute case
-    VERIFY_IS_TRUE(CreateDevice(&pDevice, D3D_SHADER_MODEL_6_0, false));
+    VERIFY_IS_TRUE(createDevice(&pDevice, D3D_SHADER_MODEL_6_0, false));
   }
   bool bDXRSupported =
       bSM_6_3_Supported && DoesDeviceSupportRayTracing(pDevice);
@@ -2465,7 +2293,7 @@ TEST_F(ExecutionTest, BasicComputeTest) {
   static const int DispatchGroupCount = 1;
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice))
+  if (!createDevice(&pDevice))
     return;
 
   std::vector<uint32_t> values;
@@ -2524,7 +2352,7 @@ TEST_F(ExecutionTest, BasicTriangleTest) {
       "  return 1; //input.color;\r\n"
       "};\r\n";
 
-  if (!CreateDevice(&pDevice))
+  if (!createDevice(&pDevice))
     return;
 
   struct BasicTestChecker {
@@ -2668,7 +2496,7 @@ TEST_F(ExecutionTest, Int64Test) {
   static const int DispatchGroupCount = 1;
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice))
+  if (!createDevice(&pDevice))
     return;
 
   if (!DoesDeviceSupportInt64(pDevice)) {
@@ -2693,7 +2521,7 @@ TEST_F(ExecutionTest, SignTest) {
                                 "}";
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice))
+  if (!createDevice(&pDevice))
     return;
 
   const uint32_t neg1 = (uint32_t)-1;
@@ -2714,7 +2542,7 @@ TEST_F(ExecutionTest, SignTest) {
 TEST_F(ExecutionTest, WaveIntrinsicsDDITest) {
 #ifndef _HLK_CONF
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice))
+  if (!createDevice(&pDevice))
     return;
   D3D12_FEATURE_DATA_D3D12_OPTIONS1 O;
   if (FAILED(pDevice->CheckFeatureSupport(
@@ -2814,7 +2642,7 @@ TEST_F(ExecutionTest, WaveIntrinsicsTest) {
   static const int DispatchGroupCount = 1;
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice))
+  if (!createDevice(&pDevice))
     return;
 
   if (!DoesDeviceSupportWaveOps(pDevice)) {
@@ -2841,7 +2669,7 @@ TEST_F(ExecutionTest, WaveIntrinsicsTest) {
   CComPtr<ID3D12DescriptorHeap> pUavHeap;
   CComPtr<ID3D12CommandAllocator> pCommandAllocator;
   FenceObj FO;
-  bool dxbc = UseDxbc();
+  bool dxbc = useDxbc();
 
   const size_t valueSizeInBytes = values.size() * sizeof(PerThreadData);
   CreateComputeCommandQueue(pDevice, L"WaveIntrinsicsTest Command Queue",
@@ -3172,7 +3000,7 @@ TEST_F(ExecutionTest, WaveIntrinsicsInPSTest) {
   CComPtr<ID3D12Resource> pVertexBuffer;
   D3D12_VERTEX_BUFFER_VIEW vertexBufferView;
 
-  if (!CreateDevice(&pDevice))
+  if (!createDevice(&pDevice))
     return;
   if (!DoesDeviceSupportWaveOps(pDevice)) {
     // Optional feature, so it's correct to not support it if declared as such.
@@ -3229,7 +3057,7 @@ TEST_F(ExecutionTest, WaveIntrinsicsInPSTest) {
 
   CreateVertexBuffer(pDevice, vertices, &pVertexBuffer, &vertexBufferView);
 
-  bool dxbc = UseDxbc();
+  bool dxbc = useDxbc();
 
   // Set up UAV resource.
   std::vector<PerPixelData> values;
@@ -3491,12 +3319,6 @@ TEST_F(ExecutionTest, WaveIntrinsicsInPSTest) {
   }
 }
 
-struct ShaderOpTestResult {
-  st::ShaderOp *ShaderOp;
-  std::shared_ptr<st::ShaderOpSet> ShaderOpSet;
-  std::shared_ptr<st::ShaderOpTest> Test;
-};
-
 struct SPrimitives {
   float f_float;
   float f_float2;
@@ -3504,87 +3326,19 @@ struct SPrimitives {
   float f_float2_o;
 };
 
-std::shared_ptr<ShaderOpTestResult>
-RunShaderOpTestAfterParse(ID3D12Device *pDevice, dxc::DxcDllSupport &support,
-                          LPCSTR pName,
-                          st::ShaderOpTest::TInitCallbackFn pInitCallback,
-                          st::ShaderOpTest::TShaderCallbackFn pShaderCallback,
-                          std::shared_ptr<st::ShaderOpSet> ShaderOpSet) {
-  st::ShaderOp *pShaderOp;
-  if (pName == nullptr) {
-    if (ShaderOpSet->ShaderOps.size() != 1) {
-      VERIFY_FAIL(L"Expected a single shader operation.");
-    }
-    pShaderOp = ShaderOpSet->ShaderOps[0].get();
-  } else {
-    pShaderOp = ShaderOpSet->GetShaderOp(pName);
-  }
-  if (pShaderOp == nullptr) {
-    std::string msg = "Unable to find shader op ";
-    msg += pName;
-    msg += "; available ops";
-    const char sep = ':';
-    for (auto &pAvailOp : ShaderOpSet->ShaderOps) {
-      msg += sep;
-      msg += pAvailOp->Name ? pAvailOp->Name : "[n/a]";
-    }
-    CA2W msgWide(msg.c_str());
-    VERIFY_FAIL(msgWide.m_psz);
-  }
-
-  // This won't actually be used since we're supplying the device,
-  // but let's make it consistent.
-  pShaderOp->UseWarpDevice = GetTestParamUseWARP(true);
-
-  std::shared_ptr<st::ShaderOpTest> test = std::make_shared<st::ShaderOpTest>();
-  test->SetDxcSupport(&support);
-  test->SetInitCallback(pInitCallback);
-  test->SetShaderCallback(pShaderCallback);
-  test->SetDevice(pDevice);
-  test->RunShaderOp(pShaderOp);
-
-  std::shared_ptr<ShaderOpTestResult> result =
-      std::make_shared<ShaderOpTestResult>();
-  result->ShaderOpSet = ShaderOpSet;
-  result->Test = test;
-  result->ShaderOp = pShaderOp;
-  return result;
-}
-
-std::shared_ptr<ShaderOpTestResult>
-RunShaderOpTestAfterParse(ID3D12Device *pDevice, dxc::DxcDllSupport &support,
-                          LPCSTR pName,
-                          st::ShaderOpTest::TInitCallbackFn pInitCallback,
-                          std::shared_ptr<st::ShaderOpSet> ShaderOpSet) {
-  return RunShaderOpTestAfterParse(pDevice, support, pName, pInitCallback,
-                                   nullptr, ShaderOpSet);
-}
-
-std::shared_ptr<ShaderOpTestResult>
-RunShaderOpTest(ID3D12Device *pDevice, dxc::DxcDllSupport &support,
-                IStream *pStream, LPCSTR pName,
-                st::ShaderOpTest::TInitCallbackFn pInitCallback) {
-  DXASSERT_NOMSG(pStream != nullptr);
-  std::shared_ptr<st::ShaderOpSet> ShaderOpSet =
-      std::make_shared<st::ShaderOpSet>();
-  st::ParseShaderOpSetFromStream(pStream, ShaderOpSet.get());
-  return RunShaderOpTestAfterParse(pDevice, support, pName, pInitCallback,
-                                   ShaderOpSet);
-}
-
 TEST_F(ExecutionTest, OutOfBoundsTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   // Single operation test at the moment.
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice))
+  if (!createDevice(&pDevice))
     return;
 
-  std::shared_ptr<ShaderOpTestResult> test =
-      RunShaderOpTest(pDevice, m_support, pStream, "OOB", nullptr);
+  std::shared_ptr<st::ShaderOpTestResult> test =
+      st::RunShaderOpTest(pDevice, m_support, pStream, "OOB", nullptr);
   MappedData data;
   // Read back to CPU and examine contents - should get pure red.
   {
@@ -3601,15 +3355,15 @@ TEST_F(ExecutionTest, SaturateTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   // Single operation test at the moment.
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice))
+  if (!createDevice(&pDevice))
     return;
 
-  std::shared_ptr<ShaderOpTestResult> test =
-      RunShaderOpTest(pDevice, m_support, pStream, "Saturate", nullptr);
+  std::shared_ptr<st::ShaderOpTestResult> test =
+      st::RunShaderOpTest(pDevice, m_support, pStream, "Saturate", nullptr);
   MappedData data;
   test->Test->GetReadBackData("U0", &data);
   const float *pValues = (float *)data.data();
@@ -3636,11 +3390,11 @@ void ExecutionTest::BasicTriangleTestSetup(LPCSTR ShaderOpName,
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   // Single operation test at the moment.
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice, testModel))
+  if (!createDevice(&pDevice, testModel))
     return;
 
   // As this is used, 6.2 requirement always comes with requiring native 16-bit
@@ -3653,8 +3407,8 @@ void ExecutionTest::BasicTriangleTestSetup(LPCSTR ShaderOpName,
     return;
   }
 
-  std::shared_ptr<ShaderOpTestResult> test =
-      RunShaderOpTest(pDevice, m_support, pStream, ShaderOpName, nullptr);
+  std::shared_ptr<st::ShaderOpTestResult> test =
+      st::RunShaderOpTest(pDevice, m_support, pStream, ShaderOpName, nullptr);
   MappedData data;
   D3D12_RESOURCE_DESC &D = test->ShaderOp->GetResourceByName("RTarget")->Desc;
   UINT width = (UINT)D.Width;
@@ -3786,14 +3540,14 @@ TEST_F(ExecutionTest, PartialDerivTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice))
+  if (!createDevice(&pDevice))
     return;
 
-  std::shared_ptr<ShaderOpTestResult> test =
-      RunShaderOpTest(pDevice, m_support, pStream, "DerivFine", nullptr);
+  std::shared_ptr<st::ShaderOpTestResult> test =
+      st::RunShaderOpTest(pDevice, m_support, pStream, "DerivFine", nullptr);
   MappedData data;
   D3D12_RESOURCE_DESC &D = test->ShaderOp->GetResourceByName("RTarget")->Desc;
   UINT width = (UINT)D.Width;
@@ -3894,10 +3648,10 @@ TEST_F(ExecutionTest, DerivativesTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice, D3D_SHADER_MODEL_6_6))
+  if (!createDevice(&pDevice, D3D_SHADER_MODEL_6_6))
     return;
 
   std::shared_ptr<st::ShaderOpSet> ShaderOpSet =
@@ -3977,10 +3731,10 @@ TEST_F(ExecutionTest, QuadReadTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice))
+  if (!createDevice(&pDevice))
     return;
 
   if (!DoesDeviceSupportWaveOps(pDevice)) {
@@ -4033,8 +3787,9 @@ TEST_F(ExecutionTest, QuadReadTest) {
 
     // Test Compute Shader
     pShaderOp->CS = CS;
-    std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTestAfterParse(
-        pDevice, m_support, "QuadRead", nullptr, ShaderOpSet);
+    std::shared_ptr<st::ShaderOpTestResult> test =
+        st::RunShaderOpTestAfterParse(pDevice, m_support, "QuadRead", nullptr,
+                                      ShaderOpSet);
     MappedData data;
 
     test->Test->GetReadBackData("U0", &data);
@@ -4055,8 +3810,8 @@ TEST_F(ExecutionTest, QuadReadTest) {
 
       // Disable CS so mesh goes forward
       pShaderOp->CS = nullptr;
-      test = RunShaderOpTestAfterParse(pDevice, m_support, "QuadRead", nullptr,
-                                       ShaderOpSet);
+      test = st::RunShaderOpTestAfterParse(pDevice, m_support, "QuadRead",
+                                           nullptr, ShaderOpSet);
       test->Test->GetReadBackData("U1", &data);
       pPixels = (UINT *)data.data();
       // Test first, second and center quads
@@ -4124,10 +3879,10 @@ TEST_F(ExecutionTest, ComputeSampleTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice, D3D_SHADER_MODEL_6_6))
+  if (!createDevice(&pDevice, D3D_SHADER_MODEL_6_6))
     return;
 
   std::shared_ptr<st::ShaderOpSet> ShaderOpSet =
@@ -4175,7 +3930,7 @@ TEST_F(ExecutionTest, ComputeSampleTest) {
   }
 
   // Test 1D compute shader
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTestAfterParse(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTestAfterParse(
       pDevice, m_support, "ComputeSample", SampleInitFn, ShaderOpSet);
   MappedData data;
 
@@ -4190,8 +3945,8 @@ TEST_F(ExecutionTest, ComputeSampleTest) {
   pShaderOp->CS = CS2;
 
   test.reset();
-  test = RunShaderOpTestAfterParse(pDevice, m_support, "ComputeSample",
-                                   SampleInitFn, ShaderOpSet);
+  test = st::RunShaderOpTestAfterParse(pDevice, m_support, "ComputeSample",
+                                       SampleInitFn, ShaderOpSet);
 
   test->Test->GetReadBackData("U0", &data);
   pPixels = (UINT *)data.data();
@@ -4203,8 +3958,8 @@ TEST_F(ExecutionTest, ComputeSampleTest) {
   if (DoesDeviceSupportMeshAmpDerivatives(pDevice)) {
     // Disable CS so mesh goes forward
     pShaderOp->CS = nullptr;
-    test = RunShaderOpTestAfterParse(pDevice, m_support, "ComputeSample",
-                                     SampleInitFn, ShaderOpSet);
+    test = st::RunShaderOpTestAfterParse(pDevice, m_support, "ComputeSample",
+                                         SampleInitFn, ShaderOpSet);
     test->Test->GetReadBackData("U1", &data);
     pPixels = (UINT *)data.data();
 
@@ -4221,8 +3976,8 @@ TEST_F(ExecutionTest, ComputeSampleTest) {
 
     pShaderOp->AS = AS2;
     pShaderOp->MS = MS2;
-    test = RunShaderOpTestAfterParse(pDevice, m_support, "ComputeSample",
-                                     SampleInitFn, ShaderOpSet);
+    test = st::RunShaderOpTestAfterParse(pDevice, m_support, "ComputeSample",
+                                         SampleInitFn, ShaderOpSet);
     test->Test->GetReadBackData("U1", &data);
     pPixels = (UINT *)data.data();
 
@@ -4251,7 +4006,7 @@ TEST_F(ExecutionTest, ATOWriteMSAATest) {
 #else
   D3D_SHADER_MODEL sm = D3D_SHADER_MODEL_6_7;
 #endif
-  if (!CreateDevice(&pDevice, sm))
+  if (!createDevice(&pDevice, sm))
     return;
 
 #ifndef WRITEMSAA_FALLBACK
@@ -4517,7 +4272,7 @@ TEST_F(ExecutionTest, ATOProgOffset) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   std::shared_ptr<st::ShaderOpSet> ShaderOpSet =
       std::make_shared<st::ShaderOpSet>();
@@ -4550,7 +4305,7 @@ TEST_F(ExecutionTest, ATOProgOffset) {
     D3D_SHADER_MODEL sm = TestShaderModels[i];
 
     CComPtr<ID3D12Device> pDevice;
-    if (!CreateDevice(&pDevice, sm, /*skipUnsupported*/ false)) {
+    if (!createDevice(&pDevice, sm, /*skipUnsupported*/ false)) {
       LogCommentFmt(L"Device does not support shader model 6.%1u",
                     ((UINT)sm & 0x0f));
       break;
@@ -4603,8 +4358,9 @@ TEST_F(ExecutionTest, ATOProgOffset) {
     }
 
     // Test compute shader
-    std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTestAfterParse(
-        pDevice, m_support, "ProgOffset", SampleInitFn, ShaderOpSet);
+    std::shared_ptr<st::ShaderOpTestResult> test =
+        st::RunShaderOpTestAfterParse(pDevice, m_support, "ProgOffset",
+                                      SampleInitFn, ShaderOpSet);
     MappedData data;
 
     test->Test->GetReadBackData("U0", &data);
@@ -4614,8 +4370,8 @@ TEST_F(ExecutionTest, ATOProgOffset) {
     pShaderOp->CS = nullptr;
 
     if (DoesDeviceSupportMeshShaders(pDevice)) {
-      test = RunShaderOpTestAfterParse(pDevice, m_support, "ProgOffset",
-                                       SampleInitFn, ShaderOpSet);
+      test = st::RunShaderOpTestAfterParse(pDevice, m_support, "ProgOffset",
+                                           SampleInitFn, ShaderOpSet);
 
       // PS
       test->Test->GetReadBackData("U0", &data);
@@ -4632,8 +4388,8 @@ TEST_F(ExecutionTest, ATOProgOffset) {
 
     // Disable MS so PS goes forward
     pShaderOp->MS = nullptr;
-    test = RunShaderOpTestAfterParse(pDevice, m_support, "ProgOffset",
-                                     SampleInitFn, ShaderOpSet);
+    test = st::RunShaderOpTestAfterParse(pDevice, m_support, "ProgOffset",
+                                         SampleInitFn, ShaderOpSet);
 
     test->Test->GetReadBackData("U0", &data);
     VerifyProgOffsetResults((UINT *)data.data(), true);
@@ -4653,10 +4409,10 @@ TEST_F(ExecutionTest, ATOSampleCmpLevelTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice, D3D_SHADER_MODEL_6_7))
+  if (!createDevice(&pDevice, D3D_SHADER_MODEL_6_7))
     return;
 
   if (!DoesDeviceSupportAdvancedTexOps(pDevice)) {
@@ -4701,7 +4457,7 @@ TEST_F(ExecutionTest, ATOSampleCmpLevelTest) {
   };
 
   // Test compute shader
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTestAfterParse(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTestAfterParse(
       pDevice, m_support, "SampleCmpLevel", SampleInitFn, ShaderOpSet);
   MappedData data;
 
@@ -4718,8 +4474,8 @@ TEST_F(ExecutionTest, ATOSampleCmpLevelTest) {
   if (DoesDeviceSupportMeshShaders(pDevice)) {
     // Disable CS so mesh goes forward
     pShaderOp->CS = nullptr;
-    test = RunShaderOpTestAfterParse(pDevice, m_support, "SampleCmpLevel",
-                                     SampleInitFn, ShaderOpSet);
+    test = st::RunShaderOpTestAfterParse(pDevice, m_support, "SampleCmpLevel",
+                                         SampleInitFn, ShaderOpSet);
 
     test->Test->GetReadBackData("U0", &data);
     pPixels = (UINT *)data.data();
@@ -5298,7 +5054,7 @@ TEST_F(ExecutionTest, ATORawGather) {
   D3D_SHADER_MODEL sm = D3D_SHADER_MODEL_6_7;
 #endif
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice, sm))
+  if (!createDevice(&pDevice, sm))
     return;
 
 #ifndef RAWGATHER_FALLBACK
@@ -5528,7 +5284,7 @@ void ExecutionTest::RunBasicShaderModelTest(D3D_SHADER_MODEL shaderModel) {
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice, shaderModel)) {
+  if (!createDevice(&pDevice, shaderModel)) {
     return;
   }
 
@@ -5628,9 +5384,9 @@ void ExecutionTest::RunBasicShaderModelTest(CComPtr<ID3D12Device> pDevice,
   };
 
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTest(
       pDevice, m_support, pStream, "BinaryFPOp",
       // this callback is called when the test is creating the resource to run
       // the test
@@ -5958,178 +5714,6 @@ struct SPackUnpackOpOutUnpacked {
   std::array<uint16_t, 4> outputClampedUint16;
   std::array<int16_t, 4> outputClampedInt16;
 };
-
-// Parameter representation for taef data-driven tests
-struct TableParameter {
-  LPCWSTR m_name;
-  enum TableParameterType {
-    INT8,
-    INT16,
-    INT32,
-    UINT,
-    FLOAT,
-    HALF,
-    DOUBLE,
-    STRING,
-    BOOL,
-    INT8_TABLE,
-    INT16_TABLE,
-    INT32_TABLE,
-    FLOAT_TABLE,
-    HALF_TABLE,
-    DOUBLE_TABLE,
-    STRING_TABLE,
-    UINT8_TABLE,
-    UINT16_TABLE,
-    UINT32_TABLE,
-    BOOL_TABLE
-  };
-  TableParameter(LPCWSTR name, TableParameterType type, bool required)
-      : m_name(name), m_type(type), m_required(required) {}
-  TableParameterType m_type;
-  bool m_required; // required parameter
-  int8_t m_int8;
-  int16_t m_int16;
-  int m_int32;
-  unsigned int m_uint;
-  float m_float;
-  uint16_t m_half; // no such thing as half type in c++. Use int16 instead
-  double m_double;
-  bool m_bool;
-  WEX::Common::String m_str;
-  std::vector<int8_t> m_int8Table;
-  std::vector<int16_t> m_int16Table;
-  std::vector<int> m_int32Table;
-  std::vector<uint8_t> m_uint8Table;
-  std::vector<uint16_t> m_uint16Table;
-  std::vector<unsigned int> m_uint32Table;
-  std::vector<float> m_floatTable;
-  std::vector<uint16_t> m_halfTable; // no such thing as half type in c++
-  std::vector<double> m_doubleTable;
-  std::vector<bool> m_boolTable;
-  std::vector<WEX::Common::String> m_StringTable;
-};
-
-class TableParameterHandler {
-private:
-  HRESULT ParseTableRow();
-
-public:
-  TableParameter *m_table;
-  size_t m_tableSize;
-  TableParameterHandler(TableParameter *pTable, size_t size)
-      : m_table(pTable), m_tableSize(size) {
-    clearTableParameter();
-    VERIFY_SUCCEEDED(ParseTableRow());
-  }
-
-  TableParameter *GetTableParamByName(LPCWSTR name) {
-    for (size_t i = 0; i < m_tableSize; ++i) {
-      if (_wcsicmp(name, m_table[i].m_name) == 0) {
-        return &m_table[i];
-      }
-    }
-    DXASSERT_ARGS(false, "Invalid Table Parameter Name %s", name);
-    return nullptr;
-  }
-
-  void clearTableParameter() {
-    for (size_t i = 0; i < m_tableSize; ++i) {
-      m_table[i].m_int32 = 0;
-      m_table[i].m_uint = 0;
-      m_table[i].m_double = 0;
-      m_table[i].m_bool = false;
-      m_table[i].m_str = WEX::Common::String();
-    }
-  }
-
-  template <class T1> std::vector<T1> *GetDataArray(LPCWSTR name) {
-    return nullptr;
-  }
-
-  template <> std::vector<int> *GetDataArray(LPCWSTR name) {
-    for (size_t i = 0; i < m_tableSize; ++i) {
-      if (_wcsicmp(name, m_table[i].m_name) == 0) {
-        return &(m_table[i].m_int32Table);
-      }
-    }
-    DXASSERT_ARGS(false, "Invalid Table Parameter Name %s", name);
-    return nullptr;
-  }
-
-  template <> std::vector<int8_t> *GetDataArray(LPCWSTR name) {
-    for (size_t i = 0; i < m_tableSize; ++i) {
-      if (_wcsicmp(name, m_table[i].m_name) == 0) {
-        return &(m_table[i].m_int8Table);
-      }
-    }
-    DXASSERT_ARGS(false, "Invalid Table Parameter Name %s", name);
-    return nullptr;
-  }
-
-  template <> std::vector<int16_t> *GetDataArray(LPCWSTR name) {
-    for (size_t i = 0; i < m_tableSize; ++i) {
-      if (_wcsicmp(name, m_table[i].m_name) == 0) {
-        return &(m_table[i].m_int16Table);
-      }
-    }
-    DXASSERT_ARGS(false, "Invalid Table Parameter Name %s", name);
-    return nullptr;
-  }
-
-  template <> std::vector<unsigned int> *GetDataArray(LPCWSTR name) {
-    for (size_t i = 0; i < m_tableSize; ++i) {
-      if (_wcsicmp(name, m_table[i].m_name) == 0) {
-        return &(m_table[i].m_uint32Table);
-      }
-    }
-    DXASSERT_ARGS(false, "Invalid Table Parameter Name %s", name);
-    return nullptr;
-  }
-
-  template <> std::vector<float> *GetDataArray(LPCWSTR name) {
-    for (size_t i = 0; i < m_tableSize; ++i) {
-      if (_wcsicmp(name, m_table[i].m_name) == 0) {
-        return &(m_table[i].m_floatTable);
-      }
-    }
-    DXASSERT_ARGS(false, "Invalid Table Parameter Name %s", name);
-    return nullptr;
-  }
-
-  // TODO: uin16_t may be used to represent two different types when we
-  // introduce uint16
-  template <> std::vector<uint16_t> *GetDataArray(LPCWSTR name) {
-    for (size_t i = 0; i < m_tableSize; ++i) {
-      if (_wcsicmp(name, m_table[i].m_name) == 0) {
-        return &(m_table[i].m_halfTable);
-      }
-    }
-    DXASSERT_ARGS(false, "Invalid Table Parameter Name %s", name);
-    return nullptr;
-  }
-
-  template <> std::vector<double> *GetDataArray(LPCWSTR name) {
-    for (size_t i = 0; i < m_tableSize; ++i) {
-      if (_wcsicmp(name, m_table[i].m_name) == 0) {
-        return &(m_table[i].m_doubleTable);
-      }
-    }
-    DXASSERT_ARGS(false, "Invalid Table Parameter Name %s", name);
-    return nullptr;
-  }
-
-  template <> std::vector<bool> *GetDataArray(LPCWSTR name) {
-    for (size_t i = 0; i < m_tableSize; ++i) {
-      if (_wcsicmp(name, m_table[i].m_name) == 0) {
-        return &(m_table[i].m_boolTable);
-      }
-    }
-    DXASSERT_ARGS(false, "Invalid Table Parameter Name %s", name);
-    return nullptr;
-  }
-};
-
 static TableParameter UnaryFPOpParameters[] = {
     {L"ShaderOp.Target", TableParameter::STRING, true},
     {L"ShaderOp.Text", TableParameter::STRING, true},
@@ -6460,381 +6044,6 @@ static TableParameter PackUnpackOpParameters[] = {
     {L"Validation.Input", TableParameter::UINT32_TABLE, true},
 };
 
-static bool IsHexString(PCWSTR str, uint16_t *value) {
-  std::wstring wString(str);
-  wString.erase(std::remove(wString.begin(), wString.end(), L' '),
-                wString.end());
-  LPCWSTR wstr = wString.c_str();
-  if (wcsncmp(wstr, L"0x", 2) == 0 || wcsncmp(wstr, L"0b", 2) == 0) {
-    *value = (uint16_t)wcstol(wstr, NULL, 0);
-    return true;
-  }
-  return false;
-}
-
-static HRESULT ParseDataToFloat(PCWSTR str, float &value) {
-  std::wstring wString(str);
-  wString.erase(std::remove(wString.begin(), wString.end(), L' '),
-                wString.end());
-  wString.erase(std::remove(wString.begin(), wString.end(), L'\n'),
-                wString.end());
-  PCWSTR wstr = wString.data();
-  if (_wcsicmp(wstr, L"NaN") == 0) {
-    value = NAN;
-  } else if (_wcsicmp(wstr, L"-inf") == 0) {
-    value = -(INFINITY);
-  } else if (_wcsicmp(wstr, L"inf") == 0) {
-    value = INFINITY;
-  } else if (_wcsicmp(wstr, L"-denorm") == 0) {
-    value = -(FLT_MIN / 2);
-  } else if (_wcsicmp(wstr, L"denorm") == 0) {
-    value = FLT_MIN / 2;
-  } else if (_wcsicmp(wstr, L"-0.0f") == 0 || _wcsicmp(wstr, L"-0.0") == 0 ||
-             _wcsicmp(wstr, L"-0") == 0) {
-    value = -0.0f;
-  } else if (_wcsicmp(wstr, L"0.0f") == 0 || _wcsicmp(wstr, L"0.0") == 0 ||
-             _wcsicmp(wstr, L"0") == 0) {
-    value = 0.0f;
-  } else if (_wcsnicmp(wstr, L"0x", 2) ==
-             0) { // For hex values, take values literally
-    unsigned temp_i = std::stoul(wstr, nullptr, 16);
-    value = (float &)temp_i;
-  } else {
-    // evaluate the expression of wstring
-    double val = _wtof(wstr);
-    if (val == 0) {
-      LogErrorFmt(L"Failed to parse parameter %s to float", wstr);
-      return E_FAIL;
-    }
-    value = (float)val;
-  }
-  return S_OK;
-}
-
-static HRESULT ParseDataToUint(PCWSTR str, unsigned int &value) {
-  std::wstring wString(str);
-  wString.erase(std::remove(wString.begin(), wString.end(), L' '),
-                wString.end());
-  PCWSTR wstr = wString.data();
-  // evaluate the expression of string
-  if (_wcsicmp(wstr, L"0") == 0 || _wcsicmp(wstr, L"0x00000000") == 0) {
-    value = 0;
-    return S_OK;
-  }
-  wchar_t *end;
-  unsigned int val = std::wcstoul(wstr, &end, 0);
-  if (val == 0) {
-    LogErrorFmt(L"Failed to parse parameter %s to int", wstr);
-    return E_FAIL;
-  }
-  value = val;
-  return S_OK;
-}
-
-static HRESULT ParseDataToVectorFloat(PCWSTR str, float *ptr, size_t count) {
-  std::wstring wstr(str);
-  size_t curPosition = 0;
-  // parse a string of dot product separated by commas
-  for (size_t i = 0; i < count; ++i) {
-    size_t nextPosition = wstr.find(L",", curPosition);
-    if (FAILED(ParseDataToFloat(
-            wstr.substr(curPosition, nextPosition - curPosition).data(),
-            *(ptr + i)))) {
-      return E_FAIL;
-    }
-    curPosition = nextPosition + 1;
-  }
-  return S_OK;
-}
-
-static HRESULT ParseDataToVectorHalf(PCWSTR str, uint16_t *ptr, size_t count) {
-  std::wstring wstr(str);
-  size_t curPosition = 0;
-  // parse a string of dot product separated by commas
-  for (size_t i = 0; i < count; ++i) {
-    size_t nextPosition = wstr.find(L",", curPosition);
-    float floatValue;
-    if (FAILED(ParseDataToFloat(
-            wstr.substr(curPosition, nextPosition - curPosition).data(),
-            floatValue))) {
-      return E_FAIL;
-    }
-    *(ptr + i) = ConvertFloat32ToFloat16(floatValue);
-    curPosition = nextPosition + 1;
-  }
-  return S_OK;
-}
-
-static HRESULT ParseDataToVectorUint(PCWSTR str, unsigned int *ptr,
-                                     size_t count) {
-  std::wstring wstr(str);
-  size_t curPosition = 0;
-  // parse a string of dot product separated by commas
-  for (size_t i = 0; i < count; ++i) {
-    size_t nextPosition = wstr.find(L",", curPosition);
-    if (FAILED(ParseDataToUint(
-            wstr.substr(curPosition, nextPosition - curPosition).data(),
-            *(ptr + i)))) {
-      return E_FAIL;
-    }
-    curPosition = nextPosition + 1;
-  }
-  return S_OK;
-}
-
-HRESULT TableParameterHandler::ParseTableRow() {
-  TableParameter *table = m_table;
-  for (unsigned int i = 0; i < m_tableSize; ++i) {
-    switch (table[i].m_type) {
-    case TableParameter::INT8:
-      if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
-                                                           table[i].m_int32)) &&
-          table[i].m_required) {
-        // TryGetValue does not suppport reading from int16
-        LogErrorFmt(L"Failed to get %s", table[i].m_name);
-        return E_FAIL;
-      }
-      table[i].m_int8 = (int8_t)(table[i].m_int32);
-      break;
-    case TableParameter::INT16:
-      if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
-                                                           table[i].m_int32)) &&
-          table[i].m_required) {
-        // TryGetValue does not suppport reading from int16
-        LogErrorFmt(L"Failed to get %s", table[i].m_name);
-        return E_FAIL;
-      }
-      table[i].m_int16 = (short)(table[i].m_int32);
-      break;
-    case TableParameter::INT32:
-      if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
-                                                           table[i].m_int32)) &&
-          table[i].m_required) {
-        LogErrorFmt(L"Failed to get %s", table[i].m_name);
-        return E_FAIL;
-      }
-      break;
-    case TableParameter::UINT:
-      if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
-                                                           table[i].m_uint)) &&
-          table[i].m_required) {
-        LogErrorFmt(L"Failed to get %s", table[i].m_name);
-        return E_FAIL;
-      }
-      break;
-    case TableParameter::DOUBLE:
-      if (FAILED(WEX::TestExecution::TestData::TryGetValue(
-              table[i].m_name, table[i].m_double)) &&
-          table[i].m_required) {
-        LogErrorFmt(L"Failed to get %s", table[i].m_name);
-        return E_FAIL;
-      }
-      break;
-    case TableParameter::STRING:
-      if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
-                                                           table[i].m_str)) &&
-          table[i].m_required) {
-        LogErrorFmt(L"Failed to get %s", table[i].m_name);
-        return E_FAIL;
-      }
-      break;
-    case TableParameter::BOOL:
-      if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
-                                                           table[i].m_str)) &&
-          table[i].m_bool) {
-        LogErrorFmt(L"Failed to get %s", table[i].m_name);
-        return E_FAIL;
-      }
-      break;
-    case TableParameter::INT8_TABLE: {
-      WEX::TestExecution::TestDataArray<int> tempTable;
-      if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
-                                                           tempTable)) &&
-          table[i].m_required) {
-
-        LogErrorFmt(L"Failed to get %s", table[i].m_name);
-        return E_FAIL;
-      }
-      // TryGetValue does not suppport reading from int8
-      table[i].m_int8Table.resize(tempTable.GetSize());
-      for (size_t j = 0, end = tempTable.GetSize(); j != end; ++j) {
-        table[i].m_int8Table[j] = (int8_t)tempTable[j];
-      }
-      break;
-    }
-    case TableParameter::INT16_TABLE: {
-      WEX::TestExecution::TestDataArray<int> tempTable;
-      if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
-                                                           tempTable)) &&
-          table[i].m_required) {
-        LogErrorFmt(L"Failed to get %s", table[i].m_name);
-        return E_FAIL;
-      }
-      // TryGetValue does not suppport reading from int8
-      table[i].m_int16Table.resize(tempTable.GetSize());
-      for (size_t j = 0, end = tempTable.GetSize(); j != end; ++j) {
-        table[i].m_int16Table[j] = (int16_t)tempTable[j];
-      }
-      break;
-    }
-    case TableParameter::INT32_TABLE: {
-      WEX::TestExecution::TestDataArray<int> tempTable;
-      if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
-                                                           tempTable)) &&
-          table[i].m_required) {
-        // TryGetValue does not suppport reading from int8
-        LogErrorFmt(L"Failed to get %s", table[i].m_name);
-        return E_FAIL;
-      }
-      table[i].m_int32Table.resize(tempTable.GetSize());
-      for (size_t j = 0, end = tempTable.GetSize(); j != end; ++j) {
-        table[i].m_int32Table[j] = tempTable[j];
-      }
-      break;
-    }
-    case TableParameter::UINT8_TABLE: {
-      WEX::TestExecution::TestDataArray<int> tempTable;
-      if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
-                                                           tempTable)) &&
-          table[i].m_required) {
-
-        LogErrorFmt(L"Failed to get %s", table[i].m_name);
-        return E_FAIL;
-      }
-      // TryGetValue does not suppport reading from int8
-      table[i].m_int8Table.resize(tempTable.GetSize());
-      for (size_t j = 0, end = tempTable.GetSize(); j != end; ++j) {
-        table[i].m_int8Table[j] = (uint8_t)tempTable[j];
-      }
-      break;
-    }
-    case TableParameter::UINT16_TABLE: {
-      WEX::TestExecution::TestDataArray<int> tempTable;
-      if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
-                                                           tempTable)) &&
-          table[i].m_required) {
-        LogErrorFmt(L"Failed to get %s", table[i].m_name);
-        return E_FAIL;
-      }
-      // TryGetValue does not suppport reading from int8
-      table[i].m_uint16Table.resize(tempTable.GetSize());
-      for (size_t j = 0, end = tempTable.GetSize(); j != end; ++j) {
-        table[i].m_uint16Table[j] = (uint16_t)tempTable[j];
-      }
-      break;
-    }
-    case TableParameter::UINT32_TABLE: {
-      WEX::TestExecution::TestDataArray<unsigned int> tempTable;
-      if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
-                                                           tempTable)) &&
-          table[i].m_required) {
-        // TryGetValue does not suppport reading from int8
-        LogErrorFmt(L"Failed to get %s", table[i].m_name);
-        return E_FAIL;
-      }
-      table[i].m_uint32Table.resize(tempTable.GetSize());
-      for (size_t j = 0, end = tempTable.GetSize(); j != end; ++j) {
-        table[i].m_uint32Table[j] = tempTable[j];
-      }
-      break;
-    }
-    case TableParameter::FLOAT_TABLE: {
-      WEX::TestExecution::TestDataArray<WEX::Common::String> tempTable;
-      if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
-                                                           tempTable)) &&
-          table[i].m_required) {
-        // TryGetValue does not suppport reading from int8
-        LogErrorFmt(L"Failed to get %s", table[i].m_name);
-        return E_FAIL;
-      }
-      table[i].m_floatTable.resize(tempTable.GetSize());
-      for (size_t j = 0, end = tempTable.GetSize(); j != end; ++j) {
-        ParseDataToFloat(tempTable[j], table[i].m_floatTable[j]);
-      }
-      break;
-    }
-    case TableParameter::HALF_TABLE: {
-      WEX::TestExecution::TestDataArray<WEX::Common::String> tempTable;
-      if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
-                                                           tempTable)) &&
-          table[i].m_required) {
-        // TryGetValue does not suppport reading from int8
-        LogErrorFmt(L"Failed to get %s", table[i].m_name);
-        return E_FAIL;
-      }
-      table[i].m_halfTable.resize(tempTable.GetSize());
-      for (size_t j = 0, end = tempTable.GetSize(); j != end; ++j) {
-        uint16_t value = 0;
-        if (IsHexString(tempTable[j], &value)) {
-          table[i].m_halfTable[j] = value;
-        } else {
-          float val;
-          ParseDataToFloat(tempTable[j], val);
-          if (isdenorm(val))
-            table[i].m_halfTable[j] =
-                signbit(val) ? Float16NegDenorm : Float16PosDenorm;
-          else
-            table[i].m_halfTable[j] = ConvertFloat32ToFloat16(val);
-        }
-      }
-      break;
-    }
-    case TableParameter::DOUBLE_TABLE: {
-      WEX::TestExecution::TestDataArray<double> tempTable;
-      if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
-                                                           tempTable)) &&
-          table[i].m_required) {
-        // TryGetValue does not suppport reading from int8
-        LogErrorFmt(L"Failed to get %s", table[i].m_name);
-        return E_FAIL;
-      }
-      table[i].m_doubleTable.resize(tempTable.GetSize());
-      for (size_t j = 0, end = tempTable.GetSize(); j != end; ++j) {
-        table[i].m_doubleTable[j] = tempTable[j];
-      }
-      break;
-    }
-    case TableParameter::BOOL_TABLE: {
-      WEX::TestExecution::TestDataArray<bool> tempTable;
-      if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
-                                                           tempTable)) &&
-          table[i].m_required) {
-        // TryGetValue does not suppport reading from int8
-        LogErrorFmt(L"Failed to get %s", table[i].m_name);
-        return E_FAIL;
-      }
-      table[i].m_boolTable.resize(tempTable.GetSize());
-      for (size_t j = 0, end = tempTable.GetSize(); j != end; ++j) {
-        table[i].m_boolTable[j] = tempTable[j];
-      }
-      break;
-    }
-    case TableParameter::STRING_TABLE: {
-      WEX::TestExecution::TestDataArray<WEX::Common::String> tempTable;
-      if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
-                                                           tempTable)) &&
-          table[i].m_required) {
-        // TryGetValue does not suppport reading from int8
-        LogErrorFmt(L"Failed to get %s", table[i].m_name);
-        return E_FAIL;
-      }
-      table[i].m_StringTable.resize(tempTable.GetSize());
-      for (size_t j = 0, end = tempTable.GetSize(); j != end; ++j) {
-        table[i].m_StringTable[j] = tempTable[j];
-      }
-      break;
-    }
-    default:
-      DXASSERT_NOMSG("Invalid Parameter Type");
-    }
-    if (errno == ERANGE) {
-      LogErrorFmt(L"got out of range value for table %s", table[i].m_name);
-      return E_FAIL;
-    }
-  }
-  return S_OK;
-}
-
 static bool CompareOutputWithExpectedValueInt(int output, int ref,
                                               int tolerance) {
   return ((output - ref) <= tolerance) && ((ref - output) <= tolerance);
@@ -6972,10 +6181,10 @@ TEST_F(ExecutionTest, UnaryFloatOpTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice)) {
+  if (!createDevice(&pDevice)) {
     return;
   }
   // Read data from the table
@@ -6997,7 +6206,7 @@ TEST_F(ExecutionTest, UnaryFloatOpTest) {
 
   size_t count = Validation_Input->size();
 
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTest(
       pDevice, m_support, pStream, "UnaryFPOp",
       // this callback is called when the test
       // is creating the resource to run the test
@@ -7035,10 +6244,10 @@ TEST_F(ExecutionTest, BinaryFloatOpTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice)) {
+  if (!createDevice(&pDevice)) {
     return;
   }
   // Read data from the table
@@ -7065,7 +6274,7 @@ TEST_F(ExecutionTest, BinaryFloatOpTest) {
       handler.GetTableParamByName(L"Validation.Tolerance")->m_double;
   size_t count = Validation_Input1->size();
 
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTest(
       pDevice, m_support, pStream, "BinaryFPOp",
       // this callback is called when the test
       // is creating the resource to run the test
@@ -7125,10 +6334,10 @@ TEST_F(ExecutionTest, TertiaryFloatOpTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice)) {
+  if (!createDevice(&pDevice)) {
     return;
   }
   // Read data from the table
@@ -7155,7 +6364,7 @@ TEST_F(ExecutionTest, TertiaryFloatOpTest) {
       handler.GetTableParamByName(L"Validation.Tolerance")->m_double;
   size_t count = Validation_Input1->size();
 
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTest(
       pDevice, m_support, pStream, "TertiaryFPOp",
       // this callback is called when the test
       // is creating the resource to run the test
@@ -7198,10 +6407,10 @@ TEST_F(ExecutionTest, UnaryHalfOpTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_2)) {
+  if (!createDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_2)) {
     return;
   }
 
@@ -7232,7 +6441,7 @@ TEST_F(ExecutionTest, UnaryHalfOpTest) {
 
   size_t count = Validation_Input->size();
 
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTest(
       pDevice, m_support, pStream, "UnaryFPOp",
       // this callback is called when the test
       // is creating the resource to run the test
@@ -7273,10 +6482,10 @@ TEST_F(ExecutionTest, BinaryHalfOpTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_2)) {
+  if (!createDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_2)) {
     return;
   }
 
@@ -7312,7 +6521,7 @@ TEST_F(ExecutionTest, BinaryHalfOpTest) {
       handler.GetTableParamByName(L"Validation.Tolerance")->m_double;
   size_t count = Validation_Input1->size();
 
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTest(
       pDevice, m_support, pStream, "BinaryFPOp",
       // this callback is called when the test
       // is creating the resource to run the test
@@ -7384,10 +6593,10 @@ TEST_F(ExecutionTest, TertiaryHalfOpTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_2)) {
+  if (!createDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_2)) {
     return;
   }
 
@@ -7422,7 +6631,7 @@ TEST_F(ExecutionTest, TertiaryHalfOpTest) {
       handler.GetTableParamByName(L"Validation.Tolerance")->m_double;
   size_t count = Validation_Input1->size();
 
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTest(
       pDevice, m_support, pStream, "TertiaryFPOp",
       // this callback is called when the test
       // is creating the resource to run the test
@@ -7470,10 +6679,10 @@ TEST_F(ExecutionTest, UnaryIntOpTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice)) {
+  if (!createDevice(&pDevice)) {
     return;
   }
   // Read data from the table
@@ -7492,7 +6701,7 @@ TEST_F(ExecutionTest, UnaryIntOpTest) {
       handler.GetTableParamByName(L"Validation.Tolerance")->m_int32;
   size_t count = Validation_Input->size();
 
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTest(
       pDevice, m_support, pStream, "UnaryIntOp",
       // this callback is called when the test
       // is creating the resource to run the test
@@ -7530,10 +6739,10 @@ TEST_F(ExecutionTest, UnaryUintOpTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice)) {
+  if (!createDevice(&pDevice)) {
     return;
   }
   // Read data from the table
@@ -7552,7 +6761,7 @@ TEST_F(ExecutionTest, UnaryUintOpTest) {
       handler.GetTableParamByName(L"Validation.Tolerance")->m_int32;
   size_t count = Validation_Input->size();
 
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTest(
       pDevice, m_support, pStream, "UnaryUintOp",
       // this callback is called when the test
       // is creating the resource to run the test
@@ -7590,10 +6799,10 @@ TEST_F(ExecutionTest, BinaryIntOpTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice)) {
+  if (!createDevice(&pDevice)) {
     return;
   }
   // Read data from the table
@@ -7617,7 +6826,7 @@ TEST_F(ExecutionTest, BinaryIntOpTest) {
 
   size_t numExpected = Validation_Expected2->size() == 0 ? 1 : 2;
 
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTest(
       pDevice, m_support, pStream, "BinaryIntOp",
       // this callback is called when the test
       // is creating the resource to run the test
@@ -7680,10 +6889,10 @@ TEST_F(ExecutionTest, TertiaryIntOpTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice)) {
+  if (!createDevice(&pDevice)) {
     return;
   }
   // Read data from the table
@@ -7705,7 +6914,7 @@ TEST_F(ExecutionTest, TertiaryIntOpTest) {
       handler.GetTableParamByName(L"Validation.Tolerance")->m_int32;
   size_t count = Validation_Input1->size();
 
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTest(
       pDevice, m_support, pStream, "TertiaryIntOp",
       // this callback is called when the test
       // is creating the resource to run the test
@@ -7750,10 +6959,10 @@ TEST_F(ExecutionTest, BinaryUintOpTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice)) {
+  if (!createDevice(&pDevice)) {
     return;
   }
   // Read data from the table
@@ -7775,7 +6984,7 @@ TEST_F(ExecutionTest, BinaryUintOpTest) {
       handler.GetTableParamByName(L"Validation.Tolerance")->m_int32;
   size_t count = Validation_Input1->size();
   int numExpected = Validation_Expected2->size() == 0 ? 1 : 2;
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTest(
       pDevice, m_support, pStream, "BinaryUintOp",
       // this callback is called when the test
       // is creating the resource to run the test
@@ -7842,10 +7051,10 @@ TEST_F(ExecutionTest, TertiaryUintOpTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice)) {
+  if (!createDevice(&pDevice)) {
     return;
   }
   // Read data from the table
@@ -7867,7 +7076,7 @@ TEST_F(ExecutionTest, TertiaryUintOpTest) {
       handler.GetTableParamByName(L"Validation.Tolerance")->m_int32;
   size_t count = Validation_Input1->size();
 
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTest(
       pDevice, m_support, pStream, "TertiaryUintOp",
       // this callback is called when the test
       // is creating the resource to run the test
@@ -7916,10 +7125,10 @@ TEST_F(ExecutionTest, UnaryInt16OpTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_2)) {
+  if (!createDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_2)) {
     return;
   }
 
@@ -7946,7 +7155,7 @@ TEST_F(ExecutionTest, UnaryInt16OpTest) {
       handler.GetTableParamByName(L"Validation.Tolerance")->m_int32;
   size_t count = Validation_Input->size();
 
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTest(
       pDevice, m_support, pStream, "UnaryIntOp",
       // this callback is called when the test
       // is creating the resource to run the test
@@ -7984,10 +7193,10 @@ TEST_F(ExecutionTest, UnaryUint16OpTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_2)) {
+  if (!createDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_2)) {
     return;
   }
 
@@ -8014,7 +7223,7 @@ TEST_F(ExecutionTest, UnaryUint16OpTest) {
       handler.GetTableParamByName(L"Validation.Tolerance")->m_int32;
   size_t count = Validation_Input->size();
 
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTest(
       pDevice, m_support, pStream, "UnaryUintOp",
       // this callback is called when the test
       // is creating the resource to run the test
@@ -8053,10 +7262,10 @@ TEST_F(ExecutionTest, BinaryInt16OpTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_2)) {
+  if (!createDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_2)) {
     return;
   }
 
@@ -8089,7 +7298,7 @@ TEST_F(ExecutionTest, BinaryInt16OpTest) {
 
   size_t numExpected = Validation_Expected2->size() == 0 ? 1 : 2;
 
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTest(
       pDevice, m_support, pStream, "BinaryIntOp",
       // this callback is called when the test
       // is creating the resource to run the test
@@ -8151,10 +7360,10 @@ TEST_F(ExecutionTest, TertiaryInt16OpTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_2)) {
+  if (!createDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_2)) {
     return;
   }
 
@@ -8185,7 +7394,7 @@ TEST_F(ExecutionTest, TertiaryInt16OpTest) {
       handler.GetTableParamByName(L"Validation.Tolerance")->m_int32;
   size_t count = Validation_Input1->size();
 
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTest(
       pDevice, m_support, pStream, "TertiaryIntOp",
       // this callback is called when the test
       // is creating the resource to run the test
@@ -8228,10 +7437,10 @@ TEST_F(ExecutionTest, BinaryUint16OpTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_2)) {
+  if (!createDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_2)) {
     return;
   }
 
@@ -8262,7 +7471,7 @@ TEST_F(ExecutionTest, BinaryUint16OpTest) {
       handler.GetTableParamByName(L"Validation.Tolerance")->m_int32;
   size_t count = Validation_Input1->size();
   int numExpected = Validation_Expected2->size() == 0 ? 1 : 2;
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTest(
       pDevice, m_support, pStream, "BinaryUintOp",
       // this callback is called when the test
       // is creating the resource to run the test
@@ -8326,10 +7535,10 @@ TEST_F(ExecutionTest, TertiaryUint16OpTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_2)) {
+  if (!createDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_2)) {
     return;
   }
 
@@ -8361,7 +7570,7 @@ TEST_F(ExecutionTest, TertiaryUint16OpTest) {
       handler.GetTableParamByName(L"Validation.Tolerance")->m_int32;
   size_t count = Validation_Input1->size();
 
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTest(
       pDevice, m_support, pStream, "TertiaryUintOp",
       // this callback is called when the test
       // is creating the resource to run the test
@@ -8916,10 +8125,10 @@ TEST_F(ExecutionTest, DotTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice)) {
+  if (!createDevice(&pDevice)) {
     return;
   }
 
@@ -8946,7 +8155,7 @@ TEST_F(ExecutionTest, DotTest) {
       handler.GetTableParamByName(L"Validation.Tolerance")->m_double;
   size_t count = Validation_Input1->size();
 
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTest(
       pDevice, m_support, pStream, "DotOp",
       // this callback is called when the test
       // is creating the resource to run the test
@@ -9000,10 +8209,10 @@ TEST_F(ExecutionTest, Dot2AddHalfTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_4, false)) {
+  if (!createDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_4, false)) {
     return;
   }
 
@@ -9036,7 +8245,7 @@ TEST_F(ExecutionTest, Dot2AddHalfTest) {
       handler.GetTableParamByName(L"Validation.Tolerance")->m_double;
   size_t count = validation_input1->size();
 
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTest(
       pDevice, m_support, pStream, "Dot2AddHalfOp",
       // this callback is called when the test
       // is creating the resource to run the test
@@ -9088,10 +8297,10 @@ TEST_F(ExecutionTest, Dot4AddI8PackedTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_4, false)) {
+  if (!createDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_4, false)) {
     return;
   }
 
@@ -9112,7 +8321,7 @@ TEST_F(ExecutionTest, Dot4AddI8PackedTest) {
 
   size_t count = validation_input1->size();
 
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTest(
       pDevice, m_support, pStream, "Dot4AddI8PackedOp",
       // this callback is called when the test
       // is creating the resource to run the test
@@ -9151,10 +8360,10 @@ TEST_F(ExecutionTest, Dot4AddU8PackedTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_4, false)) {
+  if (!createDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_4, false)) {
     return;
   }
 
@@ -9175,7 +8384,7 @@ TEST_F(ExecutionTest, Dot4AddU8PackedTest) {
 
   size_t count = validation_input1->size();
 
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTest(
       pDevice, m_support, pStream, "Dot4AddU8PackedOp",
       // this callback is called when the test
       // is creating the resource to run the test
@@ -9214,10 +8423,10 @@ TEST_F(ExecutionTest, Msad4Test) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice)) {
+  if (!createDevice(&pDevice)) {
     return;
   }
   size_t tableSize = sizeof(Msad4OpParameters) / sizeof(TableParameter);
@@ -9238,7 +8447,7 @@ TEST_F(ExecutionTest, Msad4Test) {
 
   size_t count = Validation_Expected->size();
 
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTest(
       pDevice, m_support, pStream, "Msad4",
       // this callback is called when the test
       // is creating the resource to run the test
@@ -9296,10 +8505,10 @@ TEST_F(ExecutionTest, DenormBinaryFloatOpTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_2)) {
+  if (!createDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_2)) {
     return;
   }
 
@@ -9340,7 +8549,7 @@ TEST_F(ExecutionTest, DenormBinaryFloatOpTest) {
              "must have same number of expected values");
   }
 
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTest(
       pDevice, m_support, pStream, "BinaryFPOp",
       // this callback is called when the test
       // is creating the resource to run the test
@@ -9407,10 +8616,10 @@ TEST_F(ExecutionTest, DenormTertiaryFloatOpTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_2)) {
+  if (!createDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_2)) {
     return;
   }
 
@@ -9453,7 +8662,7 @@ TEST_F(ExecutionTest, DenormTertiaryFloatOpTest) {
              "must have same number of expected values");
   }
 
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTest(
       pDevice, m_support, pStream, "TertiaryFPOp",
       // this callback is called when the test
       // is creating the resource to run the test
@@ -9846,10 +9055,10 @@ void ExecutionTest::WaveIntrinsicsActivePrefixTest(
   static const unsigned int DispatchGroupCount = 1;
   static const unsigned int ThreadCount = ThreadsPerGroup * DispatchGroupCount;
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice)) {
+  if (!createDevice(&pDevice)) {
     return;
   }
   if (!DoesDeviceSupportWaveOps(pDevice)) {
@@ -9881,31 +9090,33 @@ void ExecutionTest::WaveIntrinsicsActivePrefixTest(
     for (size_t maskIndex = 0;
          maskIndex < sizeof(MaskFunctionTable) / sizeof(MaskFunction);
          ++maskIndex) {
-      std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTestAfterParse(
-          pDevice, m_support, "WaveIntrinsicsOp",
-          // this callback is called when the test
-          // is creating the resource to run the test
-          [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
-            VERIFY_IS_TRUE(0 == _stricmp(Name, "SWaveIntrinsicsOp"));
-            size_t size = sizeof(PerThreadData) * ThreadCount;
-            Data.resize(size);
-            PerThreadData *pPrimitives = (PerThreadData *)Data.data();
-            // 4 different inputs for each operation test
-            size_t index = 0;
-            std::vector<T1> *IntList = InputDataList[setIndex];
-            while (index < ThreadCount) {
-              PerThreadData *p = &pPrimitives[index];
-              p->firstLaneId = 0xFFFFBFFF;
-              p->laneIndex = 0xFFFFBFFF;
-              p->mask = MaskFunctionTable[maskIndex]((int)index);
-              p->input = (*IntList)[index % IntList->size()];
-              p->output = 0xFFFFBFFF;
-              index++;
-            }
-            // use shader from data table
-            pShaderOp->Shaders.at(0).Text = Text.m_psz;
-          },
-          ShaderOpSet);
+      std::shared_ptr<st::ShaderOpTestResult> test =
+          st::RunShaderOpTestAfterParse(
+              pDevice, m_support, "WaveIntrinsicsOp",
+              // this callback is called when the test
+              // is creating the resource to run the test
+              [&](LPCSTR Name, std::vector<BYTE> &Data,
+                  st::ShaderOp *pShaderOp) {
+                VERIFY_IS_TRUE(0 == _stricmp(Name, "SWaveIntrinsicsOp"));
+                size_t size = sizeof(PerThreadData) * ThreadCount;
+                Data.resize(size);
+                PerThreadData *pPrimitives = (PerThreadData *)Data.data();
+                // 4 different inputs for each operation test
+                size_t index = 0;
+                std::vector<T1> *IntList = InputDataList[setIndex];
+                while (index < ThreadCount) {
+                  PerThreadData *p = &pPrimitives[index];
+                  p->firstLaneId = 0xFFFFBFFF;
+                  p->laneIndex = 0xFFFFBFFF;
+                  p->mask = MaskFunctionTable[maskIndex]((int)index);
+                  p->input = (*IntList)[index % IntList->size()];
+                  p->output = 0xFFFFBFFF;
+                  index++;
+                }
+                // use shader from data table
+                pShaderOp->Shaders.at(0).Text = Text.m_psz;
+              },
+              ShaderOpSet);
 
       // Check the value
       MappedData data;
@@ -10106,11 +9317,11 @@ void ExecutionTest::WaveIntrinsicsMultiPrefixOpTest(
   constexpr size_t ThreadCount = ThreadsPerGroup * DispatchGroupSize;
 
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
 
-  if (!CreateDevice(&pDevice, D3D_SHADER_MODEL_6_5)) {
+  if (!createDevice(&pDevice, D3D_SHADER_MODEL_6_5)) {
     return;
   }
 
@@ -10134,30 +9345,31 @@ void ExecutionTest::WaveIntrinsicsMultiPrefixOpTest(
 
   for (size_t maskIndex = 0; maskIndex < _countof(MaskFunctionTable);
        ++maskIndex) {
-    std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTestAfterParse(
-        pDevice, m_support, "WaveIntrinsicsOp",
-        [&](LPCSTR name, std::vector<BYTE> &data, st::ShaderOp *pShaderOp) {
-          UNREFERENCED_PARAMETER(name);
-
-          const size_t dataSize = sizeof(PerThreadData) * ThreadCount;
-
-          data.resize(dataSize);
-          PerThreadData *pThreadData =
-              reinterpret_cast<PerThreadData *>(data.data());
-
-          for (size_t i = 0; i != ThreadCount; ++i) {
-            pThreadData[i].key = keys->at(i % keys->size());
-            pThreadData[i].value = values->at(i % values->size());
-            pThreadData[i].firstLaneId = 0xdeadbeef;
-            pThreadData[i].laneId = 0xdeadbeef;
-            pThreadData[i].mask = MaskFunctionTable[maskIndex]((int)i);
-            pThreadData[i].result = 0xdeadbeef;
-          }
+    std::shared_ptr<st::ShaderOpTestResult> test =
+        st::RunShaderOpTestAfterParse(
+            pDevice, m_support, "WaveIntrinsicsOp",
+            [&](LPCSTR name, std::vector<BYTE> &data, st::ShaderOp *pShaderOp) {
+              UNREFERENCED_PARAMETER(name);
+
+              const size_t dataSize = sizeof(PerThreadData) * ThreadCount;
+
+              data.resize(dataSize);
+              PerThreadData *pThreadData =
+                  reinterpret_cast<PerThreadData *>(data.data());
+
+              for (size_t i = 0; i != ThreadCount; ++i) {
+                pThreadData[i].key = keys->at(i % keys->size());
+                pThreadData[i].value = values->at(i % values->size());
+                pThreadData[i].firstLaneId = 0xdeadbeef;
+                pThreadData[i].laneId = 0xdeadbeef;
+                pThreadData[i].mask = MaskFunctionTable[maskIndex]((int)i);
+                pThreadData[i].result = 0xdeadbeef;
+              }
 
-          pShaderOp->Shaders.at(0).Text = shaderSource;
-          pShaderOp->Shaders.at(0).Target = shaderProfile;
-        },
-        ShaderOpSet);
+              pShaderOp->Shaders.at(0).Text = shaderSource;
+              pShaderOp->Shaders.at(0).Target = shaderProfile;
+            },
+            ShaderOpSet);
 
     MappedData mappedData;
     test->Test->GetReadBackData("SWaveIntrinsicsOp", &mappedData);
@@ -10234,11 +9446,11 @@ TEST_F(ExecutionTest, CBufferTestHalf) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   // Single operation test at the moment.
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice, D3D_SHADER_MODEL_6_2))
+  if (!createDevice(&pDevice, D3D_SHADER_MODEL_6_2))
     return;
 
   if (!DoesDeviceSupportNative16bitOps(pDevice)) {
@@ -10250,7 +9462,7 @@ TEST_F(ExecutionTest, CBufferTestHalf) {
 
   uint16_t InputData[] = {0x3F80, 0x3F00, 0x3D80, 0x7BFF};
 
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTest(
       pDevice, m_support, pStream, "CBufferTestHalf",
       [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
         UNREFERENCED_PARAMETER(pShaderOp);
@@ -10280,7 +9492,7 @@ TEST_F(ExecutionTest, CBufferTestHalf) {
 }
 
 void TestBarycentricVariant(bool checkOrdering,
-                            std::shared_ptr<ShaderOpTestResult> test) {
+                            std::shared_ptr<st::ShaderOpTestResult> test) {
   MappedData data;
   D3D12_RESOURCE_DESC &D = test->ShaderOp->GetResourceByName("RTarget")->Desc;
   UINT width = (UINT)D.Width;
@@ -10364,10 +9576,10 @@ TEST_F(ExecutionTest, BarycentricsTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice, D3D_SHADER_MODEL_6_1))
+  if (!createDevice(&pDevice, D3D_SHADER_MODEL_6_1))
     return;
 
   if (!DoesDeviceSupportBarycentrics(pDevice)) {
@@ -10386,9 +9598,9 @@ TEST_F(ExecutionTest, BarycentricsTest) {
   auto ResourceCallbackFnNoShift =
       MakeBarycentricsResourceInitCallbackFn(test_iteration);
 
-  std::shared_ptr<ShaderOpTestResult> test =
-      RunShaderOpTestAfterParse(pDevice, m_support, "Barycentrics",
-                                ResourceCallbackFnNoShift, ShaderOpSet);
+  std::shared_ptr<st::ShaderOpTestResult> test =
+      st::RunShaderOpTestAfterParse(pDevice, m_support, "Barycentrics",
+                                    ResourceCallbackFnNoShift, ShaderOpSet);
   TestBarycentricVariant(false, test);
 
   // Now test that barycentric ordering is consistent
@@ -10400,8 +9612,9 @@ TEST_F(ExecutionTest, BarycentricsTest) {
     auto ResourceCallbackFn =
         MakeBarycentricsResourceInitCallbackFn(test_iteration);
 
-    std::shared_ptr<ShaderOpTestResult> test2 = RunShaderOpTestAfterParse(
-        pDevice, m_support, "Barycentrics", ResourceCallbackFn, ShaderOpSet);
+    std::shared_ptr<st::ShaderOpTestResult> test2 =
+        st::RunShaderOpTestAfterParse(pDevice, m_support, "Barycentrics",
+                                      ResourceCallbackFn, ShaderOpSet);
     TestBarycentricVariant(true, test2);
   }
 }
@@ -10647,7 +9860,7 @@ bool ExecutionTest::SetupRawBufferLdStTest(D3D_SHADER_MODEL shaderModel,
                                            CComPtr<IStream> &pStream,
                                            const char *&sTy,
                                            const char *&additionalOptions) {
-  if (!CreateDevice(&pDevice, shaderModel)) {
+  if (!createDevice(&pDevice, shaderModel)) {
     return false;
   }
 
@@ -10692,7 +9905,7 @@ bool ExecutionTest::SetupRawBufferLdStTest(D3D_SHADER_MODEL shaderModel,
   }
 
   // read shader config
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   return true;
 }
@@ -10784,7 +9997,7 @@ void ExecutionTest::RunComputeRawBufferLdStTest(
                            (int)sizeof(Ty), additionalOptions) != -1);
 
   // run the shader
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTest(
       pDevice, m_support, pStream, shaderOpName,
       [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
         VERIFY_IS_TRUE(((0 == strncmp(Name, "SRVBuffer", 9)) ||
@@ -10839,7 +10052,7 @@ void ExecutionTest::RunGraphicsRawBufferLdStTest(
                            (int)sizeof(Ty), additionalOptions) != -1);
 
   // run the shader
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTest(
       pDevice, m_support, pStream, shaderOpName,
       [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
         VERIFY_IS_TRUE(((0 == strncmp(Name, "SRVBuffer", 9)) ||
@@ -10921,7 +10134,7 @@ TEST_F(ExecutionTest, PackUnpackTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
 
@@ -10929,14 +10142,14 @@ TEST_F(ExecutionTest, PackUnpackTest) {
   string args = "-enable-16bit-types -DPACKUNPACK_PLACEHOLDER";
   string target = "cs_6_2";
 
-  if (!CreateDevice(&pDevice)) {
+  if (!createDevice(&pDevice)) {
     return;
   }
 #else
   string args = "-enable-16bit-types";
   string target = "cs_6_6";
 
-  if (!CreateDevice(&pDevice, D3D_SHADER_MODEL_6_6)) {
+  if (!createDevice(&pDevice, D3D_SHADER_MODEL_6_6)) {
     return;
   }
 #endif
@@ -10962,7 +10175,7 @@ TEST_F(ExecutionTest, PackUnpackTest) {
   std::vector<SPackUnpackOpOutPacked> expectedPacked(count / 4);
   std::vector<SPackUnpackOpOutUnpacked> expectedUnpacked(count / 4);
 
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTest(
       pDevice, m_support, pStream, "PackUnpackOp",
       // this callback is called when the test
       // is creating the resource to run the test
@@ -11316,7 +10529,7 @@ TEST_F(ExecutionTest, SignatureResourcesTest) {
       "}\n";
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice, D3D_SHADER_MODEL_6_6))
+  if (!createDevice(&pDevice, D3D_SHADER_MODEL_6_6))
     return;
 
   RunResourceTest(pDevice, pShader.c_str(), L"cs_6_6", /*isDynamic*/ false);
@@ -11355,7 +10568,7 @@ TEST_F(ExecutionTest, DynamicResourcesTest) {
       "}\n";
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice, D3D_SHADER_MODEL_6_6))
+  if (!createDevice(&pDevice, D3D_SHADER_MODEL_6_6))
     return;
 
   // ResourceDescriptorHeap/SamplerDescriptorHeap requires Resource Binding Tier
@@ -11398,7 +10611,7 @@ TEST_F(ExecutionTest, DynamicResourcesDynamicIndexingTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   std::shared_ptr<st::ShaderOpSet> ShaderOpSet =
       std::make_shared<st::ShaderOpSet>();
@@ -11436,7 +10649,7 @@ TEST_F(ExecutionTest, DynamicResourcesDynamicIndexingTest) {
                   ((UINT)sm & 0x0f));
 
     CComPtr<ID3D12Device> pDevice;
-    if (!CreateDevice(&pDevice, sm, false /* skipUnsupported */)) {
+    if (!createDevice(&pDevice, sm, false /* skipUnsupported */)) {
       continue;
     }
     D3D12_FEATURE_DATA_D3D12_OPTIONS devOptions;
@@ -11495,9 +10708,10 @@ TEST_F(ExecutionTest, DynamicResourcesDynamicIndexingTest) {
       // Test Compute shader
       {
         pShaderOp->CS = pShaderOp->GetString("CS66");
-        std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTestAfterParse(
-            pDevice, m_support, "DynamicResourcesDynamicIndexing", nullptr,
-            ShaderOpSet);
+        std::shared_ptr<st::ShaderOpTestResult> test =
+            st::RunShaderOpTestAfterParse(pDevice, m_support,
+                                          "DynamicResourcesDynamicIndexing",
+                                          nullptr, ShaderOpSet);
 
         MappedData resultData;
         test->Test->GetReadBackData("g_result", &resultData);
@@ -11512,9 +10726,10 @@ TEST_F(ExecutionTest, DynamicResourcesDynamicIndexingTest) {
         pShaderOp->CS = nullptr;
         pShaderOp->VS = pShaderOp->GetString("VS66");
         pShaderOp->PS = pShaderOp->GetString("PS66");
-        std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTestAfterParse(
-            pDevice, m_support, "DynamicResourcesDynamicIndexing", nullptr,
-            ShaderOpSet);
+        std::shared_ptr<st::ShaderOpTestResult> test =
+            st::RunShaderOpTestAfterParse(pDevice, m_support,
+                                          "DynamicResourcesDynamicIndexing",
+                                          nullptr, ShaderOpSet);
 
         MappedData resultVSData;
         MappedData resultPSData;
@@ -11577,19 +10792,20 @@ void RunWaveSizeTest(UINT minWaveSize, UINT maxWaveSize,
                              waveSize) != -1);
 
     // run the shader
-    std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTestAfterParse(
-        pDevice, m_support, "WaveSizeTest",
-        [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
-          VERIFY_IS_TRUE((0 == strncmp(Name, "UAVBuffer0", 10)));
-          pShaderOp->Shaders.at(0).Arguments = compilerOptions;
-          pShaderOp->Shaders.at(0).Text = waveSizeTestShader;
-
-          VERIFY_IS_TRUE(sizeof(WaveSizeTestData) * MAX_WAVESIZE <=
-                         Data.size());
-          WaveSizeTestData *pInData = (WaveSizeTestData *)Data.data();
-          memset(pInData, 0, sizeof(WaveSizeTestData) * MAX_WAVESIZE);
-        },
-        ShaderOpSet);
+    std::shared_ptr<st::ShaderOpTestResult> test =
+        st::RunShaderOpTestAfterParse(
+            pDevice, m_support, "WaveSizeTest",
+            [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
+              VERIFY_IS_TRUE((0 == strncmp(Name, "UAVBuffer0", 10)));
+              pShaderOp->Shaders.at(0).Arguments = compilerOptions;
+              pShaderOp->Shaders.at(0).Text = waveSizeTestShader;
+
+              VERIFY_IS_TRUE(sizeof(WaveSizeTestData) * MAX_WAVESIZE <=
+                             Data.size());
+              WaveSizeTestData *pInData = (WaveSizeTestData *)Data.data();
+              memset(pInData, 0, sizeof(WaveSizeTestData) * MAX_WAVESIZE);
+            },
+            ShaderOpSet);
 
     // verify expected values
     MappedData dataUav;
@@ -11665,7 +10881,7 @@ void ExecuteWaveSizeRangeInstance(UINT minWaveSize, UINT maxWaveSize,
   };
 
   // run the shader
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTestAfterParse(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTestAfterParse(
       pDevice, m_support, "WaveSizeTest",
       [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
         VERIFY_IS_TRUE((0 == strncmp(Name, "UAVBuffer0", 10)));
@@ -11737,7 +10953,7 @@ void ExecutionTest::WaveSizeTest() {
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice, D3D_SHADER_MODEL_6_6,
+  if (!createDevice(&pDevice, D3D_SHADER_MODEL_6_6,
                     /*skipUnsupported*/ false)) {
     return;
   }
@@ -11765,7 +10981,7 @@ void ExecutionTest::WaveSizeTest() {
   CComPtr<IStream> pStream;
   std::shared_ptr<st::ShaderOpSet> ShaderOpSet =
       std::make_shared<st::ShaderOpSet>();
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
   st::ParseShaderOpSetFromStream(pStream, ShaderOpSet.get());
 
   LogCommentFmt(L"Testing WaveSize attribute for shader model 6.6.");
@@ -11777,7 +10993,7 @@ void ExecutionTest::WaveSizeRangeTest() {
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice, D3D_SHADER_MODEL_6_8,
+  if (!createDevice(&pDevice, D3D_SHADER_MODEL_6_8,
                     /*skipUnsupported*/ false)) {
     return;
   }
@@ -11805,7 +11021,7 @@ void ExecutionTest::WaveSizeRangeTest() {
   CComPtr<IStream> pStream;
   std::shared_ptr<st::ShaderOpSet> ShaderOpSet =
       std::make_shared<st::ShaderOpSet>();
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
   st::ParseShaderOpSetFromStream(pStream, ShaderOpSet.get());
 
   LogCommentFmt(L"Testing WaveSize Range attribute for shader model 6.8.");
@@ -12034,7 +11250,7 @@ void VerifyAtomicResults(const BYTE *uResults, const BYTE *sResults,
   }
 }
 
-void VerifyAtomicsRawTest(std::shared_ptr<ShaderOpTestResult> test,
+void VerifyAtomicsRawTest(std::shared_ptr<st::ShaderOpTestResult> test,
                           uint64_t maxIdx, size_t bitSize) {
 
   size_t stride = 8;
@@ -12083,7 +11299,7 @@ void VerifyAtomicsRawTest(std::shared_ptr<ShaderOpTestResult> test,
                       bitSize);
 }
 
-void VerifyAtomicsTypedTest(std::shared_ptr<ShaderOpTestResult> test,
+void VerifyAtomicsTypedTest(std::shared_ptr<st::ShaderOpTestResult> test,
                             uint64_t maxIdx, size_t bitSize) {
 
   size_t stride = 8;
@@ -12135,7 +11351,7 @@ void VerifyAtomicsTypedTest(std::shared_ptr<ShaderOpTestResult> test,
   VerifyAtomicResults(pUint, pSint + stride, pXchg, stride, maxIdx, bitSize);
 }
 
-void VerifyAtomicsSharedTest(std::shared_ptr<ShaderOpTestResult> test,
+void VerifyAtomicsSharedTest(std::shared_ptr<st::ShaderOpTestResult> test,
                              uint64_t maxIdx, size_t bitSize) {
 
   size_t stride = 8;
@@ -12156,7 +11372,7 @@ void VerifyAtomicsSharedTest(std::shared_ptr<ShaderOpTestResult> test,
                       bitSize);
 }
 
-void VerifyAtomicsTest(std::shared_ptr<ShaderOpTestResult> test,
+void VerifyAtomicsTest(std::shared_ptr<st::ShaderOpTestResult> test,
                        uint64_t maxIdx, size_t bitSize) {
   VerifyAtomicsRawTest(test, maxIdx, bitSize);
   VerifyAtomicsTypedTest(test, maxIdx, bitSize);
@@ -12166,10 +11382,10 @@ TEST_F(ExecutionTest, AtomicsTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice))
+  if (!createDevice(&pDevice))
     return;
 
   std::shared_ptr<st::ShaderOpSet> ShaderOpSet =
@@ -12181,7 +11397,7 @@ TEST_F(ExecutionTest, AtomicsTest) {
   // Test compute shader
   LogCommentFmt(
       L"Verifying 32-bit integer atomic operations in compute shader");
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTestAfterParse(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTestAfterParse(
       pDevice, m_support, "AtomicsHeap", nullptr, ShaderOpSet);
 
   VerifyAtomicsTest(test, 32 * 32, 32);
@@ -12192,8 +11408,8 @@ TEST_F(ExecutionTest, AtomicsTest) {
   if (DoesDeviceSupportMeshShaders(pDevice)) {
     LogCommentFmt(L"Verifying 32-bit integer atomic operations in "
                   L"amp/mesh/pixel shaders");
-    test = RunShaderOpTestAfterParse(pDevice, m_support, "AtomicsHeap", nullptr,
-                                     ShaderOpSet);
+    test = st::RunShaderOpTestAfterParse(pDevice, m_support, "AtomicsHeap",
+                                         nullptr, ShaderOpSet);
     VerifyAtomicsTest(test, 8 * 8 * 2 + 8 * 8 * 2 + 64 * 64, 32);
     VerifyAtomicsSharedTest(test, 8 * 8 * 2 + 8 * 8 * 2, 32);
   }
@@ -12202,8 +11418,8 @@ TEST_F(ExecutionTest, AtomicsTest) {
   pShaderOp->MS = nullptr;
   LogCommentFmt(
       L"Verifying 32-bit integer atomic operations in vert/pixel shaders");
-  test = RunShaderOpTestAfterParse(pDevice, m_support, "AtomicsHeap", nullptr,
-                                   ShaderOpSet);
+  test = st::RunShaderOpTestAfterParse(pDevice, m_support, "AtomicsHeap",
+                                       nullptr, ShaderOpSet);
   VerifyAtomicsTest(test, 64 * 64 + 6, 32);
 }
 
@@ -12211,10 +11427,10 @@ TEST_F(ExecutionTest, Atomics64Test) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice, D3D_SHADER_MODEL_6_6))
+  if (!createDevice(&pDevice, D3D_SHADER_MODEL_6_6))
     return;
 
   if (!DoesDeviceSupportInt64(pDevice)) {
@@ -12240,7 +11456,7 @@ TEST_F(ExecutionTest, Atomics64Test) {
   // Test compute shader
   LogCommentFmt(L"Verifying 64-bit integer atomic operations on raw buffers in "
                 L"compute shader");
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTestAfterParse(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTestAfterParse(
       pDevice, m_support, "AtomicsRoot", nullptr, ShaderOpSet);
   VerifyAtomicsRawTest(test, 32 * 32, 64);
 
@@ -12249,8 +11465,8 @@ TEST_F(ExecutionTest, Atomics64Test) {
   if (DoesDeviceSupportMeshShaders(pDevice)) {
     LogCommentFmt(L"Verifying 64-bit integer atomic operations on raw buffers "
                   L"in amp/mesh/pixel shader");
-    test = RunShaderOpTestAfterParse(pDevice, m_support, "AtomicsRoot", nullptr,
-                                     ShaderOpSet);
+    test = st::RunShaderOpTestAfterParse(pDevice, m_support, "AtomicsRoot",
+                                         nullptr, ShaderOpSet);
     VerifyAtomicsRawTest(test, 8 * 8 * 2 + 8 * 8 * 2 + 64 * 64, 64);
   }
 
@@ -12258,8 +11474,8 @@ TEST_F(ExecutionTest, Atomics64Test) {
   pShaderOp->MS = nullptr;
   LogCommentFmt(L"Verifying 64-bit integer atomic operations on raw buffers in "
                 L"vert/pixel shader");
-  test = RunShaderOpTestAfterParse(pDevice, m_support, "AtomicsRoot", nullptr,
-                                   ShaderOpSet);
+  test = st::RunShaderOpTestAfterParse(pDevice, m_support, "AtomicsRoot",
+                                       nullptr, ShaderOpSet);
   VerifyAtomicsRawTest(test, 64 * 64 + 6, 64);
 }
 
@@ -12267,10 +11483,10 @@ TEST_F(ExecutionTest, AtomicsRawHeap64Test) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice, D3D_SHADER_MODEL_6_6))
+  if (!createDevice(&pDevice, D3D_SHADER_MODEL_6_6))
     return;
 
   if (!DoesDeviceSupportInt64(pDevice)) {
@@ -12303,7 +11519,7 @@ TEST_F(ExecutionTest, AtomicsRawHeap64Test) {
   // Test compute shader
   LogCommentFmt(L"Verifying 64-bit integer atomic operations on heap raw "
                 L"buffers in compute shader");
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTestAfterParse(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTestAfterParse(
       pDevice, m_support, "AtomicsHeap", nullptr, ShaderOpSet);
   VerifyAtomicsRawTest(test, 32 * 32, 64);
 
@@ -12312,8 +11528,8 @@ TEST_F(ExecutionTest, AtomicsRawHeap64Test) {
   if (DoesDeviceSupportMeshShaders(pDevice)) {
     LogCommentFmt(L"Verifying 64-bit integer atomic operations on heap raw "
                   L"buffers in amp/mesh/pixel shader");
-    test = RunShaderOpTestAfterParse(pDevice, m_support, "AtomicsHeap", nullptr,
-                                     ShaderOpSet);
+    test = st::RunShaderOpTestAfterParse(pDevice, m_support, "AtomicsHeap",
+                                         nullptr, ShaderOpSet);
     VerifyAtomicsRawTest(test, 8 * 8 * 2 + 8 * 8 * 2 + 64 * 64, 64);
   }
 
@@ -12321,8 +11537,8 @@ TEST_F(ExecutionTest, AtomicsRawHeap64Test) {
   pShaderOp->MS = nullptr;
   LogCommentFmt(L"Verifying 64-bit integer atomic operations on heap raw "
                 L"buffers in vert/pixel shader");
-  test = RunShaderOpTestAfterParse(pDevice, m_support, "AtomicsHeap", nullptr,
-                                   ShaderOpSet);
+  test = st::RunShaderOpTestAfterParse(pDevice, m_support, "AtomicsHeap",
+                                       nullptr, ShaderOpSet);
   VerifyAtomicsRawTest(test, 64 * 64 + 6, 64);
 }
 
@@ -12330,10 +11546,10 @@ TEST_F(ExecutionTest, AtomicsTyped64Test) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice, D3D_SHADER_MODEL_6_6))
+  if (!createDevice(&pDevice, D3D_SHADER_MODEL_6_6))
     return;
 
   if (!DoesDeviceSupportInt64(pDevice)) {
@@ -12366,7 +11582,7 @@ TEST_F(ExecutionTest, AtomicsTyped64Test) {
   // Test compute shader
   LogCommentFmt(L"Verifying 64-bit integer atomic operations on typed "
                 L"resources in compute shader");
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTestAfterParse(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTestAfterParse(
       pDevice, m_support, "AtomicsHeap", nullptr, ShaderOpSet);
   VerifyAtomicsTypedTest(test, 32 * 32, 64);
 
@@ -12375,8 +11591,8 @@ TEST_F(ExecutionTest, AtomicsTyped64Test) {
   if (DoesDeviceSupportMeshShaders(pDevice)) {
     LogCommentFmt(L"Verifying 64-bit integer atomic operations on typed "
                   L"resources in amp/mesh/pixel shader");
-    test = RunShaderOpTestAfterParse(pDevice, m_support, "AtomicsHeap", nullptr,
-                                     ShaderOpSet);
+    test = st::RunShaderOpTestAfterParse(pDevice, m_support, "AtomicsHeap",
+                                         nullptr, ShaderOpSet);
     VerifyAtomicsTypedTest(test, 8 * 8 * 2 + 8 * 8 * 2 + 64 * 64, 64);
   }
 
@@ -12384,8 +11600,8 @@ TEST_F(ExecutionTest, AtomicsTyped64Test) {
   pShaderOp->MS = nullptr;
   LogCommentFmt(L"Verifying 64-bit integer atomic operations on typed "
                 L"resources in vert/pixel shader");
-  test = RunShaderOpTestAfterParse(pDevice, m_support, "AtomicsHeap", nullptr,
-                                   ShaderOpSet);
+  test = st::RunShaderOpTestAfterParse(pDevice, m_support, "AtomicsHeap",
+                                       nullptr, ShaderOpSet);
   VerifyAtomicsTypedTest(test, 64 * 64 + 6, 64);
 }
 
@@ -12393,10 +11609,10 @@ TEST_F(ExecutionTest, AtomicsShared64Test) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice, D3D_SHADER_MODEL_6_6))
+  if (!createDevice(&pDevice, D3D_SHADER_MODEL_6_6))
     return;
 
   if (!DoesDeviceSupportInt64(pDevice)) {
@@ -12426,7 +11642,7 @@ TEST_F(ExecutionTest, AtomicsShared64Test) {
 
   LogCommentFmt(L"Verifying 64-bit integer atomic operations on groupshared "
                 L"variables in compute shader");
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTestAfterParse(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTestAfterParse(
       pDevice, m_support, "AtomicsRoot", nullptr, ShaderOpSet);
   VerifyAtomicsSharedTest(test, 32 * 32, 64);
 
@@ -12435,8 +11651,8 @@ TEST_F(ExecutionTest, AtomicsShared64Test) {
   if (DoesDeviceSupportMeshShaders(pDevice)) {
     LogCommentFmt(L"Verifying 64-bit integer atomic operations on groupshared "
                   L"variables in amp/mesh/pixel shader");
-    test = RunShaderOpTestAfterParse(pDevice, m_support, "AtomicsRoot", nullptr,
-                                     ShaderOpSet);
+    test = st::RunShaderOpTestAfterParse(pDevice, m_support, "AtomicsRoot",
+                                         nullptr, ShaderOpSet);
     VerifyAtomicsSharedTest(test, 8 * 8 * 2 + 8 * 8 * 2, 64);
   }
 }
@@ -12464,7 +11680,8 @@ void VerifyAtomicFloatResults(const float *results) {
   }
 }
 
-void VerifyAtomicsFloatSharedTest(std::shared_ptr<ShaderOpTestResult> test) {
+void VerifyAtomicsFloatSharedTest(
+    std::shared_ptr<st::ShaderOpTestResult> test) {
   MappedData Data;
   const float *pData = nullptr;
 
@@ -12476,7 +11693,7 @@ void VerifyAtomicsFloatSharedTest(std::shared_ptr<ShaderOpTestResult> test) {
   VerifyAtomicFloatResults(pData);
 }
 
-void VerifyAtomicsFloatTest(std::shared_ptr<ShaderOpTestResult> test) {
+void VerifyAtomicsFloatTest(std::shared_ptr<st::ShaderOpTestResult> test) {
 
   // struct mirroring that in the shader
   struct AtomicStuff {
@@ -12524,10 +11741,10 @@ TEST_F(ExecutionTest, AtomicsFloatTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice))
+  if (!createDevice(&pDevice))
     return;
 
   std::shared_ptr<st::ShaderOpSet> ShaderOpSet =
@@ -12539,7 +11756,7 @@ TEST_F(ExecutionTest, AtomicsFloatTest) {
   // Test compute shader
   LogCommentFmt(
       L"Verifying float cmp/xchg atomic operations in compute shader");
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTestAfterParse(
+  std::shared_ptr<st::ShaderOpTestResult> test = st::RunShaderOpTestAfterParse(
       pDevice, m_support, "FloatAtomics", nullptr, ShaderOpSet);
   VerifyAtomicsFloatTest(test);
   VerifyAtomicsFloatSharedTest(test);
@@ -12549,8 +11766,8 @@ TEST_F(ExecutionTest, AtomicsFloatTest) {
   if (DoesDeviceSupportMeshShaders(pDevice)) {
     LogCommentFmt(L"Verifying float cmp/xchg atomic operations in "
                   L"amp/mesh/pixel shaders");
-    test = RunShaderOpTestAfterParse(pDevice, m_support, "FloatAtomics",
-                                     nullptr, ShaderOpSet);
+    test = st::RunShaderOpTestAfterParse(pDevice, m_support, "FloatAtomics",
+                                         nullptr, ShaderOpSet);
     VerifyAtomicsFloatTest(test);
     VerifyAtomicsFloatSharedTest(test);
   }
@@ -12559,8 +11776,8 @@ TEST_F(ExecutionTest, AtomicsFloatTest) {
   pShaderOp->MS = nullptr;
   LogCommentFmt(
       L"Verifying float cmp/xchg atomic operations in vert/pixel shaders");
-  test = RunShaderOpTestAfterParse(pDevice, m_support, "FloatAtomics", nullptr,
-                                   ShaderOpSet);
+  test = st::RunShaderOpTestAfterParse(pDevice, m_support, "FloatAtomics",
+                                       nullptr, ShaderOpSet);
   VerifyAtomicsFloatTest(test);
 }
 
@@ -12589,7 +11806,7 @@ TEST_F(ExecutionTest, HelperLaneTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   std::shared_ptr<st::ShaderOpSet> ShaderOpSet =
       std::make_shared<st::ShaderOpSet>();
@@ -12604,19 +11821,20 @@ TEST_F(ExecutionTest, HelperLaneTest) {
                   ((UINT)sm & 0x0f));
 
     CComPtr<ID3D12Device> pDevice;
-    if (!CreateDevice(&pDevice, sm, false /* skipUnsupported */))
+    if (!createDevice(&pDevice, sm, false /* skipUnsupported */))
       continue;
 
-    std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTestAfterParse(
-        pDevice, m_support, "HelperLaneTestNoWave",
-        // this callback is called when the test is creating the resource to
-        // run the test
-        [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
-          VERIFY_IS_TRUE(0 == _stricmp(Name, "UAVBuffer0"));
-          std::fill(Data.begin(), Data.end(), (BYTE)0xCC);
-          UNREFERENCED_PARAMETER(pShaderOp);
-        },
-        ShaderOpSet);
+    std::shared_ptr<st::ShaderOpTestResult> test =
+        st::RunShaderOpTestAfterParse(
+            pDevice, m_support, "HelperLaneTestNoWave",
+            // this callback is called when the test is creating the resource to
+            // run the test
+            [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
+              VERIFY_IS_TRUE(0 == _stricmp(Name, "UAVBuffer0"));
+              std::fill(Data.begin(), Data.end(), (BYTE)0xCC);
+              UNREFERENCED_PARAMETER(pShaderOp);
+            },
+            ShaderOpSet);
 
     struct HelperLaneTestResult {
       int32_t is_helper_00;
@@ -12989,7 +12207,7 @@ TEST_F(ExecutionTest, HelperLaneTestWave) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   std::shared_ptr<st::ShaderOpSet> ShaderOpSet =
       std::make_shared<st::ShaderOpSet>();
@@ -13010,7 +12228,7 @@ TEST_F(ExecutionTest, HelperLaneTestWave) {
     bool smPassed = true;
 
     CComPtr<ID3D12Device> pDevice;
-    if (!CreateDevice(&pDevice, sm, false /* skipUnsupported */)) {
+    if (!createDevice(&pDevice, sm, false /* skipUnsupported */)) {
       continue;
     }
 
@@ -13045,9 +12263,10 @@ TEST_F(ExecutionTest, HelperLaneTestWave) {
 
     // Test Compute shader
     {
-      std::shared_ptr<ShaderOpTestResult> test =
-          RunShaderOpTestAfterParse(pDevice, m_support, "HelperLaneTestWave",
-                                    CleanUAVBuffer0Buffer, ShaderOpSet);
+      std::shared_ptr<st::ShaderOpTestResult> test =
+          st::RunShaderOpTestAfterParse(pDevice, m_support,
+                                        "HelperLaneTestWave",
+                                        CleanUAVBuffer0Buffer, ShaderOpSet);
 
       MappedData uavData;
       test->Test->GetReadBackData("UAVBuffer0", &uavData);
@@ -13069,9 +12288,10 @@ TEST_F(ExecutionTest, HelperLaneTestWave) {
     // Test Vertex + Pixel shader
     {
       pShaderOp->CS = nullptr;
-      std::shared_ptr<ShaderOpTestResult> test =
-          RunShaderOpTestAfterParse(pDevice, m_support, "HelperLaneTestWave",
-                                    CleanUAVBuffer0Buffer, ShaderOpSet);
+      std::shared_ptr<st::ShaderOpTestResult> test =
+          st::RunShaderOpTestAfterParse(pDevice, m_support,
+                                        "HelperLaneTestWave",
+                                        CleanUAVBuffer0Buffer, ShaderOpSet);
 
       MappedData uavData;
       test->Test->GetReadBackData("UAVBuffer0", &uavData);
@@ -13130,7 +12350,7 @@ TEST_F(ExecutionTest, QuadAnyAll) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   std::shared_ptr<st::ShaderOpSet> ShaderOpSet =
       std::make_shared<st::ShaderOpSet>();
@@ -13163,7 +12383,7 @@ TEST_F(ExecutionTest, QuadAnyAll) {
     }
 
     CComPtr<ID3D12Device> pDevice;
-    if (!CreateDevice(&pDevice, sm, false /* skipUnsupported */)) {
+    if (!createDevice(&pDevice, sm, false /* skipUnsupported */)) {
       continue;
     }
 
@@ -13176,8 +12396,9 @@ TEST_F(ExecutionTest, QuadAnyAll) {
     Skipped = false;
 
     // test compute
-    std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTestAfterParse(
-        pDevice, m_support, "QuadAnyAll", CleanUAVBuffer0Buffer, ShaderOpSet);
+    std::shared_ptr<st::ShaderOpTestResult> test =
+        st::RunShaderOpTestAfterParse(pDevice, m_support, "QuadAnyAll",
+                                      CleanUAVBuffer0Buffer, ShaderOpSet);
 
     MappedData uavData;
     test->Test->GetReadBackData("UAVBuffer0", &uavData);
@@ -13189,8 +12410,8 @@ TEST_F(ExecutionTest, QuadAnyAll) {
 
     pShaderOp->CS = nullptr;
     // test AS/MS
-    test = RunShaderOpTestAfterParse(pDevice, m_support, "QuadAnyAll",
-                                     CleanUAVBuffer0Buffer, ShaderOpSet);
+    test = st::RunShaderOpTestAfterParse(pDevice, m_support, "QuadAnyAll",
+                                         CleanUAVBuffer0Buffer, ShaderOpSet);
 
     test->Test->GetReadBackData("UAVBuffer0", &uavData);
     Result = VerifyQuadAnyAllResults((int2 *)uavData.data());
@@ -13337,7 +12558,7 @@ TEST_F(ExecutionTest, IsNormalTest) {
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
 
   CComPtr<ID3D12Device> pDevice;
-  VERIFY_IS_TRUE(CreateDevice(&pDevice, D3D_SHADER_MODEL_6_0,
+  VERIFY_IS_TRUE(createDevice(&pDevice, D3D_SHADER_MODEL_6_0,
                               false /* skipUnsupported */));
 
   // The input is -Zero, Zero, -Denormal, Denormal, -Infinity, Infinity, -NaN,
@@ -13354,7 +12575,7 @@ TEST_F(ExecutionTest, IsNormalTest) {
   std::vector<unsigned int> *Validation_Expected = &Validation_Expected_Vec;
 
   CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream, m_support);
 
   std::shared_ptr<st::ShaderOpSet> ShaderOpSet =
       std::make_shared<st::ShaderOpSet>();
@@ -13395,9 +12616,10 @@ TEST_F(ExecutionTest, IsNormalTest) {
   // Test Compute shader
   {
     pShaderOp->CS = pShaderOp->GetString("CS60");
-    std::shared_ptr<ShaderOpTestResult> test =
-        RunShaderOpTestAfterParse(pDevice, m_support, "IsNormal",
-                                  ResourceInitFn, ShaderInitFn, ShaderOpSet);
+    std::shared_ptr<st::ShaderOpTestResult> test =
+        st::RunShaderOpTestAfterParse(pDevice, m_support, "IsNormal",
+                                      ResourceInitFn, ShaderInitFn,
+                                      ShaderOpSet);
 
     MappedData data;
     test->Test->GetReadBackData("g_TestData", &data);
diff --git a/tools/clang/unittests/HLSLExec/HlslExecTestUtils.h b/tools/clang/unittests/HLSLExec/HlslExecTestUtils.h
new file mode 100644
index 0000000000..3822ef02ad
--- /dev/null
+++ b/tools/clang/unittests/HLSLExec/HlslExecTestUtils.h
@@ -0,0 +1,405 @@
+#ifndef HLSLEXECTESTUTILS_H
+#define HLSLEXECTESTUTILS_H
+
+#include "dxc/Support/dxcapi.use.h"
+#include "dxc/Test/HlslTestUtils.h"
+#include <Verify.h>
+#include <d3d12.h>
+#include <dxgi1_4.h>
+
+namespace ExecTestUtils {
+// This is defined in d3d.h for Windows 10 Anniversary Edition SDK, but we
+// only require the Windows 10 SDK.
+typedef enum D3D_SHADER_MODEL {
+  D3D_SHADER_MODEL_5_1 = 0x51,
+  D3D_SHADER_MODEL_6_0 = 0x60,
+  D3D_SHADER_MODEL_6_1 = 0x61,
+  D3D_SHADER_MODEL_6_2 = 0x62,
+  D3D_SHADER_MODEL_6_3 = 0x63,
+  D3D_SHADER_MODEL_6_4 = 0x64,
+  D3D_SHADER_MODEL_6_5 = 0x65,
+  D3D_SHADER_MODEL_6_6 = 0x66,
+  D3D_SHADER_MODEL_6_7 = 0x67,
+  D3D_SHADER_MODEL_6_8 = 0x68,
+  D3D_SHADER_MODEL_6_9 = 0x69,
+  D3D_HIGHEST_SHADER_MODEL = D3D_SHADER_MODEL_6_9
+} D3D_SHADER_MODEL;
+} // namespace ExecTestUtils
+
+static bool useDebugIfaces() { return true; }
+
+static bool useDxbc() {
+#ifdef _HLK_CONF
+  return false;
+#else
+  return hlsl_test::GetTestParamBool(L"DXBC");
+#endif
+}
+
+static bool useWarpByDefualt() {
+#ifdef _HLK_CONF
+  return false;
+#else
+  return true;
+#endif
+}
+
+// A more recent Windows SDK than currently required is needed for these.
+typedef HRESULT(WINAPI *D3D12EnableExperimentalFeaturesFn)(
+    UINT NumFeatures, __in_ecount(NumFeatures) const IID *IIDs,
+    __in_ecount_opt(NumFeatures) void *ConfigurationStructs,
+    __in_ecount_opt(NumFeatures) UINT *ConfigurationStructSizes);
+
+static const GUID D3D12ExperimentalShaderModelsID =
+    {/* 76f5573e-f13a-40f5-b297-81ce9e18933f */
+     0x76f5573e,
+     0xf13a,
+     0x40f5,
+     {0xb2, 0x97, 0x81, 0xce, 0x9e, 0x18, 0x93, 0x3f}};
+
+// Used to create D3D12SDKConfiguration to enable AgilitySDK programmatically.
+typedef HRESULT(WINAPI *D3D12GetInterfaceFn)(REFCLSID Rclsid, REFIID Riid,
+                                             void **Debug);
+
+#ifndef __ID3D12SDKConfiguration_INTERFACE_DEFINED__
+
+// Copied from AgilitySDK D3D12.h to programmatically enable when in developer
+// mode.
+#define __ID3D12SDKConfiguration_INTERFACE_DEFINED__
+
+EXTERN_C const GUID DECLSPEC_SELECTANY IID_ID3D12SDKConfiguration = {
+    0xe9eb5314,
+    0x33aa,
+    0x42b2,
+    {0xa7, 0x18, 0xd7, 0x7f, 0x58, 0xb1, 0xf1, 0xc7}};
+EXTERN_C const GUID DECLSPEC_SELECTANY CLSID_D3D12SDKConfiguration = {
+    0x7cda6aca,
+    0xa03e,
+    0x49c8,
+    {0x94, 0x58, 0x03, 0x34, 0xd2, 0x0e, 0x07, 0xce}};
+
+MIDL_INTERFACE("e9eb5314-33aa-42b2-a718-d77f58b1f1c7")
+ID3D12SDKConfiguration : public IUnknown {
+public:
+  virtual HRESULT STDMETHODCALLTYPE SetSDKVersion(UINT SDKVersion,
+                                                  LPCSTR SDKPath) = 0;
+};
+#endif /* __ID3D12SDKConfiguration_INTERFACE_DEFINED__ */
+
+static std::wstring getModuleName() {
+  wchar_t ModuleName[MAX_PATH + 1] = {0};
+  const DWORD Length = GetModuleFileNameW(NULL, ModuleName, MAX_PATH);
+
+  if (Length == 0 || Length == MAX_PATH)
+    return std::wstring(); // Error condition
+
+  return std::wstring(ModuleName, Length);
+}
+
+static std::wstring computeSDKFullPath(std::wstring SDKPath) {
+  std::wstring ModulePath = getModuleName();
+  const size_t Pos = ModulePath.rfind('\\');
+
+  if (Pos == std::wstring::npos)
+    return SDKPath;
+
+  if (SDKPath.substr(0, 2) != L".\\")
+    return SDKPath;
+
+  return ModulePath.substr(0, Pos) + SDKPath.substr(1);
+}
+
+static UINT getD3D12SDKVersion(std::wstring SDKPath) {
+  // Try to automatically get the D3D12SDKVersion from the DLL
+  UINT SDKVersion = 0;
+  std::wstring D3DCorePath = computeSDKFullPath(SDKPath);
+  D3DCorePath.append(L"D3D12Core.dll");
+  HMODULE D3DCore = LoadLibraryW(D3DCorePath.c_str());
+  if (D3DCore) {
+    if (UINT *SDKVersionOut =
+            (UINT *)GetProcAddress(D3DCore, "D3D12SDKVersion"))
+      SDKVersion = *SDKVersionOut;
+    FreeModule(D3DCore);
+  }
+  return SDKVersion;
+}
+
+static bool createDevice(ID3D12Device **D3DDevice,
+                         ExecTestUtils::D3D_SHADER_MODEL TestModel =
+                             ExecTestUtils::D3D_SHADER_MODEL_6_0,
+                         bool SkipUnsupported = true) {
+  if (TestModel > ExecTestUtils::D3D_HIGHEST_SHADER_MODEL) {
+    const UINT Minor = (UINT)TestModel & 0x0f;
+    hlsl_test::LogCommentFmt(L"Installed SDK does not support "
+                             L"shader model 6.%1u",
+                             Minor);
+
+    if (SkipUnsupported)
+      WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped);
+
+    return false;
+  }
+  CComPtr<IDXGIFactory4> DXGIFactory;
+  CComPtr<ID3D12Device> D3DDeviceCom;
+
+  *D3DDevice = nullptr;
+
+  VERIFY_SUCCEEDED(CreateDXGIFactory1(IID_PPV_ARGS(&DXGIFactory)));
+  if (hlsl_test::GetTestParamUseWARP(useWarpByDefualt())) {
+    CComPtr<IDXGIAdapter> WarpAdapter;
+    VERIFY_SUCCEEDED(DXGIFactory->EnumWarpAdapter(IID_PPV_ARGS(&WarpAdapter)));
+    HRESULT CreateHR = D3D12CreateDevice(WarpAdapter, D3D_FEATURE_LEVEL_11_0,
+                                         IID_PPV_ARGS(&D3DDeviceCom));
+    if (FAILED(CreateHR)) {
+      hlsl_test::LogCommentFmt(
+          L"The available version of WARP does not support d3d12.");
+
+      if (SkipUnsupported)
+        WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped);
+
+      return false;
+    }
+
+    if (GetModuleHandleW(L"d3d10warp.dll") != NULL) {
+      WCHAR FullModuleFilePath[MAX_PATH] = L"";
+      GetModuleFileNameW(GetModuleHandleW(L"d3d10warp.dll"), FullModuleFilePath,
+                         sizeof(FullModuleFilePath));
+      WEX::Logging::Log::Comment(WEX::Common::String().Format(
+          L"WARP driver loaded from: %ls", FullModuleFilePath));
+    }
+
+  } else {
+    CComPtr<IDXGIAdapter1> HardwareAdapter;
+    WEX::Common::String AdapterValue;
+    HRESULT HR = WEX::TestExecution::RuntimeParameters::TryGetValue(
+        L"Adapter", AdapterValue);
+    if (SUCCEEDED(HR))
+      st::GetHardwareAdapter(DXGIFactory, AdapterValue, &HardwareAdapter);
+    else
+      WEX::Logging::Log::Comment(
+          L"Using default hardware adapter with D3D12 support.");
+
+    VERIFY_SUCCEEDED(D3D12CreateDevice(HardwareAdapter, D3D_FEATURE_LEVEL_11_0,
+                                       IID_PPV_ARGS(&D3DDeviceCom)));
+  }
+  // retrieve adapter information
+  const LUID AdapterID = D3DDeviceCom->GetAdapterLuid();
+  CComPtr<IDXGIAdapter> DXGIAdapter;
+  DXGIFactory->EnumAdapterByLuid(AdapterID, IID_PPV_ARGS(&DXGIAdapter));
+  DXGI_ADAPTER_DESC AdapterDesc;
+  VERIFY_SUCCEEDED(DXGIAdapter->GetDesc(&AdapterDesc));
+  hlsl_test::LogCommentFmt(L"Using Adapter:%s", AdapterDesc.Description);
+
+  if (D3DDeviceCom == nullptr)
+    return false;
+
+  if (!useDxbc()) {
+    // Check for DXIL support.
+    typedef struct D3D12_FEATURE_DATA_SHADER_MODEL {
+      ExecTestUtils::D3D_SHADER_MODEL HighestShaderModel;
+    } D3D12_FEATURE_DATA_SHADER_MODEL;
+    const UINT D3D12_FEATURE_SHADER_MODEL = 7;
+    D3D12_FEATURE_DATA_SHADER_MODEL SMData;
+    SMData.HighestShaderModel = TestModel;
+    if (FAILED(D3DDeviceCom->CheckFeatureSupport(
+            (D3D12_FEATURE)D3D12_FEATURE_SHADER_MODEL, &SMData,
+            sizeof(SMData))) ||
+        SMData.HighestShaderModel < TestModel) {
+      const UINT Minor = (UINT)TestModel & 0x0f;
+      hlsl_test::LogCommentFmt(L"The selected device does not support "
+                               L"shader model 6.%1u",
+                               Minor);
+
+      if (SkipUnsupported)
+        WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped);
+
+      return false;
+    }
+  }
+
+  if (useDebugIfaces()) {
+    CComPtr<ID3D12InfoQueue> InfoQueue;
+    if (SUCCEEDED(D3DDeviceCom->QueryInterface(&InfoQueue)))
+      InfoQueue->SetMuteDebugOutput(FALSE);
+  }
+
+  *D3DDevice = D3DDeviceCom.Detach();
+  return true;
+}
+
+inline void readHlslDataIntoNewStream(LPCWSTR RelativePath, IStream **Stream,
+                                      dxc::DxcDllSupport &Support) {
+  VERIFY_SUCCEEDED(Support.Initialize());
+  CComPtr<IDxcLibrary> Library;
+  CComPtr<IDxcBlobEncoding> Blob;
+  CComPtr<IStream> StreamCom;
+  std::wstring Path = hlsl_test::GetPathToHlslDataFile(
+      RelativePath, HLSLDATAFILEPARAM, DEFAULT_EXEC_TEST_DIR);
+  VERIFY_SUCCEEDED(Support.CreateInstance(CLSID_DxcLibrary, &Library));
+  VERIFY_SUCCEEDED(Library->CreateBlobFromFile(Path.c_str(), nullptr, &Blob));
+  VERIFY_SUCCEEDED(Library->CreateStreamFromBlobReadOnly(Blob, &StreamCom));
+  *Stream = StreamCom.Detach();
+}
+
+static HRESULT enableAgilitySDK(HMODULE Runtime, UINT SDKVersion,
+                                LPCWSTR SDKPath) {
+  D3D12GetInterfaceFn GetInterfaceFunc =
+      (D3D12GetInterfaceFn)GetProcAddress(Runtime, "D3D12GetInterface");
+  CComPtr<ID3D12SDKConfiguration> D3D12SDKConfiguration;
+  IFR(GetInterfaceFunc(CLSID_D3D12SDKConfiguration,
+                       IID_PPV_ARGS(&D3D12SDKConfiguration)));
+  IFR(D3D12SDKConfiguration->SetSDKVersion(SDKVersion, CW2A(SDKPath)));
+
+  // Currently, it appears that the SetSDKVersion will succeed even when
+  // D3D12Core is not found, or its version doesn't match.  When that's the
+  // case, will cause a failure in the very next thing that actually requires
+  // D3D12Core.dll to be loaded instead.  So, we attempt to clear experimental
+  // features next, which is a valid use case and a no-op at this point.  This
+  // requires D3D12Core to be loaded.  If this fails, we know the AgilitySDK
+  // setting actually failed.
+  D3D12EnableExperimentalFeaturesFn ExperimentalFeaturesFunc =
+      (D3D12EnableExperimentalFeaturesFn)GetProcAddress(
+          Runtime, "D3D12EnableExperimentalFeatures");
+  if (ExperimentalFeaturesFunc == nullptr)
+    // If this failed, D3D12 must be too old for AgilitySDK.  But if that's
+    // the case, creating D3D12SDKConfiguration should have failed.  So while
+    // this case shouldn't be hit, fail if it is.
+    return HRESULT_FROM_WIN32(GetLastError());
+
+  return ExperimentalFeaturesFunc(0, nullptr, nullptr, nullptr);
+}
+
+static HRESULT
+enableExperimentalShaderModels(HMODULE hRuntime,
+                               UUID AdditionalFeatures[] = nullptr,
+                               size_t NumAdditionalFeatures = 0) {
+  D3D12EnableExperimentalFeaturesFn ExperimentalFeaturesFunc =
+      (D3D12EnableExperimentalFeaturesFn)GetProcAddress(
+          hRuntime, "D3D12EnableExperimentalFeatures");
+  if (ExperimentalFeaturesFunc == nullptr)
+    return HRESULT_FROM_WIN32(GetLastError());
+
+  std::vector<UUID> Features;
+
+  Features.push_back(D3D12ExperimentalShaderModels);
+
+  if (AdditionalFeatures != nullptr && NumAdditionalFeatures > 0)
+    Features.insert(Features.end(), AdditionalFeatures,
+                    AdditionalFeatures + NumAdditionalFeatures);
+
+  return ExperimentalFeaturesFunc((UINT)Features.size(), Features.data(),
+                                  nullptr, nullptr);
+}
+
+static HRESULT
+enableExperimentalShaderModels(UUID AdditionalFeatures[] = nullptr,
+                               size_t NumAdditionalFeatures = 0) {
+  HMODULE Runtime = LoadLibraryW(L"d3d12.dll");
+  if (Runtime == NULL)
+    return E_FAIL;
+  return enableExperimentalShaderModels(Runtime, AdditionalFeatures,
+                                        NumAdditionalFeatures);
+}
+
+static HRESULT disableExperimentalShaderModels() {
+  HMODULE Runtime = LoadLibraryW(L"d3d12.dll");
+  if (Runtime == NULL)
+    return E_FAIL;
+
+  D3D12EnableExperimentalFeaturesFn ExperimentalFeaturesFunc =
+      (D3D12EnableExperimentalFeaturesFn)GetProcAddress(
+          Runtime, "D3D12EnableExperimentalFeatures");
+  if (ExperimentalFeaturesFunc == nullptr)
+    return HRESULT_FROM_WIN32(GetLastError());
+
+  return ExperimentalFeaturesFunc(0, nullptr, nullptr, nullptr);
+}
+
+static HRESULT enableAgilitySDK(HMODULE Runtime) {
+  // D3D12SDKVersion > 1 will use provided version, otherwise, auto-detect.
+  // D3D12SDKVersion == 1 means fail if we can't auto-detect.
+  UINT SDKVersion = 0;
+  WEX::TestExecution::RuntimeParameters::TryGetValue(L"D3D12SDKVersion",
+                                                     SDKVersion);
+
+  // SDKPath must be relative path from .exe, which means relative to
+  // TE.exe location, and must start with ".\\", such as with the
+  // default: ".\\D3D12\\"
+  WEX::Common::String SDKPath;
+  if (SUCCEEDED(WEX::TestExecution::RuntimeParameters::TryGetValue(
+          L"D3D12SDKPath", SDKPath))) {
+    // Make sure path ends in backslash
+    if (!SDKPath.IsEmpty() && SDKPath.Right(1) != "\\")
+      SDKPath.Append("\\");
+  }
+
+  if (SDKPath.IsEmpty())
+    SDKPath = L".\\D3D12\\";
+
+  const bool MustFind = SDKVersion > 0;
+  if (SDKVersion <= 1) {
+    // lookup version from D3D12Core.dll
+    SDKVersion = getD3D12SDKVersion((LPCWSTR)SDKPath);
+    if (MustFind && SDKVersion == 0) {
+      hlsl_test::LogErrorFmt(L"Agility SDK not found in relative path: %s",
+                             (LPCWSTR)SDKPath);
+      return E_FAIL;
+    }
+  }
+
+  // Not found, not asked for.
+  if (SDKVersion == 0)
+    return S_FALSE;
+
+  HRESULT HR = enableAgilitySDK(Runtime, SDKVersion, (LPCWSTR)SDKPath);
+  if (FAILED(HR)) {
+    // If SDKVersion provided, fail if not successful.
+    // 1 means we should find it, and fill in the version automatically.
+    if (MustFind) {
+      hlsl_test::LogErrorFmt(
+          L"Failed to set Agility SDK version %d at path: %s", SDKVersion,
+          (LPCWSTR)SDKPath);
+      return HR;
+    }
+    return S_FALSE;
+  }
+  if (HR == S_OK)
+    hlsl_test::LogCommentFmt(L"Agility SDK version set to: %d", SDKVersion);
+
+  return HR;
+}
+
+static HRESULT enableExperimentalMode(HMODULE Runtime) {
+#ifdef _FORCE_EXPERIMENTAL_SHADERS
+  bool ExperimentalShaderModels = true;
+#else
+  bool ExperimentalShaderModels =
+      hlsl_test::GetTestParamBool(L"ExperimentalShaders");
+#endif // _FORCE_EXPERIMENTAL_SHADERS
+
+  HRESULT HR = S_FALSE;
+  if (ExperimentalShaderModels) {
+    HR = enableExperimentalShaderModels(Runtime);
+    if (SUCCEEDED(HR))
+      WEX::Logging::Log::Comment(L"Experimental shader models enabled.");
+  }
+
+  return HR;
+}
+
+static HRESULT enableDebugLayer() {
+  // The debug layer does net yet validate DXIL programs that require
+  // rewriting, but basic logging should work properly.
+  HRESULT HR = S_FALSE;
+  if (useDebugIfaces()) {
+    CComPtr<ID3D12Debug> DebugController;
+    HR = D3D12GetDebugInterface(IID_PPV_ARGS(&DebugController));
+    if (SUCCEEDED(HR)) {
+      DebugController->EnableDebugLayer();
+      HR = S_OK;
+    }
+  }
+  return HR;
+}
+
+#endif // HLSLEXECTESTUTILS_H
diff --git a/tools/clang/unittests/HLSLExec/LongVectorOpTable.xml b/tools/clang/unittests/HLSLExec/LongVectorOpTable.xml
new file mode 100644
index 0000000000..f3b2e62dbc
--- /dev/null
+++ b/tools/clang/unittests/HLSLExec/LongVectorOpTable.xml
@@ -0,0 +1,693 @@
+<?xml version="1.0" ?>
+<Data>
+    <Table Id="BinaryOpTable">
+        <ParameterTypes>
+          <!-- InputValueSetName1 is optional. If no value is provided use the
+          default value set for the data type. This string is meant to be a
+          key value for the the array of std::pairs defined in
+          LongVectorTestData.h for the applicable DataType-->
+          <ParameterType Name="InputValueSetName1">String</ParameterType>
+          <!-- InputValueSetName2 is optional. Same as InputValueSetName1 -->
+          <ParameterType Name="InputValueSetName2">String</ParameterType>
+          <ParameterType Name="DataType">String</ParameterType>
+          <ParameterType Name="OpTypeEnum">String</ParameterType>
+        </ParameterTypes>
+        <!-- LongVectorBinaryOpTypeTable DataType: bool -->
+        <Row Name="ScalarAdd_bool">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarAdd</Parameter>
+          <Parameter Name="DataType">bool</Parameter>
+        </Row>
+        <Row Name="Add_bool">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Add</Parameter>
+          <Parameter Name="DataType">bool</Parameter>
+        </Row>
+        <Row Name="ScalarSubtract_bool">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarSubtract</Parameter>
+          <Parameter Name="DataType">bool</Parameter>
+        </Row>
+        <Row Name="Subtract_bool">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Subtract</Parameter>
+          <Parameter Name="DataType">bool</Parameter>
+        </Row>
+        <!-- LongVectorBinaryOpTypeTable DataType: int16 -->
+        <Row Name="ScalarAdd_int16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarAdd</Parameter>
+          <Parameter Name="DataType">int16</Parameter>
+        </Row>
+        <Row Name="Add_int16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Add</Parameter>
+          <Parameter Name="DataType">int16</Parameter>
+        </Row>
+        <Row Name="ScalarSubtract_int16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarSubtract</Parameter>
+          <Parameter Name="DataType">int16</Parameter>
+        </Row>
+        <Row Name="Subtract_int16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Subtract</Parameter>
+          <Parameter Name="DataType">int16</Parameter>
+        </Row>
+        <Row Name="ScalarMultiply_int16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarMultiply</Parameter>
+          <Parameter Name="DataType">int16</Parameter>
+        </Row>
+        <Row Name="Multiply_int16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Multiply</Parameter>
+          <Parameter Name="DataType">int16</Parameter>
+        </Row>
+        <Row Name="ScalarDivide_int16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarDivide</Parameter>
+          <Parameter Name="DataType">int16</Parameter>
+        </Row>
+        <Row Name="Divide_int16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Divide</Parameter>
+          <Parameter Name="DataType">int16</Parameter>
+        </Row>
+        <Row Name="ScalarModulus_int16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarModulus</Parameter>
+          <Parameter Name="DataType">int16</Parameter>
+        </Row>
+        <Row Name="Modulus_int16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Modulus</Parameter>
+          <Parameter Name="DataType">int16</Parameter>
+        </Row>
+        <Row Name="ScalarMin_int16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarMin</Parameter>
+          <Parameter Name="DataType">int16</Parameter>
+        </Row>
+        <Row Name="Min_int16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Min</Parameter>
+          <Parameter Name="DataType">int16</Parameter>
+        </Row>
+        <Row Name="ScalarMax_int16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarMax</Parameter>
+          <Parameter Name="DataType">int16</Parameter>
+        </Row>
+        <Row Name="Max_int16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Max</Parameter>
+          <Parameter Name="DataType">int16</Parameter>
+        </Row>
+        <!-- LongVectorBinaryOpTypeTable DataType: int32 -->
+        <Row Name="ScalarAdd_int32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarAdd</Parameter>
+          <Parameter Name="DataType">int32</Parameter>
+        </Row>
+        <Row Name="Add_int32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Add</Parameter>
+          <Parameter Name="DataType">int32</Parameter>
+        </Row>
+        <Row Name="ScalarSubtract_int32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarSubtract</Parameter>
+          <Parameter Name="DataType">int32</Parameter>
+        </Row>
+        <Row Name="Subtract_int32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Subtract</Parameter>
+          <Parameter Name="DataType">int32</Parameter>
+        </Row>
+        <Row Name="ScalarMultiply_int32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarMultiply</Parameter>
+          <Parameter Name="DataType">int32</Parameter>
+        </Row>
+        <Row Name="Multiply_int32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Multiply</Parameter>
+          <Parameter Name="DataType">int32</Parameter>
+        </Row>
+        <Row Name="ScalarDivide_int32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarDivide</Parameter>
+          <Parameter Name="DataType">int32</Parameter>
+        </Row>
+        <Row Name="Divide_int32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Divide</Parameter>
+          <Parameter Name="DataType">int32</Parameter>
+        </Row>
+        <Row Name="ScalarModulus_int32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarModulus</Parameter>
+          <Parameter Name="DataType">int32</Parameter>
+        </Row>
+        <Row Name="Modulus_int32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Modulus</Parameter>
+          <Parameter Name="DataType">int32</Parameter>
+        </Row>
+        <Row Name="ScalarMin_int32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarMin</Parameter>
+          <Parameter Name="DataType">int32</Parameter>
+        </Row>
+        <Row Name="Min_int32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Min</Parameter>
+          <Parameter Name="DataType">int32</Parameter>
+        </Row>
+        <Row Name="ScalarMax_int32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarMax</Parameter>
+          <Parameter Name="DataType">int32</Parameter>
+        </Row>
+        <Row Name="Max_int32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Max</Parameter>
+          <Parameter Name="DataType">int32</Parameter>
+        </Row>
+        <!-- LongVectorBinaryOpTypeTable DataType: int64 -->
+        <Row Name="ScalarAdd_int64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarAdd</Parameter>
+          <Parameter Name="DataType">int64</Parameter>
+        </Row>
+        <Row Name="Add_int64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Add</Parameter>
+          <Parameter Name="DataType">int64</Parameter>
+        </Row>
+        <Row Name="ScalarSubtract_int64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarSubtract</Parameter>
+          <Parameter Name="DataType">int64</Parameter>
+        </Row>
+        <Row Name="Subtract_int64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Subtract</Parameter>
+          <Parameter Name="DataType">int64</Parameter>
+        </Row>
+        <Row Name="ScalarMultiply_int64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarMultiply</Parameter>
+          <Parameter Name="DataType">int64</Parameter>
+        </Row>
+        <Row Name="Multiply_int64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Multiply</Parameter>
+          <Parameter Name="DataType">int64</Parameter>
+        </Row>
+        <Row Name="ScalarDivide_int64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarDivide</Parameter>
+          <Parameter Name="DataType">int64</Parameter>
+        </Row>
+        <Row Name="Divide_int64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Divide</Parameter>
+          <Parameter Name="DataType">int64</Parameter>
+        </Row>
+        <Row Name="ScalarModulus_int64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarModulus</Parameter>
+          <Parameter Name="DataType">int64</Parameter>
+        </Row>
+        <Row Name="Modulus_int64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Modulus</Parameter>
+          <Parameter Name="DataType">int64</Parameter>
+        </Row>
+        <Row Name="ScalarMin_int64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarMin</Parameter>
+          <Parameter Name="DataType">int64</Parameter>
+        </Row>
+        <Row Name="Min_int64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Min</Parameter>
+          <Parameter Name="DataType">int64</Parameter>
+        </Row>
+        <Row Name="ScalarMax_int64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarMax</Parameter>
+          <Parameter Name="DataType">int64</Parameter>
+        </Row>
+        <Row Name="Max_int64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Max</Parameter>
+          <Parameter Name="DataType">int64</Parameter>
+        </Row>
+        <!-- LongVectorBinaryOpTypeTable DataType: uint16 -->
+        <Row Name="ScalarAdd_uint16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarAdd</Parameter>
+          <Parameter Name="DataType">uint16</Parameter>
+        </Row>
+        <Row Name="Add_uint16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Add</Parameter>
+          <Parameter Name="DataType">uint16</Parameter>
+        </Row>
+        <Row Name="ScalarSubtract_uint16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarSubtract</Parameter>
+          <Parameter Name="DataType">uint16</Parameter>
+        </Row>
+        <Row Name="Subtract_uint16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Subtract</Parameter>
+          <Parameter Name="DataType">uint16</Parameter>
+        </Row>
+        <Row Name="ScalarMultiply_uint16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarMultiply</Parameter>
+          <Parameter Name="DataType">uint16</Parameter>
+        </Row>
+        <Row Name="Multiply_uint16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Multiply</Parameter>
+          <Parameter Name="DataType">uint16</Parameter>
+        </Row>
+        <Row Name="ScalarDivide_uint16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarDivide</Parameter>
+          <Parameter Name="DataType">uint16</Parameter>
+        </Row>
+        <Row Name="Divide_uint16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Divide</Parameter>
+          <Parameter Name="DataType">uint16</Parameter>
+        </Row>
+        <Row Name="ScalarModulus_uint16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarModulus</Parameter>
+          <Parameter Name="DataType">uint16</Parameter>
+        </Row>
+        <Row Name="Modulus_uint16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Modulus</Parameter>
+          <Parameter Name="DataType">uint16</Parameter>
+        </Row>
+        <Row Name="ScalarMin_uint16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarMin</Parameter>
+          <Parameter Name="DataType">uint16</Parameter>
+        </Row>
+        <Row Name="Min_uint16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Min</Parameter>
+          <Parameter Name="DataType">uint16</Parameter>
+        </Row>
+        <Row Name="ScalarMax_uint16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarMax</Parameter>
+          <Parameter Name="DataType">uint16</Parameter>
+        </Row>
+        <Row Name="Max_uint16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Max</Parameter>
+          <Parameter Name="DataType">uint16</Parameter>
+        </Row>
+        <!-- LongVectorBinaryOpTypeTable DataType: uint32 -->
+        <Row Name="ScalarAdd_uint32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarAdd</Parameter>
+          <Parameter Name="DataType">uint32</Parameter>
+        </Row>
+        <Row Name="Add_uint32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Add</Parameter>
+          <Parameter Name="DataType">uint32</Parameter>
+        </Row>
+        <Row Name="ScalarSubtract_uint32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarSubtract</Parameter>
+          <Parameter Name="DataType">uint32</Parameter>
+        </Row>
+        <Row Name="Subtract_uint32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Subtract</Parameter>
+          <Parameter Name="DataType">uint32</Parameter>
+        </Row>
+        <Row Name="ScalarMultiply_uint32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarMultiply</Parameter>
+          <Parameter Name="DataType">uint32</Parameter>
+        </Row>
+        <Row Name="Multiply_uint32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Multiply</Parameter>
+          <Parameter Name="DataType">uint32</Parameter>
+        </Row>
+        <Row Name="ScalarDivide_uint32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarDivide</Parameter>
+          <Parameter Name="DataType">uint32</Parameter>
+        </Row>
+        <Row Name="Divide_uint32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Divide</Parameter>
+          <Parameter Name="DataType">uint32</Parameter>
+        </Row>
+        <Row Name="ScalarModulus_uint32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarModulus</Parameter>
+          <Parameter Name="DataType">uint32</Parameter>
+        </Row>
+        <Row Name="Modulus_uint32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Modulus</Parameter>
+          <Parameter Name="DataType">uint32</Parameter>
+        </Row>
+        <Row Name="ScalarMin_uint32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarMin</Parameter>
+          <Parameter Name="DataType">uint32</Parameter>
+        </Row>
+        <Row Name="Min_uint32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Min</Parameter>
+          <Parameter Name="DataType">uint32</Parameter>
+        </Row>
+        <Row Name="ScalarMax_uint32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarMax</Parameter>
+          <Parameter Name="DataType">uint32</Parameter>
+        </Row>
+        <Row Name="Max_uint32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Max</Parameter>
+          <Parameter Name="DataType">uint32</Parameter>
+        </Row>
+        <!-- LongVectorBinaryOpTypeTable DataType: uint64 -->
+        <Row Name="ScalarAdd_uint64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarAdd</Parameter>
+          <Parameter Name="DataType">uint64</Parameter>
+        </Row>
+        <Row Name="Add_uint64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Add</Parameter>
+          <Parameter Name="DataType">uint64</Parameter>
+        </Row>
+        <Row Name="ScalarSubtract_uint64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarSubtract</Parameter>
+          <Parameter Name="DataType">uint64</Parameter>
+        </Row>
+        <Row Name="Subtract_uint64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Subtract</Parameter>
+          <Parameter Name="DataType">uint64</Parameter>
+        </Row>
+        <Row Name="ScalarMultiply_uint64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarMultiply</Parameter>
+          <Parameter Name="DataType">uint64</Parameter>
+        </Row>
+        <Row Name="Multiply_uint64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Multiply</Parameter>
+          <Parameter Name="DataType">uint64</Parameter>
+        </Row>
+        <Row Name="ScalarDivide_uint64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarDivide</Parameter>
+          <Parameter Name="DataType">uint64</Parameter>
+        </Row>
+        <Row Name="Divide_uint64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Divide</Parameter>
+          <Parameter Name="DataType">uint64</Parameter>
+        </Row>
+        <Row Name="ScalarModulus_uint64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarModulus</Parameter>
+          <Parameter Name="DataType">uint64</Parameter>
+        </Row>
+        <Row Name="Modulus_uint64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Modulus</Parameter>
+          <Parameter Name="DataType">uint64</Parameter>
+        </Row>
+        <Row Name="ScalarMin_uint64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarMin</Parameter>
+          <Parameter Name="DataType">uint64</Parameter>
+        </Row>
+        <Row Name="Min_uint64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Min</Parameter>
+          <Parameter Name="DataType">uint64</Parameter>
+        </Row>
+        <Row Name="ScalarMax_uint64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarMax</Parameter>
+          <Parameter Name="DataType">uint64</Parameter>
+        </Row>
+        <Row Name="Max_uint64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Max</Parameter>
+          <Parameter Name="DataType">uint64</Parameter>
+        </Row>
+        <!-- LongVectorBinaryOpTypeTable DataType: float16 -->
+        <Row Name="ScalarAdd_float16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarAdd</Parameter>
+          <Parameter Name="DataType">float16</Parameter>
+        </Row>
+        <Row Name="Add_float16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Add</Parameter>
+          <Parameter Name="DataType">float16</Parameter>
+        </Row>
+        <Row Name="ScalarSubtract_float16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarSubtract</Parameter>
+          <Parameter Name="DataType">float16</Parameter>
+        </Row>
+        <Row Name="Subtract_float16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Subtract</Parameter>
+          <Parameter Name="DataType">float16</Parameter>
+        </Row>
+        <Row Name="ScalarMultiply_float16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarMultiply</Parameter>
+          <Parameter Name="DataType">float16</Parameter>
+        </Row>
+        <Row Name="Multiply_float16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Multiply</Parameter>
+          <Parameter Name="DataType">float16</Parameter>
+        </Row>
+        <Row Name="ScalarDivide_float16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarDivide</Parameter>
+          <Parameter Name="DataType">float16</Parameter>
+        </Row>
+        <Row Name="Divide_float16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Divide</Parameter>
+          <Parameter Name="DataType">float16</Parameter>
+        </Row>
+        <Row Name="ScalarModulus_float16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarModulus</Parameter>
+          <Parameter Name="DataType">float16</Parameter>
+        </Row>
+        <Row Name="Modulus_float16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Modulus</Parameter>
+          <Parameter Name="DataType">float16</Parameter>
+        </Row>
+        <Row Name="ScalarMin_float16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarMin</Parameter>
+          <Parameter Name="DataType">float16</Parameter>
+        </Row>
+        <Row Name="Min_float16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Min</Parameter>
+          <Parameter Name="DataType">float16</Parameter>
+        </Row>
+        <Row Name="ScalarMax_float16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarMax</Parameter>
+          <Parameter Name="DataType">float16</Parameter>
+        </Row>
+        <Row Name="Max_float16">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Max</Parameter>
+          <Parameter Name="DataType">float16</Parameter>
+        </Row>
+        <!-- LongVectorBinaryOpTypeTable DataType: float32 -->
+        <Row Name="ScalarAdd_float32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarAdd</Parameter>
+          <Parameter Name="DataType">float32</Parameter>
+        </Row>
+        <Row Name="Add_float32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Add</Parameter>
+          <Parameter Name="DataType">float32</Parameter>
+        </Row>
+        <Row Name="ScalarSubtract_float32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarSubtract</Parameter>
+          <Parameter Name="DataType">float32</Parameter>
+        </Row>
+        <Row Name="Subtract_float32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Subtract</Parameter>
+          <Parameter Name="DataType">float32</Parameter>
+        </Row>
+        <Row Name="ScalarMultiply_float32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarMultiply</Parameter>
+          <Parameter Name="DataType">float32</Parameter>
+        </Row>
+        <Row Name="Multiply_float32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Multiply</Parameter>
+          <Parameter Name="DataType">float32</Parameter>
+        </Row>
+        <Row Name="ScalarDivide_float32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarDivide</Parameter>
+          <Parameter Name="DataType">float32</Parameter>
+        </Row>
+        <Row Name="Divide_float32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Divide</Parameter>
+          <Parameter Name="DataType">float32</Parameter>
+        </Row>
+        <Row Name="ScalarModulus_float32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarModulus</Parameter>
+          <Parameter Name="DataType">float32</Parameter>
+        </Row>
+        <Row Name="Modulus_float32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Modulus</Parameter>
+          <Parameter Name="DataType">float32</Parameter>
+        </Row>
+        <Row Name="ScalarMin_float32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarMin</Parameter>
+          <Parameter Name="DataType">float32</Parameter>
+        </Row>
+        <Row Name="Min_float32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Min</Parameter>
+          <Parameter Name="DataType">float32</Parameter>
+        </Row>
+        <Row Name="ScalarMax_float32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarMax</Parameter>
+          <Parameter Name="DataType">float32</Parameter>
+        </Row>
+        <Row Name="Max_float32">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Max</Parameter>
+          <Parameter Name="DataType">float32</Parameter>
+        </Row>
+        <!-- LongVectorBinaryOpTypeTable DataType: float64 -->
+        <Row Name="ScalarAdd_float64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarAdd</Parameter>
+          <Parameter Name="DataType">float64</Parameter>
+        </Row>
+        <Row Name="Add_float64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Add</Parameter>
+          <Parameter Name="DataType">float64</Parameter>
+        </Row>
+        <Row Name="ScalarSubtract_float64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarSubtract</Parameter>
+          <Parameter Name="DataType">float64</Parameter>
+        </Row>
+        <Row Name="Subtract_float64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Subtract</Parameter>
+          <Parameter Name="DataType">float64</Parameter>
+        </Row>
+        <Row Name="ScalarMultiply_float64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarMultiply</Parameter>
+          <Parameter Name="DataType">float64</Parameter>
+        </Row>
+        <Row Name="Multiply_float64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Multiply</Parameter>
+          <Parameter Name="DataType">float64</Parameter>
+        </Row>
+        <Row Name="ScalarDivide_float64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarDivide</Parameter>
+          <Parameter Name="DataType">float64</Parameter>
+        </Row>
+        <Row Name="Divide_float64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Divide</Parameter>
+          <Parameter Name="DataType">float64</Parameter>
+        </Row>
+        <Row Name="ScalarMin_float64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarMin</Parameter>
+          <Parameter Name="DataType">float64</Parameter>
+        </Row>
+        <Row Name="Min_float64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Min</Parameter>
+          <Parameter Name="DataType">float64</Parameter>
+        </Row>
+        <Row Name="ScalarMax_float64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_ScalarMax</Parameter>
+          <Parameter Name="DataType">float64</Parameter>
+        </Row>
+        <Row Name="Max_float64">
+          <Parameter Name="OpTypeEnum">BinaryOpType_Max</Parameter>
+          <Parameter Name="DataType">float64</Parameter>
+        </Row>
+    </Table>
+    <Table Id="UnaryOpTable">
+      <ParameterTypes>
+        <!-- InputValueSetName1 is optional. If no value is provided use the
+        default value set for the data type. This string is meant to be a key
+        value for the the array of std::pairs defined in LongVectorTestData.h
+        for the applicable DataType-->
+        <ParameterType Name="InputValueSetName1">String</ParameterType>
+        <ParameterType Name="DataType">String</ParameterType>
+        <ParameterType Name="OpTypeEnum">String</ParameterType>
+      </ParameterTypes>
+      <!-- LongVectorUnaryOpTypeTable DataType: bool -->
+      <Row Name="Initialize_bool">
+        <Parameter Name="OpTypeEnum">UnaryOpType_Initialize</Parameter>
+        <Parameter Name="DataType">bool</Parameter>
+      </Row>
+      <!-- LongVectorUnaryOpTypeTable DataType: int16 -->
+      <Row Name="Initialize_int16">
+        <Parameter Name="OpTypeEnum">UnaryOpType_Initialize</Parameter>
+        <Parameter Name="DataType">int16</Parameter>
+      </Row>
+      <!-- LongVectorUnaryOpTypeTable DataType: int32 -->
+      <Row Name="Initialize_int32">
+        <Parameter Name="OpTypeEnum">UnaryOpType_Initialize</Parameter>
+        <Parameter Name="DataType">int32</Parameter>
+      </Row>
+      <!-- LongVectorUnaryOpTypeTable DataType: int64 -->
+      <Row Name="Initialize_int64">
+        <Parameter Name="OpTypeEnum">UnaryOpType_Initialize</Parameter>
+        <Parameter Name="DataType">int64</Parameter>
+      </Row>
+      <!-- LongVectorUnaryOpTypeTable DataType: uint16 -->
+      <Row Name="Initialize_uint16">
+        <Parameter Name="OpTypeEnum">UnaryOpType_Initialize</Parameter>
+        <Parameter Name="DataType">uint16</Parameter>
+      </Row>
+      <!-- LongVectorUnaryOpTypeTable DataType: uint32 -->
+      <Row Name="Initialize_uint32">
+        <Parameter Name="OpTypeEnum">UnaryOpType_Initialize</Parameter>
+        <Parameter Name="DataType">uint32</Parameter>
+      </Row>
+      <!-- LongVectorUnaryOpTypeTable DataType: uint64 -->
+      <Row Name="Initialize_uint64">
+        <Parameter Name="OpTypeEnum">UnaryOpType_Initialize</Parameter>
+        <Parameter Name="DataType">uint64</Parameter>
+      </Row>
+      <!-- LongVectorUnaryOpTypeTable DataType: float16 -->
+      <Row Name="Initialize_float16">
+        <Parameter Name="OpTypeEnum">UnaryOpType_Initialize</Parameter>
+        <Parameter Name="DataType">float16</Parameter>
+      </Row>
+      <!-- LongVectorUnaryOpTypeTable DataType: float32 -->
+      <Row Name="Initialize_float32">
+        <Parameter Name="OpTypeEnum">UnaryOpType_Initialize</Parameter>
+        <Parameter Name="DataType">float32</Parameter>
+      </Row>
+      <!-- LongVectorUnaryOpTypeTable DataType: float64 -->
+      <Row Name="Initialize_float64">
+        <Parameter Name="OpTypeEnum">UnaryOpType_Initialize</Parameter>
+        <Parameter Name="DataType">float64</Parameter>
+      </Row>
+    </Table>
+    <Table Id="TrigonometricOpTable">
+      <ParameterTypes>
+        <!-- InputValueSetName1 is optional. If no value is provided use the
+        default value set for the data type. This string is meant to be a key
+        value for the the array of std::pairs defined in LongVectorTestData.h
+        for the applicable DataType-->
+        <ParameterType Name="InputValueSetName1">String</ParameterType>
+        <!-- InputArgsName is optional and is also a key to the array of
+        std::pairs defined in LongVectorTestData.h for the applicable DataType.
+        Used for args like min and max in clamp-->
+        <ParameterType Name="DataType">String</ParameterType>
+        <ParameterType Name="OpTypeEnum">String</ParameterType>
+      </ParameterTypes>
+      <!-- LongVectorUnaryOpTable_Trigonometric DataType: float16 -->
+      <Row Name="Acos_float16">
+        <Parameter Name="OpTypeEnum">TrigonometricOpType_Acos</Parameter>
+        <Parameter Name="DataType">float16</Parameter>
+        <Parameter Name="InputValueSetName1">TrigonometricInputValueSet_RangeOne</Parameter>
+      </Row>
+      <Row Name="Asin_float16">
+        <Parameter Name="OpTypeEnum">TrigonometricOpType_Asin</Parameter>
+        <Parameter Name="DataType">float16</Parameter>
+        <Parameter Name="InputValueSetName1">TrigonometricInputValueSet_RangeHalfPi</Parameter>
+      </Row>
+      <Row Name="Atan_float16">
+        <Parameter Name="OpTypeEnum">TrigonometricOpType_Atan</Parameter>
+        <Parameter Name="DataType">float16</Parameter>
+        <Parameter Name="InputValueSetName1">TrigonometricInputValueSet_RangeHalfPi</Parameter>
+      </Row>
+      <Row Name="Cos_float16">
+        <Parameter Name="OpTypeEnum">TrigonometricOpType_Cos</Parameter>
+        <Parameter Name="DataType">float16</Parameter>
+      </Row>
+      <Row Name="Cosh_float16">
+        <Parameter Name="OpTypeEnum">TrigonometricOpType_Cosh</Parameter>
+        <Parameter Name="DataType">float16</Parameter>
+      </Row>
+      <Row Name="Sin_float16">
+        <Parameter Name="OpTypeEnum">TrigonometricOpType_Sin</Parameter>
+        <Parameter Name="DataType">float16</Parameter>
+      </Row>
+      <Row Name="Sinh_float16">
+        <Parameter Name="OpTypeEnum">TrigonometricOpType_Sinh</Parameter>
+        <Parameter Name="DataType">float16</Parameter>
+      </Row>
+      <Row Name="Tan_float16">
+        <Parameter Name="OpTypeEnum">TrigonometricOpType_Tan</Parameter>
+        <Parameter Name="DataType">float16</Parameter>
+      </Row>
+      <Row Name="Tanh_float16">
+        <Parameter Name="OpTypeEnum">TrigonometricOpType_Tanh</Parameter>
+        <Parameter Name="DataType">float16</Parameter>
+      </Row>
+      <!-- LongVectorUnaryOpTable_Trigonometric DataType: float32 -->
+      <Row Name="Acos_float32">
+        <Parameter Name="OpTypeEnum">TrigonometricOpType_Acos</Parameter>
+        <Parameter Name="DataType">float32</Parameter>
+        <Parameter Name="InputValueSetName1">TrigonometricInputValueSet_RangeOne</Parameter>
+      </Row>
+      <Row Name="Asin_float32">
+        <Parameter Name="OpTypeEnum">TrigonometricOpType_Asin</Parameter>
+        <Parameter Name="DataType">float32</Parameter>
+        <Parameter Name="InputValueSetName1">TrigonometricInputValueSet_RangeHalfPi</Parameter>
+      </Row>
+      <Row Name="Atan_float32">
+        <Parameter Name="OpTypeEnum">TrigonometricOpType_Atan</Parameter>
+        <Parameter Name="DataType">float32</Parameter>
+        <Parameter Name="InputValueSetName1">TrigonometricInputValueSet_RangeHalfPi</Parameter>
+      </Row>
+      <Row Name="Cos_float32">
+        <Parameter Name="OpTypeEnum">TrigonometricOpType_Cos</Parameter>
+        <Parameter Name="DataType">float32</Parameter>
+      </Row>
+      <Row Name="Cosh_float32">
+        <Parameter Name="OpTypeEnum">TrigonometricOpType_Cosh</Parameter>
+        <Parameter Name="DataType">float32</Parameter>
+      </Row>
+      <Row Name="Sin_float32">
+        <Parameter Name="OpTypeEnum">TrigonometricOpType_Sin</Parameter>
+        <Parameter Name="DataType">float32</Parameter>
+      </Row>
+      <Row Name="Sinh_float32">
+        <Parameter Name="OpTypeEnum">TrigonometricOpType_Sinh</Parameter>
+        <Parameter Name="DataType">float32</Parameter>
+      </Row>
+      <Row Name="Tan_float32">
+        <Parameter Name="OpTypeEnum">TrigonometricOpType_Tan</Parameter>
+        <Parameter Name="DataType">float32</Parameter>
+      </Row>
+      <Row Name="Tanh_float32">
+        <Parameter Name="OpTypeEnum">TrigonometricOpType_Tanh</Parameter>
+        <Parameter Name="DataType">float32</Parameter>
+      </Row>
+    </Table>
+</Data>
diff --git a/tools/clang/unittests/HLSLExec/LongVectorTestData.h b/tools/clang/unittests/HLSLExec/LongVectorTestData.h
new file mode 100644
index 0000000000..bc6ea8c7c2
--- /dev/null
+++ b/tools/clang/unittests/HLSLExec/LongVectorTestData.h
@@ -0,0 +1,298 @@
+#ifndef LONGVECTORTESTDATA_H
+#define LONGVECTORTESTDATA_H
+
+#include <Verify.h>
+#include <limits>
+#include <map>
+#include <string>
+#include <vector>
+
+// A helper struct because C++ bools are 1 byte and HLSL bools are 4 bytes.
+// Take int32_t as a constuctor argument and convert it to bool when needed.
+// Comparisons cast to a bool because we only care if the bool representation is
+// true or false.
+struct HLSLBool_t {
+  HLSLBool_t() : Val(0) {}
+  HLSLBool_t(int32_t Val) : Val(Val) {}
+  HLSLBool_t(bool Val) : Val(Val) {}
+  HLSLBool_t(const HLSLBool_t &Other) : Val(Other.Val) {}
+
+  bool operator==(const HLSLBool_t &Other) const {
+    return static_cast<bool>(Val) == static_cast<bool>(Other.Val);
+  }
+
+  bool operator!=(const HLSLBool_t &Other) const {
+    return static_cast<bool>(Val) != static_cast<bool>(Other.Val);
+  }
+
+  bool operator<(const HLSLBool_t &Other) const { return Val < Other.Val; }
+
+  bool operator>(const HLSLBool_t &Other) const { return Val > Other.Val; }
+
+  bool operator<=(const HLSLBool_t &Other) const { return Val <= Other.Val; }
+
+  bool operator>=(const HLSLBool_t &Other) const { return Val >= Other.Val; }
+
+  HLSLBool_t operator*(const HLSLBool_t &Other) const {
+    return HLSLBool_t(Val * Other.Val);
+  }
+
+  HLSLBool_t operator+(const HLSLBool_t &Other) const {
+    return HLSLBool_t(Val + Other.Val);
+  }
+
+  HLSLBool_t operator-(const HLSLBool_t &Other) const {
+    return HLSLBool_t(Val - Other.Val);
+  }
+
+  HLSLBool_t operator/(const HLSLBool_t &Other) const {
+    return HLSLBool_t(Val / Other.Val);
+  }
+
+  HLSLBool_t operator%(const HLSLBool_t &Other) const {
+    return HLSLBool_t(Val % Other.Val);
+  }
+
+  // So we can construct std::wstrings using std::wostream
+  friend std::wostream &operator<<(std::wostream &Os, const HLSLBool_t &Obj) {
+    Os << static_cast<bool>(Obj.Val);
+    return Os;
+  }
+
+  // So we can construct std::strings using std::ostream
+  friend std::ostream &operator<<(std::ostream &Os, const HLSLBool_t &Obj) {
+    Os << static_cast<bool>(Obj.Val);
+    return Os;
+  }
+
+  int32_t Val = 0;
+};
+
+//  No native float16 type in C++ until C++23 . So we use uint16_t to represent
+//  it. Simple little wrapping struct to help handle the right behavior.
+struct HLSLHalf_t {
+  HLSLHalf_t() : Val(0) {}
+  HLSLHalf_t(DirectX::PackedVector::HALF Val) : Val(Val) {}
+  HLSLHalf_t(const HLSLHalf_t &Other) : Val(Other.Val) {}
+  HLSLHalf_t(const float F) {
+    Val = DirectX::PackedVector::XMConvertFloatToHalf(F);
+  }
+  HLSLHalf_t(const double D) {
+    float F = 0.0f;
+    // We wrap '::max' in () to prevent it from being expanded as a
+    // macro by the Windows SDK.
+    if (D >= (std::numeric_limits<double>::max)())
+      F = (std::numeric_limits<float>::max)();
+    else if (D <= std::numeric_limits<double>::lowest())
+      F = std::numeric_limits<float>::lowest();
+    else
+      F = static_cast<float>(D);
+
+    Val = DirectX::PackedVector::XMConvertFloatToHalf(F);
+  }
+  HLSLHalf_t(const int I) {
+    VERIFY_IS_TRUE(I == 0, L"HLSLHalf_t constructor with int override only "
+                           L"meant for cases when initializing to 0.");
+    const float F = static_cast<float>(I);
+    Val = DirectX::PackedVector::XMConvertFloatToHalf(F);
+  }
+
+  // Implicit conversion to float for use with things like std::acos, std::tan,
+  // etc
+  operator float() const {
+    return DirectX::PackedVector::XMConvertHalfToFloat(Val);
+  }
+
+  bool operator==(const HLSLHalf_t &Other) const {
+    // Convert to floats to properly handle the '0 == -0' case which must
+    // compare to true but have different uint16_t values.
+    // That is, 0 == -0 is true. We store Val as a uint16_t.
+    const float A = DirectX::PackedVector::XMConvertHalfToFloat(Val);
+    const float B = DirectX::PackedVector::XMConvertHalfToFloat(Other.Val);
+    return A == B;
+  }
+
+  bool operator<(const HLSLHalf_t &Other) const {
+    return DirectX::PackedVector::XMConvertHalfToFloat(Val) <
+           DirectX::PackedVector::XMConvertHalfToFloat(Other.Val);
+  }
+
+  bool operator>(const HLSLHalf_t &Other) const {
+    return DirectX::PackedVector::XMConvertHalfToFloat(Val) >
+           DirectX::PackedVector::XMConvertHalfToFloat(Other.Val);
+  }
+
+  // Used by tolerance checks in the tests.
+  bool operator>(float F) const {
+    const float A = DirectX::PackedVector::XMConvertHalfToFloat(Val);
+    return A > F;
+  }
+
+  bool operator<(float F) const {
+    const float A = DirectX::PackedVector::XMConvertHalfToFloat(Val);
+    return A < F;
+  }
+
+  bool operator<=(const HLSLHalf_t &Other) const {
+    return DirectX::PackedVector::XMConvertHalfToFloat(Val) <=
+           DirectX::PackedVector::XMConvertHalfToFloat(Other.Val);
+  }
+
+  bool operator>=(const HLSLHalf_t &Other) const {
+    return DirectX::PackedVector::XMConvertHalfToFloat(Val) >=
+           DirectX::PackedVector::XMConvertHalfToFloat(Other.Val);
+  }
+
+  bool operator!=(const HLSLHalf_t &Other) const { return Val != Other.Val; }
+
+  HLSLHalf_t operator*(const HLSLHalf_t &Other) const {
+    const float A = DirectX::PackedVector::XMConvertHalfToFloat(Val);
+    const float B = DirectX::PackedVector::XMConvertHalfToFloat(Other.Val);
+    return HLSLHalf_t(DirectX::PackedVector::XMConvertFloatToHalf(A * B));
+  }
+
+  HLSLHalf_t operator+(const HLSLHalf_t &Other) const {
+    const float A = DirectX::PackedVector::XMConvertHalfToFloat(Val);
+    const float B = DirectX::PackedVector::XMConvertHalfToFloat(Other.Val);
+    return HLSLHalf_t(DirectX::PackedVector::XMConvertFloatToHalf(A + B));
+  }
+
+  HLSLHalf_t operator-(const HLSLHalf_t &Other) const {
+    const float A = DirectX::PackedVector::XMConvertHalfToFloat(Val);
+    const float B = DirectX::PackedVector::XMConvertHalfToFloat(Other.Val);
+    return HLSLHalf_t(DirectX::PackedVector::XMConvertFloatToHalf(A - B));
+  }
+
+  HLSLHalf_t operator/(const HLSLHalf_t &Other) const {
+    const float A = DirectX::PackedVector::XMConvertHalfToFloat(Val);
+    const float B = DirectX::PackedVector::XMConvertHalfToFloat(Other.Val);
+    return HLSLHalf_t(DirectX::PackedVector::XMConvertFloatToHalf(A / B));
+  }
+
+  HLSLHalf_t operator%(const HLSLHalf_t &Other) const {
+    const float A = DirectX::PackedVector::XMConvertHalfToFloat(Val);
+    const float B = DirectX::PackedVector::XMConvertHalfToFloat(Other.Val);
+    const float C = std::fmod(A, B);
+    return HLSLHalf_t(DirectX::PackedVector::XMConvertFloatToHalf(C));
+  }
+
+  // So we can construct std::wstrings using std::wostream
+  friend std::wostream &operator<<(std::wostream &Os, const HLSLHalf_t &Obj) {
+    Os << DirectX::PackedVector::XMConvertHalfToFloat(Obj.Val);
+    return Os;
+  }
+
+  // So we can construct std::wstrings using std::wostream
+  friend std::ostream &operator<<(std::ostream &Os, const HLSLHalf_t &Obj) {
+    Os << DirectX::PackedVector::XMConvertHalfToFloat(Obj.Val);
+    return Os;
+  }
+
+  // HALF is an alias to uint16_t
+  DirectX::PackedVector::HALF Val = 0;
+};
+
+template <typename T> struct LongVectorTestData {
+  static const std::map<std::wstring, std::vector<T>> Data;
+};
+
+template <> struct LongVectorTestData<HLSLBool_t> {
+  inline static const std::map<std::wstring, std::vector<HLSLBool_t>> Data = {
+      {L"DefaultInputValueSet1",
+       {false, true, false, false, false, false, true, true, true, true}},
+      {L"DefaultInputValueSet2",
+       {true, false, false, false, false, true, true, true, false, false}},
+  };
+};
+
+template <> struct LongVectorTestData<int16_t> {
+  inline static const std::map<std::wstring, std::vector<int16_t>> Data = {
+      {L"DefaultInputValueSet1", {-6, 1, 7, 3, 8, 4, -3, 8, 8, -2}},
+      {L"DefaultInputValueSet2", {5, -6, -3, -2, 9, 3, 1, -3, -7, 2}},
+  };
+};
+
+template <> struct LongVectorTestData<int32_t> {
+  inline static const std::map<std::wstring, std::vector<int32_t>> Data = {
+      {L"DefaultInputValueSet1", {-6, 1, 7, 3, 8, 4, -3, 8, 8, -2}},
+      {L"DefaultInputValueSet2", {5, -6, -3, -2, 9, 3, 1, -3, -7, 2}},
+  };
+};
+
+template <> struct LongVectorTestData<int64_t> {
+  inline static const std::map<std::wstring, std::vector<int64_t>> Data = {
+      {L"DefaultInputValueSet1", {-6, 11, 7, 3, 8, 4, -3, 8, 8, -2}},
+      {L"DefaultInputValueSet2", {5, -1337, -3, -2, 9, 3, 1, -3, 501, 2}},
+  };
+};
+
+template <> struct LongVectorTestData<uint16_t> {
+  inline static const std::map<std::wstring, std::vector<uint16_t>> Data = {
+      {L"DefaultInputValueSet1", {1, 699, 3, 1023, 5, 6, 0, 8, 9, 10}},
+      {L"DefaultInputValueSet2", {2, 111, 3, 4, 5, 9, 21, 8, 9, 10}},
+  };
+};
+
+template <> struct LongVectorTestData<uint32_t> {
+  inline static const std::map<std::wstring, std::vector<uint32_t>> Data = {
+      {L"DefaultInputValueSet1", {1, 2, 3, 4, 5, 0, 7, 8, 9, 10}},
+      {L"DefaultInputValueSet2", {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}},
+  };
+};
+
+template <> struct LongVectorTestData<uint64_t> {
+  inline static const std::map<std::wstring, std::vector<uint64_t>> Data = {
+      {L"DefaultInputValueSet1", {1, 2, 3, 4, 5, 0, 7, 1000, 9, 10}},
+      {L"DefaultInputValueSet2", {1, 2, 1337, 4, 5, 6, 7, 8, 9, 10}},
+  };
+};
+
+template <> struct LongVectorTestData<HLSLHalf_t> {
+  inline static const std::map<std::wstring, std::vector<HLSLHalf_t>> Data = {
+      {L"DefaultInputValueSet1",
+       {-1.0, -1.0, 1.0, -0.01, 1.0, -0.01, 1.0, -0.01, 1.0, -0.01}},
+      {L"DefaultInputValueSet2",
+       {1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0}},
+      {L"DefaultClampArgs", {-1.0, 1.0}}, // Min, Max values for clamp
+      // Range [ -pi/2, pi/2]
+      {L"TrigonometricInputValueSet_RangeHalfPi",
+       {-1.073, 0.044, -1.047, 0.313, 1.447, -0.865, 1.364, -0.715, -0.800,
+        0.541}},
+      {L"TrigonometricInputValueSet_RangeOne",
+       {0.331, 0.727, -0.957, 0.677, -0.025, 0.495, 0.855, -0.673, -0.678,
+        -0.905}},
+  };
+};
+
+template <> struct LongVectorTestData<float> {
+  inline static const std::map<std::wstring, std::vector<float>> Data = {
+      {L"DefaultInputValueSet1",
+       {1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0}},
+      {L"DefaultInputValueSet2",
+       {1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0}},
+      // Range [ -pi/2, pi/2]
+      {L"TrigonometricInputValueSet_RangeHalfPi",
+       {0.315f, -0.316f, 1.409f, -0.09f, -1.569f, 1.302f, -0.326f, 0.781f,
+        -1.235f, 0.623f}},
+      {L"TrigonometricInputValueSet_RangeOne",
+       {0.727f, 0.331f, -0.957f, 0.677f, -0.025f, 0.495f, 0.855f, -0.673f,
+        -0.678f, -0.905f}},
+  };
+};
+
+template <> struct LongVectorTestData<double> {
+  inline static const std::map<std::wstring, std::vector<double>> Data = {
+      {L"DefaultInputValueSet1",
+       {1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0}},
+      {L"DefaultInputValueSet2",
+       {1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0}},
+      // Range [ -pi/2, pi/2]
+      {L"TrigonometricInputValueSet_RangeHalfPi",
+       {0.807, 0.605, 1.317, 0.188, 1.566, -1.507, 0.67, -1.553, 0.194,
+        -0.883}},
+      {L"TrigonometricInputValueSet_RangeOne",
+       {0.331, 0.277, -0.957, 0.677, -0.025, 0.495, 0.855, -0.673, -0.678,
+        -0.905}}};
+};
+
+#endif // LONGVECTORTESTDATA_H
diff --git a/tools/clang/unittests/HLSLExec/LongVectors.cpp b/tools/clang/unittests/HLSLExec/LongVectors.cpp
new file mode 100644
index 0000000000..b9e79cfc5e
--- /dev/null
+++ b/tools/clang/unittests/HLSLExec/LongVectors.cpp
@@ -0,0 +1,341 @@
+#include "LongVectors.h"
+#include "HlslExecTestUtils.h"
+#include <iomanip>
+
+LongVector::BinaryOpType
+LongVector::getBinaryOpType(const std::wstring &OpTypeString) {
+  return getLongVectorOpType<LongVector::BinaryOpType>(
+      binaryOpTypeStringToEnumMap, OpTypeString,
+      std::size(binaryOpTypeStringToEnumMap));
+}
+
+LongVector::UnaryOpType
+LongVector::getUnaryOpType(const std::wstring &OpTypeString) {
+  return getLongVectorOpType<LongVector::UnaryOpType>(
+      unaryOpTypeStringToEnumMap, OpTypeString,
+      std::size(unaryOpTypeStringToEnumMap));
+}
+
+LongVector::TrigonometricOpType
+LongVector::getTrigonometricOpType(const std::wstring &OpTypeString) {
+  return getLongVectorOpType<LongVector::TrigonometricOpType>(
+      trigonometricOpTypeStringToEnumMap, OpTypeString,
+      std::size(trigonometricOpTypeStringToEnumMap));
+}
+
+// These are helper arrays to be used with the TableParameterHandler that parses
+// the LongVectorOpTable.xml file for us.
+static TableParameter BinaryOpParameters[] = {
+    {L"DataType", TableParameter::STRING, true},
+    {L"OpTypeEnum", TableParameter::STRING, true},
+    {L"InputValueSetName1", TableParameter::STRING, false},
+    {L"InputValueSetName2", TableParameter::STRING, false},
+};
+
+static TableParameter UnaryOpParameters[] = {
+    {L"DataType", TableParameter::STRING, true},
+    {L"OpTypeEnum", TableParameter::STRING, true},
+    {L"InputValueSetName1", TableParameter::STRING, false},
+};
+
+bool LongVector::OpTest::classSetup() {
+  // Run this only once.
+  if (!Initialized) {
+    Initialized = true;
+
+    HMODULE Runtime = LoadLibraryW(L"d3d12.dll");
+    if (Runtime == NULL)
+      return false;
+    // Do not: FreeLibrary(hRuntime);
+    // If we actually free the library, it defeats the purpose of
+    // enableAgilitySDK and enableExperimentalMode.
+
+    HRESULT HR;
+    HR = enableAgilitySDK(Runtime);
+
+    if (FAILED(HR))
+      hlsl_test::LogCommentFmt(L"Unable to enable Agility SDK - 0x%08x.", HR);
+    else if (HR == S_FALSE)
+      hlsl_test::LogCommentFmt(L"Agility SDK not enabled.");
+    else
+      hlsl_test::LogCommentFmt(L"Agility SDK enabled.");
+
+    HR = enableExperimentalMode(Runtime);
+    if (FAILED(HR))
+      hlsl_test::LogCommentFmt(
+          L"Unable to enable shader experimental mode - 0x%08x.", HR);
+    else if (HR == S_FALSE)
+      hlsl_test::LogCommentFmt(L"Experimental mode not enabled.");
+    else
+      hlsl_test::LogCommentFmt(L"Experimental mode enabled.");
+
+    HR = enableDebugLayer();
+    if (FAILED(HR))
+      hlsl_test::LogCommentFmt(L"Unable to enable debug layer - 0x%08x.", HR);
+    else if (HR == S_FALSE)
+      hlsl_test::LogCommentFmt(L"Debug layer not enabled.");
+    else
+      hlsl_test::LogCommentFmt(L"Debug layer enabled.");
+  }
+
+  return true;
+}
+
+TEST_F(LongVector::OpTest, binaryOpTest) {
+  WEX::TestExecution::SetVerifyOutput verifySettings(
+      WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
+
+  using namespace WEX::Common;
+
+  const int TableSize = sizeof(BinaryOpParameters) / sizeof(TableParameter);
+  TableParameterHandler Handler(BinaryOpParameters, TableSize);
+
+  std::wstring DataType(Handler.GetTableParamByName(L"DataType")->m_str);
+  std::wstring OpTypeString(Handler.GetTableParamByName(L"OpTypeEnum")->m_str);
+
+  auto OpType = LongVector::getBinaryOpType(OpTypeString);
+  dispatchTestByDataType(OpType, DataType, Handler);
+}
+
+TEST_F(LongVector::OpTest, trigonometricOpTest) {
+  WEX::TestExecution::SetVerifyOutput verifySettings(
+      WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
+
+  const int TableSize = sizeof(UnaryOpParameters) / sizeof(TableParameter);
+  TableParameterHandler Handler(UnaryOpParameters, TableSize);
+
+  std::wstring DataType(Handler.GetTableParamByName(L"DataType")->m_str);
+  std::wstring OpTypeString(Handler.GetTableParamByName(L"OpTypeEnum")->m_str);
+
+  auto OpType = LongVector::getTrigonometricOpType(OpTypeString);
+  dispatchTestByDataType(OpType, DataType, Handler);
+}
+
+TEST_F(LongVector::OpTest, unaryOpTest) {
+  WEX::TestExecution::SetVerifyOutput verifySettings(
+      WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
+
+  const int TableSize = sizeof(UnaryOpParameters) / sizeof(TableParameter);
+  TableParameterHandler Handler(UnaryOpParameters, TableSize);
+
+  std::wstring DataType(Handler.GetTableParamByName(L"DataType")->m_str);
+  std::wstring OpTypeString(Handler.GetTableParamByName(L"OpTypeEnum")->m_str);
+
+  auto OpType = LongVector::getUnaryOpType(OpTypeString);
+  dispatchTestByDataType(OpType, DataType, Handler);
+}
+
+template <typename LongVectorOpTypeT>
+void LongVector::OpTest::dispatchTestByDataType(
+    LongVectorOpTypeT OpType, std::wstring DataType,
+    TableParameterHandler &Handler) {
+  using namespace WEX::Common;
+
+  if (DataType == L"bool")
+    dispatchTestByVectorSize<HLSLBool_t>(OpType, Handler);
+  else if (DataType == L"int16")
+    dispatchTestByVectorSize<int16_t>(OpType, Handler);
+  else if (DataType == L"int32")
+    dispatchTestByVectorSize<int32_t>(OpType, Handler);
+  else if (DataType == L"int64")
+    dispatchTestByVectorSize<int64_t>(OpType, Handler);
+  else if (DataType == L"uint16")
+    dispatchTestByVectorSize<uint16_t>(OpType, Handler);
+  else if (DataType == L"uint32")
+    dispatchTestByVectorSize<uint32_t>(OpType, Handler);
+  else if (DataType == L"uint64")
+    dispatchTestByVectorSize<uint64_t>(OpType, Handler);
+  else if (DataType == L"float16")
+    dispatchTestByVectorSize<HLSLHalf_t>(OpType, Handler);
+  else if (DataType == L"float32")
+    dispatchTestByVectorSize<float>(OpType, Handler);
+  else if (DataType == L"float64")
+    dispatchTestByVectorSize<double>(OpType, Handler);
+  else
+    VERIFY_FAIL(
+        String().Format(L"DataType: %s is not recognized.", DataType.c_str()));
+}
+
+template <typename DataTypeT, typename LongVectorOpTypeT>
+void LongVector::OpTest::dispatchTestByVectorSize(
+    LongVectorOpTypeT opType, TableParameterHandler &Handler) {
+  WEX::TestExecution::SetVerifyOutput verifySettings(
+      WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
+
+  LongVector::TestConfig<DataTypeT, LongVectorOpTypeT> TestConfig(opType);
+
+  // InputValueSetName1 is optional. So the string may be empty. An empty
+  // string will result in the default value set for this DataType being used.
+  std::wstring InputValueSet1(
+      Handler.GetTableParamByName(L"InputValueSetName1")->m_str);
+  if (!InputValueSet1.empty())
+    TestConfig.setInputValueSet1(InputValueSet1);
+
+  // InputValueSetName2 is optional. So the string may be empty. An empty
+  // string will result in the default value set for this DataType being used.
+  if (TestConfig.isBinaryOp()) {
+    std::wstring InputValueSet2(
+        Handler.GetTableParamByName(L"InputValueSetName2")->m_str);
+    if (!InputValueSet2.empty())
+      TestConfig.setInputValueSet2(InputValueSet2);
+  }
+
+  std::vector<size_t> InputVectorSizes = {3, 4, 5, 16, 17, 35, 100, 256, 1024};
+  for (auto SizeToTest : InputVectorSizes) {
+    testBaseMethod<DataTypeT, LongVectorOpTypeT>(TestConfig, SizeToTest);
+  }
+}
+
+template <typename DataTypeT, typename LongVectorOpTypeT>
+void LongVector::OpTest::testBaseMethod(
+    LongVector::TestConfig<DataTypeT, LongVectorOpTypeT> &TestConfig,
+    size_t VectorSizeToTest) {
+  WEX::TestExecution::SetVerifyOutput verifySettings(
+      WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
+
+  hlsl_test::LogCommentFmt(L"Running LongVectorOpTestBase<%S, %zu>",
+                           typeid(DataTypeT).name(), VectorSizeToTest);
+
+  bool LogInputs = false;
+  WEX::TestExecution::RuntimeParameters::TryGetValue(L"LongVectorLogInputs",
+                                                     LogInputs);
+
+  CComPtr<ID3D12Device> D3DDevice;
+  if (!createDevice(&D3DDevice, ExecTestUtils::D3D_SHADER_MODEL_6_9, false)) {
+#ifdef _HLK_CONF
+    LOG_ERROR_FMT_THROW(
+        L"Device does not support SM 6.9. Can't run these tests.");
+#else
+    WEX::Logging::Log::Comment(
+        "Device does not support SM 6.9. Can't run these tests.");
+    WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped);
+    return;
+#endif
+  }
+
+  std::vector<DataTypeT> InputVector1;
+  InputVector1.reserve(VectorSizeToTest);
+  std::vector<DataTypeT> InputVector2; // May be unused, but must be defined.
+  InputVector2.reserve(VectorSizeToTest);
+  std::vector<DataTypeT> ScalarInput; // May be unused, but must be defined.
+  const bool IsVectorBinaryOp =
+      TestConfig.isBinaryOp() && !TestConfig.isScalarOp();
+
+  std::vector<DataTypeT> InputVector1ValueSet = TestConfig.getInputValueSet1();
+  std::vector<DataTypeT> InputVector2ValueSet =
+      TestConfig.isBinaryOp() ? TestConfig.getInputValueSet2()
+                              : std::vector<DataTypeT>();
+
+  if (TestConfig.isScalarOp())
+    // Scalar ops are always binary ops. So InputVector2ValueSet is initialized
+    // with values above.
+    ScalarInput.push_back(InputVector2ValueSet[0]);
+
+  // Fill the input vectors with values from the value set. Repeat the values
+  // when we reach the end of the value set.
+  for (size_t Index = 0; Index < VectorSizeToTest; Index++) {
+    InputVector1.push_back(
+        InputVector1ValueSet[Index % InputVector1ValueSet.size()]);
+
+    if (IsVectorBinaryOp)
+      InputVector2.push_back(
+          InputVector2ValueSet[Index % InputVector2ValueSet.size()]);
+  }
+
+  std::vector<DataTypeT> ExpectedVector;
+  ExpectedVector.reserve(VectorSizeToTest);
+  if (IsVectorBinaryOp)
+    ExpectedVector =
+        computeExpectedValues(InputVector1, InputVector2, TestConfig);
+  else if (TestConfig.isScalarOp())
+    ExpectedVector =
+        computeExpectedValues(InputVector1, ScalarInput[0], TestConfig);
+  else // Must be a unary op
+    ExpectedVector = computeExpectedValues(InputVector1, TestConfig);
+
+  if (LogInputs) {
+    logLongVector<DataTypeT>(InputVector1, L"InputVector1");
+
+    if (IsVectorBinaryOp)
+      logLongVector<DataTypeT>(InputVector2, L"InputVector2");
+    else if (TestConfig.isScalarOp())
+      logLongVector<DataTypeT>(ScalarInput, L"ScalarInput");
+  }
+
+  // We have to construct the string outside of the lambda. Otherwise it's
+  // cleaned up when the lambda finishes executing but before the shader runs.
+  std::string CompilerOptionsString =
+      TestConfig.getCompilerOptionsString(VectorSizeToTest);
+
+  // The name of the shader we want to use in ShaderOpArith.xml. Could also add
+  // logic to set this name in ShaderOpArithTable.xml so we can use different
+  // shaders for different tests.
+  LPCSTR ShaderName = "LongVectorOp";
+  // ShaderOpArith.xml defines the input/output resources and the shader source.
+  CComPtr<IStream> TestXML;
+  readHlslDataIntoNewStream(L"ShaderOpArith.xml", &TestXML, DxcDllSupport);
+
+  // RunShaderOpTest is a helper function that handles resource creation
+  // and setup. It also handles the shader compilation and execution. It takes a
+  // callback that is called when the shader is compiled, but before it is
+  // executed.
+  std::shared_ptr<st::ShaderOpTestResult> TestResult = st::RunShaderOpTest(
+      D3DDevice, DxcDllSupport, TestXML, ShaderName,
+      [&](LPCSTR Name, std::vector<BYTE> &ShaderData, st::ShaderOp *ShaderOp) {
+        hlsl_test::LogCommentFmt(L"RunShaderOpTest CallBack. Resource Name: %S",
+                                 Name);
+
+        // This callback is called once for each resource defined for
+        // "LongVectorOp" in ShaderOpArith.xml. All callbacks are fired for each
+        // resource. We determine whether they are applicable to the test case
+        // when they run.
+
+        // Process the callback for the OutputVector resource.
+        if (0 == _stricmp(Name, "OutputVector")) {
+          // We only need to set the compiler options string once. So this is a
+          // convenient place to do it.
+          ShaderOp->Shaders.at(0).Arguments = CompilerOptionsString.c_str();
+
+          return;
+        }
+
+        // Process the callback for the InputFuncArgs resource.
+        if (0 == _stricmp(Name, "InputFuncArgs")) {
+          if (TestConfig.isScalarOp())
+            fillShaderBufferFromLongVectorData<DataTypeT>(ShaderData,
+                                                          ScalarInput);
+          return;
+        }
+
+        // Process the callback for the InputVector1 resource.
+        if (0 == _stricmp(Name, "InputVector1")) {
+          fillShaderBufferFromLongVectorData<DataTypeT>(ShaderData,
+                                                        InputVector1);
+          return;
+        }
+
+        // Process the callback for the InputVector2 resource.
+        if (0 == _stricmp(Name, "InputVector2")) {
+          if (IsVectorBinaryOp)
+            fillShaderBufferFromLongVectorData<DataTypeT>(ShaderData,
+                                                          InputVector2);
+
+          return;
+        }
+
+        LOG_ERROR_FMT_THROW(
+            L"RunShaderOpTest CallBack. Unexpected Resource Name: %S", Name);
+      });
+
+  // Map the data from GPU to CPU memory so we can verify our expectations.
+  MappedData ShaderOutData;
+  TestResult->Test->GetReadBackData("OutputVector", &ShaderOutData);
+
+  std::vector<DataTypeT> OutputVector;
+  fillLongVectorDataFromShaderBuffer<DataTypeT>(ShaderOutData, OutputVector,
+                                                VectorSizeToTest);
+
+  VERIFY_SUCCEEDED(doVectorsMatch<DataTypeT>(OutputVector, ExpectedVector,
+                                             TestConfig.getTolerance(),
+                                             TestConfig.getValidationType()));
+}
diff --git a/tools/clang/unittests/HLSLExec/LongVectors.h b/tools/clang/unittests/HLSLExec/LongVectors.h
new file mode 100644
index 0000000000..0e046d1966
--- /dev/null
+++ b/tools/clang/unittests/HLSLExec/LongVectors.h
@@ -0,0 +1,336 @@
+#ifndef LONGVECTORS_H
+#define LONGVECTORS_H
+
+#include <array>
+#include <ostream>
+#include <random>
+#include <sstream>
+#include <string>
+
+#include <DirectXMath.h>
+#include <DirectXPackedVector.h>
+
+#include <Verify.h>
+
+#include "LongVectorTestData.h"
+#include "ShaderOpTest.h"
+#include "TableParameterHandler.h"
+#include "dxc/Support/WinIncludes.h"
+#include "dxc/Support/dxcapi.use.h"
+#include "dxc/Test/HlslTestUtils.h"
+
+namespace LongVector {
+template <typename DataTypeT, typename LongVectorOpTypeT>
+class TestConfig; // Forward declaration
+
+class OpTest {
+public:
+  BEGIN_TEST_CLASS(OpTest)
+  END_TEST_CLASS()
+
+  TEST_CLASS_SETUP(classSetup);
+
+  BEGIN_TEST_METHOD(binaryOpTest)
+  TEST_METHOD_PROPERTY(L"DataSource",
+                       L"Table:LongVectorOpTable.xml#BinaryOpTable")
+  END_TEST_METHOD()
+
+  BEGIN_TEST_METHOD(trigonometricOpTest)
+  TEST_METHOD_PROPERTY(L"DataSource",
+                       L"Table:LongVectorOpTable.xml#TrigonometricOpTable")
+  END_TEST_METHOD()
+
+  BEGIN_TEST_METHOD(unaryOpTest)
+  TEST_METHOD_PROPERTY(L"DataSource",
+                       L"Table:LongVectorOpTable.xml#UnaryOpTable")
+  END_TEST_METHOD()
+
+  template <typename LongVectorOpTypeT>
+  void dispatchTestByDataType(LongVectorOpTypeT OpType, std::wstring DataType,
+                              TableParameterHandler &Handler);
+
+  template <typename DataTypeT, typename LongVectorOpTypeT>
+  void dispatchTestByVectorSize(LongVectorOpTypeT OpType,
+                                TableParameterHandler &Handler);
+
+  template <typename DataTypeT, typename LongVectorOpTypeT>
+  void testBaseMethod(
+      LongVector::TestConfig<DataTypeT, LongVectorOpTypeT> &TestConfig,
+      size_t VectorSizeToTest);
+
+private:
+  dxc::DxcDllSupport DxcDllSupport;
+  bool Initialized = false;
+};
+
+template <typename DataTypeT>
+void fillShaderBufferFromLongVectorData(std::vector<BYTE> &ShaderBuffer,
+                                        std::vector<DataTypeT> &TestData);
+
+template <typename DataTypeT>
+void fillLongVectorDataFromShaderBuffer(MappedData &ShaderBuffer,
+                                        std::vector<DataTypeT> &TestData,
+                                        size_t NumElements);
+
+template <typename DataTypeT> constexpr bool isFloatingPointType() {
+  return std::is_same_v<DataTypeT, float> ||
+         std::is_same_v<DataTypeT, double> ||
+         std::is_same_v<DataTypeT, HLSLHalf_t>;
+}
+
+struct LongVectorOpTypeStringToEnumValue {
+  std::wstring OpTypeString;
+  uint32_t OpTypeValue;
+};
+
+template <typename DataTypeT>
+DataTypeT getLongVectorOpType(const LongVectorOpTypeStringToEnumValue *Values,
+                              const std::wstring &OpTypeString,
+                              std::size_t Length);
+
+enum ValidationType {
+  ValidationType_Epsilon,
+  ValidationType_Ulp,
+};
+
+enum BasicOpType {
+  BasicOpType_Binary,
+  BasicOpType_Unary,
+  BasicOpType_ScalarBinary,
+  BasicOpType_EnumValueCount
+};
+
+enum BinaryOpType {
+  BinaryOpType_ScalarAdd,
+  BinaryOpType_ScalarMultiply,
+  BinaryOpType_ScalarSubtract,
+  BinaryOpType_ScalarDivide,
+  BinaryOpType_ScalarModulus,
+  BinaryOpType_Multiply,
+  BinaryOpType_Add,
+  BinaryOpType_Subtract,
+  BinaryOpType_Divide,
+  BinaryOpType_Modulus,
+  BinaryOpType_Min,
+  BinaryOpType_Max,
+  BinaryOpType_ScalarMin,
+  BinaryOpType_ScalarMax,
+  BinaryOpType_EnumValueCount
+};
+
+static const LongVectorOpTypeStringToEnumValue binaryOpTypeStringToEnumMap[] = {
+    {L"BinaryOpType_ScalarAdd", BinaryOpType_ScalarAdd},
+    {L"BinaryOpType_ScalarMultiply", BinaryOpType_ScalarMultiply},
+    {L"BinaryOpType_ScalarSubtract", BinaryOpType_ScalarSubtract},
+    {L"BinaryOpType_ScalarDivide", BinaryOpType_ScalarDivide},
+    {L"BinaryOpType_ScalarModulus", BinaryOpType_ScalarModulus},
+    {L"BinaryOpType_Add", BinaryOpType_Add},
+    {L"BinaryOpType_Multiply", BinaryOpType_Multiply},
+    {L"BinaryOpType_Subtract", BinaryOpType_Subtract},
+    {L"BinaryOpType_Divide", BinaryOpType_Divide},
+    {L"BinaryOpType_Modulus", BinaryOpType_Modulus},
+    {L"BinaryOpType_Min", BinaryOpType_Min},
+    {L"BinaryOpType_Max", BinaryOpType_Max},
+    {L"BinaryOpType_ScalarMin", BinaryOpType_ScalarMin},
+    {L"BinaryOpType_ScalarMax", BinaryOpType_ScalarMax},
+};
+
+static_assert(_countof(binaryOpTypeStringToEnumMap) ==
+                  BinaryOpType_EnumValueCount,
+              "binaryOpTypeStringToEnumMap size mismatch. Did you "
+              "add a new enum value?");
+
+BinaryOpType getBinaryOpType(const std::wstring &OpTypeString);
+
+enum UnaryOpType { UnaryOpType_Initialize, UnaryOpType_EnumValueCount };
+
+static const LongVectorOpTypeStringToEnumValue unaryOpTypeStringToEnumMap[] = {
+    {L"UnaryOpType_Initialize", UnaryOpType_Initialize},
+};
+
+static_assert(_countof(unaryOpTypeStringToEnumMap) ==
+                  UnaryOpType_EnumValueCount,
+              "unaryOpTypeStringToEnumMap size mismatch. Did you add "
+              "a new enum value?");
+
+UnaryOpType getUnaryOpType(const std::wstring &OpTypeString);
+
+enum TrigonometricOpType {
+  TrigonometricOpType_Acos,
+  TrigonometricOpType_Asin,
+  TrigonometricOpType_Atan,
+  TrigonometricOpType_Cos,
+  TrigonometricOpType_Cosh,
+  TrigonometricOpType_Sin,
+  TrigonometricOpType_Sinh,
+  TrigonometricOpType_Tan,
+  TrigonometricOpType_Tanh,
+  TrigonometricOpType_EnumValueCount
+};
+
+static const LongVectorOpTypeStringToEnumValue
+    trigonometricOpTypeStringToEnumMap[] = {
+        {L"TrigonometricOpType_Acos", TrigonometricOpType_Acos},
+        {L"TrigonometricOpType_Asin", TrigonometricOpType_Asin},
+        {L"TrigonometricOpType_Atan", TrigonometricOpType_Atan},
+        {L"TrigonometricOpType_Cos", TrigonometricOpType_Cos},
+        {L"TrigonometricOpType_Cosh", TrigonometricOpType_Cosh},
+        {L"TrigonometricOpType_Sin", TrigonometricOpType_Sin},
+        {L"TrigonometricOpType_Sinh", TrigonometricOpType_Sinh},
+        {L"TrigonometricOpType_Tan", TrigonometricOpType_Tan},
+        {L"TrigonometricOpType_Tanh", TrigonometricOpType_Tanh},
+};
+
+static_assert(_countof(trigonometricOpTypeStringToEnumMap) ==
+                  TrigonometricOpType_EnumValueCount,
+              "trigonometricOpTypeStringToEnumMap size mismatch. Did you add "
+              "a new enum value?");
+
+TrigonometricOpType getTrigonometricOpType(const std::wstring &OpTypeString);
+
+template <typename DataTypeT>
+std::vector<DataTypeT> getInputValueSetByKey(const std::wstring &Key,
+                                             bool LogKey = true) {
+  if (LogKey)
+    WEX::Logging::Log::Comment(
+        WEX::Common::String().Format(L"Using Value Set Key: %s", Key.c_str()));
+  return std::vector<DataTypeT>(LongVectorTestData<DataTypeT>::Data.at(Key));
+}
+
+template <typename DataTypeT>
+DataTypeT mod(const DataTypeT &A, const DataTypeT &B);
+
+template <typename LongVectorOpTypeT> struct TestConfigTraits {
+  TestConfigTraits(LongVectorOpTypeT OpType) : OpType(OpType) {}
+  // LongVectorOpTypeT* Enum values. We don't use a UINT because
+  // we want the type data.
+  LongVectorOpTypeT OpType;
+};
+
+template <typename DataTypeT>
+bool doValuesMatch(DataTypeT A, DataTypeT B, float Tolerance, ValidationType);
+bool doValuesMatch(HLSLBool_t A, HLSLBool_t B, float, ValidationType);
+bool doValuesMatch(HLSLHalf_t A, HLSLHalf_t B, float Tolerance,
+                   ValidationType ValidationType);
+bool doValuesMatch(float A, float B, float Tolerance,
+                   ValidationType ValidationType);
+bool doValuesMatch(double A, double B, float Tolerance,
+                   ValidationType ValidationType);
+
+template <typename DataTypeT>
+bool doVectorsMatch(const std::vector<DataTypeT> &ActualValues,
+                    const std::vector<DataTypeT> &ExpectedValues,
+                    float Tolerance, ValidationType ValidationType);
+// Binary ops
+template <typename DataTypeT, typename LongVectorOpTypeT>
+std::vector<DataTypeT>
+computeExpectedValues(const std::vector<DataTypeT> &InputVector1,
+                      const std::vector<DataTypeT> &InputVector2,
+                      const TestConfig<DataTypeT, LongVectorOpTypeT> &Config);
+
+// Binary scalar ops
+template <typename DataTypeT, typename LongVectorOpTypeT>
+std::vector<DataTypeT>
+computeExpectedValues(const std::vector<DataTypeT> &InputVector1,
+                      const DataTypeT &ScalarInput,
+                      const TestConfig<DataTypeT, LongVectorOpTypeT> &Config);
+
+// Unary ops
+template <typename DataTypeT, typename LongVectorOpTypeT>
+std::vector<DataTypeT>
+computeExpectedValues(const std::vector<DataTypeT> &InputVector1,
+                      const TestConfig<DataTypeT, LongVectorOpTypeT> &Config);
+
+template <typename DataTypeT>
+void logLongVector(const std::vector<DataTypeT> &Values,
+                   const std::wstring &Name);
+
+// Used to pass into LongVectorOpTestBase
+template <typename DataTypeT, typename LongVectorOpTypeT> class TestConfig {
+public:
+  TestConfig() = default;
+
+  TestConfig(UnaryOpType OpType);
+  TestConfig(BinaryOpType OpType);
+  TestConfig(TrigonometricOpType OpType);
+
+  bool isBinaryOp() const {
+    return BasicOpType == LongVector::BasicOpType_Binary ||
+           BasicOpType == LongVector::BasicOpType_ScalarBinary;
+  }
+
+  bool isUnaryOp() const {
+    return BasicOpType == LongVector::BasicOpType_Unary;
+  }
+
+  bool isScalarOp() const {
+    return BasicOpType == LongVector::BasicOpType_ScalarBinary;
+  }
+
+  bool hasFunctionDefinition() const;
+  std::string getOPERAND2String() const;
+
+  // A helper to get the hlsl type as a string for a given C++ type.
+  // Used in the long vector tests.
+  std::string getHLSLTypeString() const;
+
+  DataTypeT computeExpectedValue(const DataTypeT &A, const DataTypeT &B,
+                                 BinaryOpType OpType) const;
+  DataTypeT computeExpectedValue(const DataTypeT &A, const DataTypeT &B) const;
+  DataTypeT computeExpectedValue(const DataTypeT &A,
+                                 TrigonometricOpType OpType) const;
+  DataTypeT computeExpectedValue(const DataTypeT &A, UnaryOpType OpType) const;
+  DataTypeT computeExpectedValue(const DataTypeT &A) const;
+
+  void setInputArgsArrayName(const std::wstring &InputArgsArrayName) {
+    this->InputArgsArrayName = InputArgsArrayName;
+  }
+
+  void setInputValueSet1(const std::wstring &InputValueSetName) {
+    this->InputValueSetName1 = InputValueSetName;
+  }
+
+  void setInputValueSet2(const std::wstring &InputValueSetName) {
+    this->InputValueSetName2 = InputValueSetName;
+  }
+
+  std::vector<DataTypeT> getInputValueSet1() const {
+    return getInputValueSet(1);
+  }
+
+  std::vector<DataTypeT> getInputValueSet2() const {
+    return getInputValueSet(2);
+  }
+
+  std::vector<DataTypeT> getInputArgsArray() const;
+
+  float getTolerance() const { return Tolerance; }
+  LongVector::ValidationType getValidationType() const {
+    return ValidationType;
+  }
+
+  std::string getCompilerOptionsString(size_t VectorSize) const;
+
+private:
+  std::vector<DataTypeT> getInputValueSet(size_t ValueSetIndex) const;
+
+  // To be used for the value of -DOPERATOR
+  std::string OperatorString;
+  // To be used for the value of -DFUNC
+  std::string IntrinsicString;
+  LongVector::BasicOpType BasicOpType = LongVector::BasicOpType_EnumValueCount;
+  float Tolerance = 0.0;
+  LongVector::ValidationType ValidationType =
+      LongVector::ValidationType::ValidationType_Epsilon;
+  LongVector::TestConfigTraits<LongVectorOpTypeT> OpTypeTraits;
+  std::wstring InputValueSetName1 = L"DefaultInputValueSet1";
+  std::wstring InputValueSetName2 = L"DefaultInputValueSet2";
+  // No default args array
+  std::wstring InputArgsArrayName = L"";
+}; // class LongVector::TestConfig
+
+}; // namespace LongVector
+
+#include "LongVectors.tpp"
+
+#endif // LONGVECTORS_H
diff --git a/tools/clang/unittests/HLSLExec/LongVectors.tpp b/tools/clang/unittests/HLSLExec/LongVectors.tpp
new file mode 100644
index 0000000000..29affa4b2e
--- /dev/null
+++ b/tools/clang/unittests/HLSLExec/LongVectors.tpp
@@ -0,0 +1,650 @@
+template <typename DataTypeT>
+DataTypeT LongVector::getLongVectorOpType(const LongVectorOpTypeStringToEnumValue *Values,
+                             const std::wstring &OpTypeString,
+                             std::size_t Length) {
+  for (size_t i = 0; i < Length; i++) {
+    if (Values[i].OpTypeString == OpTypeString)
+      return static_cast<DataTypeT>(Values[i].OpTypeValue);
+  }
+
+  LOG_ERROR_FMT_THROW(L"Invalid LongVectorOpType string: %s",
+                      OpTypeString.c_str());
+
+  return static_cast<DataTypeT>(UINT_MAX);
+}
+
+// Helper to fill the shader buffer based on type. Convenient to be used when
+// copying HLSL*_t types so we can copy the underlying type directly instead of
+// the struct.
+template <typename DataTypeT>
+void LongVector::fillShaderBufferFromLongVectorData(std::vector<BYTE> &ShaderBuffer, std::vector<DataTypeT> &TestData) {
+
+  // Note: DataSize for HLSLHalf_t and HLSLBool_t may be larger than the
+  // underlying type in some cases. Thats fine. Resize just makes sure we have
+  // enough space.
+  const size_t NumElements = TestData.size();
+  const size_t DataSize = sizeof(DataTypeT) * NumElements;
+  ShaderBuffer.resize(DataSize);
+
+  if constexpr (std::is_same_v<DataTypeT, HLSLHalf_t>) {
+    DirectX::PackedVector::HALF *ShaderBufferPtr =
+        reinterpret_cast<DirectX::PackedVector::HALF *>(ShaderBuffer.data());
+    for (size_t i = 0; i < NumElements; ++i)
+      ShaderBufferPtr[i] = TestData[i].Val;
+  } else if constexpr (std::is_same_v<DataTypeT, HLSLBool_t>) {
+    int32_t *ShaderBufferPtr = reinterpret_cast<int32_t *>(ShaderBuffer.data());
+    for (size_t i = 0; i < NumElements; ++i)
+      ShaderBufferPtr[i] = TestData[i].Val;
+  } else {
+    DataTypeT *ShaderBufferPtr =
+        reinterpret_cast<DataTypeT *>(ShaderBuffer.data());
+    for (size_t i = 0; i < NumElements; ++i)
+      ShaderBufferPtr[i] = TestData[i];
+  }
+}
+
+// Helpers so we do the right thing for float types. HLSLHalf_t is handled in an
+// operator overload.
+template <typename DataTypeT>
+DataTypeT LongVector::mod(const DataTypeT &A, const DataTypeT &B) {
+  return A % B;
+}
+
+template <> float LongVector::mod(const float &A, const float &B) {
+  return std::fmod(A, B);
+}
+
+template <> double LongVector::mod(const double &A, const double &B) {
+  return std::fmod(A, B);
+}
+
+// Helper to fill the test data from the shader buffer based on type. Convenient
+// to be used when copying HLSL*_t types so we can use the underlying type.
+template <typename DataTypeT>
+void LongVector::fillLongVectorDataFromShaderBuffer(MappedData &ShaderBuffer,
+                                        std::vector<DataTypeT> &TestData,
+                                        size_t NumElements) {
+  if constexpr (std::is_same_v<DataTypeT, HLSLHalf_t>) {
+    DirectX::PackedVector::HALF *ShaderBufferPtr =
+        reinterpret_cast<DirectX::PackedVector::HALF *>(ShaderBuffer.data());
+    for (size_t i = 0; i < NumElements; ++i)
+      // HLSLHalf_t has a DirectX::PackedVector::HALF based constructor.
+      TestData.push_back(ShaderBufferPtr[i]);
+  } else if constexpr (std::is_same_v<DataTypeT, HLSLBool_t>) {
+    int32_t *ShaderBufferPtr = reinterpret_cast<int32_t *>(ShaderBuffer.data());
+    for (size_t i = 0; i < NumElements; ++i)
+      // HLSLBool_t has a int32_t based constructor.
+      TestData.push_back(ShaderBufferPtr[i]);
+  } else {
+    DataTypeT *ShaderBufferPtr =
+        reinterpret_cast<DataTypeT *>(ShaderBuffer.data());
+    for (size_t i = 0; i < NumElements; ++i)
+      TestData.push_back(ShaderBufferPtr[i]);
+  }
+}
+
+template <typename DataTypeT>
+bool LongVector::doValuesMatch(DataTypeT A, DataTypeT B, float Tolerance,
+                   LongVector::ValidationType) {
+  if (Tolerance == 0.0f)
+    return A == B;
+
+  DataTypeT Diff = A > B ? A - B : B - A;
+  return Diff <= Tolerance;
+}
+
+bool LongVector::doValuesMatch(HLSLBool_t A, HLSLBool_t B, float,
+                          LongVector::ValidationType) {
+  return A == B;
+}
+
+bool LongVector::doValuesMatch(HLSLHalf_t A, HLSLHalf_t B, float Tolerance,
+                          LongVector::ValidationType ValidationType) {
+  switch (ValidationType) {
+  case LongVector::ValidationType_Epsilon:
+    return CompareHalfEpsilon(A.Val, B.Val, Tolerance);
+  case LongVector::ValidationType_Ulp:
+    return CompareHalfULP(A.Val, B.Val, Tolerance);
+  default:
+    WEX::Logging::Log::Error(
+        L"Invalid ValidationType. Expecting Epsilon or ULP.");
+    return false;
+  }
+}
+
+bool LongVector::doValuesMatch(float A, float B, float Tolerance,
+                          LongVector::ValidationType ValidationType) {
+  switch (ValidationType) {
+  case LongVector::ValidationType_Epsilon:
+    return CompareFloatEpsilon(A, B, Tolerance);
+  case LongVector::ValidationType_Ulp: {
+    // Tolerance is in ULPs. Convert to int for the comparison.
+    const int IntTolerance = static_cast<int>(Tolerance);
+    return CompareFloatULP(A, B, IntTolerance);
+  };
+  default:
+    WEX::Logging::Log::Error(
+        L"Invalid ValidationType. Expecting Epsilon or ULP.");
+    return false;
+  }
+}
+
+bool LongVector::doValuesMatch(double A, double B, float Tolerance,
+                          LongVector::ValidationType ValidationType) {
+  switch (ValidationType) {
+  case LongVector::ValidationType_Epsilon:
+    return CompareDoubleEpsilon(A, B, Tolerance);
+  case LongVector::ValidationType_Ulp: {
+    // Tolerance is in ULPs. Convert to int64_t for the comparison.
+    const int64_t IntTolerance = static_cast<int64_t>(Tolerance);
+    return CompareDoubleULP(A, B, IntTolerance);
+  };
+  default:
+    WEX::Logging::Log::Error(
+      L"Invalid ValidationType. Expecting Epsilon or ULP.");
+    return false;
+  }
+}
+
+
+template <typename DataTypeT>
+bool LongVector::doVectorsMatch(const std::vector<DataTypeT> &ActualValues,
+                   const std::vector<DataTypeT> &ExpectedValues,
+                   float Tolerance,
+                   LongVector::ValidationType ValidationType) {
+  // Stash mismatched indexes for easy failure logging later
+  std::vector<size_t> MismatchedIndexes;
+  VERIFY_IS_TRUE(ActualValues.size() == ExpectedValues.size(),
+                 L"doVectorsMatch() called with mismatched vector sizes.");
+  for (size_t i = 0; i < ActualValues.size(); ++i) {
+    if (!doValuesMatch(ActualValues[i], ExpectedValues[i], Tolerance,
+                       ValidationType))
+      MismatchedIndexes.push_back(i);
+  }
+
+  if (MismatchedIndexes.empty())
+    return true;
+
+  if (!MismatchedIndexes.empty()) {
+    for (size_t Index : MismatchedIndexes) {
+      std::wstringstream Wss(L"");
+      Wss << std::setprecision(15); // Set precision for floating point types
+      Wss << L"Mismatch at Index: " << Index;
+      Wss << L" Actual Value:" << ActualValues[Index] << ",";
+      Wss << L" Expected Value:" << ExpectedValues[Index];
+      WEX::Logging::Log::Error(Wss.str().c_str());
+    }
+  }
+
+  return false;
+}
+
+template <typename DataTypeT, typename LongVectorOpTypeT>
+std::vector<DataTypeT> LongVector::computeExpectedValues(
+  const std::vector<DataTypeT> &InputVector1,
+  const std::vector<DataTypeT> &InputVector2,
+  const LongVector::TestConfig<DataTypeT, LongVectorOpTypeT> &Config) {
+
+  VERIFY_IS_TRUE(
+      Config.isBinaryOp(),
+      L"computeExpectedValues() called with a non-binary op config.");
+
+  std::vector<DataTypeT> ExpectedValues = {};
+
+  for (size_t i = 0; i < InputVector1.size(); ++i)
+    ExpectedValues.push_back(
+      Config.computeExpectedValue(InputVector1[i], InputVector2[i]));
+
+  return ExpectedValues;
+}
+
+template <typename DataTypeT, typename LongVectorOpTypeT>
+std::vector<DataTypeT> LongVector::computeExpectedValues(
+  const std::vector<DataTypeT> &InputVector1, const DataTypeT &ScalarInput,
+  const LongVector::TestConfig<DataTypeT, LongVectorOpTypeT> &Config) {
+
+  VERIFY_IS_TRUE(Config.isScalarOp(), L"computeExpectedValues() called with a "
+                                      L"non-binary non-scalar op config.");
+
+  std::vector<DataTypeT> ExpectedValues;
+
+  for (size_t i = 0; i < InputVector1.size(); ++i)
+    ExpectedValues.push_back(
+      Config.computeExpectedValue(InputVector1[i], ScalarInput));
+
+  return ExpectedValues;
+}
+
+template <typename DataTypeT, typename LongVectorOpTypeT>
+std::vector<DataTypeT> LongVector::computeExpectedValues(
+    const std::vector<DataTypeT> &InputVector1,
+    const LongVector::TestConfig<DataTypeT, LongVectorOpTypeT> &Config) {
+
+  VERIFY_IS_TRUE(Config.isUnaryOp(),
+                 L"computeExpectedValues() called with a non-unary op config.");
+
+  std::vector<DataTypeT> ExpectedValues;
+
+  for (size_t i = 0; i < InputVector1.size(); ++i)
+    ExpectedValues.push_back(Config.computeExpectedValue(InputVector1[i]));
+
+  return ExpectedValues;
+}
+
+template <typename DataTypeT>
+void LongVector::logLongVector(const std::vector<DataTypeT> &Values,
+                   const std::wstring &Name) {
+  WEX::Logging::Log::Comment(
+      WEX::Common::String().Format(L"LongVector Name: %s", Name.c_str()));
+
+  const size_t LoggingWidth = 40;
+
+  std::wstringstream Wss(L"");
+  Wss << L"LongVector Values: ";
+  Wss << L"[";
+  const size_t NumElements = Values.size();
+  for (size_t i = 0; i < NumElements; i++) {
+    if (i % LoggingWidth == 0 && i != 0)
+      Wss << L"\n ";
+    Wss << Values[i];
+    if (i != NumElements - 1)
+      Wss << L", ";
+  }
+  Wss << L" ]";
+
+  WEX::Logging::Log::Comment(Wss.str().c_str());
+}
+
+template <typename DataTypeT, typename LongVectorOpTypeT>
+LongVector::TestConfig<DataTypeT, LongVectorOpTypeT>::TestConfig(LongVector::UnaryOpType OpType)
+    : OpTypeTraits(OpType) {
+  IntrinsicString = "";
+  BasicOpType = LongVector::BasicOpType_Unary;
+
+  if (isFloatingPointType<DataTypeT>())
+    Tolerance = 1;
+
+  switch (OpType) {
+  case LongVector::UnaryOpType_Initialize:
+    IntrinsicString = "TestInitialize";
+    break;
+  default:
+    VERIFY_FAIL("Invalid UnaryOpType");
+  }
+}
+
+template <typename DataTypeT, typename LongVectorOpTypeT>
+LongVector::TestConfig<DataTypeT, LongVectorOpTypeT>::TestConfig(LongVector::BinaryOpType OpType)
+   : OpTypeTraits(OpType) {
+  IntrinsicString = "";
+  BasicOpType = LongVector::BasicOpType_Binary;
+
+  if (isFloatingPointType<DataTypeT>())
+    Tolerance = 1;
+  ValidationType = LongVector::ValidationType_Ulp;
+
+  switch (OpType) {
+  case LongVector::BinaryOpType_ScalarAdd:
+    BasicOpType = LongVector::BasicOpType_ScalarBinary;
+    OperatorString = "+";
+    break;
+  case LongVector::BinaryOpType_ScalarMultiply:
+    BasicOpType = LongVector::BasicOpType_ScalarBinary;
+    OperatorString = "*";
+    break;
+  case LongVector::BinaryOpType_ScalarSubtract:
+    BasicOpType = LongVector::BasicOpType_ScalarBinary;
+    OperatorString = "-";
+    break;
+  case LongVector::BinaryOpType_ScalarDivide:
+    BasicOpType = LongVector::BasicOpType_ScalarBinary;
+    OperatorString = "/";
+    break;
+  case LongVector::BinaryOpType_ScalarModulus:
+    BasicOpType = LongVector::BasicOpType_ScalarBinary;
+    OperatorString = "%";
+    break;
+  case LongVector::BinaryOpType_Multiply:
+    OperatorString = "*";
+    break;
+  case LongVector::BinaryOpType_Add:
+    OperatorString = "+";
+    break;
+  case LongVector::BinaryOpType_Subtract:
+    OperatorString = "-";
+    break;
+  case LongVector::BinaryOpType_Divide:
+    OperatorString = "/";
+    break;
+  case LongVector::BinaryOpType_Modulus:
+    OperatorString = "%";
+    break;
+  case LongVector::BinaryOpType_Min:
+    OperatorString = ",";
+    IntrinsicString = "min";
+    break;
+  case LongVector::BinaryOpType_Max:
+    OperatorString = ",";
+    IntrinsicString = "max";
+    break;
+  case LongVector::BinaryOpType_ScalarMin:
+    BasicOpType = LongVector::BasicOpType_ScalarBinary;
+    OperatorString = ",";
+    IntrinsicString = "min";
+    break;
+  case LongVector::BinaryOpType_ScalarMax:
+    BasicOpType = LongVector::BasicOpType_ScalarBinary;
+    OperatorString = ",";
+    IntrinsicString = "max";
+    break;
+  default:
+    VERIFY_FAIL("Invalid BinaryOpType");
+  }
+}
+
+template <typename DataTypeT, typename LongVectorOpTypeT>
+LongVector::TestConfig<DataTypeT, LongVectorOpTypeT>::TestConfig(LongVector::TrigonometricOpType OpType)
+    : OpTypeTraits(OpType) {
+  IntrinsicString = "";
+  BasicOpType = LongVector::BasicOpType_Unary;
+
+  // All trigonometric ops are floating point types.
+  // These trig functions are defined to have a max absolute error of 0.0008
+  // as per the D3D functional specs. An example with this spec for sin and
+  // cos is available here:
+  // https://microsoft.github.io/DirectX-Specs/d3d/archive/D3D11_3_FunctionalSpec.htm#22.10.20
+  ValidationType = LongVector::ValidationType_Epsilon;
+  if (std::is_same_v<DataTypeT, HLSLHalf_t>)
+    Tolerance = 0.0010f;
+  else if (std::is_same_v<DataTypeT, float>)
+    Tolerance = 0.0008f;
+  else
+    VERIFY_FAIL(
+        "Invalid type for trigonometric op. Expecting half or float.");
+
+  switch (OpType) {
+  case LongVector::TrigonometricOpType_Acos:
+    IntrinsicString = "acos";
+    break;
+  case LongVector::TrigonometricOpType_Asin:
+    IntrinsicString = "asin";
+    break;
+  case LongVector::TrigonometricOpType_Atan:
+    IntrinsicString = "atan";
+    break;
+  case LongVector::TrigonometricOpType_Cos:
+    IntrinsicString = "cos";
+    break;
+  case LongVector::TrigonometricOpType_Cosh:
+    IntrinsicString = "cosh";
+    break;
+  case LongVector::TrigonometricOpType_Sin:
+    IntrinsicString = "sin";
+    break;
+  case LongVector::TrigonometricOpType_Sinh:
+    IntrinsicString = "sinh";
+    break;
+  case LongVector::TrigonometricOpType_Tan:
+    IntrinsicString = "tan";
+    break;
+  case LongVector::TrigonometricOpType_Tanh:
+    IntrinsicString = "tanh";
+    break;
+  default:
+    VERIFY_FAIL("Invalid TrigonometricOpType");
+  }
+}
+
+template <typename DataTypeT, typename LongVectorOpTypeT>
+bool LongVector::TestConfig<DataTypeT, LongVectorOpTypeT>::hasFunctionDefinition() const {
+  if constexpr (std::is_same_v<LongVectorOpTypeT, LongVector::UnaryOpType>) {
+    if (OpTypeTraits.OpType == LongVector::UnaryOpType_Initialize)
+      return true;
+    else
+      return false;
+  }
+
+  return false;
+}
+
+template <typename DataTypeT, typename LongVectorOpTypeT>
+std::string LongVector::TestConfig<DataTypeT, LongVectorOpTypeT>::getOPERAND2String() const {
+  if (hasFunctionDefinition()) {
+    switch (static_cast<LongVector::UnaryOpType>(OpTypeTraits.OpType)) {
+    case LongVector::UnaryOpType_Initialize:
+      return std::string(" -DFUNC_INITIALIZE=1");
+    default:
+      VERIFY_FAIL("Invalid UnaryOpType");
+    }
+  }
+  return std::string("");
+}
+
+template <typename DataTypeT, typename LongVectorOpTypeT>
+std::string LongVector::TestConfig<DataTypeT, LongVectorOpTypeT>::getHLSLTypeString() const {
+  if (std::is_same_v<DataTypeT, HLSLBool_t>)
+    return "bool";
+  if (std::is_same_v<DataTypeT, HLSLHalf_t>)
+    return "half";
+  if (std::is_same_v<DataTypeT, float>)
+    return "float";
+  if (std::is_same_v<DataTypeT, double>)
+    return "double";
+  if (std::is_same_v<DataTypeT, int16_t>)
+    return "int16_t";
+  if (std::is_same_v<DataTypeT, int32_t>)
+    return "int";
+  if (std::is_same_v<DataTypeT, int64_t>)
+    return "int64_t";
+  if (std::is_same_v<DataTypeT, uint16_t>)
+    return "uint16_t";
+  if (std::is_same_v<DataTypeT, uint32_t>)
+    return "uint32_t";
+  if (std::is_same_v<DataTypeT, uint64_t>)
+    return "uint64_t";
+
+  std::string ErrStr("getHLSLTypeString() Unsupported type: ");
+  ErrStr.append(typeid(DataTypeT).name());
+  VERIFY_IS_TRUE(false, ErrStr.c_str());
+  return "UnknownType";
+}
+
+template <typename DataTypeT, typename LongVectorOpTypeT>
+DataTypeT LongVector::TestConfig<DataTypeT, LongVectorOpTypeT>::computeExpectedValue(const DataTypeT &A, const DataTypeT &B,
+                              LongVector::BinaryOpType OpType) const {
+  switch (OpType) {
+  case LongVector::BinaryOpType_ScalarAdd:
+    return A + B;
+  case LongVector::BinaryOpType_ScalarMultiply:
+    return A * B;
+  case LongVector::BinaryOpType_ScalarSubtract:
+    return A - B;
+  case LongVector::BinaryOpType_ScalarDivide:
+    return A / B;
+  case LongVector::BinaryOpType_ScalarModulus:
+    return mod(A, B);
+  case LongVector::BinaryOpType_Multiply:
+    return A * B;
+  case LongVector::BinaryOpType_Add:
+    return A + B;
+  case LongVector::BinaryOpType_Subtract:
+    return A - B;
+  case LongVector::BinaryOpType_Divide:
+    return A / B;
+  case LongVector::BinaryOpType_Modulus:
+    return mod(A, B);
+  case LongVector::BinaryOpType_Min:
+    // std::max and std::min are wrapped in () to avoid collisions with the //
+    // macro defintions for min and max in windows.h
+    return (std::min)(A, B);
+  case LongVector::BinaryOpType_Max:
+    return (std::max)(A, B);
+  case LongVector::BinaryOpType_ScalarMin:
+    return (std::min)(A, B);
+  case LongVector::BinaryOpType_ScalarMax:
+    return (std::max)(A, B);
+  default:
+    LOG_ERROR_FMT_THROW(L"Unknown BinaryOpType: %d", OpTypeTraits.OpType);
+    return DataTypeT();
+  }
+}
+
+template <typename DataTypeT, typename LongVectorOpTypeT>
+DataTypeT LongVector::TestConfig<DataTypeT, LongVectorOpTypeT>::computeExpectedValue(const DataTypeT &A, const DataTypeT &B) const {
+  if(!isBinaryOp())
+    LOG_ERROR_FMT_THROW(
+      L"computeExpectedValue(const DataTypeT &A, const DataTypeT &B) called "
+      L"on a unary op: %d",
+      OpTypeTraits.OpType);
+
+  return computeExpectedValue(A, B, static_cast<LongVector::BinaryOpType>(OpTypeTraits.OpType));
+}
+
+
+template <typename DataTypeT, typename LongVectorOpTypeT>
+DataTypeT LongVector::TestConfig<DataTypeT, LongVectorOpTypeT>::computeExpectedValue(const DataTypeT &A,
+                              LongVector::UnaryOpType OpType) const {
+  switch (OpType) {
+  case LongVector::UnaryOpType_Initialize:
+    return A;
+  default:
+    LOG_ERROR_FMT_THROW(L"Unknown UnaryOpType :%d", OpTypeTraits.OpType);
+    return DataTypeT();
+  }
+}
+
+template <typename DataTypeT, typename LongVectorOpTypeT>
+DataTypeT LongVector::TestConfig<DataTypeT, LongVectorOpTypeT>::computeExpectedValue(const DataTypeT &A) const {
+
+  if constexpr (std::is_same_v<LongVectorOpTypeT, LongVector::TrigonometricOpType>) {
+    const auto OpType = static_cast<LongVector::TrigonometricOpType>(OpTypeTraits.OpType);
+    // HLSLHalf_t is a struct. We need to call the constructor to get the
+    // expected value.
+    return computeExpectedValue(A, OpType);
+  }
+
+  if constexpr (std::is_same_v<LongVectorOpTypeT, LongVector::UnaryOpType>) {
+    const auto OpType = static_cast<LongVector::UnaryOpType>(OpTypeTraits.OpType);
+    // HLSLHalf_t is a struct. We need to call the constructor to get the
+    // expected value.
+    return computeExpectedValue(A, OpType);
+  }
+
+  LOG_ERROR_FMT_THROW(
+    L"computeExpectedValue(const DataType&A) called on an unrecognized binary op: %d",
+    OpTypeTraits.OpType);
+
+  return DataTypeT();
+}
+
+template <typename DataTypeT, typename LongVectorOpTypeT>
+DataTypeT LongVector::TestConfig<DataTypeT, LongVectorOpTypeT>::computeExpectedValue(const DataTypeT &A,
+                              LongVector::TrigonometricOpType OpType) const {
+  // The trig functions are only valid on floating point types. The constexpr in
+  // this case is a relatively easy and clean way to prevent the compiler from
+  // erroring out trying to resolve these for the non floating point types. We
+  // won't use them in the first place.
+  if constexpr (isFloatingPointType<DataTypeT>()) {
+    switch (OpType) {
+    case LongVector::TrigonometricOpType_Acos:
+      return std::acos(A);
+    case LongVector::TrigonometricOpType_Asin:
+      return std::asin(A);
+    case LongVector::TrigonometricOpType_Atan:
+      return std::atan(A);
+    case LongVector::TrigonometricOpType_Cos:
+      return std::cos(A);
+    case LongVector::TrigonometricOpType_Cosh:
+      return std::cosh(A);
+    case LongVector::TrigonometricOpType_Sin:
+      return std::sin(A);
+    case LongVector::TrigonometricOpType_Sinh:
+      return std::sinh(A);
+    case LongVector::TrigonometricOpType_Tan:
+      return std::tan(A);
+    case LongVector::TrigonometricOpType_Tanh:
+      return std::tanh(A);
+    default:
+      LOG_ERROR_FMT_THROW(L"Unknown TrigonometricOpType: %d",
+                          OpTypeTraits.OpType);
+      return DataTypeT();
+    }
+  }
+
+  LOG_ERROR_FMT_THROW(L"ComputeExpectedValue(const DataTypeT &A, "
+                      L"LongVectorOpTypeT OpType) called on a "
+                      L"non-float type: %d",
+                      OpType);
+
+  return DataTypeT();
+}
+
+template <typename DataTypeT, typename LongVectorOpTypeT>
+std::vector<DataTypeT>  LongVector::TestConfig<DataTypeT, LongVectorOpTypeT>::getInputArgsArray() const {
+
+  std::vector<DataTypeT> InputArgs;
+
+  std::wstring InputArgsArrayName = this->InputArgsArrayName;
+
+  if (InputArgsArrayName.empty())
+    VERIFY_FAIL("No args array name set.");
+
+  if (std::is_same_v<DataTypeT, HLSLBool_t> && isClampOp())
+    VERIFY_FAIL("Clamp is not supported for bools.");
+  else
+    return getInputValueSetByKey<DataTypeT>(InputArgsArrayName, false);
+
+  VERIFY_FAIL("Invalid type for args array.");
+  return std::vector<DataTypeT>();
+}
+
+template <typename DataTypeT, typename LongVectorOpTypeT>
+std::string LongVector::TestConfig<DataTypeT, LongVectorOpTypeT>::getCompilerOptionsString(size_t VectorSize) const {
+  std::stringstream CompilerOptions("");
+  std::string HLSLType = getHLSLTypeString();
+  CompilerOptions << "-DTYPE=";
+  CompilerOptions << HLSLType;
+  CompilerOptions << " -DNUM=";
+  CompilerOptions << VectorSize;
+  const bool Is16BitType =
+      (HLSLType == "int16_t" || HLSLType == "uint16_t" || HLSLType == "half");
+  CompilerOptions << (Is16BitType ? " -enable-16bit-types" : "");
+  CompilerOptions << " -DOPERATOR=";
+  CompilerOptions << OperatorString;
+
+  if (isBinaryOp()) {
+    CompilerOptions << " -DOPERAND2=";
+    CompilerOptions << (isScalarOp() ? "InputScalar" : "InputVector2");
+
+    if (isScalarOp())
+      CompilerOptions << " -DIS_SCALAR_OP=1";
+    else
+      CompilerOptions << " -DIS_BINARY_VECTOR_OP=1";
+
+    CompilerOptions << " -DFUNC=";
+    CompilerOptions << IntrinsicString;
+  } else { // Unary Op
+    CompilerOptions << " -DFUNC=";
+    CompilerOptions << IntrinsicString;
+    CompilerOptions << " -DOPERAND2=";
+    CompilerOptions << getOPERAND2String();
+  }
+
+  return CompilerOptions.str();
+}
+
+template <typename DataTypeT, typename LongVectorOpTypeT>
+std::vector<DataTypeT> LongVector::TestConfig<DataTypeT, LongVectorOpTypeT>::getInputValueSet(size_t ValueSetIndex) const {
+  if (ValueSetIndex == 2 && !isBinaryOp())
+    VERIFY_FAIL("ValueSetindex==2 is only valid for binary ops.");
+
+  std::wstring InputValueSetName = L"";
+  if (ValueSetIndex == 1)
+    InputValueSetName = InputValueSetName1;
+  else if (ValueSetIndex == 2)
+    InputValueSetName = InputValueSetName2;
+  else
+    VERIFY_FAIL("Invalid ValueSetIndex");
+
+  return getInputValueSetByKey<DataTypeT>(InputValueSetName);
+}
diff --git a/tools/clang/unittests/HLSLExec/ShaderOpArith.xml b/tools/clang/unittests/HLSLExec/ShaderOpArith.xml
index e768f205f1..dbea8e2aaf 100644
--- a/tools/clang/unittests/HLSLExec/ShaderOpArith.xml
+++ b/tools/clang/unittests/HLSLExec/ShaderOpArith.xml
@@ -1976,7 +1976,7 @@
         RWStructuredBuffer<uint64_t> g_shareXchg64Buf : register(u5);
 
         groupshared uint64_t g_uint64Share[6];
-        groupshared int64_t g_sint64Share[3];
+        groupshared int64_t g_sint64Share[4];
         groupshared uint64_t g_xchg64Share[64];
 
         #define VEC_CALL(op, uav, ix, val) op(uav[ix*stride], val);
@@ -2046,7 +2046,7 @@
           // Zero-init shared memory, with special cases
           if (ix < 6)
             g_uint64Share[ix] = ix == 1 ? 99999999ULL | (99999999ULL << 32) : ix == 3 ? ~0ULL : 0;
-          if (ix < 3)
+          if (ix < 4)
             g_sint64Share[ix] = ix == 1 ? 99999999ULL | (99999999ULL << 32) : 0;
           if (ix < 64)
             g_xchg64Share[ix] = 0;
@@ -2552,11 +2552,11 @@
 
         void InitSharedMem(uint ix) {
           // Zero-init shared memory, with special cases
-          if (ix < 6)
+          if (ix < 7)
             g_uintShare[ix] = ix == 1 ? 99999999 : ix == 3 ? -1 : 0;
-          if (ix < 3)
+          if (ix < 4)
             g_sintShare[ix] = ix == 1 ? 99999999 : 0;
-          if (ix < 64)
+          if (ix < 65)
             g_xchgShare[ix] = 0;
 
           GroupMemoryBarrierWithGroupSync();
@@ -3750,4 +3750,71 @@ void MSMain(uint GID : SV_GroupIndex,
     </Shader>
   </ShaderOp>
 
+  <ShaderOp Name="LongVectorOp" CS="CS">
+    <RootSignature>RootFlags(0), UAV(u0), UAV(u1), UAV(u2),
+    UAV(u3)</RootSignature>
+    <!-- Width="16" BYTES to account for two largest scalar types (64 bits)-->
+    <Resource Name="InputFuncArgs" Dimension="BUFFER" Width="16"
+    Flags="ALLOW_UNORDERED_ACCESS" InitialResourceState="COPY_DEST"
+    TransitionTo="UNORDERED_ACCESS" Init="ByName" ReadBack="true" />
+    <!-- Width="8192" BYTES to account for largest type (64 bits) and vector
+    size of 1024 elements (the max long vector size)-->
+    <Resource Name="InputVector1" Dimension="BUFFER" Width="8192" Flags="ALLOW_UNORDERED_ACCESS" InitialResourceState="COPY_DEST" TransitionTo="UNORDERED_ACCESS" Init="ByName" ReadBack="true" />
+    <Resource Name="InputVector2" Dimension="BUFFER" Width="8192" Flags="ALLOW_UNORDERED_ACCESS" InitialResourceState="COPY_DEST" TransitionTo="UNORDERED_ACCESS" Init="ByName" ReadBack="true" />
+    <Resource Name="OutputVector" Dimension="BUFFER" Width="8192" Flags="ALLOW_UNORDERED_ACCESS" InitialResourceState="COPY_DEST" TransitionTo="UNORDERED_ACCESS" Init="ByName" ReadBack="true" />
+    <RootValues>
+      <RootValue Index="0" ResName="InputFuncArgs" />
+      <RootValue Index="1" ResName="InputVector1" />
+      <RootValue Index="2" ResName="InputVector2" />
+      <RootValue Index="3" ResName="OutputVector" />
+    </RootValues>
+    <!-- This shader requires the following defines to be passed in as arguments:
+     // 1 and 2 are required to compile the shader.
+     // 1. -DOPERATOR : "*" "+" "," "-" "/" etc.
+     // 2. -DOPERAND2: : InputVector2, ScalarInput, or "" depending on the test.
+     // Other defines are optional and are used to test different functions.
+     -->
+    <Shader Name="CS" Target="cs_6_9" EntryPoint="main">
+      <![CDATA[
+
+        #ifdef FUNC_INITIALIZE
+        vector<TYPE, NUM> TestInitialize(vector<TYPE, NUM> Vector)
+        {
+          vector<TYPE, NUM> VectorCopy = Vector;
+          return VectorCopy;
+        }
+        #endif
+
+        RWByteAddressBuffer g_InputFuncArgs : register(u0);
+        RWByteAddressBuffer g_InputVector1 : register(u1);
+        RWByteAddressBuffer g_InputVector2 : register(u2);
+        RWByteAddressBuffer g_OutputVector : register(u3);
+        [numthreads(1,1,1)]
+        void main(uint GI : SV_GroupIndex) {
+
+          vector<TYPE, NUM> InputVector1 = g_InputVector1.Load< vector<TYPE,
+          NUM> >(0);
+
+          #ifdef IS_BINARY_VECTOR_OP
+          vector<TYPE, NUM> InputVector2 = g_InputVector2.Load< vector<TYPE,
+          NUM> >(0);
+          #endif
+
+          #ifdef IS_SCALAR_OP
+          TYPE InputScalar = g_InputFuncArgs.Load<TYPE>(0);
+          #endif
+
+          #ifdef FUNC_CLAMP
+          TYPE Clamp_ArgMin = g_InputFuncArgs.Load<TYPE>(0);
+          TYPE Clamp_ArgMax = g_InputFuncArgs.Load<TYPE>(sizeof(TYPE));
+          vector<TYPE, 2> ClampArgMinMax = {Clamp_ArgMin, Clamp_ArgMax};
+          #endif
+
+          vector<TYPE, NUM> OutputVector = FUNC(InputVector1 OPERATOR OPERAND2);
+
+          g_OutputVector.Store< vector<TYPE, NUM> >(0, OutputVector);
+        };
+      ]]>
+    </Shader>
+  </ShaderOp>
 </ShaderOpSet>
diff --git a/tools/clang/unittests/HLSLExec/ShaderOpTest.cpp b/tools/clang/unittests/HLSLExec/ShaderOpTest.cpp
index e6c9b10f6c..60ce3a9241 100644
--- a/tools/clang/unittests/HLSLExec/ShaderOpTest.cpp
+++ b/tools/clang/unittests/HLSLExec/ShaderOpTest.cpp
@@ -10,7 +10,7 @@
 ///////////////////////////////////////////////////////////////////////////////
 
 // We need to keep & fix these warnings to integrate smoothly with HLK
-#pragma warning(error : 4100 4146 4242 4244 4267 4701 4389)
+#pragma warning(error : 4100 4242 4244 4267 4701 4389)
 
 #include "d3dx12.h"
 #include <atlbase.h>
@@ -258,6 +258,15 @@ void CommandListRefs::CreateForDevice(ID3D12Device *pDevice, bool compute) {
                                       IID_PPV_ARGS(&List)));
 }
 
+ShaderOpTest::ShaderOpTest() {
+  m_hFence = CreateEvent(nullptr, FALSE, FALSE, nullptr);
+  if (m_hFence == nullptr) {
+    AtlThrow(HRESULT_FROM_WIN32(GetLastError()));
+  }
+}
+
+ShaderOpTest::~ShaderOpTest() { CloseHandle(m_hFence); }
+
 void ShaderOpTest::CopyBackResources() {
   CommandListRefs ResCommandList;
   ResCommandList.CreateForDevice(m_pDevice, m_pShaderOp->IsCompute());
@@ -423,10 +432,6 @@ void ShaderOpTest::CreateDevice() {
   CHECK_HR(m_pDevice->CreateFence(0, D3D12_FENCE_FLAG_NONE,
                                   __uuidof(ID3D12Fence), (void **)&m_pFence));
   m_pFence->SetName(L"ShaderOpTest Fence");
-  m_hFence = CreateEvent(nullptr, FALSE, FALSE, nullptr);
-  if (m_hFence == nullptr) {
-    AtlThrow(HRESULT_FROM_WIN32(GetLastError()));
-  }
 }
 
 static void InitByteCode(D3D12_SHADER_BYTECODE *pBytecode, ID3D10Blob *pBlob) {
@@ -861,6 +866,11 @@ void ShaderOpTest::CreateShaders() {
       CHECK_HR(pLibrary->CreateBlobWithEncodingFromPinned(
           pText, (UINT32)strlen(pText), CP_UTF8, &pTextBlob));
       CHECK_HR(m_pDxcSupport->CreateInstance(CLSID_DxcCompiler, &pCompiler));
+      WEX::Logging::Log::Comment(L"Compiling shader:");
+      ShaderOpLogFmt(L"\tTarget profile: %S", S.Target);
+      if (argumentsWList.size() > 0) {
+        ShaderOpLogFmt(L"\tArguments: %S", pArguments);
+      }
       CHECK_HR(pCompiler->Compile(pTextBlob, nameW, entryPointW, targetW,
                                   (LPCWSTR *)argumentsWList.data(),
                                   (UINT32)argumentsWList.size(), nullptr, 0,
@@ -2747,6 +2757,74 @@ bool ShaderOpParser::ReadAtElementName(IXmlReader *pReader, LPCWSTR pName) {
   }
 }
 
+std::shared_ptr<ShaderOpTestResult>
+RunShaderOpTestAfterParse(ID3D12Device *pDevice, dxc::DxcDllSupport &support,
+                          LPCSTR pName,
+                          st::ShaderOpTest::TInitCallbackFn pInitCallback,
+                          st::ShaderOpTest::TShaderCallbackFn pShaderCallback,
+                          std::shared_ptr<st::ShaderOpSet> ShaderOpSet) {
+  st::ShaderOp *pShaderOp;
+  if (pName == nullptr) {
+    if (ShaderOpSet->ShaderOps.size() != 1) {
+      VERIFY_FAIL(L"Expected a single shader operation.");
+    }
+    pShaderOp = ShaderOpSet->ShaderOps[0].get();
+  } else {
+    pShaderOp = ShaderOpSet->GetShaderOp(pName);
+  }
+  if (pShaderOp == nullptr) {
+    std::string msg = "Unable to find shader op ";
+    msg += pName;
+    msg += "; available ops";
+    const char sep = ':';
+    for (auto &pAvailOp : ShaderOpSet->ShaderOps) {
+      msg += sep;
+      msg += pAvailOp->Name ? pAvailOp->Name : "[n/a]";
+    }
+    CA2W msgWide(msg.c_str());
+    VERIFY_FAIL(msgWide.m_psz);
+  }
+
+  // This won't actually be used since we're supplying the device,
+  // but let's make it consistent.
+  pShaderOp->UseWarpDevice = hlsl_test::GetTestParamUseWARP(true);
+
+  std::shared_ptr<st::ShaderOpTest> test = std::make_shared<st::ShaderOpTest>();
+  test->SetDxcSupport(&support);
+  test->SetInitCallback(pInitCallback);
+  test->SetShaderCallback(pShaderCallback);
+  test->SetDevice(pDevice);
+  test->RunShaderOp(pShaderOp);
+
+  std::shared_ptr<ShaderOpTestResult> result =
+      std::make_shared<ShaderOpTestResult>();
+  result->ShaderOpSet = ShaderOpSet;
+  result->Test = test;
+  result->ShaderOp = pShaderOp;
+  return result;
+}
+
+std::shared_ptr<ShaderOpTestResult>
+RunShaderOpTestAfterParse(ID3D12Device *pDevice, dxc::DxcDllSupport &support,
+                          LPCSTR pName,
+                          st::ShaderOpTest::TInitCallbackFn pInitCallback,
+                          std::shared_ptr<st::ShaderOpSet> ShaderOpSet) {
+  return RunShaderOpTestAfterParse(pDevice, support, pName, pInitCallback,
+                                   nullptr, ShaderOpSet);
+}
+
+std::shared_ptr<ShaderOpTestResult>
+RunShaderOpTest(ID3D12Device *pDevice, dxc::DxcDllSupport &support,
+                IStream *pStream, LPCSTR pName,
+                st::ShaderOpTest::TInitCallbackFn pInitCallback) {
+  DXASSERT_NOMSG(pStream != nullptr);
+  std::shared_ptr<st::ShaderOpSet> ShaderOpSet =
+      std::make_shared<st::ShaderOpSet>();
+  st::ParseShaderOpSetFromStream(pStream, ShaderOpSet.get());
+  return RunShaderOpTestAfterParse(pDevice, support, pName, pInitCallback,
+                                   ShaderOpSet);
+}
+
 #pragma endregion Parsing support
 
 } // namespace st
diff --git a/tools/clang/unittests/HLSLExec/ShaderOpTest.h b/tools/clang/unittests/HLSLExec/ShaderOpTest.h
index e65bd9e4e5..e8298fc8d9 100644
--- a/tools/clang/unittests/HLSLExec/ShaderOpTest.h
+++ b/tools/clang/unittests/HLSLExec/ShaderOpTest.h
@@ -12,12 +12,12 @@
 // results.                                                                  //
 //                                                                           //
 ///////////////////////////////////////////////////////////////////////////////
-
-#pragma once
-
 #ifndef __SHADEROPTEST_H__
 #define __SHADEROPTEST_H__
 
+#include <atlbase.h>
+#include <d3d12.h>
+#include <dxgi1_4.h>
 #include <functional>
 #include <map>
 #include <memory>
@@ -26,7 +26,7 @@
 #include <vector>
 
 // We need to keep & fix these warnings to integrate smoothly with HLK
-#pragma warning(error : 4100 4146 4242 4244 4267 4701 4389)
+#pragma warning(error : 4100 4242 4244 4267 4701 4389)
 
 ///////////////////////////////////////////////////////////////////////////////
 // Forward declarations.
@@ -275,6 +275,9 @@ class ShaderOpTest {
   typedef std::function<void(LPCSTR Name, LPCSTR pText, IDxcBlob **ppShaderBlob,
                              ShaderOp *pShaderOp)>
       TShaderCallbackFn;
+
+  ShaderOpTest();
+  ~ShaderOpTest();
   void GetPipelineStats(D3D12_QUERY_DATA_PIPELINE_STATISTICS *pStats);
   void GetReadBackData(LPCSTR pResourceName, MappedData *pData);
   void RunShaderOp(ShaderOp *pShaderOp);
@@ -341,6 +344,32 @@ void ParseShaderOpSetFromStream(IStream *pStream, ShaderOpSet *pShaderOpSet);
 // Deserialize a ShaderOpSet from an IXmlReader instance.
 void ParseShaderOpSetFromXml(IXmlReader *pReader, ShaderOpSet *pShaderOpSet);
 
+///////////////////////////////////////////////////////////////////////////////
+// RunShaderOpTest* helper functions.
+struct ShaderOpTestResult {
+  st::ShaderOp *ShaderOp;
+  std::shared_ptr<st::ShaderOpSet> ShaderOpSet;
+  std::shared_ptr<st::ShaderOpTest> Test;
+};
+
+std::shared_ptr<ShaderOpTestResult>
+RunShaderOpTestAfterParse(ID3D12Device *pDevice, dxc::DxcDllSupport &support,
+                          LPCSTR pName,
+                          st::ShaderOpTest::TInitCallbackFn pInitCallback,
+                          st::ShaderOpTest::TShaderCallbackFn pShaderCallback,
+                          std::shared_ptr<st::ShaderOpSet> ShaderOpSet);
+
+std::shared_ptr<ShaderOpTestResult>
+RunShaderOpTestAfterParse(ID3D12Device *pDevice, dxc::DxcDllSupport &support,
+                          LPCSTR pName,
+                          st::ShaderOpTest::TInitCallbackFn pInitCallback,
+                          std::shared_ptr<st::ShaderOpSet> ShaderOpSet);
+
+std::shared_ptr<ShaderOpTestResult>
+RunShaderOpTest(ID3D12Device *pDevice, dxc::DxcDllSupport &support,
+                IStream *pStream, LPCSTR pName,
+                st::ShaderOpTest::TInitCallbackFn pInitCallback);
+
 } // namespace st
 
 #endif // __SHADEROPTEST_H__
diff --git a/tools/clang/unittests/HLSLExec/TableParameterHandler.cpp b/tools/clang/unittests/HLSLExec/TableParameterHandler.cpp
new file mode 100644
index 0000000000..16badb074d
--- /dev/null
+++ b/tools/clang/unittests/HLSLExec/TableParameterHandler.cpp
@@ -0,0 +1,376 @@
+#include "TableParameterHandler.h"
+#include "dxc/Test/HlslTestUtils.h"
+
+TableParameterHandler::TableParameterHandler(TableParameter *pTable,
+                                             size_t size)
+    : m_table(pTable), m_tableSize(size) {
+  clearTableParameter();
+  VERIFY_SUCCEEDED(ParseTableRow());
+}
+
+TableParameter *TableParameterHandler::GetTableParamByName(LPCWSTR name) {
+  for (size_t i = 0; i < m_tableSize; ++i) {
+    if (_wcsicmp(name, m_table[i].m_name) == 0) {
+      return &m_table[i];
+    }
+  }
+  DXASSERT_ARGS(false, "Invalid Table Parameter Name %s", name);
+  return nullptr;
+}
+
+void TableParameterHandler::clearTableParameter() {
+  for (size_t i = 0; i < m_tableSize; ++i) {
+    m_table[i].m_int32 = 0;
+    m_table[i].m_uint = 0;
+    m_table[i].m_double = 0;
+    m_table[i].m_bool = false;
+    m_table[i].m_str = WEX::Common::String();
+  }
+}
+
+template <class T1>
+std::vector<T1> *TableParameterHandler::GetDataArray(LPCWSTR name) {
+  return nullptr;
+}
+
+template <>
+std::vector<int> *TableParameterHandler::GetDataArray(LPCWSTR name) {
+  for (size_t i = 0; i < m_tableSize; ++i) {
+    if (_wcsicmp(name, m_table[i].m_name) == 0) {
+      return &(m_table[i].m_int32Table);
+    }
+  }
+  DXASSERT_ARGS(false, "Invalid Table Parameter Name %s", name);
+  return nullptr;
+}
+
+template <>
+std::vector<int8_t> *TableParameterHandler::GetDataArray(LPCWSTR name) {
+  for (size_t i = 0; i < m_tableSize; ++i) {
+    if (_wcsicmp(name, m_table[i].m_name) == 0) {
+      return &(m_table[i].m_int8Table);
+    }
+  }
+  DXASSERT_ARGS(false, "Invalid Table Parameter Name %s", name);
+  return nullptr;
+}
+
+template <>
+std::vector<int16_t> *TableParameterHandler::GetDataArray(LPCWSTR name) {
+  for (size_t i = 0; i < m_tableSize; ++i) {
+    if (_wcsicmp(name, m_table[i].m_name) == 0) {
+      return &(m_table[i].m_int16Table);
+    }
+  }
+  DXASSERT_ARGS(false, "Invalid Table Parameter Name %s", name);
+  return nullptr;
+}
+
+template <>
+std::vector<unsigned int> *TableParameterHandler::GetDataArray(LPCWSTR name) {
+  for (size_t i = 0; i < m_tableSize; ++i) {
+    if (_wcsicmp(name, m_table[i].m_name) == 0) {
+      return &(m_table[i].m_uint32Table);
+    }
+  }
+  DXASSERT_ARGS(false, "Invalid Table Parameter Name %s", name);
+  return nullptr;
+}
+
+template <>
+std::vector<float> *TableParameterHandler::GetDataArray(LPCWSTR name) {
+  for (size_t i = 0; i < m_tableSize; ++i) {
+    if (_wcsicmp(name, m_table[i].m_name) == 0) {
+      return &(m_table[i].m_floatTable);
+    }
+  }
+  DXASSERT_ARGS(false, "Invalid Table Parameter Name %s", name);
+  return nullptr;
+}
+
+template <>
+std::vector<uint16_t> *TableParameterHandler::GetDataArray(LPCWSTR name) {
+  for (size_t i = 0; i < m_tableSize; ++i) {
+    if (_wcsicmp(name, m_table[i].m_name) == 0) {
+      return &(m_table[i].m_halfTable);
+    }
+  }
+  DXASSERT_ARGS(false, "Invalid Table Parameter Name %s", name);
+  return nullptr;
+}
+
+template <>
+std::vector<double> *TableParameterHandler::GetDataArray(LPCWSTR name) {
+  for (size_t i = 0; i < m_tableSize; ++i) {
+    if (_wcsicmp(name, m_table[i].m_name) == 0) {
+      return &(m_table[i].m_doubleTable);
+    }
+  }
+  DXASSERT_ARGS(false, "Invalid Table Parameter Name %s", name);
+  return nullptr;
+}
+
+template <>
+std::vector<bool> *TableParameterHandler::GetDataArray(LPCWSTR name) {
+  for (size_t i = 0; i < m_tableSize; ++i) {
+    if (_wcsicmp(name, m_table[i].m_name) == 0) {
+      return &(m_table[i].m_boolTable);
+    }
+  }
+  DXASSERT_ARGS(false, "Invalid Table Parameter Name %s", name);
+  return nullptr;
+}
+
+HRESULT TableParameterHandler::ParseTableRow() {
+  TableParameter *table = m_table;
+  for (unsigned int i = 0; i < m_tableSize; ++i) {
+    switch (table[i].m_type) {
+    case TableParameter::INT8:
+      if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
+                                                           table[i].m_int32)) &&
+          table[i].m_required) {
+        // TryGetValue does not suppport reading from int16
+        hlsl_test::LogErrorFmt(L"Failed to get %s", table[i].m_name);
+        return E_FAIL;
+      }
+      table[i].m_int8 = (int8_t)(table[i].m_int32);
+      break;
+    case TableParameter::INT16:
+      if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
+                                                           table[i].m_int32)) &&
+          table[i].m_required) {
+        // TryGetValue does not suppport reading from int16
+        hlsl_test::LogErrorFmt(L"Failed to get %s", table[i].m_name);
+        return E_FAIL;
+      }
+      table[i].m_int16 = (short)(table[i].m_int32);
+      break;
+    case TableParameter::INT32:
+      if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
+                                                           table[i].m_int32)) &&
+          table[i].m_required) {
+        hlsl_test::LogErrorFmt(L"Failed to get %s", table[i].m_name);
+        return E_FAIL;
+      }
+      break;
+    case TableParameter::UINT:
+      if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
+                                                           table[i].m_uint)) &&
+          table[i].m_required) {
+        hlsl_test::LogErrorFmt(L"Failed to get %s", table[i].m_name);
+        return E_FAIL;
+      }
+      break;
+    case TableParameter::DOUBLE:
+      if (FAILED(WEX::TestExecution::TestData::TryGetValue(
+              table[i].m_name, table[i].m_double)) &&
+          table[i].m_required) {
+        hlsl_test::LogErrorFmt(L"Failed to get %s", table[i].m_name);
+        return E_FAIL;
+      }
+      break;
+    case TableParameter::STRING:
+      if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
+                                                           table[i].m_str)) &&
+          table[i].m_required) {
+        hlsl_test::LogErrorFmt(L"Failed to get %s", table[i].m_name);
+        return E_FAIL;
+      }
+      break;
+    case TableParameter::BOOL:
+      if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
+                                                           table[i].m_str)) &&
+          table[i].m_bool) {
+        hlsl_test::LogErrorFmt(L"Failed to get %s", table[i].m_name);
+        return E_FAIL;
+      }
+      break;
+    case TableParameter::INT8_TABLE: {
+      WEX::TestExecution::TestDataArray<int> tempTable;
+      if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
+                                                           tempTable)) &&
+          table[i].m_required) {
+
+        hlsl_test::LogErrorFmt(L"Failed to get %s", table[i].m_name);
+        return E_FAIL;
+      }
+      // TryGetValue does not suppport reading from int8
+      table[i].m_int8Table.resize(tempTable.GetSize());
+      for (size_t j = 0, end = tempTable.GetSize(); j != end; ++j) {
+        table[i].m_int8Table[j] = (int8_t)tempTable[j];
+      }
+      break;
+    }
+    case TableParameter::INT16_TABLE: {
+      WEX::TestExecution::TestDataArray<int> tempTable;
+      if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
+                                                           tempTable)) &&
+          table[i].m_required) {
+        hlsl_test::LogErrorFmt(L"Failed to get %s", table[i].m_name);
+        return E_FAIL;
+      }
+      // TryGetValue does not suppport reading from int8
+      table[i].m_int16Table.resize(tempTable.GetSize());
+      for (size_t j = 0, end = tempTable.GetSize(); j != end; ++j) {
+        table[i].m_int16Table[j] = (int16_t)tempTable[j];
+      }
+      break;
+    }
+    case TableParameter::INT32_TABLE: {
+      WEX::TestExecution::TestDataArray<int> tempTable;
+      if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
+                                                           tempTable)) &&
+          table[i].m_required) {
+        // TryGetValue does not suppport reading from int8
+        hlsl_test::LogErrorFmt(L"Failed to get %s", table[i].m_name);
+        return E_FAIL;
+      }
+      table[i].m_int32Table.resize(tempTable.GetSize());
+      for (size_t j = 0, end = tempTable.GetSize(); j != end; ++j) {
+        table[i].m_int32Table[j] = tempTable[j];
+      }
+      break;
+    }
+    case TableParameter::UINT8_TABLE: {
+      WEX::TestExecution::TestDataArray<int> tempTable;
+      if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
+                                                           tempTable)) &&
+          table[i].m_required) {
+
+        hlsl_test::LogErrorFmt(L"Failed to get %s", table[i].m_name);
+        return E_FAIL;
+      }
+      // TryGetValue does not suppport reading from int8
+      table[i].m_int8Table.resize(tempTable.GetSize());
+      for (size_t j = 0, end = tempTable.GetSize(); j != end; ++j) {
+        table[i].m_int8Table[j] = (uint8_t)tempTable[j];
+      }
+      break;
+    }
+    case TableParameter::UINT16_TABLE: {
+      WEX::TestExecution::TestDataArray<int> tempTable;
+      if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
+                                                           tempTable)) &&
+          table[i].m_required) {
+        hlsl_test::LogErrorFmt(L"Failed to get %s", table[i].m_name);
+        return E_FAIL;
+      }
+      // TryGetValue does not suppport reading from int8
+      table[i].m_uint16Table.resize(tempTable.GetSize());
+      for (size_t j = 0, end = tempTable.GetSize(); j != end; ++j) {
+        table[i].m_uint16Table[j] = (uint16_t)tempTable[j];
+      }
+      break;
+    }
+    case TableParameter::UINT32_TABLE: {
+      WEX::TestExecution::TestDataArray<unsigned int> tempTable;
+      if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
+                                                           tempTable)) &&
+          table[i].m_required) {
+        // TryGetValue does not suppport reading from int8
+        hlsl_test::LogErrorFmt(L"Failed to get %s", table[i].m_name);
+        return E_FAIL;
+      }
+      table[i].m_uint32Table.resize(tempTable.GetSize());
+      for (size_t j = 0, end = tempTable.GetSize(); j != end; ++j) {
+        table[i].m_uint32Table[j] = tempTable[j];
+      }
+      break;
+    }
+    case TableParameter::FLOAT_TABLE: {
+      WEX::TestExecution::TestDataArray<WEX::Common::String> tempTable;
+      if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
+                                                           tempTable)) &&
+          table[i].m_required) {
+        // TryGetValue does not suppport reading from int8
+        hlsl_test::LogErrorFmt(L"Failed to get %s", table[i].m_name);
+        return E_FAIL;
+      }
+      table[i].m_floatTable.resize(tempTable.GetSize());
+      for (size_t j = 0, end = tempTable.GetSize(); j != end; ++j) {
+        ParseDataToFloat(tempTable[j], table[i].m_floatTable[j]);
+      }
+      break;
+    }
+    case TableParameter::HALF_TABLE: {
+      WEX::TestExecution::TestDataArray<WEX::Common::String> tempTable;
+      if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
+                                                           tempTable)) &&
+          table[i].m_required) {
+        // TryGetValue does not suppport reading from int8
+        hlsl_test::LogErrorFmt(L"Failed to get %s", table[i].m_name);
+        return E_FAIL;
+      }
+      table[i].m_halfTable.resize(tempTable.GetSize());
+      for (size_t j = 0, end = tempTable.GetSize(); j != end; ++j) {
+        uint16_t value = 0;
+        if (IsHexString(tempTable[j], &value)) {
+          table[i].m_halfTable[j] = value;
+        } else {
+          float val;
+          ParseDataToFloat(tempTable[j], val);
+          if (isdenorm(val))
+            table[i].m_halfTable[j] =
+                signbit(val) ? Float16NegDenorm : Float16PosDenorm;
+          else
+            table[i].m_halfTable[j] = ConvertFloat32ToFloat16(val);
+        }
+      }
+      break;
+    }
+    case TableParameter::DOUBLE_TABLE: {
+      WEX::TestExecution::TestDataArray<double> tempTable;
+      if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
+                                                           tempTable)) &&
+          table[i].m_required) {
+        // TryGetValue does not suppport reading from int8
+        hlsl_test::LogErrorFmt(L"Failed to get %s", table[i].m_name);
+        return E_FAIL;
+      }
+      table[i].m_doubleTable.resize(tempTable.GetSize());
+      for (size_t j = 0, end = tempTable.GetSize(); j != end; ++j) {
+        table[i].m_doubleTable[j] = tempTable[j];
+      }
+      break;
+    }
+    case TableParameter::BOOL_TABLE: {
+      WEX::TestExecution::TestDataArray<bool> tempTable;
+      if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
+                                                           tempTable)) &&
+          table[i].m_required) {
+        // TryGetValue does not suppport reading from int8
+        hlsl_test::LogErrorFmt(L"Failed to get %s", table[i].m_name);
+        return E_FAIL;
+      }
+      table[i].m_boolTable.resize(tempTable.GetSize());
+      for (size_t j = 0, end = tempTable.GetSize(); j != end; ++j) {
+        table[i].m_boolTable[j] = tempTable[j];
+      }
+      break;
+    }
+    case TableParameter::STRING_TABLE: {
+      WEX::TestExecution::TestDataArray<WEX::Common::String> tempTable;
+      if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
+                                                           tempTable)) &&
+          table[i].m_required) {
+        // TryGetValue does not suppport reading from int8
+        hlsl_test::LogErrorFmt(L"Failed to get %s", table[i].m_name);
+        return E_FAIL;
+      }
+      table[i].m_StringTable.resize(tempTable.GetSize());
+      for (size_t j = 0, end = tempTable.GetSize(); j != end; ++j) {
+        table[i].m_StringTable[j] = tempTable[j];
+      }
+      break;
+    }
+    default:
+      DXASSERT_NOMSG("Invalid Parameter Type");
+    }
+    if (errno == ERANGE) {
+      hlsl_test::LogErrorFmt(L"got out of range value for table %s",
+                             table[i].m_name);
+      return E_FAIL;
+    }
+  }
+  return S_OK;
+}
diff --git a/tools/clang/unittests/HLSLExec/TableParameterHandler.h b/tools/clang/unittests/HLSLExec/TableParameterHandler.h
new file mode 100644
index 0000000000..eac851a263
--- /dev/null
+++ b/tools/clang/unittests/HLSLExec/TableParameterHandler.h
@@ -0,0 +1,205 @@
+#ifndef TABLE_PARAMETER_HANDLER_H
+#define TABLE_PARAMETER_HANDLER_H
+
+#include <Verify.h>
+#include <WexString.h>
+#include <WexTestClass.h>
+#include <memory>
+#include <string>
+#include <vector>
+#include <wchar.h>
+#include <windows.h> // For LPCWSTR
+
+#include "dxc/Support/Global.h" // For DXASSERT_ARGS
+#include "dxc/Test/HlslTestUtils.h"
+
+// Parameter representation for taef data-driven tests
+struct TableParameter {
+  LPCWSTR m_name;
+  enum TableParameterType {
+    INT8,
+    INT16,
+    INT32,
+    UINT,
+    FLOAT,
+    HALF,
+    DOUBLE,
+    STRING,
+    BOOL,
+    INT8_TABLE,
+    INT16_TABLE,
+    INT32_TABLE,
+    FLOAT_TABLE,
+    HALF_TABLE,
+    DOUBLE_TABLE,
+    STRING_TABLE,
+    UINT8_TABLE,
+    UINT16_TABLE,
+    UINT32_TABLE,
+    BOOL_TABLE
+  };
+  TableParameter(LPCWSTR name, TableParameterType type, bool required)
+      : m_name(name), m_type(type), m_required(required) {}
+  TableParameterType m_type;
+  bool m_required; // required parameter
+  int8_t m_int8;
+  int16_t m_int16;
+  int m_int32;
+  unsigned int m_uint;
+  float m_float;
+  uint16_t m_half; // no such thing as half type in c++. Use int16 instead
+  double m_double;
+  bool m_bool;
+  WEX::Common::String m_str;
+  std::vector<int8_t> m_int8Table;
+  std::vector<int16_t> m_int16Table;
+  std::vector<int> m_int32Table;
+  std::vector<uint8_t> m_uint8Table;
+  std::vector<uint16_t> m_uint16Table;
+  std::vector<unsigned int> m_uint32Table;
+  std::vector<float> m_floatTable;
+  std::vector<uint16_t> m_halfTable; // no such thing as half type in c++
+  std::vector<double> m_doubleTable;
+  std::vector<bool> m_boolTable;
+  std::vector<WEX::Common::String> m_StringTable;
+};
+
+class TableParameterHandler {
+private:
+  HRESULT ParseTableRow();
+
+public:
+  TableParameter *m_table;
+  size_t m_tableSize;
+  TableParameterHandler(TableParameter *pTable, size_t size);
+
+  TableParameter *GetTableParamByName(LPCWSTR name);
+  void clearTableParameter();
+
+  template <class T1> std::vector<T1> *GetDataArray(LPCWSTR name);
+};
+
+// Static helpers
+static bool IsHexString(PCWSTR str, uint16_t *value) {
+  std::wstring wString(str);
+  wString.erase(std::remove(wString.begin(), wString.end(), L' '),
+                wString.end());
+  LPCWSTR wstr = wString.c_str();
+  if (wcsncmp(wstr, L"0x", 2) == 0 || wcsncmp(wstr, L"0b", 2) == 0) {
+    *value = (uint16_t)wcstol(wstr, NULL, 0);
+    return true;
+  }
+  return false;
+}
+
+static HRESULT ParseDataToFloat(PCWSTR str, float &value) {
+  std::wstring wString(str);
+  wString.erase(std::remove(wString.begin(), wString.end(), L' '),
+                wString.end());
+  wString.erase(std::remove(wString.begin(), wString.end(), L'\n'),
+                wString.end());
+  PCWSTR wstr = wString.data();
+  if (_wcsicmp(wstr, L"NaN") == 0) {
+    value = NAN;
+  } else if (_wcsicmp(wstr, L"-inf") == 0) {
+    value = -(INFINITY);
+  } else if (_wcsicmp(wstr, L"inf") == 0) {
+    value = INFINITY;
+  } else if (_wcsicmp(wstr, L"-denorm") == 0) {
+    value = -(FLT_MIN / 2);
+  } else if (_wcsicmp(wstr, L"denorm") == 0) {
+    value = FLT_MIN / 2;
+  } else if (_wcsicmp(wstr, L"-0.0f") == 0 || _wcsicmp(wstr, L"-0.0") == 0 ||
+             _wcsicmp(wstr, L"-0") == 0) {
+    value = -0.0f;
+  } else if (_wcsicmp(wstr, L"0.0f") == 0 || _wcsicmp(wstr, L"0.0") == 0 ||
+             _wcsicmp(wstr, L"0") == 0) {
+    value = 0.0f;
+  } else if (_wcsnicmp(wstr, L"0x", 2) ==
+             0) { // For hex values, take values literally
+    unsigned temp_i = std::stoul(wstr, nullptr, 16);
+    value = (float &)temp_i;
+  } else {
+    // evaluate the expression of wstring
+    double val = _wtof(wstr);
+    if (val == 0) {
+      hlsl_test::LogErrorFmt(L"Failed to parse parameter %s to float", wstr);
+      return E_FAIL;
+    }
+    value = (float)val;
+  }
+  return S_OK;
+}
+
+static HRESULT ParseDataToUint(PCWSTR str, unsigned int &value) {
+  std::wstring wString(str);
+  wString.erase(std::remove(wString.begin(), wString.end(), L' '),
+                wString.end());
+  PCWSTR wstr = wString.data();
+  // evaluate the expression of string
+  if (_wcsicmp(wstr, L"0") == 0 || _wcsicmp(wstr, L"0x00000000") == 0) {
+    value = 0;
+    return S_OK;
+  }
+  wchar_t *end;
+  unsigned int val = std::wcstoul(wstr, &end, 0);
+  if (val == 0) {
+    hlsl_test::LogErrorFmt(L"Failed to parse parameter %s to int", wstr);
+    return E_FAIL;
+  }
+  value = val;
+  return S_OK;
+}
+
+static HRESULT ParseDataToVectorFloat(PCWSTR str, float *ptr, size_t count) {
+  std::wstring wstr(str);
+  size_t curPosition = 0;
+  // parse a string of dot product separated by commas
+  for (size_t i = 0; i < count; ++i) {
+    size_t nextPosition = wstr.find(L",", curPosition);
+    if (FAILED(ParseDataToFloat(
+            wstr.substr(curPosition, nextPosition - curPosition).data(),
+            *(ptr + i)))) {
+      return E_FAIL;
+    }
+    curPosition = nextPosition + 1;
+  }
+  return S_OK;
+}
+
+static HRESULT ParseDataToVectorHalf(PCWSTR str, uint16_t *ptr, size_t count) {
+  std::wstring wstr(str);
+  size_t curPosition = 0;
+  // parse a string of dot product separated by commas
+  for (size_t i = 0; i < count; ++i) {
+    size_t nextPosition = wstr.find(L",", curPosition);
+    float floatValue;
+    if (FAILED(ParseDataToFloat(
+            wstr.substr(curPosition, nextPosition - curPosition).data(),
+            floatValue))) {
+      return E_FAIL;
+    }
+    *(ptr + i) = ConvertFloat32ToFloat16(floatValue);
+    curPosition = nextPosition + 1;
+  }
+  return S_OK;
+}
+
+static HRESULT ParseDataToVectorUint(PCWSTR str, unsigned int *ptr,
+                                     size_t count) {
+  std::wstring wstr(str);
+  size_t curPosition = 0;
+  // parse a string of dot product separated by commas
+  for (size_t i = 0; i < count; ++i) {
+    size_t nextPosition = wstr.find(L",", curPosition);
+    if (FAILED(ParseDataToUint(
+            wstr.substr(curPosition, nextPosition - curPosition).data(),
+            *(ptr + i)))) {
+      return E_FAIL;
+    }
+    curPosition = nextPosition + 1;
+  }
+  return S_OK;
+}
+
+#endif // TABLE_PARAMETER_HANDLER_H
diff --git a/tools/clang/unittests/HLSLTestLib/FileCheckerTest.cpp b/tools/clang/unittests/HLSLTestLib/FileCheckerTest.cpp
index 2c75d45e5e..2d9ee7315d 100644
--- a/tools/clang/unittests/HLSLTestLib/FileCheckerTest.cpp
+++ b/tools/clang/unittests/HLSLTestLib/FileCheckerTest.cpp
@@ -519,28 +519,21 @@ FileRunCommandPart::RunDxc(dxc::DxcDllSupport &DllSupport,
       // Convert stage to minimum dxil/validator version:
       RequiredDxilMajor = std::max(RequiredDxilMajor, (unsigned)6) - 5;
 
-      bool bInternalValidator =
-          opts.SelectValidator == hlsl::options::ValidatorSelection::Internal;
       bool bValVerExplicit = opts.ValVerMajor != UINT_MAX;
 
-      // Normally we must check the validator version as well, but there are
-      // two scenarios where the validator version doesn't need to be checked
-      // against the version based on the shader model:
-      // 1. The test selects internal validator.
-      // 2. The test explicitly requests a specific validator version.
-      FileRunCommandResult result =
-          CheckDxilVer(DllSupport, RequiredDxilMajor, RequiredDxilMinor,
-                       !(bInternalValidator || bValVerExplicit));
+      // If validator version set explicitly, skip validator version check when
+      // checking required version for shader model.
+      FileRunCommandResult result = CheckDxilVer(
+          DllSupport, RequiredDxilMajor, RequiredDxilMinor, !bValVerExplicit);
       if (result.AbortPipeline)
         return result;
 
       // Additionally, if the test explicitly requests a specific non-zero
-      // validator version, and doesn't select internal validator or disable
-      // validation, we must check that the validator version is at least as
-      // high as the requested version.
-      // When ValVerMajor is 0, validation cannot be run against the module.
-      if (bValVerExplicit && opts.ValVerMajor != 0 &&
-          !(bInternalValidator || opts.DisableValidation))
+      // validator version, and doesn't disable validation, we must check
+      // that the validator version is at least as high as the requested
+      // version. When ValVerMajor is 0, validation cannot be run against
+      // the module.
+      if (bValVerExplicit && opts.ValVerMajor != 0 && !opts.DisableValidation)
         result = CheckDxilVer(DllSupport, opts.ValVerMajor, opts.ValVerMinor);
       if (result.AbortPipeline)
         return result;
diff --git a/unittests/ADT/APIntTest.cpp b/unittests/ADT/APIntTest.cpp
index ffba7b1633..a15307023e 100644
--- a/unittests/ADT/APIntTest.cpp
+++ b/unittests/ADT/APIntTest.cpp
@@ -11,6 +11,7 @@
 #include "llvm/ADT/SmallString.h"
 #include "gtest/gtest.h"
 #include <array>
+#include <limits>
 #include <ostream>
 
 using namespace llvm;
@@ -753,7 +754,7 @@ TEST(APIntTest, StringDeath) {
 #endif
 
 TEST(APIntTest, mul_clear) {
-  APInt ValA(65, -1ULL);
+  APInt ValA(65, std::numeric_limits<uint64_t>::max());
   APInt ValB(65, 4);
   APInt ValC(65, 0);
   ValC = ValA * ValB;
diff --git a/unittests/ADT/BitVectorTest.cpp b/unittests/ADT/BitVectorTest.cpp
index 26f103b3c1..c7de9194c4 100644
--- a/unittests/ADT/BitVectorTest.cpp
+++ b/unittests/ADT/BitVectorTest.cpp
@@ -12,6 +12,7 @@
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/SmallBitVector.h"
 #include "gtest/gtest.h"
+#include <limits>
 
 using namespace llvm;
 
@@ -73,7 +74,8 @@ TYPED_TEST(BitVectorTest, TrivialOperation) {
   Vec.resize(33, true);
   Vec.resize(57, false);
   unsigned Count = 0;
-  for (unsigned i = Vec.find_first(); i != -1u; i = Vec.find_next(i)) {
+  for (unsigned i = Vec.find_first(); i != std::numeric_limits<unsigned>::max();
+       i = Vec.find_next(i)) {
     ++Count;
     EXPECT_TRUE(Vec[i]);
     EXPECT_TRUE(Vec.test(i));
@@ -103,7 +105,8 @@ TYPED_TEST(BitVectorTest, TrivialOperation) {
   Vec.resize(91, true);
   Vec.resize(130, false);
   Count = 0;
-  for (unsigned i = Vec.find_first(); i != -1u; i = Vec.find_next(i)) {
+  for (unsigned i = Vec.find_first(); i != std::numeric_limits<unsigned>::max();
+       i = Vec.find_next(i)) {
     ++Count;
     EXPECT_TRUE(Vec[i]);
     EXPECT_TRUE(Vec.test(i));
diff --git a/unittests/Support/DataExtractorTest.cpp b/unittests/Support/DataExtractorTest.cpp
index 81de983d22..250b89d696 100644
--- a/unittests/Support/DataExtractorTest.cpp
+++ b/unittests/Support/DataExtractorTest.cpp
@@ -7,8 +7,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "gtest/gtest.h"
 #include "llvm/Support/DataExtractor.h"
+#include "gtest/gtest.h"
+#include <limits>
 using namespace llvm;
 
 namespace {
@@ -20,7 +21,8 @@ const char bigleb128data[] = "\xAA\xA9\xFF\xAA\xFF\xAA\xFF\x4A";
 
 TEST(DataExtractorTest, OffsetOverflow) {
   DataExtractor DE(StringRef(numberData, sizeof(numberData)-1), false, 8);
-  EXPECT_FALSE(DE.isValidOffsetForDataOfSize(-2U, 5));
+  EXPECT_FALSE(DE.isValidOffsetForDataOfSize(
+      std::numeric_limits<uint32_t>::max() - 1, 5));
 }
 
 TEST(DataExtractorTest, UnsignedNumbers) {
diff --git a/utils/TableGen/FixedLenDecoderEmitter.cpp b/utils/TableGen/FixedLenDecoderEmitter.cpp
index c5ef9d0e99..d356971f24 100644
--- a/utils/TableGen/FixedLenDecoderEmitter.cpp
+++ b/utils/TableGen/FixedLenDecoderEmitter.cpp
@@ -547,10 +547,11 @@ void Filter::recurse() {
 
     // Delegates to an inferior filter chooser for further processing on this
     // group of instructions whose segment values are variable.
-    FilterChooserMap.insert(
-        std::make_pair(-1U, llvm::make_unique<FilterChooser>(
-                                Owner->AllInstructions, VariableInstructions,
-                                Owner->Operands, BitValueArray, *Owner)));
+    FilterChooserMap.insert(std::make_pair(
+        std::numeric_limits<unsigned>::max(),
+        llvm::make_unique<FilterChooser>(Owner->AllInstructions,
+                                         VariableInstructions, Owner->Operands,
+                                         BitValueArray, *Owner)));
   }
 
   // No need to recurse for a singleton filtered instruction.
diff --git a/utils/asan/x86_64-pc-linux-gnu.lsan.supp b/utils/asan/x86_64-pc-linux-gnu.lsan.supp
new file mode 100644
index 0000000000..3a7725f535
--- /dev/null
+++ b/utils/asan/x86_64-pc-linux-gnu.lsan.supp
@@ -0,0 +1 @@
+leak:^call_init$
\ No newline at end of file
diff --git a/utils/git/requirements_formatting.txt b/utils/git/requirements_formatting.txt
index 6f3e07dcf2..2afb003c4f 100644
--- a/utils/git/requirements_formatting.txt
+++ b/utils/git/requirements_formatting.txt
@@ -42,11 +42,11 @@ pyjwt[crypto]==2.8.0
     # via pygithub
 pynacl==1.5.0
     # via pygithub
-requests==2.32.0
+requests==2.32.4
     # via pygithub
 toml==0.10.2
     # via darker
-urllib3==2.2.2
+urllib3==2.5.0
     # via requests
 wrapt==1.15.0
     # via deprecated
diff --git a/utils/hct/gen_intrin_main.txt b/utils/hct/gen_intrin_main.txt
index f1274fd308..f2c0cc5e2e 100644
--- a/utils/hct/gen_intrin_main.txt
+++ b/utils/hct/gen_intrin_main.txt
@@ -1,9 +1,6 @@
 // Copyright (C) Microsoft Corporation. All rights reserved.
 // This file is distributed under the University of Illinois Open Source License. See LICENSE.TXT for details.
 //
-// Modifications Copyright(C) 2025 Advanced Micro Devices, Inc.
-// All rights reserved.
-//
 // See hctdb.py for the implementation of intrinsic file processing.
 //
 // Intrinsic declarations are grouped into namespaces that
@@ -339,9 +336,9 @@ float<4,3> [[rn]] ObjectToWorld4x3();
 float<4,3> [[rn]] WorldToObject4x3();
 
 // Packed dot products with accumulate:
-$type3 [[rn]] dot4add_u8packed(in uint a, in $type1 b, in uint c);
-$type3 [[rn]] dot4add_i8packed(in uint a, in $type1 b, in int c);
-$type3 [[rn]] dot2add(in float16_t<2> a, in $type1 b, in float c);
+uint [[rn]] dot4add_u8packed(in uint a, in $type1 b, in uint c);
+int [[rn]] dot4add_i8packed(in uint a, in $type1 b, in int c);
+float [[rn]] dot2add(in float16_t<2> a, in $type1 b, in float c);
 
 // Unpacking intrinsics
 int16_t<4> [[rn]] unpack_s8s16(in p32i8 pk);
@@ -383,6 +380,14 @@ void [[]] Barrier(in NodeRecordOrUAV o, in uint SemanticFlags);
 
 uint [[]] GetRemainingRecursionLevels();
 
+void [[min_sm=6.9]] __builtin_MatVecMul(out LinAlg<c> OutputVector, in bool OutputIsUnsigned, in LinAlg<c2> InputVector, in bool InputIsUnsigned, in uint InputInterpretation, in ByteAddressBuffer MatrixBuffer, in uint MatrixOffset, in uint MatrixInterpretation, in uint M, in uint K, in uint MatrixLayout, in bool MatrixIsTransposed, in uint MatrixStride);
+
+void [[min_sm=6.9]] __builtin_MatVecMulAdd(out LinAlg<c> OutputVector, in bool OutputIsUnsigned, in LinAlg<c2> InputVector, in bool InputIsUnsigned, in uint InputInterpretation, in ByteAddressBuffer MatrixBuffer, in uint MatrixOffset, in uint MatrixInterpretation, in uint M, in uint K, in uint MatrixLayout, in bool MatrixIsTransposed, in uint MatrixStride, in ByteAddressBuffer BiasVector, in uint BiasOffset, in uint BiasInterpretation);
+
+void [[min_sm=6.9]] __builtin_OuterProductAccumulate(in LinAlg<c> InputVector1, in LinAlg<c2> InputVector2, in RWByteAddressBuffer MatrixBuffer, in uint MatrixOffset, in uint MatrixInterpretation, in uint MatrixLayout, in uint MatrixStride);
+
+void [[min_sm=6.9]] __builtin_VectorAccumulate(in LinAlg<c> InputVector, in RWByteAddressBuffer MatrixBuffer, in uint MatrixOffset);
+
 } namespace
 
 
@@ -1126,7 +1131,7 @@ namespace DxHitObjectMethods {
     uint [[rn,class_prefix,min_sm=6.9]] GetPrimitiveIndex();
     uint [[rn,class_prefix,min_sm=6.9]] GetHitKind();
     uint [[rn,class_prefix,min_sm=6.9]] GetShaderTableIndex();
-    $funcT [[class_prefix,min_sm=6.9]] GetAttributes();
+    void [[class_prefix,min_sm=6.9]] GetAttributes(out udt Attributes);
     void [[class_prefix,min_sm=6.9]] SetShaderTableIndex(in uint RecordIndex);
     uint [[ro,class_prefix,min_sm=6.9]] LoadLocalRootTableConstant(in uint RootConstantOffsetInBytes);
 } namespace
diff --git a/utils/hct/hctdb.py b/utils/hct/hctdb.py
index 6344fb5849..2b94b13134 100644
--- a/utils/hct/hctdb.py
+++ b/utils/hct/hctdb.py
@@ -1,7 +1,5 @@
 # Copyright (C) Microsoft Corporation. All rights reserved.
 # This file is distributed under the University of Illinois Open Source License. See LICENSE.TXT for details.
-# Modifications Copyright(C) 2025 Advanced Micro Devices, Inc.
-# All rights reserved.
 ###############################################################################
 # DXIL information.                                                           #
 ###############################################################################
@@ -873,6 +871,11 @@ def populate_categories_and_models(self):
                 "library",
                 "raygeneration",
             )
+        for i in (
+            "MatVecMul,MatVecMulAdd,OuterProductAccumulate,VectorAccumulate"
+        ).split(","):
+            self.name_idx[i].category = "Linear Algebra Operations"
+            self.name_idx[i].shader_model = 6, 9
 
     def populate_llvm_instructions(self):
         # Add instructions that map to LLVM instructions.
@@ -2624,7 +2627,7 @@ def UFI(name, **mappings):
             next_op_idx,
             "Unary",
             "computes the rate of change of components per stamp",
-            "hf<",
+            "hf",
             "rn",
             [
                 db_dxil_param(
@@ -2642,7 +2645,7 @@ def UFI(name, **mappings):
             next_op_idx,
             "Unary",
             "computes the rate of change of components per stamp",
-            "hf<",
+            "hf",
             "rn",
             [
                 db_dxil_param(
@@ -2660,7 +2663,7 @@ def UFI(name, **mappings):
             next_op_idx,
             "Unary",
             "computes the rate of change of components per pixel",
-            "hf<",
+            "hf",
             "rn",
             [
                 db_dxil_param(
@@ -2678,7 +2681,7 @@ def UFI(name, **mappings):
             next_op_idx,
             "Unary",
             "computes the rate of change of components per pixel",
-            "hf<",
+            "hf",
             "rn",
             [
                 db_dxil_param(
@@ -6340,6 +6343,103 @@ def UFI(name, **mappings):
         )
         next_op_idx += 1
 
+        self.add_dxil_op(
+            "MatVecMul",
+            next_op_idx,
+            "MatVecMul",
+            "Multiplies a MxK dimension matrix and a K sized input vector",
+            "<hfwi,<hfwi",
+            "ro",
+            [
+                db_dxil_param(0, "$x0", "outputVector", "output vector"),
+                db_dxil_param(2, "$x1", "inputVector", "input vector"),
+                db_dxil_param(3, "i1", "isInputUnsigned", "is input unsigned"),
+                db_dxil_param(4, "i32", "inputInterpretation", "input interpretation"),
+                db_dxil_param(5, "res", "matrixBuffer", "matrix resource"),
+                db_dxil_param(6, "i32", "matrixOffset", "matrix offset"),
+                db_dxil_param(7, "i32", "matrixIntepretation", "matrix intepretation"),
+                db_dxil_param(8, "i32", "matrixM", "matrix M dimension"),
+                db_dxil_param(9, "i32", "matrixK", "matrix K dimension"),
+                db_dxil_param(10, "i32", "matrixLayout", "matrix layout"),
+                db_dxil_param(11, "i1", "matrixTranspose", "matrix transpose"),
+                db_dxil_param(12, "i32", "matrixStride", "matrix stride"),
+                db_dxil_param(13, "i1", "isOutputUnsigned", "is output unsigned"),
+            ],
+        )
+        next_op_idx += 1
+
+        self.add_dxil_op(
+            "MatVecMulAdd",
+            next_op_idx,
+            "MatVecMulAdd",
+            "multiplies a MxK dimension matrix and a K sized input vector and adds an M-sized bias vector",
+            "<hfwi,<hfwi",
+            "ro",
+            [
+                db_dxil_param(0, "$x0", "outputVector", "output vector"),
+                db_dxil_param(2, "$x1", "inputVector", "input vector"),
+                db_dxil_param(3, "i1", "isInputUnsigned", "is input unsigned"),
+                db_dxil_param(4, "i32", "inputInterpretation", "input interpretation"),
+                db_dxil_param(5, "res", "matrixBuffer", "matrix resource"),
+                db_dxil_param(6, "i32", "matrixOffset", "matrix offset"),
+                db_dxil_param(7, "i32", "matrixIntepretation", "matrix intepretation"),
+                db_dxil_param(8, "i32", "matrixM", "matrix M dimension"),
+                db_dxil_param(9, "i32", "matrixK", "matrix K dimension"),
+                db_dxil_param(10, "i32", "matrixLayout", "matrix layout"),
+                db_dxil_param(11, "i1", "matrixTranspose", "matrix transpose"),
+                db_dxil_param(12, "i32", "matrixStride", "matrix stride"),
+                db_dxil_param(13, "res", "biasBuffer", "bias vector resource"),
+                db_dxil_param(14, "i32", "biasOffset", "bias vector offset"),
+                db_dxil_param(
+                    15, "i32", "biasIntepretation", "bias vector intepretation"
+                ),
+                db_dxil_param(16, "i1", "isOutputUnsigned", "is output unsigned"),
+            ],
+        )
+        next_op_idx += 1
+
+        self.add_dxil_op(
+            "OuterProductAccumulate",
+            next_op_idx,
+            "OuterProductAccumulate",
+            "Computes the outer product between column vectors and an MxN matrix is accumulated component-wise atomically (with device scope) in memory",
+            "<hfwi,<hfwi",
+            "",
+            [
+                db_dxil_param(0, "v", "", ""),
+                db_dxil_param(2, "$x0", "inputVector1", "input vector 1"),
+                db_dxil_param(3, "$x1", "inputVector2", "input vector 2"),
+                db_dxil_param(4, "res", "matrixBuffer", "matrix resource"),
+                db_dxil_param(5, "i32", "matrixOffset", "matrix offset"),
+                db_dxil_param(
+                    6,
+                    "i32",
+                    "matrixIntepretation",
+                    "matrix intepretation",
+                    is_const=True,
+                ),
+                db_dxil_param(7, "i32", "matrixLayout", "matrix layout", is_const=True),
+                db_dxil_param(8, "i32", "matrixStride", "matrix stride"),
+            ],
+        )
+        next_op_idx += 1
+
+        self.add_dxil_op(
+            "VectorAccumulate",
+            next_op_idx,
+            "VectorAccumulate",
+            "Accumulates the components of a vector component-wise atomically (with device scope) to the corresponding elements of an array in memory",
+            "<hfwi",
+            "",
+            [
+                db_dxil_param(0, "v", "", ""),
+                db_dxil_param(2, "$o", "inputVector", "input vector 1"),
+                db_dxil_param(3, "res", "arrayBuffer", "output array resource"),
+                db_dxil_param(4, "i32", "arrayOffset", "output array offset"),
+            ],
+        )
+        next_op_idx += 1
+
         # End of DXIL 1.9 opcodes.
         # NOTE!! Update and uncomment when DXIL 1.9 opcodes are finalized:
         # self.set_op_count_for_version(1, 9, next_op_idx)
@@ -7913,9 +8013,10 @@ def build_valrules(self):
             "Hull Shader MaxTessFactor must be [%0..%1].  %2 specified.",
         )
         self.add_valrule("Meta.ValidSamplerMode", "Invalid sampler mode on sampler .")
-        self.add_valrule(
-            "Meta.GlcNotOnAppendConsume",
-            "globallycoherent cannot be used with append/consume buffers: '%0'.",
+        self.add_valrule_msg(
+            "Meta.CoherenceNotOnAppendConsume",
+            "globally/reorder coherent incompatible with append/consume/counter buffers",
+            "%0coherent cannot be used on buffer with counter",
         )
         self.add_valrule_msg(
             "Meta.StructBufAlignment",
@@ -8298,10 +8399,80 @@ def build_valrules(self):
             "Instr.UndefHitObject",
             "HitObject is undef.",
         )
+        self.add_valrule_msg(
+            "Instr.ParamMultiple",
+            "Parameter must be a valid multiple",
+            "parameter '%0' must be a multiple of %1, got %2",
+        )
         self.add_valrule(
             "Instr.MayReorderThreadUndefCoherenceHintParam",
             "Use of undef coherence hint or num coherence hint bits in MaybeReorderThread.",
         )
+        self.add_valrule(
+            "Instr.ReorderCoherentRequiresSM69",
+            "reordercoherent requires SM 6.9 or later.",
+        )
+
+        # Linalg ops
+        self.add_valrule_msg(
+            "Instr.MatVecOpIsUnsignedFlagsAreConst",
+            "In Linalg Mul/MulAdd functions, IsUnsigned flag is a constant.",
+            "%0 is not a constant value",
+        )
+
+        self.add_valrule_msg(
+            "Instr.LinalgInterpretationParamAreConst",
+            "In Linalg operations, Interpretation value is a constant.",
+            "%0 is not a constant value",
+        )
+
+        self.add_valrule_msg(
+            "Instr.LinalgInvalidRegisterInterpValue",
+            "From Register Interpretation value must be valid.",
+            "'%0' is not a valid %1 interpretation value",
+        )
+
+        self.add_valrule_msg(
+            "Instr.LinalgInvalidMemoryInterpValue",
+            "In Memory Interpolation value must be valid.",
+            "'%0' is not a valid %1 interpretation value",
+        )
+
+        self.add_valrule_msg(
+            "Instr.LinalgMatrixShapeParamsAreConst",
+            "Matrix Layout, Dimensions and isTranspose are constants",
+            "'%0' is not a constant value",
+        )
+
+        self.add_valrule_msg(
+            "Instr.LinalgInvalidMatrixLayoutValueForMatVecOps",
+            "Matrix Layout for Linalg Mul/MulAdd operation must be valid.",
+            "matrix layout value '%0' is not valid. Must be between [%1 - %2]",
+        )
+
+        self.add_valrule_msg(
+            "Instr.LinalgMatrixStrideZeroForOptimalLayouts",
+            "For optimal layouts, matrix stride must be zero.",
+            "matrix stride must be a constant zero for optimal layouts",
+        )
+
+        self.add_valrule_msg(
+            "Instr.LinalgMatrixLayoutNotTransposable",
+            "Row Major and Column Major matrix layouts are not transposable.",
+            "%0 matrix layout is not transposable",
+        )
+
+        self.add_valrule_msg(
+            "Instr.LinalgNotAnUnsignedType",
+            "Unsigned flag set for a float signed type",
+            "IsUnsigned flag set to true for a float type '%0' vector",
+        )
+
+        self.add_valrule_msg(
+            "Instr.LinalgInvalidMatrixLayoutValueForOuterProductAccumulate",
+            "Matrix Layout for Linalg Mul/MulAdd operation must be valid.",
+            "matrix layout value '%0' is not valid for outerproductaccumulate, must be '%1'",
+        )
 
         # Some legacy rules:
         # - space is only supported for shader targets 5.1 and higher
@@ -9184,6 +9355,7 @@ def __init__(self, intrinsic_defs, opcode_data):
             "DxHitObject": "LICOMPTYPE_HIT_OBJECT",
             "VkBufferPointer": "LICOMPTYPE_VK_BUFFER_POINTER",
             "RayQuery": "LICOMPTYPE_RAY_QUERY",
+            "LinAlg": "LICOMPTYPE_LINALG",
         }
 
         self.trans_rowcol = {"r": "IA_R", "c": "IA_C", "r2": "IA_R2", "c2": "IA_C2"}
diff --git a/utils/hct/hlsl_intrinsic_opcodes.json b/utils/hct/hlsl_intrinsic_opcodes.json
index d99b84b745..a6f17bf7cf 100644
--- a/utils/hct/hlsl_intrinsic_opcodes.json
+++ b/utils/hct/hlsl_intrinsic_opcodes.json
@@ -1,6 +1,6 @@
 {
   "IntrinsicOpCodes": {
-    "Num_Intrinsics": 390,
+    "Num_Intrinsics": 394,
     "IOP_AcceptHitAndEndSearch": 0,
     "IOP_AddUint64": 1,
     "IOP_AllMemoryBarrier": 2,
@@ -390,6 +390,10 @@
     "MOP_DxHitObject_LoadLocalRootTableConstant": 386,
     "MOP_DxHitObject_MakeMiss": 387,
     "MOP_DxHitObject_SetShaderTableIndex": 388,
-    "MOP_DxHitObject_TraceRay": 389
+    "MOP_DxHitObject_TraceRay": 389,
+    "IOP___builtin_MatVecMul": 390,
+    "IOP___builtin_MatVecMulAdd": 391,
+    "IOP___builtin_OuterProductAccumulate": 392,
+    "IOP___builtin_VectorAccumulate": 393
   }
 }
diff --git a/utils/version/latest-release.json b/utils/version/latest-release.json
index 3138ccd2b1..40d50a28ba 100644
--- a/utils/version/latest-release.json
+++ b/utils/version/latest-release.json
@@ -1,8 +1,8 @@
 {
-    "version": {
-        "major": "1",
-        "minor": "8",
-        "rev": "2502"
-    },
-    "sha": "070d0d5a2beacef9eeb51037a9b04665716fd6f3"
+  "version": {
+    "major": "1",
+    "minor": "8",
+    "rev": "2505"
+  },
+  "sha": "0fd79eba6bb23f50ec21a7a7daeee3614bebe12b"
 }
diff --git a/utils/version/version.inc b/utils/version/version.inc
index 2577daa529..1d33b63ee2 100644
--- a/utils/version/version.inc
+++ b/utils/version/version.inc
@@ -18,7 +18,7 @@
 #ifdef RC_VERSION_FIELD_3
 #undef RC_VERSION_FIELD_3
 #endif
-#define RC_VERSION_FIELD_3 2502
+#define RC_VERSION_FIELD_3 2505
 
 #ifdef RC_VERSION_FIELD_4
 #undef RC_VERSION_FIELD_4
@@ -28,7 +28,7 @@
 #ifdef RC_FILE_VERSION
 #undef RC_FILE_VERSION
 #endif
-#define RC_FILE_VERSION "1.8.2502.0"
+#define RC_FILE_VERSION "1.8.2505.0"
 
 #ifdef RC_FILE_DESCRIPTION
 #undef RC_FILE_DESCRIPTION
@@ -49,7 +49,7 @@
 #ifdef RC_PRODUCT_VERSION
 #undef RC_PRODUCT_VERSION
 #endif
-#define RC_PRODUCT_VERSION "1.8.2502.0"
+#define RC_PRODUCT_VERSION "1.8.2505.0"
 
 #ifdef HLSL_TOOL_NAME
 #undef HLSL_TOOL_NAME