[SM6.9] Enable Native Vector Overloads for Derivatives (#7598)

pow2clk · Greg Roth · llvm-beanz · web-flow · commit 86f5bb5045fe · 2025-08-06T11:52:30.000-06:00
This enables derivative operations using native vectors by allowing native vectors in temporary convergent functions. HLSL intrinsics that lower to DXIL derivative ops get their parameters marked as convergent by passing their parameters to a temporary convergent function. This function scalarized vectors, leading to the results remaining scalarized. This change adds native vector support overloads to the convergent function and generates them in the convergent pass. This preserves the native vectors throughout final DXIL. Moves tests for the until now scalarized intrinsics to native vector test locations. fwidth requires some more involved expansion while the derivative operations can be tested trivially. Fixes #7343 --------- Co-authored-by: Greg Roth <grroth@microsoft.com> Co-authored-by: Chris B <beanz@abolishcrlf.org> Co-authored-by: Ashley Coleman <ascoleman@microsoft.com>
diff --git a/lib/DXIL/DxilOperations.cpp b/lib/DXIL/DxilOperations.cpp
@@ -765,32 +765,32 @@ const OP::OpCodeProperty OP::m_OpCodeProps[(unsigned)OP::OpCode::NumOpCodes] = {
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x3}},
-     {{0x0}}}, // Overloads: hf
+     {{0x403}},
+     {{0x3}}}, // Overloads: hf<hf
     {OC::DerivCoarseY,
      "DerivCoarseY",
      OCC::Unary,
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x3}},
-     {{0x0}}}, // Overloads: hf
+     {{0x403}},
+     {{0x3}}}, // Overloads: hf<hf
     {OC::DerivFineX,
      "DerivFineX",
      OCC::Unary,
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x3}},
-     {{0x0}}}, // Overloads: hf
+     {{0x403}},
+     {{0x3}}}, // Overloads: hf<hf
     {OC::DerivFineY,
      "DerivFineY",
      OCC::Unary,
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x3}},
-     {{0x0}}}, // Overloads: hf
+     {{0x403}},
+     {{0x3}}}, // Overloads: hf<hf
 
     // Pixel shader
     {OC::EvalSnapped,
diff --git a/lib/HLSL/DxilConvergent.cpp b/lib/HLSL/DxilConvergent.cpp
@@ -38,16 +38,18 @@ class DxilConvergentMark : public ModulePass {
 public:
   static char ID; // Pass identification, replacement for typeid
   explicit DxilConvergentMark() : ModulePass(ID) {}
+  bool SupportsVectors = false;
 
   StringRef getPassName() const override { return "DxilConvergentMark"; }
 
   bool runOnModule(Module &M) override {
-    if (M.HasHLModule()) {
-      const ShaderModel *SM = M.GetHLModule().GetShaderModel();
-      if (!SM->IsPS() && !SM->IsLib() &&
-          (!SM->IsSM66Plus() || (!SM->IsCS() && !SM->IsMS() && !SM->IsAS())))
-        return false;
-    }
+    const ShaderModel *SM = M.GetOrCreateHLModule().GetShaderModel();
+    // Can skip if in a shader and version that doesn't support derivatives.
+    if (!SM->IsPS() && !SM->IsLib() &&
+        (!SM->IsSM66Plus() || (!SM->IsCS() && !SM->IsMS() && !SM->IsAS())))
+      return false;
+    SupportsVectors = SM->IsSM69Plus();
+
     bool bUpdated = false;
 
     for (Function &F : M.functions()) {
@@ -87,7 +89,14 @@ char DxilConvergentMark::ID = 0;
 
 void DxilConvergentMark::MarkConvergent(Value *V, IRBuilder<> &Builder,
                                         Module &M) {
-  Type *Ty = V->getType()->getScalarType();
+  Type *Ty = V->getType();
+  bool NeedVectorExpansion = false;
+  VectorType *VTy = dyn_cast<VectorType>(Ty);
+  if (VTy && (!SupportsVectors || VTy->getNumElements() == 1)) {
+    Ty = Ty->getScalarType();
+    NeedVectorExpansion = true;
+  }
+
   // Only work on vector/scalar types.
   if (Ty->isAggregateType() || Ty->isPointerTy())
     return;
@@ -98,7 +107,8 @@ void DxilConvergentMark::MarkConvergent(Value *V, IRBuilder<> &Builder,
   os.flush();
   Function *ConvF = cast<Function>(M.getOrInsertFunction(str, FT));
   ConvF->addFnAttr(Attribute::AttrKind::Convergent);
-  if (VectorType *VT = dyn_cast<VectorType>(V->getType())) {
+  if (NeedVectorExpansion) {
+    VectorType *VT = cast<VectorType>(V->getType());
     Value *ConvV = UndefValue::get(V->getType());
     std::vector<ExtractElementInst *> extractList(VT->getNumElements());
     for (unsigned i = 0; i < VT->getNumElements(); i++) {
diff --git a/lib/HLSL/HLOperationLower.cpp b/lib/HLSL/HLOperationLower.cpp
@@ -6981,17 +6981,15 @@ IntrinsicLower gLowerTable[] = {
     {IntrinsicOp::IOP_countbits, TrivialUnaryOperationRet,
      DXIL::OpCode::Countbits},
     {IntrinsicOp::IOP_cross, TranslateCross, DXIL::OpCode::NumOpCodes},
-    {IntrinsicOp::IOP_ddx, TrivialUnaryOperationRet,
+    {IntrinsicOp::IOP_ddx, TrivialUnaryOperation, DXIL::OpCode::DerivCoarseX},
+    {IntrinsicOp::IOP_ddx_coarse, TrivialUnaryOperation,
      DXIL::OpCode::DerivCoarseX},
-    {IntrinsicOp::IOP_ddx_coarse, TrivialUnaryOperationRet,
-     DXIL::OpCode::DerivCoarseX},
-    {IntrinsicOp::IOP_ddx_fine, TrivialUnaryOperationRet,
+    {IntrinsicOp::IOP_ddx_fine, TrivialUnaryOperation,
      DXIL::OpCode::DerivFineX},
-    {IntrinsicOp::IOP_ddy, TrivialUnaryOperationRet,
-     DXIL::OpCode::DerivCoarseY},
-    {IntrinsicOp::IOP_ddy_coarse, TrivialUnaryOperationRet,
+    {IntrinsicOp::IOP_ddy, TrivialUnaryOperation, DXIL::OpCode::DerivCoarseY},
+    {IntrinsicOp::IOP_ddy_coarse, TrivialUnaryOperation,
      DXIL::OpCode::DerivCoarseY},
-    {IntrinsicOp::IOP_ddy_fine, TrivialUnaryOperationRet,
+    {IntrinsicOp::IOP_ddy_fine, TrivialUnaryOperation,
      DXIL::OpCode::DerivFineY},
     {IntrinsicOp::IOP_degrees, TranslateDegrees, DXIL::OpCode::NumOpCodes},
     {IntrinsicOp::IOP_determinant, EmptyLower, DXIL::OpCode::NumOpCodes},
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-intrinsics.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-intrinsics.hlsl
@@ -334,6 +334,22 @@ void main() {
   // CHECK: fdiv fast <[[NUM]] x float> <float 1.000000e+00, {{.*}}>, [[fvec1]]
   fRes += rcp(fVec1);
 
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[tmp:%.*]] = call <[[NUM]] x half> @dx.op.unary.[[HTY]](i32 83, <[[NUM]] x half> [[hvec1]])  ; DerivCoarseX(value)
+  // CHECK: call <[[NUM]] x half> @dx.op.unary.[[HTY]](i32 6, <[[NUM]] x half> [[tmp]])  ; FAbs(value)
+  // CHECK: [[tmp:%.*]] = call <[[NUM]] x half> @dx.op.unary.[[HTY]](i32 84, <[[NUM]] x half> [[hvec1]])  ; DerivCoarseY(value)
+  // CHECK: call <[[NUM]] x half> @dx.op.unary.[[HTY]](i32 6, <[[NUM]] x half> [[tmp]])  ; FAbs(value)
+  hRes += fwidth(hVec1);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[tmp:%.*]] = call <[[NUM]] x float> @dx.op.unary.[[FTY]](i32 83, <[[NUM]] x float> [[fvec1]])  ; DerivCoarseX(value)
+  // CHECK: call <[[NUM]] x float> @dx.op.unary.[[FTY]](i32 6, <[[NUM]] x float> [[tmp]])  ; FAbs(value)
+  // CHECK: [[tmp:%.*]] = call <[[NUM]] x float> @dx.op.unary.[[FTY]](i32 84, <[[NUM]] x float> [[fvec1]])  ; DerivCoarseY(value)
+  // CHECK: call <[[NUM]] x float> @dx.op.unary.[[FTY]](i32 6, <[[NUM]] x float> [[tmp]])  ; FAbs(value)
+  fRes += fwidth(fVec1);
+
   vector<uint, NUM> signs = 1;
   // CHECK-NOT: extractelement
   // CHECK-NOT: insertelement
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-scalarized-intrinsics.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-scalarized-intrinsics.hlsl
@@ -9,13 +9,6 @@
 // RUN: %dxc -DFUNC=countbits   -DARITY=1 -DTYPE=uint -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,UNARY
 // RUN: %dxc -DFUNC=firstbithigh -DARITY=1 -DTYPE=uint -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,UNARY
 // RUN: %dxc -DFUNC=firstbitlow  -DARITY=1 -DTYPE=uint -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,UNARY
-// RUN: %dxc -DFUNC=ddx         -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,UNARY
-// RUN: %dxc -DFUNC=ddx_coarse  -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,UNARY
-// RUN: %dxc -DFUNC=ddx_fine    -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,UNARY
-// RUN: %dxc -DFUNC=ddy         -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,UNARY
-// RUN: %dxc -DFUNC=ddy_coarse  -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,UNARY
-// RUN: %dxc -DFUNC=ddy_fine    -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,UNARY
-// RUN: %dxc -DFUNC=fwidth      -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,UNARY
 // RUN: %dxc -DFUNC=QuadReadLaneAt         -DARITY=4 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,QUAD
 // RUN: %dxc -DFUNC=QuadReadAcrossX        -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,QUAD
 // RUN: %dxc -DFUNC=QuadReadAcrossY        -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,QUAD
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-unary-float-intrinsics.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-unary-float-intrinsics.hlsl
@@ -41,6 +41,20 @@
 // RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=trunc -DOP=29 -DNUM=7    %s | FileCheck %s
 // RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=trunc -DOP=29 -DNUM=1022 %s | FileCheck %s
 
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=ddx -DOP=83 -DNUM=7    %s | FileCheck %s -check-prefixes=CHECK,CONV
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=ddx -DOP=83 -DNUM=1022 %s | FileCheck %s -check-prefixes=CHECK,CONV
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=ddx_coarse -DOP=83 -DNUM=7    %s | FileCheck %s -check-prefixes=CHECK,CONV
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=ddx_coarse -DOP=83 -DNUM=1022 %s | FileCheck %s -check-prefixes=CHECK,CONV
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=ddx_fine -DOP=85 -DNUM=7    %s | FileCheck %s -check-prefixes=CHECK,CONV
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=ddx_fine -DOP=85 -DNUM=1022 %s | FileCheck %s -check-prefixes=CHECK,CONV
+
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=ddy -DOP=84 -DNUM=7    %s | FileCheck %s -check-prefixes=CHECK,CONV
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=ddy -DOP=84 -DNUM=1022 %s | FileCheck %s -check-prefixes=CHECK,CONV
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=ddy_coarse -DOP=84 -DNUM=7    %s | FileCheck %s -check-prefixes=CHECK,CONV
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=ddy_coarse -DOP=84 -DNUM=1022 %s | FileCheck %s -check-prefixes=CHECK,CONV
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=ddy_fine -DOP=86 -DNUM=7    %s | FileCheck %s -check-prefixes=CHECK,CONV
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=ddy_fine -DOP=86 -DNUM=1022 %s | FileCheck %s -check-prefixes=CHECK,CONV
+
 // Test vector-enabled unary intrinsics that take float-like parameters and
 // and are "trivial" in that they can be implemented with a single call
 // instruction with the same parameter and return types.
@@ -64,6 +78,9 @@ void main() {
   // CHECK: [[hvec:%.*]] = extractvalue %dx.types.ResRet.[[HTY]] [[ld]], 0
   vector<float16_t, NUM> hVec = buf.Load<vector<float16_t, NUM> >(0);
 
+  // Convergent markers prevent GVN removal of redundant annotateHandle calls.
+  // CONV: [[buf:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %1, %dx.types.ResourceProperties { i32 4107, i32 0 })
+
   // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[FTY]] @dx.op.rawBufferVectorLoad.[[FTY]](i32 303, %dx.types.Handle [[buf]], i32 1024
   // CHECK: [[fvec:%.*]] = extractvalue %dx.types.ResRet.[[FTY]] [[ld]], 0
   vector<float, NUM> fVec = buf.Load<vector<float, NUM> >(1024);
diff --git a/tools/clang/test/CodeGenDXIL/passes/convergent-derivs.hlsl b/tools/clang/test/CodeGenDXIL/passes/convergent-derivs.hlsl
@@ -0,0 +1,33 @@
+// RUN: %dxc -T ps_6_1 -DFUNC=ddx %s        | FileCheck %s --check-prefixes CHECK,PRE69
+// RUN: %dxc -T ps_6_1 -DFUNC=ddx_coarse %s | FileCheck %s --check-prefixes CHECK,PRE69
+// RUN: %dxc -T ps_6_1 -DFUNC=ddx_fine %s   | FileCheck %s --check-prefixes CHECK,PRE69
+// RUN: %dxc -T ps_6_1 -DFUNC=ddy %s        | FileCheck %s --check-prefixes CHECK,PRE69
+// RUN: %dxc -T ps_6_1 -DFUNC=ddy_coarse %s | FileCheck %s --check-prefixes CHECK,PRE69
+// RUN: %dxc -T ps_6_1 -DFUNC=ddy_fine %s   | FileCheck %s --check-prefixes CHECK,PRE69
+
+// RUN: %dxc -T ps_6_9 -DFUNC=ddx %s        | FileCheck %s --check-prefixes CHECK,SM69
+// RUN: %dxc -T ps_6_9 -DFUNC=ddx_coarse %s | FileCheck %s --check-prefixes CHECK,SM69
+// RUN: %dxc -T ps_6_9 -DFUNC=ddx_fine %s   | FileCheck %s --check-prefixes CHECK,SM69
+// RUN: %dxc -T ps_6_9 -DFUNC=ddy %s        | FileCheck %s --check-prefixes CHECK,SM69
+// RUN: %dxc -T ps_6_9 -DFUNC=ddy_coarse %s | FileCheck %s --check-prefixes CHECK,SM69
+// RUN: %dxc -T ps_6_9 -DFUNC=ddy_fine %s   | FileCheck %s --check-prefixes CHECK,SM69
+
+// Make sure add(s) are not sunk into the conditional block.
+// SM69: fadd fast <2 x float>
+// PRE69: fadd fast float
+// PRE69: fadd fast float
+// CHECK: icmp sgt
+// CHECK-NEXT: br i1
+
+// Source for test of dxil-convergent pass.
+
+float2 main(float2 a:A, float2 b:B, int c:C) : SV_Target {
+
+  float2 coord = a + b;
+  float2 res = 0;
+  if (c > 2)
+    res -= FUNC(coord);
+
+  return res;
+
+}
diff --git a/tools/clang/test/CodeGenDXIL/passes/dxil-convergence-sm69.ll b/tools/clang/test/CodeGenDXIL/passes/dxil-convergence-sm69.ll
@@ -0,0 +1,67 @@
+; RUN: %dxopt %s -hlsl-passes-resume -hlsl-dxil-convergent-mark -S | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+; Function Attrs: nounwind readnone
+declare <2 x float> @"dx.hl.op.rn.<2 x float> (i32, <2 x float>)"(i32, <2 x float>) #1
+
+; Function Attrs: nounwind
+define void @main(<2 x float>* noalias %arg, <2 x float> %arg1, <2 x float> %arg2, i32 %arg3) #0 {
+bb:
+
+  %tmp = fadd <2 x float> %arg1, %arg2
+  ; CHECK: [[vec:%.*]] = call <2 x float> @"dxil.convergent.marker.<2 x float>"(<2 x float> %tmp)
+  %tmp4 = icmp sgt i32 %arg3, 2
+  %tmp5 = icmp ne i1 %tmp4, false
+  %tmp6 = icmp ne i1 %tmp5, false
+  br i1 %tmp6, label %bb7, label %bb10
+
+bb7:                                              ; preds = %bb
+  ; CHECK: call <2 x float> @"dx.hl.op.rn.<2 x float> (i32, <2 x float>)"(i32 128, <2 x float> [[vec]])
+  %tmp8 = call <2 x float> @"dx.hl.op.rn.<2 x float> (i32, <2 x float>)"(i32 128, <2 x float> %tmp)
+  %tmp9 = fsub <2 x float> zeroinitializer, %tmp8
+  br label %bb10
+
+bb10:                                             ; preds = %bb7, %bb
+  %res.0 = phi <2 x float> [ %tmp9, %bb7 ], [ zeroinitializer, %bb ]
+  store <2 x float> %res.0, <2 x float>* %arg
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+
+!llvm.module.flags = !{!0}
+!pauseresume = !{!1}
+!llvm.ident = !{!2}
+!dx.version = !{!3}
+!dx.valver = !{!3}
+!dx.shaderModel = !{!4}
+!dx.typeAnnotations = !{!5}
+!dx.entryPoints = !{!18}
+!dx.fnprops = !{!19}
+!dx.options = !{!20, !21}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !{!"hlsl-hlemit", !"hlsl-hlensure"}
+!2 = !{!"dxc(private) 1.8.0.4959 (coopvec-tests, 43e1db83c-dirty)"}
+!3 = !{i32 1, i32 9}
+!4 = !{!"ps", i32 6, i32 9}
+!5 = !{i32 1, void (<2 x float>*, <2 x float>, <2 x float>, i32)* @main, !6}
+!6 = !{!7, !9, !12, !14, !16}
+!7 = !{i32 0, !8, !8}
+!8 = !{}
+!9 = !{i32 1, !10, !11}
+!10 = !{i32 4, !"SV_Target", i32 7, i32 9}
+!11 = !{i32 0}
+!12 = !{i32 0, !13, !11}
+!13 = !{i32 4, !"A", i32 7, i32 9}
+!14 = !{i32 0, !15, !11}
+!15 = !{i32 4, !"B", i32 7, i32 9}
+!16 = !{i32 0, !17, !11}
+!17 = !{i32 4, !"C", i32 7, i32 4}
+!18 = !{void (<2 x float>*, <2 x float>, <2 x float>, i32)* @main, !"main", null, null, null}
+!19 = !{void (<2 x float>*, <2 x float>, <2 x float>, i32)* @main, i32 0, i1 false}
+!20 = !{i32 64}
+!21 = !{i32 -1}
diff --git a/tools/clang/test/CodeGenDXIL/passes/dxil-convergence.ll b/tools/clang/test/CodeGenDXIL/passes/dxil-convergence.ll
@@ -0,0 +1,82 @@
+; RUN: %dxopt %s -hlsl-passes-resume -hlsl-dxil-convergent-mark -S | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+; Function Attrs: nounwind readnone
+declare <2 x float> @"dx.hl.op.rn.<2 x float> (i32, <2 x float>)"(i32, <2 x float>) #1
+
+; Function Attrs: nounwind
+define void @main(<2 x float>* noalias %arg, <2 x float> %arg1, <2 x float> %arg2, i32 %arg3) #0 {
+bb:
+  ; CHECK: [[val:%.*]] = extractelement <2 x float> %tmp, i64 0
+  ; CHECK: [[conv:%.*]] = call float @dxil.convergent.marker.float(float [[val]])
+  ; CHECK: [[vec0:%.*]] = insertelement <2 x float> undef, float [[conv]], i64 0
+  ; CHECK: [[val:%.*]] = extractelement <2 x float> %tmp, i64 1
+  ; CHECK: [[conv:%.*]] = call float @dxil.convergent.marker.float(float [[val]])
+  ; CHECK: [[vec:%.*]] = insertelement <2 x float> [[vec0]], float [[conv]], i64 1
+  %tmp = fadd <2 x float> %arg1, %arg2
+  %tmp4 = icmp sgt i32 %arg3, 2
+  %tmp5 = icmp ne i1 %tmp4, false
+  %tmp6 = icmp ne i1 %tmp5, false
+  br i1 %tmp6, label %bb7, label %bb10
+
+bb7:                                              ; preds = %bb
+  ; CHECK: call <2 x float> @"dx.hl.op.rn.<2 x float> (i32, <2 x float>)"(i32 128, <2 x float> [[vec]])
+  %tmp8 = call <2 x float> @"dx.hl.op.rn.<2 x float> (i32, <2 x float>)"(i32 128, <2 x float> %tmp)
+  %tmp9 = fsub <2 x float> zeroinitializer, %tmp8
+  br label %bb10
+
+bb10:                                             ; preds = %bb7, %bb
+  %res.0 = phi <2 x float> [ %tmp9, %bb7 ], [ zeroinitializer, %bb ]
+  store <2 x float> %res.0, <2 x float>* %arg
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+
+!llvm.module.flags = !{!0}
+!pauseresume = !{!1}
+!llvm.ident = !{!2}
+!dx.version = !{!3}
+!dx.valver = !{!4}
+!dx.shaderModel = !{!5}
+!dx.typeAnnotations = !{!6}
+!dx.entryPoints = !{!19}
+!dx.fnprops = !{!20}
+!dx.options = !{!21, !22}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !{!"hlsl-hlemit", !"hlsl-hlensure"}
+!2 = !{!"dxc(private) 1.8.0.4959 (coopvec-tests, 43e1db83c-dirty)"}
+!3 = !{i32 1, i32 8}
+!4 = !{i32 1, i32 9}
+!5 = !{!"ps", i32 6, i32 8}
+!6 = !{i32 1, void (<2 x float>*, <2 x float>, <2 x float>, i32)* @main, !7}
+!7 = !{!8, !10, !13, !15, !17}
+!8 = !{i32 0, !9, !9}
+!9 = !{}
+!10 = !{i32 1, !11, !12}
+!11 = !{i32 4, !"SV_Target", i32 7, i32 9}
+!12 = !{i32 0}
+!13 = !{i32 0, !14, !12}
+!14 = !{i32 4, !"A", i32 7, i32 9}
+!15 = !{i32 0, !16, !12}
+!16 = !{i32 4, !"B", i32 7, i32 9}
+!17 = !{i32 0, !18, !12}
+!18 = !{i32 4, !"C", i32 7, i32 4}
+!19 = !{void (<2 x float>*, <2 x float>, <2 x float>, i32)* @main, !"main", null, null, null}
+!20 = !{void (<2 x float>*, <2 x float>, <2 x float>, i32)* @main, i32 0, i1 false}
+!21 = !{i32 64}
+!22 = !{i32 -1}
+!23 = !DILocation(line: 26, column: 20, scope: !24)
+!24 = !DISubprogram(name: "main", scope: !25, file: !25, line: 24, type: !26, isLocal: false, isDefinition: true, scopeLine: 24, flags: DIFlagPrototyped, isOptimized: false, function: void (<2 x float>*, <2 x float>, <2 x float>, i32)* @main)
+!25 = !DIFile(filename: "/Users/pow2clk/dxc/tools/clang/test/CodeGenDXIL/passes/convergent-derivs.hlsl", directory: "")
+!26 = !DISubroutineType(types: !9)
+!27 = !DILocation(line: 28, column: 9, scope: !24)
+!28 = !DILocation(line: 28, column: 7, scope: !24)
+!29 = !DILocation(line: 29, column: 12, scope: !24)
+!30 = !DILocation(line: 29, column: 9, scope: !24)
+!31 = !DILocation(line: 29, column: 5, scope: !24)
+!32 = !DILocation(line: 31, column: 3, scope: !24)
diff --git a/tools/clang/test/CodeGenDXIL/passes/longvec-intrinsics.hlsl b/tools/clang/test/CodeGenDXIL/passes/longvec-intrinsics.hlsl
@@ -60,8 +60,6 @@ void main() {
   // CHECK: fmul fast <13 x float> [[mul]], [[sub]]
   fRes += smoothstep(fVec1, fVec2, fVec3);
 
-  // Intrinsics that expand into llvm ops.
-
   // CHECK: fmul fast <13 x float> [[fvec3]], <float 0x3F91DF46A0000000
   fRes += radians(fVec3);
 
@@ -82,6 +80,11 @@ void main() {
   // CHECK: fmul fast <13 x half> [[tmp]], [[hvec1]]
   hRes += lerp(hVec2, hVec3, hVec1);
 
+  // CHECK: [[tmp:%.*]] = call <13 x float> @dx.op.unary.v13f32(i32 83, <13 x float> [[fvec1]])  ; DerivCoarseX(value)
+  // CHECK: call <13 x float> @dx.op.unary.v13f32(i32 6, <13 x float> [[tmp]])  ; FAbs(value)
+  // CHECK: [[tmp:%.*]] = call <13 x float> @dx.op.unary.v13f32(i32 84, <13 x float> [[fvec1]])  ; DerivCoarseY(value)
+  // CHECK: call <13 x float> @dx.op.unary.v13f32(i32 6, <13 x float> [[tmp]])  ; FAbs(value)
+  fRes += fwidth(fVec1);
 
   // CHECK: [[ld:%.*]] = call %dx.types.ResRet.v13i32 @dx.op.rawBufferVectorLoad.v13i32(i32 303, %dx.types.Handle {{%.*}}, i32 17, i32 0, i32 4) 
   // CHECK: [[uvec1:%.*]] = extractvalue %dx.types.ResRet.v13i32 [[ld]], 0
@@ -165,6 +168,9 @@ void main() {
   // CHECK: call <13 x float> @dx.op.tertiary.v13f32(i32 46, <13 x float> [[fvec1]], <13 x float> [[fvec2]], <13 x float> [[fvec3]])  ; FMad(a,b,c)
   fRes += mad(fVec1, fVec2, fVec3);
 
+  // CHECK: call <13 x half> @dx.op.unary.v13f16(i32 85, <13 x half> [[hvec1]])  ; DerivFineX(value)
+  hRes += ddx_fine(hVec1);
+
   // CHECK: [[ld:%.*]] = call %dx.types.ResRet.v13f64 @dx.op.rawBufferVectorLoad.v13f64(i32 303, %dx.types.Handle {{%.*}}, i32 24, i32 0, i32 8) 
   // CHECK: [[dvec1:%.*]] = extractvalue %dx.types.ResRet.v13f64 [[ld]], 0
   // CHECK: [[ld:%.*]] = call %dx.types.ResRet.v13f64 @dx.op.rawBufferVectorLoad.v13f64(i32 303, %dx.types.Handle {{%.*}}, i32 25, i32 0, i32 8) 
diff --git a/tools/clang/test/CodeGenDXIL/passes/longvec-intrinsics.ll b/tools/clang/test/CodeGenDXIL/passes/longvec-intrinsics.ll
diff --git a/utils/hct/hctdb.py b/utils/hct/hctdb.py