diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 368e1a646c83b..76c49336aabca 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -3363,9 +3363,9 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
     // Already computed the OpcodeOffset table, just index into it.
     if (N.getOpcode() < OpcodeOffset.size())
       MatcherIndex = OpcodeOffset[N.getOpcode()];
-    if (N->getOpcode() == ISD::FADD) {
-      MatcherIndex = 0;
-    }
+      if (N->getOpcode() == ISD::FADD) {
+        MatcherIndex = 0;
+      }
     LLVM_DEBUG(dbgs() << "  Initial Opcode index to " << MatcherIndex << "\n");
   } else if (MatcherTable[0] == OPC_SwitchOpcode) {
     // Otherwise, the table isn't computed, but the state machine does start
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 75becd6122fb9..ffa9f4f947e54 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -793,7 +793,7 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
   }
   }
 
-  if (N->getOpcode() == ISD::FADD && false) {
+  if (N->getOpcode() == ISD::FADD) {
     llvm::dbgs() << "N->dump()\n";
     N->dump();
     MySelectCode(N);
@@ -1717,7 +1717,7 @@ bool AMDGPUDAGToDAGISel::SelectBUFSOffset(SDValue ByteOffsetNode,
   return true;
 }
 
-bool AMDGPUDAGToDAGISel::MaxsComplexPatternPackedFP(SDNode *N) const {
+bool AMDGPUDAGToDAGISel::MaxsComplexPatternPackedFP(SDNode *N, SelectionDAG *CurDAG) const {
   N->dump();
   return false;
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index ab581586c90a3..ea72d04eb8d7d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -277,7 +277,7 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
   void SelectWAVE_ADDRESS(SDNode *N);
   void SelectSTACKRESTORE(SDNode *N);
 
-  bool MaxsComplexPatternPackedFP(SDNode *N) const;
+  bool MaxsComplexPatternPackedFP(SDNode *N, SelectionDAG *CurDAG) const;
 
 protected:
   // Include the pieces autogenerated from the target description.
@@ -290,7 +290,7 @@ void MySelectCode(SDNode *N) {
   #define TARGET_VAL(X) X & 255, unsigned(X) >> 8
   #define COVERAGE_IDX_VAL(X) X & 255, (unsigned(X) >> 8) & 255, (unsigned(X) >> 16) & 255, (unsigned(X) >> 24) & 255
   static const unsigned char MatcherTable[] = {
-// /*528829*/ /*SwitchOpcode*/ 118|128,4/*630*/, TARGET_VAL(ISD::FADD),// ->529463
+// /*528829*/ /*SwitchOpcode*/ 120|128,4/*632*/, TARGET_VAL(ISD::FADD),// ->529465
 /*528833*/  OPC_Scope, 76, /*->528911*/ // 4 children in Scope
 /*528835*/   OPC_MoveChild0,
 /*528836*/   OPC_CheckOpcode, TARGET_VAL(ISD::INTRINSIC_WO_CHAIN),
@@ -393,7 +393,7 @@ void MySelectCode(SDNode *N) {
               // Src: (fadd:{ *:[f32] } (AMDGPUfmul_legacy_impl:{ *:[f32] } (VOP3Mods:{ *:[f32] } f32:{ *:[f32] }:$src0, i32:{ *:[i32] }:$src0_mod), (VOP3Mods:{ *:[f32] } f32:{ *:[f32] }:$src1, i32:{ *:[i32] }:$src1_mod)), (VOP3Mods:{ *:[f32] } f32:{ *:[f32] }:$src2, i32:{ *:[i32] }:$src2_mod)) - Complexity = 33
               // Dst: (V_MAD_LEGACY_F32_e64:{ *:[f32] } ?:{ *:[i32] }:$src0_mod, ?:{ *:[f32] }:$src0, ?:{ *:[i32] }:$src1_mod, ?:{ *:[f32] }:$src1, ?:{ *:[i32] }:$src2_mod, ?:{ *:[f32] }:$src2, 0:{ *:[i1] }, 0:{ *:[i32] })
 /*529063*/   0, /*End of Scope*/
-/*529064*/  /*Scope*/ 12|128,3/*396*/, /*->529462*/
+/*529064*/  /*Scope*/ 14|128,3/*398*/, /*->529464*/
 /*529066*/   OPC_RecordChild0, // #0 = $VOP3NoMods:src2
 /*529067*/   OPC_Scope, 74, /*->529143*/ // 2 children in Scope
 /*529069*/    OPC_MoveChild1,
@@ -428,7 +428,7 @@ void MySelectCode(SDNode *N) {
                // Src: (fadd:{ *:[f32] } (VOP3Mods:{ *:[f32] } f32:{ *:[f32] }:$src2, i32:{ *:[i32] }:$src2_mod), (AMDGPUfmul_legacy_impl:{ *:[f32] } (VOP3Mods:{ *:[f32] } f32:{ *:[f32] }:$src0, i32:{ *:[i32] }:$src0_mod), (VOP3Mods:{ *:[f32] } f32:{ *:[f32] }:$src1, i32:{ *:[i32] }:$src1_mod))) - Complexity = 33
                // Dst: (V_MAD_LEGACY_F32_e64:{ *:[f32] } ?:{ *:[i32] }:$src0_mod, ?:{ *:[f32] }:$src0, ?:{ *:[i32] }:$src1_mod, ?:{ *:[f32] }:$src1, ?:{ *:[i32] }:$src2_mod, ?:{ *:[f32] }:$src2, 0:{ *:[i1] }, 0:{ *:[i32] })
 /*529142*/    0, /*End of Scope*/
-/*529143*/   /*Scope*/ 60|128,2/*316*/, /*->529461*/
+/*529143*/   /*Scope*/ 62|128,2/*318*/, /*->529463*/
 /*529145*/    OPC_RecordChild1, // #1 = $src1
 /*529146*/    OPC_Scope, 25, /*->529173*/ // 6 children in Scope
 /*529148*/     OPC_CheckPredicate3,  // Predicate_anonymous_13768
@@ -543,36 +543,37 @@ void MySelectCode(SDNode *N) {
                 // Src: (fadd:{ *:[f64] } (VOP3Mods:{ *:[f64] } f64:{ *:[f64] }:$src1, i32:{ *:[i32] }:$src1_modifiers), (VOP3Mods0:{ *:[f64] } f64:{ *:[f64] }:$src0, i32:{ *:[i32] }:$src0_modifiers, i1:{ *:[i1] }:$clamp, i32:{ *:[i32] }:$omod)) - Complexity = -973
                 // Dst: (V_ADD_F64_e64:{ *:[f64] } i32:{ *:[i32] }:$src0_modifiers, f64:{ *:[f64] }:$src0, i32:{ *:[i32] }:$src1_modifiers, f64:{ *:[f64] }:$src1, i1:{ *:[i1] }:$clamp, i32:{ *:[i32] }:$omod)
 /*529389*/     0, /*End of Scope*/
-/*529390*/    /*Scope*/ 33, /*->529424*/
-/*529391*/     OPC_CheckType, /*MVT::v2f16*/89,
-/*529393*/     OPC_CheckComplexPat, /*CP*/13, /*#*/0, // SelectVOP3PMods:$ #2 #3
-/*529396*/     OPC_CheckComplexPat, /*CP*/13, /*#*/1, // SelectVOP3PMods:$ #4 #5
-/*529399*/     OPC_EmitInteger, /*MVT::i1*/2, 0,  // 0 #6
-/*529402*/     OPC_EmitInteger32, 0,  // 0 #7
-/*529404*/     OPC_EmitInteger32, 0,  // 0 #8
-/*529406*/     OPC_EmitInteger32, 0,  // 0 #9
-/*529408*/     OPC_EmitInteger32, 0,  // 0 #10
-/*529410*/     OPC_MorphNodeTo1None, TARGET_VAL(AMDGPU::V_PK_ADD_F16),
+/*529390*/    /*Scope*/ 37, /*->529428*/
+/*529391*/     OPC_CheckType, /*MVT::v2f32*/109,
+/*529393*/     OPC_CheckPredicate, 108, // Predicate_my_any_fadd
+/*529395*/     OPC_CheckPatternPredicate, 104, // (Subtarget->hasPackedFP32Ops())
+/*529397*/     OPC_CheckComplexPat, /*CP*/13, /*#*/0, // SelectVOP3PMods:$ #2 #3
+/*529400*/     OPC_CheckComplexPat, /*CP*/13, /*#*/1, // SelectVOP3PMods:$ #4 #5
+/*529403*/     OPC_EmitInteger, /*MVT::i1*/2, 0,  // 0 #6
+/*529406*/     OPC_EmitInteger32, 0,  // 0 #7
+/*529408*/     OPC_EmitInteger32, 0,  // 0 #8
+/*529410*/     OPC_EmitInteger32, 0,  // 0 #9
+/*529412*/     OPC_EmitInteger32, 0,  // 0 #10
+/*529414*/     OPC_MorphNodeTo1None, TARGET_VAL(AMDGPU::V_PK_ADD_F32),
+                   /*MVT::v2f32*/109, 9/*#Ops*/, 3, 2, 5, 4, 6, 7, 8, 9, 10,
+               // Src: (fadd:{ *:[v2f32] } (VOP3PMods:{ *:[v2f32] } v2f32:{ *:[v2f32] }:$src0, i32:{ *:[i32] }:$src0_modifiers), (VOP3PMods:{ *:[v2f32] } v2f32:{ *:[v2f32] }:$src1, i32:{ *:[i32] }:$src1_modifiers))<<P:Predicate_my_any_fadd>> - Complexity = -978
+               // Dst: (V_PK_ADD_F32:{ *:[v2f32] } i32:{ *:[i32] }:$src0_modifiers, v2f32:{ *:[v2f32] }:$src0, i32:{ *:[i32] }:$src1_modifiers, v2f32:{ *:[v2f32] }:$src1)
+/*529428*/    /*Scope*/ 33, /*->529462*/
+/*529429*/     OPC_CheckType, /*MVT::v2f16*/89,
+/*529431*/     OPC_CheckComplexPat, /*CP*/13, /*#*/0, // SelectVOP3PMods:$ #2 #3
+/*529434*/     OPC_CheckComplexPat, /*CP*/13, /*#*/1, // SelectVOP3PMods:$ #4 #5
+/*529437*/     OPC_EmitInteger, /*MVT::i1*/2, 0,  // 0 #6
+/*529440*/     OPC_EmitInteger32, 0,  // 0 #7
+/*529442*/     OPC_EmitInteger32, 0,  // 0 #8
+/*529444*/     OPC_EmitInteger32, 0,  // 0 #9
+/*529446*/     OPC_EmitInteger32, 0,  // 0 #10
+/*529448*/     OPC_MorphNodeTo1None, TARGET_VAL(AMDGPU::V_PK_ADD_F16),
                    /*MVT::v2f16*/89, 9/*#Ops*/, 3, 2, 5, 4, 6, 7, 8, 9, 10,
                // Src: (fadd:{ *:[v2f16] } (VOP3PMods:{ *:[v2f16] } v2f16:{ *:[v2f16] }:$src0, i32:{ *:[i32] }:$src0_modifiers), (VOP3PMods:{ *:[v2f16] } v2f16:{ *:[v2f16] }:$src1, i32:{ *:[i32] }:$src1_modifiers)) - Complexity = -979
                // Dst: (V_PK_ADD_F16:{ *:[v2f16] } i32:{ *:[i32] }:$src0_modifiers, v2f16:{ *:[v2f16] }:$src0, i32:{ *:[i32] }:$src1_modifiers, v2f16:{ *:[v2f16] }:$src1)
-/*529424*/    /*Scope*/ 35, /*->529460*/
-/*529425*/     OPC_CheckType, /*MVT::v2f32*/109,
-/*529427*/     OPC_CheckPatternPredicate, 104, // (Subtarget->hasPackedFP32Ops())
-/*529429*/     OPC_CheckComplexPat, /*CP*/13, /*#*/0, // SelectVOP3PMods:$ #2 #3
-/*529432*/     OPC_CheckComplexPat, /*CP*/13, /*#*/1, // SelectVOP3PMods:$ #4 #5
-/*529435*/     OPC_EmitInteger, /*MVT::i1*/2, 0,  // 0 #6
-/*529438*/     OPC_EmitInteger32, 0,  // 0 #7
-/*529440*/     OPC_EmitInteger32, 0,  // 0 #8
-/*529442*/     OPC_EmitInteger32, 0,  // 0 #9
-/*529444*/     OPC_EmitInteger32, 0,  // 0 #10
-/*529446*/     OPC_MorphNodeTo1None, TARGET_VAL(AMDGPU::V_PK_ADD_F32),
-                   /*MVT::v2f32*/109, 9/*#Ops*/, 3, 2, 5, 4, 6, 7, 8, 9, 10,
-               // Src: (fadd:{ *:[v2f32] } (VOP3PMods:{ *:[v2f32] } v2f32:{ *:[v2f32] }:$src0, i32:{ *:[i32] }:$src0_modifiers), (VOP3PMods:{ *:[v2f32] } v2f32:{ *:[v2f32] }:$src1, i32:{ *:[i32] }:$src1_modifiers)) - Complexity = -979
-               // Dst: (V_PK_ADD_F32:{ *:[v2f32] } i32:{ *:[i32] }:$src0_modifiers, v2f32:{ *:[v2f32] }:$src0, i32:{ *:[i32] }:$src1_modifiers, v2f32:{ *:[v2f32] }:$src1)
-/*529460*/    0, /*End of Scope*/
-/*529461*/   0, /*End of Scope*/
-/*529462*/  0, /*End of Scope*/
+/*529462*/    0, /*End of Scope*/
+/*529463*/   0, /*End of Scope*/
+/*529464*/  0, /*End of Scope*/
   }; // Total Array size is 563301 bytes
 
   SelectCodeCommon(N, MatcherTable, sizeof(MatcherTable));
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index caf115490dccc..a319de43d23e7 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -1143,8 +1143,9 @@ struct MaxsUnpackPackedF32OpsDAGMutation : ScheduleDAGMutation {
     const TargetInstrInfo &TII = *DAG->TII;
     const GCNSubtarget &ST = DAG->MF.getSubtarget<GCNSubtarget>();
     for (auto &I : *DAG) {
-      if (I.getOpcode() == AMDGPU::V_PK_ADD_F32)
+      if (I.getOpcode() == AMDGPU::V_PK_ADD_F32) {
         I.dump();
+      }
       I.dump();
     }
     llvm::dbgs() << "Completed MaxsUnpackPackedF32OpsDAGMutation\n";
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 5768f27a2a263..88fb90eab8118 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -1144,9 +1144,13 @@ def MAIInstInfoTable : GenericTable {
   let PrimaryKeyName = "getMAIInstInfoHelper";
 }
 
+// this still doesn't work because
+// SubtargetPredicate = HasPackedFP32Ops
+// doesn't just determine selection it determines type legalization etc too
+// (general spaghetti)
 def my_any_fadd : PatFrags<(ops node:$lhs, node:$rhs),
                               [(strict_fadd node:$lhs, node:$rhs), (fadd node:$lhs, node:$rhs)]> {
-    let PredicateCode = [{ return MaxsComplexPatternPackedFP(N); }];
+    let PredicateCode = [{ return MaxsComplexPatternPackedFP(N, CurDAG); }];
 }
 
 let isCommutable = 1, isReMaterializable = 1 in {
diff --git a/llvm/lib/Target/AMDGPU/test_v_pk.ll b/llvm/lib/Target/AMDGPU/test_v_pk.ll
new file mode 100644
index 0000000000000..19d7ad418f187
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/test_v_pk.ll
@@ -0,0 +1,90 @@
+; /home/mlevental/dev_projects/llvm-project/llvm/lib/Target/AMDGPU/test_v_pk.ll  -mtriple=amdgcn -mcpu=gfx942 -o -
+; /home/mlevental/dev_projects/llvm-project/llvm/lib/Target/AMDGPU/test_v_pk.ll  -mattr=-packed-fp32-ops -mtriple=amdgcn -mcpu=gfx942 -o -
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+define amdgpu_kernel void @add_kernel(ptr addrspace(1) nocapture readonly %0, ptr addrspace(1) nocapture readonly %1, ptr addrspace(1) nocapture writeonly %2, i32 %3) local_unnamed_addr #0 {
+  %5 = tail call i32 @llvm.amdgcn.workgroup.id.x()
+  %6 = shl i32 %5, 10
+  %7 = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %8 = shl i32 %7, 2
+  %9 = and i32 %8, 1020
+  %10 = or disjoint i32 %9, %6
+  %11 = icmp slt i32 %10, %3
+  br i1 %11, label %.critedge, label %.critedge2
+
+.critedge:                                        ; preds = %4
+  %12 = or disjoint i32 %10, 3
+  %13 = or disjoint i32 %10, 2
+  %14 = or disjoint i32 %10, 1
+  %15 = sext i32 %10 to i64
+  %16 = getelementptr float, ptr addrspace(1) %0, i64 %15
+  %17 = addrspacecast ptr addrspace(1) %16 to ptr
+  %18 = load float, ptr %17, align 16
+  %19 = getelementptr inbounds i8, ptr %17, i64 4
+  %20 = load float, ptr %19, align 4
+
+  %v_100 = insertelement <2 x float> undef, float %18, i32 0
+  %v_102 = insertelement <2 x float> %v_100, float %20, i32 1
+
+  %21 = getelementptr inbounds i8, ptr %17, i64 8
+  %22 = load float, ptr %21, align 8
+  %23 = getelementptr inbounds i8, ptr %17, i64 12
+  %24 = load float, ptr %23, align 4
+
+  %v_200 = insertelement <2 x float> undef, float %22, i32 0
+  %v_202 = insertelement <2 x float> %v_200, float %24, i32 1
+
+  %25 = getelementptr float, ptr addrspace(1) %1, i64 %15
+  %26 = addrspacecast ptr addrspace(1) %25 to ptr
+  %27 = sext i32 %12 to i64
+  %28 = getelementptr float, ptr addrspace(1) %2, i64 %27
+  %29 = sext i32 %13 to i64
+  %30 = getelementptr float, ptr addrspace(1) %2, i64 %29
+  %31 = sext i32 %14 to i64
+  %32 = getelementptr float, ptr addrspace(1) %2, i64 %31
+  %33 = getelementptr inbounds i8, ptr %26, i64 12
+  %34 = load float, ptr %33, align 4
+
+  %36 = getelementptr inbounds i8, ptr %26, i64 8
+  %37 = load float, ptr %36, align 8
+
+  %v_300 = insertelement <2 x float> undef, float %34, i32 0
+  %v_302 = insertelement <2 x float> %v_300, float %37, i32 1
+
+  %39 = getelementptr inbounds i8, ptr %26, i64 4
+  %40 = load float, ptr %39, align 4
+  %42 = load float, ptr %26, align 16
+
+  %v_400 = insertelement <2 x float> undef, float %40, i32 0
+  %v_402 = insertelement <2 x float> %v_400, float %42, i32 1
+
+  %v_500 = fadd <2 x float> %v_102, %v_402
+  ; %v_501 = fadd <2 x float> %v_202, %v_302
+  ; tail call void @llvm.amdgcn.iglp.opt(i32 4)
+
+  ; %v_45 = extractelement <2 x float> %v_501, i32 1
+  ; %v_32 = extractelement <2 x float> %v_501, i32 0
+  %v_30 = extractelement <2 x float> %v_500, i32 1
+  %v_28 = extractelement <2 x float> %v_500, i32 0
+
+  %i_44 = sext i32 %10 to i64
+  %p_45 = getelementptr float, ptr addrspace(1) %2, i64 %i_44
+  store float %v_28, ptr addrspace(1) %p_45, align 4
+
+  ; %i_31 = sext i32 %14 to i64
+  ; %p_32 = getelementptr float, ptr addrspace(1) %2, i64 %i_31
+  ; store float %v_32, ptr addrspace(1) %p_32, align 4
+
+  %i_29 = sext i32 %13 to i64
+  %p_30 = getelementptr float, ptr addrspace(1) %2, i64 %i_29
+  store float %v_30, ptr addrspace(1) %p_30, align 4
+
+  %i_27 = sext i32 %12 to i64
+  %p_28 = getelementptr float, ptr addrspace(1) %2, i64 %i_27
+  store float %v_28, ptr addrspace(1) %p_28, align 4
+
+  br label %.critedge2
+
+.critedge2:                                       ; preds = %4, %.critedge
+  ret void
+}