Improve tf32 dpas Prototype for A/B Matrix Sources

YuriPlyakhin · igcbot · commit aa0886f55afa · 2024-03-11T17:02:26.000+01:00
Update TF32 DPAS prototype to use float A and B types
Short matrix A type did not allow pre-processing of the A matrix
and did not work well with 2d block loads for u32 sources
diff --git a/IGC/BiFModule/Implementation/IGCBiF_Intrinsics_Dpas.cl b/IGC/BiFModule/Implementation/IGCBiF_Intrinsics_Dpas.cl
@@ -339,16 +339,16 @@ half8   __builtin_IB_sub_group16_fdpas_hf_hf_hf_hf_8_8 (half8  acc,  short8  a,
 
 
 // tf32, rcount = 1, simd16
-float   __builtin_IB_sub_group16_fdpas_f_f_tf32_tf32_8_1  (float  acc, short  a, int8 b) __attribute__((const));
+float   __builtin_IB_sub_group16_fdpas_f_f_tf32_tf32_8_1  (float  acc, float  a, float8 b) __attribute__((const));
 
 // tf32, rcount = 2, simd16
-float2  __builtin_IB_sub_group16_fdpas_f_f_tf32_tf32_8_2  (float2 acc, short2 a, int8 b) __attribute__((const));
+float2  __builtin_IB_sub_group16_fdpas_f_f_tf32_tf32_8_2  (float2 acc, float  a, float8 b) __attribute__((const));
 
 // tf32, rcount = 4, simd16
-float4  __builtin_IB_sub_group16_fdpas_f_f_tf32_tf32_8_4  (float4 acc, short4 a, int8 b) __attribute__((const));
+float4  __builtin_IB_sub_group16_fdpas_f_f_tf32_tf32_8_4  (float4 acc, float2 a, float8 b) __attribute__((const));
 
 // tf32, rcount = 8, simd16
-float8  __builtin_IB_sub_group16_fdpas_f_f_tf32_tf32_8_8  (float8 acc, short8 a, int8 b) __attribute__((const));
+float8  __builtin_IB_sub_group16_fdpas_f_f_tf32_tf32_8_8  (float8 acc, float4 a, float8 b) __attribute__((const));
 
 
 //
diff --git a/IGC/BiFModule/Languages/OpenCL/IBiF_dpas.cl b/IGC/BiFModule/Languages/OpenCL/IBiF_dpas.cl
@@ -483,10 +483,10 @@ DEFN_INTEL_CVT2( f32_to_bf16_packed,  int16, float16, float16, 2fto2bf_16 )
 #ifdef cl_intel_subgroup_matrix_multiply_accumulate_tf32
 // PVC_B
 
-DEFN_INTEL_SG16_FDPAS( tf32_tf32_matrix_mad_k8_f32, float,   float,   short,   int8,  fdpas_f_f_tf32_tf32_8_1 )
-DEFN_INTEL_SG16_FDPAS( tf32_tf32_matrix_mad_k8_f32, float2,  float2,  short2,  int8,  fdpas_f_f_tf32_tf32_8_2 )
-DEFN_INTEL_SG16_FDPAS( tf32_tf32_matrix_mad_k8_f32, float4,  float4,  short4,  int8,  fdpas_f_f_tf32_tf32_8_4 )
-DEFN_INTEL_SG16_FDPAS( tf32_tf32_matrix_mad_k8_f32, float8,  float8,  short8,  int8,  fdpas_f_f_tf32_tf32_8_8 )
+DEFN_INTEL_SG16_FDPAS( tf32_tf32_matrix_mad_k8_f32, float,   float,   float,   float8,  fdpas_f_f_tf32_tf32_8_1 )
+DEFN_INTEL_SG16_FDPAS( tf32_tf32_matrix_mad_k8_f32, float2,  float2,  float,   float8,  fdpas_f_f_tf32_tf32_8_2 )
+DEFN_INTEL_SG16_FDPAS( tf32_tf32_matrix_mad_k8_f32, float4,  float4,  float2,  float8,  fdpas_f_f_tf32_tf32_8_4 )
+DEFN_INTEL_SG16_FDPAS( tf32_tf32_matrix_mad_k8_f32, float8,  float8,  float4,  float8,  fdpas_f_f_tf32_tf32_8_8 )
 
 DEFN_INTEL_CVT( f32_to_tf32,  int,   float,   ftotf32_1  )
 DEFN_INTEL_CVT( f32_to_tf32,  int2,  float2,  ftotf32_2  )
diff --git a/IGC/BiFModule/Languages/OpenCL/PreRelease/opencl_cth_pre_release.h b/IGC/BiFModule/Languages/OpenCL/PreRelease/opencl_cth_pre_release.h
@@ -2380,11 +2380,22 @@ int16 __attribute__((overloadable)) intel_convert_f32_to_bf16_packed(float16 a,
 
 #ifdef cl_intel_subgroup_matrix_multiply_accumulate_tf32
 
-// A: half of tfloat32 B: tfloat32 ACC: float DST: float
-float  __attribute__((overloadable)) intel_sub_group_tf32_tf32_matrix_mad_k8_f32(short  a, int8 b, float  acc);
-float2 __attribute__((overloadable)) intel_sub_group_tf32_tf32_matrix_mad_k8_f32(short2 a, int8 b, float2 acc);
-float4 __attribute__((overloadable)) intel_sub_group_tf32_tf32_matrix_mad_k8_f32(short4 a, int8 b, float4 acc);
-float8 __attribute__((overloadable)) intel_sub_group_tf32_tf32_matrix_mad_k8_f32(short8 a, int8 b, float8 acc);
+// A: tf32, even rows in lower 8 SIMD channels, odd rows in upper 8 SIMD channels
+// B: tf32
+// ACC: float
+// DST: float
+
+// M = 1, K = 8, N = 16, upper 8 channels of a ignored
+float  __attribute__((overloadable)) intel_sub_group_tf32_tf32_matrix_mad_k8_f32(float  a, float8 b, float  acc);
+
+// M = 2, K = 8, N = 16, all channels of a are used
+float2 __attribute__((overloadable)) intel_sub_group_tf32_tf32_matrix_mad_k8_f32(float  a, float8 b, float2 acc);
+
+// M = 4, K = 8, N = 16
+float4 __attribute__((overloadable)) intel_sub_group_tf32_tf32_matrix_mad_k8_f32(float2 a, float8 b, float4 acc);
+
+// M = 8, K = 8, N = 16
+float8 __attribute__((overloadable)) intel_sub_group_tf32_tf32_matrix_mad_k8_f32(float4 a, float8 b, float8 acc);
 
 // Conversions
 int   __attribute__((overloadable)) intel_convert_f32_to_tf32(float source);
diff --git a/IGC/Compiler/Optimizer/OpenCLPasses/DpasFuncs/DpasFuncsResolution.cpp b/IGC/Compiler/Optimizer/OpenCLPasses/DpasFuncs/DpasFuncsResolution.cpp
@@ -421,16 +421,21 @@ void DpasFuncsResolution::visitCallInst(CallInst& CI)
             IGC_ASSERT_MESSAGE(ACC_nelts == RC, "ICE: dpas intrinsic has mismatched vector sizes of arguments!");
             IGC_ASSERT_MESSAGE(B_nelts == SD, "ICE: dpas intrinsic has mismatched vector sizes of arguments!");
             IGC_ASSERT_MESSAGE(precOk, "ICE: dpas's A and B have illegal type combination!");
-            IGC_ASSERT_MESSAGE(B_BaseTy->isIntegerTy(32), "ICE: dpas's arg B shall have base type int32!");
-            IGC_ASSERT_MESSAGE(RC == (IsDpasw ? 2 * A_nelts : A_nelts), "ICE: dpas's arg A has wrong element size!");
+            IGC_ASSERT_MESSAGE(B_BaseTy->isIntegerTy(32) || (PB == PrecisionType::TF32 && B_BaseTy->isFloatTy()),
+                "ICE: dpas's arg B shall have base type int32 or float!");
+            IGC_ASSERT_MESSAGE((RC == (IsDpasw ? 2 * A_nelts : A_nelts) ||
+                               (PA == PrecisionType::TF32 && (RC == 2 * A_nelts))),
+                               "ICE: dpas's arg A has wrong element size!");
 
             uint32_t AbitsPerDepth = 32;
             if (m_pCtx->platform.hasExecSize16DPAS())
             {
                 AbitsPerDepth = AbitsPerDepth / 2;
             }
 
-            IGC_ASSERT_MESSAGE(A_BaseTy->isIntegerTy(AbitsPerDepth), "ICE: dpas intrinsic's A has wrong base type!");
+            IGC_ASSERT_MESSAGE(A_BaseTy->isIntegerTy(AbitsPerDepth) ||
+                              (PA == PrecisionType::TF32 && A_BaseTy->isFloatTy()),
+                              "ICE: dpas intrinsic's A has wrong base type!");
             if (PA == PrecisionType::TF32)
             {
                 if (!(DstTy == DSTACC_FLOAT && AccTy == DSTACC_FLOAT))
@@ -458,7 +463,15 @@ void DpasFuncsResolution::visitCallInst(CallInst& CI)
     Value* args[8];
     args[0] = CI.getArgOperand(0);
     args[1] = CI.getArgOperand(1);
-    args[2] = CI.getArgOperand(2);
+
+    Value* B = CI.getArgOperand(2);
+    Type* BTy = B->getType();
+    if (FixedVectorType *BVecTy = dyn_cast<FixedVectorType>(BTy); BVecTy && BTy->getScalarType()->isFloatTy()) {
+        B = CastInst::Create(Instruction::CastOps::BitCast, B,
+            FixedVectorType::get(intTy, (unsigned) BVecTy->getNumElements()),
+            B->getName() + ".cast", &CI);
+    }
+    args[2] = B;
 
     args[3] = ConstantInt::get(intTy, PA);
     args[4] = ConstantInt::get(intTy, PB);