Merge commit '3613bf40d90a38766ec65a250aeadb391f9f7fc9'

whitneywhtsang · whitneywhtsang · commit be47a2746154 · 2024-10-25T13:31:56.000Z
diff --git a/lib/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVM.cpp
@@ -41,36 +41,60 @@ SmallVector<Value> reorderValues(const SmallVector<Value> &values, Type inType,
   if (inBitWidth == ouBitWidth)
     return values;
   if (inBitWidth == 16 && ouBitWidth == 32) {
+    // Register layout conversion:
+    //
+    //   [0, 1], [4, 5]  ⟶  [0], [1], [4], [5]
+    //   [2, 3], [6, 7]      [2], [3], [6], [7]
+    //
+    // Original access order:
+    //
+    //   [0, 1], [2, 3], [4, 5], [6, 7]
+    //
+    // Transformed access order:
+    //
+    //   [0], [2], [1], [3], [4], [6], [5], [7]
     SmallVector<Value> ret;
     for (unsigned i = 0; i < values.size(); i += 8) {
       ret.push_back(values[i]);
-      ret.push_back(values[i + 1]);
-      ret.push_back(values[i + 4]);
-      ret.push_back(values[i + 5]);
       ret.push_back(values[i + 2]);
+      ret.push_back(values[i + 1]);
       ret.push_back(values[i + 3]);
+      ret.push_back(values[i + 4]);
       ret.push_back(values[i + 6]);
+      ret.push_back(values[i + 5]);
       ret.push_back(values[i + 7]);
     }
     return ret;
   }
   if (inBitWidth == 8 && ouBitWidth == 16) {
+    // Register layout conversion:
+    //
+    //   [0, 1, 2, 3], [8, 9, 10, 11]  ⟶  [0, 1], [2, 3], [8, 9], [10, 11]
+    //   [4, 5, 6, 7], [12, 13, 14, 15]    [4, 5], [6, 7], [12, 13], [14, 15]
+    //
+    // Original access order:
+    //
+    //   [0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]
+    //
+    // Transformed access order:
+    //
+    //   [0, 1], [4, 5], [2, 3], [6, 7], [8, 9], [12, 13], [10, 11], [14, 15]
     SmallVector<Value> ret;
     for (unsigned i = 0; i < values.size(); i += 16) {
-      ret.push_back(values[i + 0]);
+      ret.push_back(values[i]);
       ret.push_back(values[i + 1]);
-      ret.push_back(values[i + 2]);
-      ret.push_back(values[i + 3]);
-      ret.push_back(values[i + 8]);
-      ret.push_back(values[i + 9]);
-      ret.push_back(values[i + 10]);
-      ret.push_back(values[i + 11]);
       ret.push_back(values[i + 4]);
       ret.push_back(values[i + 5]);
+      ret.push_back(values[i + 2]);
+      ret.push_back(values[i + 3]);
       ret.push_back(values[i + 6]);
       ret.push_back(values[i + 7]);
+      ret.push_back(values[i + 8]);
+      ret.push_back(values[i + 9]);
       ret.push_back(values[i + 12]);
       ret.push_back(values[i + 13]);
+      ret.push_back(values[i + 10]);
+      ret.push_back(values[i + 11]);
       ret.push_back(values[i + 14]);
       ret.push_back(values[i + 15]);
     }
diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM.cpp
@@ -641,7 +641,6 @@ struct ConvertLayoutOpConversion
       // for the destination type, we need to pack values together
       // so they can be consumed by tensor core operations
       SmallVector<Value> vecVals;
-      SmallVector<Type> types;
       // For some reasons, LLVM's NVPTX backend inserts unnecessary (?) integer
       // instructions to pack & unpack sub-word integers. A workaround is to
       // store the results of ldmatrix in i32
@@ -655,37 +654,20 @@ struct ConvertLayoutOpConversion
                 shl(i32_ty, zext(i32_ty, vals[i + j]), i32_val(elemSize * j));
             val = or_(i32_ty, val, ext);
           }
-          vecVals.push_back(val);
+          vecVals.push_back(bitcast(val, i32_ty));
         }
-        elems = elems / (32 / elemSize);
-        types = SmallVector<Type>(elems, i32_ty);
       } else {
         unsigned vecSize = std::max<unsigned>(32 / elemSize, 1);
         Type vecTy = vec_ty(elemTy, vecSize);
-        types = SmallVector<Type>(elems / vecSize, vecTy);
         for (unsigned i = 0; i < elems; i += vecSize) {
           Value packed = rewriter.create<LLVM::UndefOp>(loc, vecTy);
           for (unsigned j = 0; j < vecSize; j++)
             packed = insert_element(vecTy, packed, vals[i + j], i32_val(j));
-          vecVals.push_back(packed);
+          vecVals.push_back(bitcast(packed, i32_ty));
         }
       }
-
-      // This needs to be ordered the same way that
-      // ldmatrix.x4 would order it
-      // TODO: this needs to be refactor so we don't
-      // implicitly depends on how emitOffsetsForMMAV2
-      // is implemented
-      SmallVector<Value> reorderedVals;
-      for (unsigned i = 0; i < vecVals.size(); i += 4) {
-        reorderedVals.push_back(bitcast(vecVals[i], i32_ty));
-        reorderedVals.push_back(bitcast(vecVals[i + 2], i32_ty));
-        reorderedVals.push_back(bitcast(vecVals[i + 1], i32_ty));
-        reorderedVals.push_back(bitcast(vecVals[i + 3], i32_ty));
-      }
-
-      Value view = packLLElements(loc, getTypeConverter(), reorderedVals,
-                                  rewriter, dstTy);
+      Value view =
+          packLLElements(loc, getTypeConverter(), vecVals, rewriter, dstTy);
       rewriter.replaceOp(op, view);
       return success();
     }
diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandMMAv2.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandMMAv2.cpp
@@ -513,8 +513,8 @@ Value composeValuesToDotOperandLayoutStruct(
     for (int m = 0; m < n0; ++m)
       for (int k = 0; k < n1; ++k) {
         elems.push_back(vals.at({b, 2 * m, 2 * k}));
-        elems.push_back(vals.at({b, 2 * m, 2 * k + 1}));
         elems.push_back(vals.at({b, 2 * m + 1, 2 * k}));
+        elems.push_back(vals.at({b, 2 * m, 2 * k + 1}));
         elems.push_back(vals.at({b, 2 * m + 1, 2 * k + 1}));
       }
   assert(!elems.empty());
diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM/MMAv2.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM/MMAv2.cpp
@@ -62,12 +62,86 @@ ValueTableV2 getValuesFromDotOperandLayoutStruct(
   auto elems = unpackLLElements(loc, value, rewriter);
   int offset{};
   ValueTableV2 vals;
+
+  // FIXME [Dot LL]
+  // [ez] Generalize the logic below for kWidth * elemBitWidth > 32
+  auto dot = cast<DotOperandEncodingAttr>(type.getEncoding());
+  auto largeK = dot.getKWidth() == 8 &&
+                cast<NvidiaMmaEncodingAttr>(dot.getParent()).isAmpere();
+  if (largeK) {
+    llvm::SmallVector<unsigned> si;
+
+    // For kWidth = 8, split the mma into 4 mmas with "stride 4" along K
+    if (dot.getOpIdx() == 0) {
+      // Original register layout:
+      //
+      //   [0, 1, 2, 3], [8, 9, 10, 11]
+      //   [4, 5, 6, 7], [12, 13, 14, 15]
+      //
+      // Each element in the layout consists of two bf16 values.
+      // For example, the row [0, 1, 2, 3] expands to:
+      //
+      //   [[0/0, 0/1], [1/0, 1/1], [2/0, 2/1], [3/0, 3/1]]
+      //
+      // Here, 0/0 refers to the first half of element 0, and 0/1 refers to the
+      // second half, matching kWidth = 8.
+      //
+      // To derive four independent MMA operations, a stride of 4 is applied to
+      // the original register layout:
+      //
+      //   1st MMA: [0, 4, 8, 12]
+      //   2nd MMA: [1, 5, 9, 13]
+      //   3rd MMA: [2, 6, 10, 14]
+      //   4th MMA: [3, 7, 11, 15]
+      si = llvm::SmallVector<unsigned>{0, 4, 8,  12, 1, 5, 9,  13,
+                                       2, 6, 10, 14, 3, 7, 11, 15};
+    } else {
+      // Original register layout:
+      //
+      //   [0, 1, 2, 3]^T, [4, 5, 6, 7]^T
+      //
+      // A stride of 4 is applied to derive four independent MMA operations:
+      //
+      //   1st MMA: [0, 4]
+      //   2nd MMA: [1, 5]
+      //   3rd MMA: [2, 6]
+      //   4th MMA: [3, 7]
+      si = llvm::SmallVector<unsigned>{0, 4, 1, 5, 2, 6, 3, 7};
+    }
+
+    auto step = si.size();
+    SmallVector<Value> perm(step);
+    for (auto i = 0; i < elems.size() / step; ++i) {
+      for (auto j = 0; j < step; ++j) {
+        perm[j] = elems[i * step + si[j]];
+      }
+      std::copy(perm.begin(), perm.end(), elems.begin() + i * step);
+    }
+
+    if (dot.getOpIdx() == 1) {
+      // there are kWidth * 2 elems packed as bf16x2
+      int elemsInTile = dot.getKWidth();
+      // n0 and n1 are unrolled in the legacy path
+      // Unrolling n1 makes some sense, but unrolling n0 makes absolutely no
+      // sense IMO
+      n0 *= 2;
+      n1 *= 2;
+      for (auto b = 0; b < batch; ++b)
+        for (auto j = 0; j < n1 / elemsInTile; ++j)
+          for (auto i = 0; i < n0; ++i)
+            for (auto k = 0; k < elemsInTile; ++k) {
+              vals[{b, i, elemsInTile * j + k}] = elems[offset++];
+            }
+      return vals;
+    }
+  }
+
   for (auto b = 0; b < batch; ++b)
     for (auto i = 0; i < n0; ++i) {
       for (auto j = 0; j < n1; j++) {
         vals[{b, 2 * i, 2 * j}] = elems[offset++];
-        vals[{b, 2 * i, 2 * j + 1}] = elems[offset++];
         vals[{b, 2 * i + 1, 2 * j}] = elems[offset++];
+        vals[{b, 2 * i, 2 * j + 1}] = elems[offset++];
         vals[{b, 2 * i + 1, 2 * j + 1}] = elems[offset++];
       }
     }
diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/UpcastMXFPToLLVM.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/UpcastMXFPToLLVM.cpp
@@ -27,6 +27,60 @@ class UpcastMXFPOpPattern : public ConvertOpToLLVMPattern<UpcastMXFPOp> {
       : ConvertOpToLLVMPattern<UpcastMXFPOp>(typeConverter, benefit),
         targetInfo(targetInfo) {}
 
+  llvm::SmallVector<Value>
+  unpackFP4Elements(Location loc, ConversionPatternRewriter &rewriter,
+                    const llvm::SmallVector<Value> &vals, Value laneId) const {
+    auto fp4x2ToBf16x2 = [&loc, &rewriter](Value v) -> Value {
+      auto em0 = and_(v, i8_val(0x70));
+      auto em1 = and_(v, i8_val(0x7));
+      Value v0 = or_(shl(zext(i16_ty, em0), i16_val(2)),
+                     shl(zext(i16_ty, and_(v, i8_val(0x80))), i16_val(8)));
+      Value v1 = or_(shl(zext(i16_ty, em1), i16_val(6)),
+                     shl(zext(i16_ty, and_(v, i8_val(0x8))), i16_val(12)));
+
+      // Three cases:
+      // 1) x is normal and non-zero: Correct bias
+      v0 = select(icmp_ne(and_(em0, i8_val(0x60)), i8_val(0)),
+                  add(v0, i16_val((127 - 1) << 7)), v0);
+      v1 = select(icmp_ne(and_(em1, i8_val(0x6)), i8_val(0)),
+                  add(v1, i16_val((127 - 1) << 7)), v1);
+
+      // 2) x is subnormal (x == 0bs001 where s is the sign): Map to +-0.5 in
+      // bf16
+      v0 = select(icmp_eq(em0, i8_val(0x10)),
+                  or_(i16_val(16128), and_(v0, i16_val(0x8000))), v0);
+      v1 = select(icmp_eq(em1, i8_val(0x1)),
+                  or_(i16_val(16128), and_(v1, i16_val(0x8000))), v1);
+      // 3) x is zero, nothing to do
+
+      // Swap as they come packed in big endian
+      return or_(zext(i32_ty, v0), shl(zext(i32_ty, v1), i32_val(16)));
+    };
+
+    auto fp4x8ToBf16x2 = [&loc, &rewriter, &fp4x2ToBf16x2](
+                             Value v) -> llvm::SmallVector<Value, 4> {
+      llvm::SmallVector<Value, 4> results(4);
+      for (int i = 0; i < 4; ++i) {
+        auto v_i = trunc(i8_ty, lshr(v, i32_val(8 * i)));
+        results[i] = fp4x2ToBf16x2(v_i);
+      }
+      return results;
+    };
+
+    // Split fp4x8 into 4 bf16x2
+    llvm::SmallVector<Value> ret;
+    ret.reserve(vals.size() * 4);
+    for (int i = 0; i < vals.size(); ++i) {
+      auto vs = fp4x8ToBf16x2(vals[i]);
+      assert(vs.size() == 4);
+      for (auto v : vs) {
+        ret.push_back(v);
+      }
+    }
+
+    return ret;
+  }
+
   LogicalResult
   matchAndRewrite(UpcastMXFPOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {