openxla
diff --git a/‎BUILD‎
Lines changed: 900 additions & 0 deletions b/‎BUILD‎
Lines changed: 900 additions & 0 deletions
diff --git a/‎include/triton/Analysis/Alias.h‎
Lines changed: 3 additions & 4 deletions b/‎include/triton/Analysis/Alias.h‎
Lines changed: 3 additions & 4 deletions
diff --git a/‎include/triton/Dialect/Triton/IR/TritonTypes.td‎
Lines changed: 1 addition & 1 deletion b/‎include/triton/Dialect/Triton/IR/TritonTypes.td‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/triton/Dialect/Triton/IR/Utility.h‎
Lines changed: 5 additions & 1 deletion b/‎include/triton/Dialect/Triton/IR/Utility.h‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎lib/Analysis/Alias.cpp‎
Lines changed: 5 additions & 3 deletions b/‎lib/Analysis/Alias.cpp‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎lib/Analysis/AxisInfo.cpp‎
Lines changed: 9 additions & 6 deletions b/‎lib/Analysis/AxisInfo.cpp‎
Lines changed: 9 additions & 6 deletions
diff --git a/‎lib/Analysis/Utility.cpp‎
Lines changed: 4 additions & 2 deletions b/‎lib/Analysis/Utility.cpp‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVM.cpp‎
Lines changed: 2 additions & 1 deletion b/‎lib/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVM.cpp‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎lib/Conversion/TritonGPUToLLVM/TypeConverter.cpp‎
Lines changed: 3 additions & 0 deletions b/‎lib/Conversion/TritonGPUToLLVM/TypeConverter.cpp‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp‎
Lines changed: 2 additions & 1 deletion b/‎lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp‎
Lines changed: 2 additions & 1 deletion
@@ -85,10 +85,9 @@ class SharedMemoryAliasAnalysis
   }
 
   /// Computes if the alloc set of the results are changed.
-  void
-  visitOperation(Operation *op,
-                 ArrayRef<const dataflow::Lattice<AliasInfo> *> operands,
-                 ArrayRef<dataflow::Lattice<AliasInfo> *> results) override;
+  LogicalResult visitOperation(
+      Operation *op, ArrayRef<const dataflow::Lattice<AliasInfo> *> operands,
+      ArrayRef<dataflow::Lattice<AliasInfo> *> results) override;
 };
 
 } // namespace mlir
 
@@ -15,7 +15,7 @@ class TritonTypeDef<string name, string _mnemonic, list<Trait> traits = []>
 }
 
 // Floating-point Type
-def TT_Float : AnyTypeOf<[F8E4M3FNUZ, F8E5M2, F8E5M2FNUZ, F16, BF16, F32, F64], "floating-point">;
+def TT_Float : AnyTypeOf<[F8E4M3FN, F8E4M3FNUZ, F8E5M2, F8E5M2FNUZ, F16, BF16, F32, F64], "floating-point">;
 def TT_FloatTensor : RankedTensorOf<[TT_Float]>;
 def TT_FloatLike : AnyTypeOf<[TT_Float, TT_FloatTensor]>;
 
 
@@ -31,7 +31,11 @@ template <typename Int> Int ceil(Int m, Int n) { return (m + n - 1) / n; }
 
 /// Get the highest power of 2 divisor of an integer.
 template <typename T> T highestPowOf2Divisor(T n) {
-  if (n == 0) {
+  // When n is 0 or min, return the highest power of 2. The min case is handled
+  // separately to avoid underflow when T is a signed integer. Technically
+  // in that case the correct divisor is -n, but this value is outside the
+  // range of possible values, so we take the next best alternative.
+  if (n == 0 || n == std::numeric_limits<T>::min()) {
     return (static_cast<T>(1) << (sizeof(T) * 8 - 2));
   }
   return (n & (~(n - 1)));
 
@@ -21,7 +21,7 @@ AliasInfo AliasInfo::join(const AliasInfo &lhs, const AliasInfo &rhs) {
   return ret;
 }
 
-void SharedMemoryAliasAnalysis::visitOperation(
+LogicalResult SharedMemoryAliasAnalysis::visitOperation(
     Operation *op, ArrayRef<const dataflow::Lattice<AliasInfo> *> operands,
     ArrayRef<dataflow::Lattice<AliasInfo> *> results) {
   AliasInfo aliasInfo;
@@ -31,7 +31,7 @@ void SharedMemoryAliasAnalysis::visitOperation(
   if (auto memdescTy = dyn_cast<triton::MemDescType>(result.getType())) {
     if (!isa_and_nonnull<triton::gpu::SharedMemorySpaceAttr>(
             memdescTy.getMemorySpace()))
-      return;
+      return mlir::success();
   }
 
   // Only LocalAllocOp creates a new buffer.
@@ -49,11 +49,13 @@ void SharedMemoryAliasAnalysis::visitOperation(
   }
 
   if (pessimistic) {
-    return setAllToEntryStates(results);
+    setAllToEntryStates(results);
+    return mlir::success();
   }
   // Join all lattice elements
   for (auto *result : results)
     propagateIfChanged(result, result->join(aliasInfo));
+  return mlir::success();
 }
 
 AliasResult SharedMemoryAliasAnalysis::alias(Value lhs, Value rhs) {
 
@@ -195,9 +195,9 @@ class AxisInfoAnalysis : public dataflow::SparseForwardDataFlowAnalysis<
       dataflow::Lattice<AxisInfo>>::getLatticeElement;
   using FuncAxisInfoMapT = DenseMap<FunctionOpInterface, AxisInfo>;
 
-  void visitOperation(Operation *op,
-                      ArrayRef<const dataflow::Lattice<AxisInfo> *> operands,
-                      ArrayRef<dataflow::Lattice<AxisInfo> *> results) override;
+  LogicalResult visitOperation(
+      Operation *op, ArrayRef<const dataflow::Lattice<AxisInfo> *> operands,
+      ArrayRef<dataflow::Lattice<AxisInfo> *> results) override;
   void
   visitForOpInductionVar(scf::ForOp op,
                          ArrayRef<dataflow::Lattice<AxisInfo> *> argLattices);
@@ -1039,7 +1039,7 @@ AxisInfoAnalysis::AxisInfoAnalysis(DataFlowSolver &solver)
   visitors.append<LoadOpAxisInfoVisitor>();
 }
 
-void AxisInfoAnalysis::visitOperation(
+LogicalResult AxisInfoAnalysis::visitOperation(
     Operation *op, ArrayRef<const dataflow::Lattice<AxisInfo> *> operands,
     ArrayRef<dataflow::Lattice<AxisInfo> *> results) {
   // TODO: For sure not the right way to do this
@@ -1048,8 +1048,10 @@ void AxisInfoAnalysis::visitOperation(
     if (op->getValue().getRank() == 0)
       setToEntryState((dataflow::Lattice<AxisInfo> *)op);
   AxisInfo curr = visitors.apply(op, operands);
-  if (curr.getRank() == 0)
-    return setAllToEntryStates(results);
+  if (curr.getRank() == 0) {
+    setAllToEntryStates(results);
+    return mlir::success();
+  }
   // override with hint
   auto newContiguity = curr.getContiguity();
   auto newDivisibility = curr.getDivisibility();
@@ -1071,6 +1073,7 @@ void AxisInfoAnalysis::visitOperation(
   // join all lattice elements
   for (auto *result : results)
     propagateIfChanged(result, result->join(curr));
+  return mlir::success();
 }
 
 void AxisInfoAnalysis::visitForOpInductionVar(
 
@@ -425,6 +425,7 @@ bool supportMFMATypes(Type a, Type b) {
   if (a.getIntOrFloatBitWidth() != b.getIntOrFloatBitWidth())
     return false;
 
+  auto F8E4M3FN = TypeID::get<Float8E4M3FNType>();
   auto F8E5M2 = TypeID::get<Float8E5M2Type>();
   auto F8E4M3FNUZ = TypeID::get<Float8E4M3FNUZType>();
   auto F8E5M2FNUZ = TypeID::get<Float8E5M2FNUZType>();
@@ -436,6 +437,7 @@ bool supportMFMATypes(Type a, Type b) {
       {F32, F32},
       {F16, F16},
       {BF16, BF16},
+      {F8E4M3FN, F8E4M3FN},
       {F8E5M2, F8E5M2},
       {F8E4M3FNUZ, F8E4M3FNUZ},
       {F8E4M3FNUZ, F8E5M2FNUZ},
@@ -495,14 +497,14 @@ bool supportMMA(triton::DotOp op, int version) {
       return false;
     if (!(numWarps % 4 == 0 && retShapePerCTA[rank - 2] % 64 == 0 &&
           retShapePerCTA[rank - 1] % 8 == 0 &&
-          (aElemTy.isFloat8E5M2() || aElemTy.isFloat8E4M3FNUZ() ||
+          (aElemTy.isFloat8E5M2() || aElemTy.isFloat8E4M3FN() ||
            aElemTy.isInteger(8) || aElemTy.isF16() || aElemTy.isBF16() ||
            aElemTy.isF32()))) {
       return false;
     }
     // We cannot use MMA_V3 if we need to accumulate in F32 within the MMA op.
     if (op.getMaxNumImpreciseAcc() < 32 &&
-        (aElemTy.isFloat8E5M2() || aElemTy.isFloat8E4M3FNUZ()) &&
+        (aElemTy.isFloat8E5M2() || aElemTy.isFloat8E4M3FN()) &&
         cast<RankedTensorType>(op.getType()).getElementType().isF32()) {
       return false;
     }
 
@@ -40,7 +40,8 @@ SmallVector<Value> reorderValues(const SmallVector<Value> &values, Type inType,
   auto ouEltTy = ouTensorTy.getElementType();
   if (inBitWidth == ouBitWidth)
     return values;
-  if (inBitWidth == 16 && ouBitWidth == 32) {
+  if ((inBitWidth == 16 && ouBitWidth == 32) ||
+      (inBitWidth == 32 && ouBitWidth == 16)) {
     SmallVector<Value> ret;
     for (unsigned i = 0; i < values.size(); i += 8) {
       ret.push_back(values[i]);
 
@@ -34,6 +34,9 @@ TritonGPUToLLVMTypeConverter::TritonGPUToLLVMTypeConverter(
   addConversion([&](mlir::Float8E4M3FNUZType type) -> std::optional<Type> {
     return IntegerType::get(type.getContext(), 8);
   });
+  addConversion([&](mlir::Float8E4M3FNType type) -> std::optional<Type> {
+    return IntegerType::get(type.getContext(), 8);
+  });
   addConversion([&](mlir::Float8E5M2Type type) -> std::optional<Type> {
     return IntegerType::get(type.getContext(), 8);
   });
 
@@ -87,8 +87,9 @@ struct ArithConstantSplatOpConversion
     // LLVM IR.
     if (type::isFloat8(elemType))
       elemType = rewriter.getIntegerType(8);
-    auto constOp = rewriter.create<LLVM::ConstantOp>(loc, elemType, val);
     auto typeConverter = getTypeConverter();
+    auto constOp = rewriter.create<LLVM::ConstantOp>(
+        loc, typeConverter->convertType(elemType), val);
     auto llStruct = SplatOpConversion::convertSplatLikeOp(
         elemType, op.getType(), constOp, typeConverter, rewriter, loc);
     rewriter.replaceOp(op, llStruct);
Original file line number	Diff line number	Diff line change
`@@ -15,7 +15,7 @@ class TritonTypeDef<string name, string _mnemonic, list<Trait> traits = []>`
`15`	`15`	`}`
`16`	`16`
`17`	`17`	`// Floating-point Type`
`18`		`-def TT_Float : AnyTypeOf<[F8E4M3FNUZ, F8E5M2, F8E5M2FNUZ, F16, BF16, F32, F64], "floating-point">;`
	`18`	`+def TT_Float : AnyTypeOf<[F8E4M3FN, F8E4M3FNUZ, F8E5M2, F8E5M2FNUZ, F16, BF16, F32, F64], "floating-point">;`
`19`	`19`	`def TT_FloatTensor : RankedTensorOf<[TT_Float]>;`
`20`	`20`	`def TT_FloatLike : AnyTypeOf<[TT_Float, TT_FloatTensor]>;`
`21`	`21`
Original file line number	Diff line number	Diff line change
`@@ -21,7 +21,7 @@ AliasInfo AliasInfo::join(const AliasInfo &lhs, const AliasInfo &rhs) {`
`21`	`21`	`return ret;`
`22`	`22`	`}`
`23`	`23`
`24`		`-void SharedMemoryAliasAnalysis::visitOperation(`
	`24`	`+LogicalResult SharedMemoryAliasAnalysis::visitOperation(`
`25`	`25`	`Operation op, ArrayRef<const dataflow::Lattice<AliasInfo> > operands,`
`26`	`26`	`ArrayRef<dataflow::Lattice<AliasInfo> *> results) {`
`27`	`27`	`AliasInfo aliasInfo;`
`@@ -31,7 +31,7 @@ void SharedMemoryAliasAnalysis::visitOperation(`
`31`	`31`	`if (auto memdescTy = dyn_cast<triton::MemDescType>(result.getType())) {`
`32`	`32`	`if (!isa_and_nonnull<triton::gpu::SharedMemorySpaceAttr>(`
`33`	`33`	`memdescTy.getMemorySpace()))`
`34`		`- return;`
	`34`	`+ return mlir::success();`
`35`	`35`	`}`
`36`	`36`
`37`	`37`	`// Only LocalAllocOp creates a new buffer.`
`@@ -49,11 +49,13 @@ void SharedMemoryAliasAnalysis::visitOperation(`
`49`	`49`	`}`
`50`	`50`
`51`	`51`	`if (pessimistic) {`
`52`		`- return setAllToEntryStates(results);`
	`52`	`+ setAllToEntryStates(results);`
	`53`	`+ return mlir::success();`
`53`	`54`	`}`
`54`	`55`	`// Join all lattice elements`
`55`	`56`	`for (auto *result : results)`
`56`	`57`	`propagateIfChanged(result, result->join(aliasInfo));`
	`58`	`+ return mlir::success();`
`57`	`59`	`}`
`58`	`60`
`59`	`61`	`AliasResult SharedMemoryAliasAnalysis::alias(Value lhs, Value rhs) {`