Triton forward mode AD (#1578)

Pangoraw · giordano · wsmoses · web-flow · commit 47d57c1cea7b · 2025-11-10T09:26:15.000+01:00
* Triton forward mode AD

* fmt build

* header

* Bump enzyme

* Update TritonDerivatives.td

---------

Co-authored-by: Mosè Giordano &lt;765740+giordano@users.noreply.github.com&gt;
Co-authored-by: William Moses &lt;wmoses@google.com&gt;
diff --git a/src/enzyme_ad/jax/BUILD b/src/enzyme_ad/jax/BUILD
@@ -266,6 +266,20 @@ td_library(
     ],
 )
 
+gentbl_cc_library(
+    name = "triton-derivatives",
+    tbl_outs = [(
+        ["-gen-mlir-derivatives"],
+        "Implementations/TritonDerivatives.inc",
+    )],
+    tblgen = "@enzyme//:enzyme-tblgen",
+    td_file = "Implementations/TritonDerivatives.td",
+    td_srcs = [
+        "Implementations/TritonDerivatives.td",
+    ],
+    deps = [":ImplementationsCommonTdFiles"],
+)
+
 gentbl_cc_library(
     name = "mhlo-derivatives",
     tbl_outs = [(
@@ -874,6 +888,7 @@ cc_library(
         ":enzymexla-derivatives",
         ":mhlo-derivatives",
         ":stablehlo-derivatives",
+        ":triton-derivatives",
         "//src/external/isl:Isl",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
diff --git a/src/enzyme_ad/jax/Implementations/TritonAutoDiffOpInterfaceImpl.cpp b/src/enzyme_ad/jax/Implementations/TritonAutoDiffOpInterfaceImpl.cpp
@@ -0,0 +1,96 @@
+//===- TritonAutoDiffOpInterfaceImpl.cpp - Interface external model -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the external model implementation of the automatic
+// differentiation op interfaces for the MLIR tt dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Enzyme/MLIR/Implementations/CoreDialectsAutoDiffImplementations.h"
+#include "Enzyme/MLIR/Interfaces/AutoDiffOpInterface.h"
+#include "Enzyme/MLIR/Interfaces/GradientUtils.h"
+#include "Enzyme/MLIR/Interfaces/GradientUtilsReverse.h"
+
+#include "triton/Dialect/Triton/IR/Dialect.h"
+
+#include "src/enzyme_ad/jax/Implementations/XLADerivatives.h"
+
+using namespace mlir;
+using namespace mlir::enzyme;
+using namespace mlir::triton;
+
+namespace {
+
+#include "src/enzyme_ad/jax/Implementations/TritonDerivatives.inc"
+
+class TritonPointerTypeInterface
+    : public AutoDiffTypeInterface::ExternalModel<TritonPointerTypeInterface,
+                                                  triton::PointerType> {
+public:
+  mlir::Value createNullValue(mlir::Type self, OpBuilder &builder,
+                              Location loc) const {
+    llvm_unreachable("TODO");
+  }
+
+  Value createAddOp(Type self, OpBuilder &builder, Location loc, Value a,
+                    Value b) const {
+    llvm_unreachable("TODO");
+  }
+
+  Value createConjOp(Type self, OpBuilder &builder, Location loc,
+                     Value a) const {
+    llvm_unreachable("TODO");
+  }
+
+  Type getShadowType(Type self, unsigned width) const {
+    assert(width == 1 && "unsupported width != 1");
+    return self;
+  }
+
+  bool isMutable(Type self) const { return true; }
+
+  LogicalResult zeroInPlace(Type self, OpBuilder &builder, Location loc,
+                            Value val) const {
+    // TODO inspect val and memset corresponding size
+    return failure();
+  }
+
+  bool isZero(Type self, Value val) const { return false; }
+  bool isZeroAttr(Type self, Attribute attr) const { return false; }
+};
+
+class AutoDiffTritonFuncFunctionInterface
+    : public AutoDiffFunctionInterface::ExternalModel<
+          AutoDiffTritonFuncFunctionInterface, triton::FuncOp> {
+public:
+  void transformResultTypes(Operation *self,
+                            SmallVectorImpl<Type> &returnTypes) const {}
+
+  Operation *createCall(Operation *self, OpBuilder &builder, Location loc,
+                        ValueRange args) const {
+    return triton::CallOp::create(builder, loc, cast<triton::FuncOp>(self),
+                                  args);
+  }
+
+  Operation *createReturn(Operation *self, OpBuilder &builder, Location loc,
+                          ValueRange retArgs) const {
+    return triton::ReturnOp::create(builder, loc, retArgs);
+  }
+};
+
+} // end anonymous namespace
+
+void mlir::enzyme::registerTritonDialectAutoDiffInterface(
+    DialectRegistry &registry) {
+  registry.addExtension(+[](MLIRContext *context, triton::TritonDialect *) {
+    registerInterfaces(context);
+    triton::FuncOp::attachInterface<AutoDiffTritonFuncFunctionInterface>(
+        *context);
+    triton::PointerType::attachInterface<TritonPointerTypeInterface>(*context);
+  });
+}
diff --git a/src/enzyme_ad/jax/Implementations/TritonDerivatives.td b/src/enzyme_ad/jax/Implementations/TritonDerivatives.td
@@ -0,0 +1,41 @@
+include "src/enzyme_ad/jax/Implementations/Common.td"
+
+class TritonDerivative<string opName_, dag patternToMatch, list<dag> resultOps, dag forwardOps=(ForwardFromSummedReverse)> : MLIRDerivative<"triton", opName_, patternToMatch, resultOps, forwardOps>;
+
+class TritonInst<string m, string postopt="", string preopt=""> : Inst<m, "triton", postopt, preopt>;
+
+class TritonMemoryIdentityOp<string opName_, list<int> ptrargs_, list<int> storedargs_ = [], dag patternToMatch=(Unimplemented), list<dag> reverse_ = []>  : MemoryIdentityOp<"triton", opName_, ptrargs_, storedargs_, patternToMatch, reverse_>;
+
+class TritonReadOnlyIdentityOp<string opName_, list<int> ptrargs_ = [0], dag patternToMatch=(Unimplemented), list<dag> reverse_ = []> : ReadOnlyIdentityOp<"triton", opName_, ptrargs_, patternToMatch, reverse_>;
+
+class ArithConstantFP<string m> : ConstantFP<m, "arith", "ConstantOp", "mlir::ElementsAttr">;
+
+class TritonInactiveOp<string m> : InactiveOp<"triton", m>;
+
+class TritonReturnOp<string m> : ReturnOp<"triton", m>;
+
+def FpToFp : TritonInst<"FpToFpOp">;
+def PreciseDivF : TritonInst<"PreciseDivFOp">;
+def MakeRange : TritonInst<"MakeRangeOp">;
+
+def : TritonReturnOp<"ReturnOp">;
+
+def : TritonInactiveOp<"AssertOp">;
+def : TritonInactiveOp<"MakeRangeOp">;
+def : TritonInactiveOp<"PrintOp">;
+
+def : ReadOnlyIdentityOp<"triton", "AddPtrOp", [0]>;
+def : ReadOnlyIdentityOp<"triton", "AdvanceOp", [0]>;
+def : ReadOnlyIdentityOp<"triton", "LoadOp", [0]>;
+def : ReadOnlyIdentityOp<"triton", "SplatOp", [0]>;
+def : MemoryIdentityOp<"triton", "StoreOp", [1], [0]>;
+
+def FpToFpRoundingMode : GlobalExpr</*needsprimal*/0, /*needsshadow*/0, [{
+  op.getRoundingAttr();
+}]>;
+
+def : TritonDerivative<"FpToFpOp", (Op $x),
+                       [
+                         (FpToFp (TypeOf $x), (DiffeRet), (FpToFpRoundingMode))
+                       ]
+                     >;
diff --git a/src/enzyme_ad/jax/Implementations/XLADerivatives.h b/src/enzyme_ad/jax/Implementations/XLADerivatives.h
@@ -14,13 +14,15 @@ void registerMHLODialectAutoDiffInterface(mlir::DialectRegistry &registry);
 void registerStableHLODialectAutoDiffInterface(mlir::DialectRegistry &registry);
 void registerCHLODialectAutoDiffInterface(mlir::DialectRegistry &registry);
 void registerEnzymeXLADialectAutoDiffInterface(mlir::DialectRegistry &registry);
+void registerTritonDialectAutoDiffInterface(mlir::DialectRegistry &registry);
 
 static inline void
 registerXLAAutoDiffInterfaces(mlir::DialectRegistry &registry) {
   registerMHLODialectAutoDiffInterface(registry);
   registerStableHLODialectAutoDiffInterface(registry);
   registerCHLODialectAutoDiffInterface(registry);
   registerEnzymeXLADialectAutoDiffInterface(registry);
+  registerTritonDialectAutoDiffInterface(registry);
 }
 } // namespace enzyme
 } // namespace mlir
diff --git a/test/lit_tests/diffrules/triton/add.mlir b/test/lit_tests/diffrules/triton/add.mlir
@@ -0,0 +1,53 @@
+// RUN: enzymexlamlir-opt %s --enzyme-wrap="infn=add_kernel outfn= argTys=enzyme_dup,enzyme_const,enzyme_dup,enzyme_const retTys= mode=ForwardMode" | FileCheck %s
+
+module {
+  tt.func public @add_kernel(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32}) attributes {noinline = false} {
+    %c1024_i32 = arith.constant 1024 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c1024_i32 : i32
+    %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32>
+    %3 = tt.splat %1 : i32 -> tensor<1024xi32>
+    %4 = arith.addi %3, %2 : tensor<1024xi32>
+    %5 = tt.splat %arg3 : i32 -> tensor<1024xi32>
+    %6 = arith.cmpi slt, %4, %5 : tensor<1024xi32>
+    %7 = tt.splat %arg0 : !tt.ptr<f32> -> tensor<1024x!tt.ptr<f32>>
+    %8 = tt.addptr %7, %4 : tensor<1024x!tt.ptr<f32>>, tensor<1024xi32>
+    %9 = tt.load %8, %6 : tensor<1024x!tt.ptr<f32>>
+    %10 = tt.splat %arg1 : !tt.ptr<f32> -> tensor<1024x!tt.ptr<f32>>
+    %11 = tt.addptr %10, %4 : tensor<1024x!tt.ptr<f32>>, tensor<1024xi32>
+    %12 = tt.load %11, %6 : tensor<1024x!tt.ptr<f32>>
+    %13 = arith.addf %9, %12 : tensor<1024xf32>
+    %14 = tt.splat %arg2 : !tt.ptr<f32> -> tensor<1024x!tt.ptr<f32>>
+    %15 = tt.addptr %14, %4 : tensor<1024x!tt.ptr<f32>>, tensor<1024xi32>
+    tt.store %15, %13, %6 : tensor<1024x!tt.ptr<f32>>
+    tt.return
+  }
+}
+
+// CHECK:  tt.func @add_kernel(%[[arg0:.+]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %[[arg1:.+]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %[[arg2:.+]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %[[arg3:.+]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %[[arg4:.+]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %[[arg5:.+]]: i32 {tt.divisibility = 16 : i32}) attributes {noinline = false} {
+// CHECK-NEXT:    %[[c1024_i32:.+]] = arith.constant 1024 : i32
+// CHECK-NEXT:    %[[v0:.+]] = tt.get_program_id x : i32
+// CHECK-NEXT:    %[[v1:.+]] = arith.muli %[[v0]], %[[c1024_i32]] : i32
+// CHECK-NEXT:    %[[v2:.+]] = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32>
+// CHECK-NEXT:    %[[v3:.+]] = tt.splat %[[v1]] : i32 -> tensor<1024xi32>
+// CHECK-NEXT:    %[[v4:.+]] = arith.addi %[[v3]], %[[v2]] : tensor<1024xi32>
+// CHECK-NEXT:    %[[v5:.+]] = tt.splat %[[arg5]] : i32 -> tensor<1024xi32>
+// CHECK-NEXT:    %[[v6:.+]] = arith.cmpi slt, %[[v4]], %[[v5]] : tensor<1024xi32>
+// CHECK-NEXT:    %[[v7:.+]] = tt.splat %[[arg1]] : !tt.ptr<f32> -> tensor<1024x!tt.ptr<f32>>
+// CHECK-NEXT:    %[[v8:.+]] = tt.splat %[[arg0]] : !tt.ptr<f32> -> tensor<1024x!tt.ptr<f32>>
+// CHECK-NEXT:    %[[v9:.+]] = tt.addptr %[[v7]], %[[v4]] : tensor<1024x!tt.ptr<f32>>, tensor<1024xi32>
+// CHECK-NEXT:    %[[v10:.+]] = tt.addptr %[[v8]], %[[v4]] : tensor<1024x!tt.ptr<f32>>, tensor<1024xi32>
+// CHECK-NEXT:    %[[v11:.+]] = tt.load %[[v9]], %[[v6]] : tensor<1024x!tt.ptr<f32>>
+// CHECK-NEXT:    %[[v12:.+]] = tt.load %[[v10]], %[[v6]] : tensor<1024x!tt.ptr<f32>>
+// CHECK-NEXT:    %[[v13:.+]] = tt.splat %[[arg2]] : !tt.ptr<f32> -> tensor<1024x!tt.ptr<f32>>
+// CHECK-NEXT:    %[[v14:.+]] = tt.addptr %[[v13]], %[[v4]] : tensor<1024x!tt.ptr<f32>>, tensor<1024xi32>
+// CHECK-NEXT:    %[[v15:.+]] = tt.load %[[v14]], %[[v6]] : tensor<1024x!tt.ptr<f32>>
+// CHECK-NEXT:    %[[v16:.+]] = arith.addf %[[v12]], %[[v15]] : tensor<1024xf32>
+// CHECK-NEXT:    %[[v17:.+]] = tt.splat %[[arg4]] : !tt.ptr<f32> -> tensor<1024x!tt.ptr<f32>>
+// CHECK-NEXT:    %[[v18:.+]] = tt.splat %[[arg3]] : !tt.ptr<f32> -> tensor<1024x!tt.ptr<f32>>
+// CHECK-NEXT:    %[[v19:.+]] = tt.addptr %[[v17]], %[[v4]] : tensor<1024x!tt.ptr<f32>>, tensor<1024xi32>
+// CHECK-NEXT:    %[[v20:.+]] = tt.addptr %[[v18]], %[[v4]] : tensor<1024x!tt.ptr<f32>>, tensor<1024xi32>
+// CHECK-NEXT:    tt.store %[[v19]], %[[v11]], %[[v6]] : tensor<1024x!tt.ptr<f32>>
+// CHECK-NEXT:    tt.store %[[v20]], %[[v16]], %[[v6]] : tensor<1024x!tt.ptr<f32>>
+// CHECK-NEXT:    tt.return
+// CHECK-NEXT:  }