From 6af3eef5c7279e5fd9d8625e0ea6c463bec1c261 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Thu, 15 Aug 2024 16:29:07 -0400
Subject: [PATCH 001/183] Setup latest dependency for new bufferization passes

---
 .dep-versions                 | 2 +-
 frontend/catalyst/__init__.py | 2 +-
 mlir/Makefile                 | 2 +-
 mlir/llvm-project             | 2 +-
 mlir/mlir-hlo                 | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.dep-versions b/.dep-versions
index 5480b17f2f..bfd7917e8f 100644
--- a/.dep-versions
+++ b/.dep-versions
@@ -1,5 +1,5 @@
 # Always update the version check in catalyst.__init__ when changing the JAX version.
-jax=0.4.28
+jax=0.4.31
 mhlo=89a891c986650c33df76885f5620e0a92150d90f
 llvm=3a8316216807d64a586b971f51695e23883331f7
 enzyme=v0.0.130
diff --git a/frontend/catalyst/__init__.py b/frontend/catalyst/__init__.py
index af9d10d1d3..aeac5ffd69 100644
--- a/frontend/catalyst/__init__.py
+++ b/frontend/catalyst/__init__.py
@@ -23,7 +23,7 @@
 
 import jaxlib as _jaxlib
 
-_jaxlib_version = "0.4.28"
+_jaxlib_version = "0.4.31"
 if _jaxlib.__version__ != _jaxlib_version:
     import warnings
 
diff --git a/mlir/Makefile b/mlir/Makefile
index 1d5a126ef6..797f75abd2 100644
--- a/mlir/Makefile
+++ b/mlir/Makefile
@@ -118,7 +118,7 @@ enzyme:
 		-DCMAKE_CXX_VISIBILITY_PRESET=$(SYMBOL_VISIBILITY) \
 		-DCMAKE_POLICY_DEFAULT_CMP0116=NEW
 
-	cmake --build $(ENZYME_BUILD_DIR) --target EnzymeStatic-19
+	cmake --build $(ENZYME_BUILD_DIR) --target EnzymeStatic-20
 
 .PHONY: dialects
 dialects:
diff --git a/mlir/llvm-project b/mlir/llvm-project
index 3a83162168..51d4980a13 160000
--- a/mlir/llvm-project
+++ b/mlir/llvm-project
@@ -1 +1 @@
-Subproject commit 3a8316216807d64a586b971f51695e23883331f7
+Subproject commit 51d4980a133db12888207698e39c469cb7055cac
diff --git a/mlir/mlir-hlo b/mlir/mlir-hlo
index 89a891c986..1d15157654 160000
--- a/mlir/mlir-hlo
+++ b/mlir/mlir-hlo
@@ -1 +1 @@
-Subproject commit 89a891c986650c33df76885f5620e0a92150d90f
+Subproject commit 1d151576543aebf134f76ae1ebf86da11623b912

From 951ef36784e4bb33e35b0bbc11da9c1483ac7971 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Thu, 15 Aug 2024 16:30:43 -0400
Subject: [PATCH 002/183] enableRegionSimplification accepts
 GreedySimplifyRegionLevel type instead of bool

---
 mlir/lib/Catalyst/Transforms/DetectQNodes.cpp        | 2 +-
 mlir/lib/Quantum/Transforms/emit_catalyst_pyface.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Catalyst/Transforms/DetectQNodes.cpp b/mlir/lib/Catalyst/Transforms/DetectQNodes.cpp
index 6acdb8c2bf..242421da81 100644
--- a/mlir/lib/Catalyst/Transforms/DetectQNodes.cpp
+++ b/mlir/lib/Catalyst/Transforms/DetectQNodes.cpp
@@ -889,7 +889,7 @@ struct AddExceptionHandlingPass : impl::AddExceptionHandlingPassBase<AddExceptio
 
         GreedyRewriteConfig config;
         config.strictMode = GreedyRewriteStrictness::ExistingOps;
-        config.enableRegionSimplification = false;
+        config.enableRegionSimplification = GreedySimplifyRegionLevel::Disabled;
 
         if (failed(applyPatternsAndFoldGreedily(getOperation(), std::move(patterns1), config))) {
             signalPassFailure();
diff --git a/mlir/lib/Quantum/Transforms/emit_catalyst_pyface.cpp b/mlir/lib/Quantum/Transforms/emit_catalyst_pyface.cpp
index 34d246ae7c..ff3e53f30f 100644
--- a/mlir/lib/Quantum/Transforms/emit_catalyst_pyface.cpp
+++ b/mlir/lib/Quantum/Transforms/emit_catalyst_pyface.cpp
@@ -214,7 +214,7 @@ struct EmitCatalystPyInterfacePass
         patterns.add<EmitCatalystPyInterfaceTransform>(context);
         GreedyRewriteConfig config;
         config.strictMode = GreedyRewriteStrictness::ExistingOps;
-        config.enableRegionSimplification = false;
+        config.enableRegionSimplification = GreedySimplifyRegionLevel::Disabled;
         config.maxIterations = 1;
 
         auto op = getOperation();

From 21e5933c94ce7615f6468ae3ff8246f13fafc28f Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Thu, 15 Aug 2024 16:36:06 -0400
Subject: [PATCH 003/183] translateModuleToLLVMIR needs an extra
 disableVerification parameter

---
 mlir/lib/Driver/CompilerDriver.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Driver/CompilerDriver.cpp b/mlir/lib/Driver/CompilerDriver.cpp
index 8b775667c8..af19b09038 100644
--- a/mlir/lib/Driver/CompilerDriver.cpp
+++ b/mlir/lib/Driver/CompilerDriver.cpp
@@ -662,8 +662,9 @@ LogicalResult QuantumDriverMain(const CompilerOptions &options, CompilerOutput &
         outIRStream << *op;
 
         if (options.lowerToLLVM) {
-            llvmModule = timer::timer(translateModuleToLLVMIR, "translateModuleToLLVMIR",
-                                      /* add_endl */ false, *op, llvmContext, "LLVMDialectModule");
+            llvmModule =
+                timer::timer(translateModuleToLLVMIR, "translateModuleToLLVMIR",
+                             /* add_endl */ false, *op, llvmContext, "LLVMDialectModule", false);
             if (!llvmModule) {
                 CO_MSG(options, Verbosity::Urgent, "Failed to translate LLVM module\n");
                 return failure();

From c8336bdcf80395fde64b34d2fce26ab9c3609414 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Thu, 15 Aug 2024 17:10:53 -0400
Subject: [PATCH 004/183] Replace equals with compare for llvm::StringRef

---
 mlir/lib/Catalyst/Transforms/AsyncUtils.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/lib/Catalyst/Transforms/AsyncUtils.cpp b/mlir/lib/Catalyst/Transforms/AsyncUtils.cpp
index e86cbb9c7d..28026e009b 100644
--- a/mlir/lib/Catalyst/Transforms/AsyncUtils.cpp
+++ b/mlir/lib/Catalyst/Transforms/AsyncUtils.cpp
@@ -207,7 +207,7 @@ std::optional<LLVM::LLVMFuncOp> AsyncUtils::getCalleeSafe(LLVM::CallOp callOp)
 bool AsyncUtils::isFunctionNamed(LLVM::LLVMFuncOp funcOp, llvm::StringRef expectedName)
 {
     llvm::StringRef observedName = funcOp.getSymName();
-    return observedName.equals(expectedName);
+    return observedName.compare(expectedName) == 0;
 }
 
 bool AsyncUtils::isMlirAsyncRuntimeCreateValue(LLVM::LLVMFuncOp funcOp)

From 9e6ae3f97e8c65409224c5d97df13a0148612518 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Fri, 16 Aug 2024 11:41:22 -0400
Subject: [PATCH 005/183] Keep using jax 0.4.28 but update llvm (from jax
 0.4.29)

---
 .dep-versions                 | 6 +++---
 frontend/catalyst/__init__.py | 2 +-
 mlir/llvm-project             | 2 +-
 mlir/mlir-hlo                 | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/.dep-versions b/.dep-versions
index bfd7917e8f..992c9ff241 100644
--- a/.dep-versions
+++ b/.dep-versions
@@ -1,7 +1,7 @@
 # Always update the version check in catalyst.__init__ when changing the JAX version.
-jax=0.4.31
-mhlo=89a891c986650c33df76885f5620e0a92150d90f
-llvm=3a8316216807d64a586b971f51695e23883331f7
+jax=0.4.28
+mhlo=39c37c43fb9db18144f2e155a0fe65864646a968
+llvm=6f2c61071c274a1b5e212e6ad4114641ec7c7fc3
 enzyme=v0.0.130
 
 # Always remove custom PL/LQ versions before release.
diff --git a/frontend/catalyst/__init__.py b/frontend/catalyst/__init__.py
index aeac5ffd69..af9d10d1d3 100644
--- a/frontend/catalyst/__init__.py
+++ b/frontend/catalyst/__init__.py
@@ -23,7 +23,7 @@
 
 import jaxlib as _jaxlib
 
-_jaxlib_version = "0.4.31"
+_jaxlib_version = "0.4.28"
 if _jaxlib.__version__ != _jaxlib_version:
     import warnings
 
diff --git a/mlir/llvm-project b/mlir/llvm-project
index 51d4980a13..6f2c61071c 160000
--- a/mlir/llvm-project
+++ b/mlir/llvm-project
@@ -1 +1 @@
-Subproject commit 51d4980a133db12888207698e39c469cb7055cac
+Subproject commit 6f2c61071c274a1b5e212e6ad4114641ec7c7fc3
diff --git a/mlir/mlir-hlo b/mlir/mlir-hlo
index 1d15157654..39c37c43fb 160000
--- a/mlir/mlir-hlo
+++ b/mlir/mlir-hlo
@@ -1 +1 @@
-Subproject commit 1d151576543aebf134f76ae1ebf86da11623b912
+Subproject commit 39c37c43fb9db18144f2e155a0fe65864646a968

From 0cf462bcc03fd1a5a45c9ad3343217a737d5f74b Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Fri, 16 Aug 2024 11:42:17 -0400
Subject: [PATCH 006/183] Roll back EnzymeStatic to 19

---
 mlir/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/Makefile b/mlir/Makefile
index 797f75abd2..1d5a126ef6 100644
--- a/mlir/Makefile
+++ b/mlir/Makefile
@@ -118,7 +118,7 @@ enzyme:
 		-DCMAKE_CXX_VISIBILITY_PRESET=$(SYMBOL_VISIBILITY) \
 		-DCMAKE_POLICY_DEFAULT_CMP0116=NEW
 
-	cmake --build $(ENZYME_BUILD_DIR) --target EnzymeStatic-20
+	cmake --build $(ENZYME_BUILD_DIR) --target EnzymeStatic-19
 
 .PHONY: dialects
 dialects:

From 39b25378d190caa3992eb5433cc2adaef797cf5a Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Fri, 16 Aug 2024 12:06:54 -0400
Subject: [PATCH 007/183] Use == instead of compare

---
 mlir/lib/Catalyst/Transforms/AsyncUtils.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/lib/Catalyst/Transforms/AsyncUtils.cpp b/mlir/lib/Catalyst/Transforms/AsyncUtils.cpp
index 28026e009b..83dd657dae 100644
--- a/mlir/lib/Catalyst/Transforms/AsyncUtils.cpp
+++ b/mlir/lib/Catalyst/Transforms/AsyncUtils.cpp
@@ -207,7 +207,7 @@ std::optional<LLVM::LLVMFuncOp> AsyncUtils::getCalleeSafe(LLVM::CallOp callOp)
 bool AsyncUtils::isFunctionNamed(LLVM::LLVMFuncOp funcOp, llvm::StringRef expectedName)
 {
     llvm::StringRef observedName = funcOp.getSymName();
-    return observedName.compare(expectedName) == 0;
+    return observedName == expectedName;
 }
 
 bool AsyncUtils::isMlirAsyncRuntimeCreateValue(LLVM::LLVMFuncOp funcOp)

From 5bad7413ea5ac1b418400513d677af45827d1c93 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Fri, 16 Aug 2024 12:07:26 -0400
Subject: [PATCH 008/183] Roll back enableRegionSimplification type

---
 mlir/lib/Catalyst/Transforms/DetectQNodes.cpp        | 2 +-
 mlir/lib/Quantum/Transforms/emit_catalyst_pyface.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Catalyst/Transforms/DetectQNodes.cpp b/mlir/lib/Catalyst/Transforms/DetectQNodes.cpp
index 242421da81..6acdb8c2bf 100644
--- a/mlir/lib/Catalyst/Transforms/DetectQNodes.cpp
+++ b/mlir/lib/Catalyst/Transforms/DetectQNodes.cpp
@@ -889,7 +889,7 @@ struct AddExceptionHandlingPass : impl::AddExceptionHandlingPassBase<AddExceptio
 
         GreedyRewriteConfig config;
         config.strictMode = GreedyRewriteStrictness::ExistingOps;
-        config.enableRegionSimplification = GreedySimplifyRegionLevel::Disabled;
+        config.enableRegionSimplification = false;
 
         if (failed(applyPatternsAndFoldGreedily(getOperation(), std::move(patterns1), config))) {
             signalPassFailure();
diff --git a/mlir/lib/Quantum/Transforms/emit_catalyst_pyface.cpp b/mlir/lib/Quantum/Transforms/emit_catalyst_pyface.cpp
index ff3e53f30f..34d246ae7c 100644
--- a/mlir/lib/Quantum/Transforms/emit_catalyst_pyface.cpp
+++ b/mlir/lib/Quantum/Transforms/emit_catalyst_pyface.cpp
@@ -214,7 +214,7 @@ struct EmitCatalystPyInterfacePass
         patterns.add<EmitCatalystPyInterfaceTransform>(context);
         GreedyRewriteConfig config;
         config.strictMode = GreedyRewriteStrictness::ExistingOps;
-        config.enableRegionSimplification = GreedySimplifyRegionLevel::Disabled;
+        config.enableRegionSimplification = false;
         config.maxIterations = 1;
 
         auto op = getOperation();

From 4e9cc289d44eb1490583c6defca60bad0dae7866 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Fri, 16 Aug 2024 14:36:22 -0400
Subject: [PATCH 009/183] Comment out retired bufferization passes

---
 frontend/catalyst/compiler.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/frontend/catalyst/compiler.py b/frontend/catalyst/compiler.py
index c24ea994f6..e8635aaa23 100644
--- a/frontend/catalyst/compiler.py
+++ b/frontend/catalyst/compiler.py
@@ -198,13 +198,13 @@ def run_writing_command(command: List[str], compile_options: Optional[CompileOpt
         "scf-bufferize",
         "convert-tensor-to-linalg",  # tensor.pad
         "convert-elementwise-to-linalg",  # Must be run before --arith-bufferize
-        "arith-bufferize",
+        #"arith-bufferize",
         "empty-tensor-to-alloc-tensor",
-        "func.func(bufferization-bufferize)",
-        "func.func(tensor-bufferize)",
+        #"func.func(bufferization-bufferize)",
+        #"func.func(tensor-bufferize)",
         "catalyst-bufferize",  # Must be run before -- func.func(linalg-bufferize)
-        "func.func(linalg-bufferize)",
-        "func.func(tensor-bufferize)",
+        #"func.func(linalg-bufferize)",
+        #"func.func(tensor-bufferize)",
         "quantum-bufferize",
         "func-bufferize",
         "func.func(finalizing-bufferize)",

From ad22787af4380ec920571bd672e07e03cc187433 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Tue, 20 Aug 2024 11:05:11 -0400
Subject: [PATCH 010/183] Temporarily disable finalizing-bufferization for
 debugging

---
 frontend/catalyst/compiler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/frontend/catalyst/compiler.py b/frontend/catalyst/compiler.py
index e8635aaa23..d6d0e9bdb6 100644
--- a/frontend/catalyst/compiler.py
+++ b/frontend/catalyst/compiler.py
@@ -207,7 +207,7 @@ def run_writing_command(command: List[str], compile_options: Optional[CompileOpt
         #"func.func(tensor-bufferize)",
         "quantum-bufferize",
         "func-bufferize",
-        "func.func(finalizing-bufferize)",
+        #"func.func(finalizing-bufferize)",
         "canonicalize",  # Remove dead memrefToTensorOp's
         # introduced during gradient-bufferize of callbacks
         "func.func(buffer-hoisting)",

From 0852bccb57f5106a75fbe6d84c54aab8568aef88 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Tue, 20 Aug 2024 16:41:45 -0400
Subject: [PATCH 011/183] Reformat compiler.py

---
 frontend/catalyst/compiler.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/frontend/catalyst/compiler.py b/frontend/catalyst/compiler.py
index 8bb86e2791..a13bd44194 100644
--- a/frontend/catalyst/compiler.py
+++ b/frontend/catalyst/compiler.py
@@ -199,16 +199,16 @@ def run_writing_command(command: List[str], compile_options: Optional[CompileOpt
         "scf-bufferize",
         "convert-tensor-to-linalg",  # tensor.pad
         "convert-elementwise-to-linalg",  # Must be run before --arith-bufferize
-        #"arith-bufferize",
+        # "arith-bufferize",
         "empty-tensor-to-alloc-tensor",
-        #"func.func(bufferization-bufferize)",
-        #"func.func(tensor-bufferize)",
+        # "func.func(bufferization-bufferize)",
+        # "func.func(tensor-bufferize)",
         "catalyst-bufferize",  # Must be run before -- func.func(linalg-bufferize)
-        #"func.func(linalg-bufferize)",
-        #"func.func(tensor-bufferize)",
+        # "func.func(linalg-bufferize)",
+        # "func.func(tensor-bufferize)",
         "quantum-bufferize",
         "func-bufferize",
-        #"func.func(finalizing-bufferize)",
+        # "func.func(finalizing-bufferize)",
         "canonicalize",  # Remove dead memrefToTensorOp's
         # introduced during gradient-bufferize of callbacks
         "func.func(buffer-hoisting)",

From ffcd4d3eb64faef40a2690f0e6610dca66998e3a Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Tue, 20 Aug 2024 18:52:43 -0400
Subject: [PATCH 012/183] Remove all passes and use one-shot-bufferize only

---
 frontend/catalyst/compiler.py | 32 +++++---------------------------
 1 file changed, 5 insertions(+), 27 deletions(-)

diff --git a/frontend/catalyst/compiler.py b/frontend/catalyst/compiler.py
index a13bd44194..7accd85ee1 100644
--- a/frontend/catalyst/compiler.py
+++ b/frontend/catalyst/compiler.py
@@ -193,33 +193,7 @@ def run_writing_command(command: List[str], compile_options: Optional[CompileOpt
 BUFFERIZATION_PASS = (
     "BufferizationPass",
     [
-        "one-shot-bufferize{dialect-filter=memref}",
-        "inline",
-        "gradient-bufferize",
-        "scf-bufferize",
-        "convert-tensor-to-linalg",  # tensor.pad
-        "convert-elementwise-to-linalg",  # Must be run before --arith-bufferize
-        # "arith-bufferize",
-        "empty-tensor-to-alloc-tensor",
-        # "func.func(bufferization-bufferize)",
-        # "func.func(tensor-bufferize)",
-        "catalyst-bufferize",  # Must be run before -- func.func(linalg-bufferize)
-        # "func.func(linalg-bufferize)",
-        # "func.func(tensor-bufferize)",
-        "quantum-bufferize",
-        "func-bufferize",
-        # "func.func(finalizing-bufferize)",
-        "canonicalize",  # Remove dead memrefToTensorOp's
-        # introduced during gradient-bufferize of callbacks
-        "func.func(buffer-hoisting)",
-        "func.func(buffer-loop-hoisting)",
-        "func.func(buffer-deallocation)",
-        "convert-arraylist-to-memref",
-        "convert-bufferization-to-memref",
-        "canonicalize",  # Must be after convert-bufferization-to-memref
-        # otherwise there are issues in lowering of dynamic tensors.
-        # "cse",
-        "cp-global-memref",
+        "one-shot-bufferize{bufferize-function-boundaries}",
     ],
 )
 
@@ -275,6 +249,8 @@ def run_writing_command(command: List[str], compile_options: Optional[CompileOpt
     HLO_LOWERING_PASS,
     QUANTUM_COMPILATION_PASS,
     BUFFERIZATION_PASS,
+    #BUFFERIZATION_PASS2,
+    #BUFFERIZATION_PASS3,
     MLIR_TO_LLVM_PASS,
 ]
 
@@ -290,6 +266,8 @@ def run_writing_command(command: List[str], compile_options: Optional[CompileOpt
     HLO_LOWERING_PASS,
     QUANTUM_COMPILATION_PASS,
     BUFFERIZATION_PASS,
+    #BUFFERIZATION_PASS2,
+    #BUFFERIZATION_PASS3,
     MLIR_TO_LLVM_ASYNC_PASS,
 ]
 

From 8b45ae8429410e19b4069d7deb5dc2d8b5e5dcd7 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Wed, 21 Aug 2024 09:59:48 -0400
Subject: [PATCH 013/183] Add new buffer passes except for the deallocation
 pipeline

---
 frontend/catalyst/compiler.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/frontend/catalyst/compiler.py b/frontend/catalyst/compiler.py
index 7accd85ee1..862e202c0e 100644
--- a/frontend/catalyst/compiler.py
+++ b/frontend/catalyst/compiler.py
@@ -193,7 +193,14 @@ def run_writing_command(command: List[str], compile_options: Optional[CompileOpt
 BUFFERIZATION_PASS = (
     "BufferizationPass",
     [
+        "eliminate-empty-tensors",
         "one-shot-bufferize{bufferize-function-boundaries}",
+        "func.func(buffer-hoisting)",
+        "func.func(buffer-loop-hoisting)",
+        "buffer-results-to-out-params",
+        "drop-equivalent-buffer-results",
+        "func.func(promote-buffers-to-stack)",
+        #"buffer-deallocation-pipeline",
     ],
 )
 

From 4d20def85cab458e7ddf297d9424c1573f109e01 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Wed, 21 Aug 2024 12:05:47 -0400
Subject: [PATCH 014/183] Temporarily set side effect of InitOp and FinalizeOp
 as zero

---
 frontend/catalyst/compiler.py         | 5 ++++-
 mlir/include/Quantum/IR/QuantumOps.td | 4 ++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/frontend/catalyst/compiler.py b/frontend/catalyst/compiler.py
index 862e202c0e..6bfef707ae 100644
--- a/frontend/catalyst/compiler.py
+++ b/frontend/catalyst/compiler.py
@@ -195,12 +195,15 @@ def run_writing_command(command: List[str], compile_options: Optional[CompileOpt
     [
         "eliminate-empty-tensors",
         "one-shot-bufferize{bufferize-function-boundaries}",
+        #"gradient-bufferize",
+        #"catalyst-bufferize",
+        #"quantum-bufferize",
         "func.func(buffer-hoisting)",
         "func.func(buffer-loop-hoisting)",
         "buffer-results-to-out-params",
         "drop-equivalent-buffer-results",
         "func.func(promote-buffers-to-stack)",
-        #"buffer-deallocation-pipeline",
+        "buffer-deallocation-pipeline",
     ],
 )
 
diff --git a/mlir/include/Quantum/IR/QuantumOps.td b/mlir/include/Quantum/IR/QuantumOps.td
index dd81b8f52d..10c4cd50f6 100644
--- a/mlir/include/Quantum/IR/QuantumOps.td
+++ b/mlir/include/Quantum/IR/QuantumOps.td
@@ -63,7 +63,7 @@ def NamedObservableAttr : EnumAttr<QuantumDialect, NamedObservable, "named_obser
 
 class Quantum_Op<string mnemonic, list<Trait> traits = []> : Op<QuantumDialect, mnemonic, traits>;
 
-def InitializeOp : Quantum_Op<"init"> {
+def InitializeOp : Quantum_Op<"init", [NoMemoryEffect]> {
     let summary = "Initialize the quantum runtime.";
 
     let assemblyFormat = [{
@@ -71,7 +71,7 @@ def InitializeOp : Quantum_Op<"init"> {
     }];
 }
 
-def FinalizeOp : Quantum_Op<"finalize"> {
+def FinalizeOp : Quantum_Op<"finalize", [NoMemoryEffect]> {
     let summary = "Teardown the quantum runtime.";
 
     let assemblyFormat = [{

From d9b38ba39cb71cca03ea69709600c25a4b5ae6e4 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Wed, 21 Aug 2024 15:42:46 -0400
Subject: [PATCH 015/183] Disable deallocation pipeline for debugging

---
 frontend/catalyst/compiler.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/frontend/catalyst/compiler.py b/frontend/catalyst/compiler.py
index 6bfef707ae..862e202c0e 100644
--- a/frontend/catalyst/compiler.py
+++ b/frontend/catalyst/compiler.py
@@ -195,15 +195,12 @@ def run_writing_command(command: List[str], compile_options: Optional[CompileOpt
     [
         "eliminate-empty-tensors",
         "one-shot-bufferize{bufferize-function-boundaries}",
-        #"gradient-bufferize",
-        #"catalyst-bufferize",
-        #"quantum-bufferize",
         "func.func(buffer-hoisting)",
         "func.func(buffer-loop-hoisting)",
         "buffer-results-to-out-params",
         "drop-equivalent-buffer-results",
         "func.func(promote-buffers-to-stack)",
-        "buffer-deallocation-pipeline",
+        #"buffer-deallocation-pipeline",
     ],
 )
 

From 676053f9547e616ef4bdd01ad4966f7541606bf1 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Wed, 21 Aug 2024 15:43:41 -0400
Subject: [PATCH 016/183] Draft Quantum Impl of BufferizableOpInterface

---
 .../Quantum/Transforms/BufferizableOpInterfaceImpl.h | 12 ++++++++++++
 mlir/lib/Quantum/IR/QuantumDialect.cpp               |  3 +++
 .../Transforms/BufferizableOpInterfaceImpl.cpp       | 11 +++++++++++
 mlir/lib/Quantum/Transforms/CMakeLists.txt           |  1 +
 4 files changed, 27 insertions(+)
 create mode 100644 mlir/include/Quantum/Transforms/BufferizableOpInterfaceImpl.h
 create mode 100644 mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp

diff --git a/mlir/include/Quantum/Transforms/BufferizableOpInterfaceImpl.h b/mlir/include/Quantum/Transforms/BufferizableOpInterfaceImpl.h
new file mode 100644
index 0000000000..069f4b3ee4
--- /dev/null
+++ b/mlir/include/Quantum/Transforms/BufferizableOpInterfaceImpl.h
@@ -0,0 +1,12 @@
+#ifndef MLIR_DIALECT_QUANTUM_BUFFERIZABLEOPINTERFACEIMPL_H
+#define MLIR_DIALECT_QUANTUM_BUFFERIZABLEOPINTERFACEIMPL_H
+
+namespace mlir {
+class DialectRegistry;
+
+namespace tensor {
+void registerBufferizableOpInterfaceExternalModels(DialectRegistry &registry);
+} // namespace tensor
+} // namespace mlir
+
+#endif // MLIR_DIALECT_QUANTUM_BUFFERIZABLEOPINTERFACEIMPL_H
\ No newline at end of file
diff --git a/mlir/lib/Quantum/IR/QuantumDialect.cpp b/mlir/lib/Quantum/IR/QuantumDialect.cpp
index 385f4e0ae5..04bfe34f2c 100644
--- a/mlir/lib/Quantum/IR/QuantumDialect.cpp
+++ b/mlir/lib/Quantum/IR/QuantumDialect.cpp
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
 #include "mlir/IR/DialectImplementation.h" // needed for generated type parser
 #include "llvm/ADT/TypeSwitch.h"           // needed for generated type parser
 
@@ -43,6 +44,8 @@ void QuantumDialect::initialize()
 #define GET_OP_LIST
 #include "Quantum/IR/QuantumOps.cpp.inc"
         >();
+
+    declarePromisedInterfaces<bufferization::BufferizableOpInterface>();
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp
new file mode 100644
index 0000000000..a92aa1da79
--- /dev/null
+++ b/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -0,0 +1,11 @@
+#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
+
+#include "Quantum/IR/QuantumOps.h"
+#include "Quantum/Transforms/BufferizableOpInterfaceImpl.h"
+
+using namespace mlir;
+using namespace catalyst::quantum;
+
+namespace {
+
+}
\ No newline at end of file
diff --git a/mlir/lib/Quantum/Transforms/CMakeLists.txt b/mlir/lib/Quantum/Transforms/CMakeLists.txt
index 51118a3878..22489d497b 100644
--- a/mlir/lib/Quantum/Transforms/CMakeLists.txt
+++ b/mlir/lib/Quantum/Transforms/CMakeLists.txt
@@ -1,6 +1,7 @@
 set(LIBRARY_NAME quantum-transforms)
 
 file(GLOB SRC
+    BufferizableOpInterfaceImpl.cpp
     BufferizationPatterns.cpp
     quantum_bufferize.cpp
     ConversionPatterns.cpp

From 3febbf7a8e2cee3524ce076cdd90e5ce8dd30855 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Wed, 21 Aug 2024 17:53:28 -0400
Subject: [PATCH 017/183] Add empty one-shot-bufferize for quantum::ExtractOp

---
 .../Transforms/BufferizableOpInterfaceImpl.h  | 19 +++++-----
 .../BufferizableOpInterfaceImpl.cpp           | 36 +++++++++++++++++++
 2 files changed, 46 insertions(+), 9 deletions(-)

diff --git a/mlir/include/Quantum/Transforms/BufferizableOpInterfaceImpl.h b/mlir/include/Quantum/Transforms/BufferizableOpInterfaceImpl.h
index 069f4b3ee4..bf60013f70 100644
--- a/mlir/include/Quantum/Transforms/BufferizableOpInterfaceImpl.h
+++ b/mlir/include/Quantum/Transforms/BufferizableOpInterfaceImpl.h
@@ -1,12 +1,13 @@
-#ifndef MLIR_DIALECT_QUANTUM_BUFFERIZABLEOPINTERFACEIMPL_H
-#define MLIR_DIALECT_QUANTUM_BUFFERIZABLEOPINTERFACEIMPL_H
+#pragma once
 
-namespace mlir {
-class DialectRegistry;
+using namespace mlir;
 
-namespace tensor {
-void registerBufferizableOpInterfaceExternalModels(DialectRegistry &registry);
-} // namespace tensor
-} // namespace mlir
+namespace catalyst {
 
-#endif // MLIR_DIALECT_QUANTUM_BUFFERIZABLEOPINTERFACEIMPL_H
\ No newline at end of file
+namespace quantum {
+
+void registerBufferizableOpInterfaceExternalModels(mlir::DialectRegistry &registry);
+
+}
+
+} // namespace catalyst
diff --git a/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp
index a92aa1da79..f3dbb521e2 100644
--- a/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -1,3 +1,4 @@
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
 
 #include "Quantum/IR/QuantumOps.h"
@@ -8,4 +9,39 @@ using namespace catalyst::quantum;
 
 namespace {
 
+/// Bufferization of tensor.extract. Replace with memref.load.
+struct ExtractOpInterface
+    : public mlir::bufferization::BufferizableOpInterface::ExternalModel<ExtractOpInterface,
+                                                    catalyst::quantum::ExtractOp> {
+  bool bufferizesToMemoryRead(mlir::Operation *op, mlir::OpOperand &opOperand,
+                              const mlir::bufferization::AnalysisState &state) const {
+    return true;
+  }
+
+  bool bufferizesToMemoryWrite(mlir::Operation *op, mlir::OpOperand &opOperand,
+                               const mlir::bufferization::AnalysisState &state) const {
+    return false;
+  }
+
+  mlir::bufferization::AliasingValueList getAliasingValues(mlir::Operation *op,
+                                      mlir::OpOperand &opOperand,
+                                      const mlir::bufferization::AnalysisState &state) const {
+    return {};
+  }
+
+  LogicalResult bufferize(mlir::Operation *op, RewriterBase &rewriter,
+                          const mlir::bufferization::BufferizationOptions &options) const {
+    auto extractOp = cast<catalyst::quantum::ExtractOp>(op);
+
+    return success();
+  }
+};
+
+}
+
+void catalyst::quantum::registerBufferizableOpInterfaceExternalModels(
+    DialectRegistry &registry) {
+  registry.addExtension(+[](MLIRContext *ctx, catalyst::quantum::QuantumDialect *dialect) {
+    ExtractOp::attachInterface<ExtractOpInterface>(*ctx);
+  });
 }
\ No newline at end of file

From a354e3fe8efffbfad599d714a5990418f7bbf966 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Mon, 26 Aug 2024 13:46:19 -0400
Subject: [PATCH 018/183] Temporarily get rid of memory error

---
 frontend/catalyst/compiler.py         | 6 +++---
 mlir/include/Quantum/IR/QuantumOps.td | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/frontend/catalyst/compiler.py b/frontend/catalyst/compiler.py
index 862e202c0e..6a1bf8242c 100644
--- a/frontend/catalyst/compiler.py
+++ b/frontend/catalyst/compiler.py
@@ -197,9 +197,9 @@ def run_writing_command(command: List[str], compile_options: Optional[CompileOpt
         "one-shot-bufferize{bufferize-function-boundaries}",
         "func.func(buffer-hoisting)",
         "func.func(buffer-loop-hoisting)",
-        "buffer-results-to-out-params",
-        "drop-equivalent-buffer-results",
-        "func.func(promote-buffers-to-stack)",
+        #"buffer-results-to-out-params",
+        #"drop-equivalent-buffer-results",
+        #"func.func(promote-buffers-to-stack)",
         #"buffer-deallocation-pipeline",
     ],
 )
diff --git a/mlir/include/Quantum/IR/QuantumOps.td b/mlir/include/Quantum/IR/QuantumOps.td
index 10c4cd50f6..dd81b8f52d 100644
--- a/mlir/include/Quantum/IR/QuantumOps.td
+++ b/mlir/include/Quantum/IR/QuantumOps.td
@@ -63,7 +63,7 @@ def NamedObservableAttr : EnumAttr<QuantumDialect, NamedObservable, "named_obser
 
 class Quantum_Op<string mnemonic, list<Trait> traits = []> : Op<QuantumDialect, mnemonic, traits>;
 
-def InitializeOp : Quantum_Op<"init", [NoMemoryEffect]> {
+def InitializeOp : Quantum_Op<"init"> {
     let summary = "Initialize the quantum runtime.";
 
     let assemblyFormat = [{
@@ -71,7 +71,7 @@ def InitializeOp : Quantum_Op<"init", [NoMemoryEffect]> {
     }];
 }
 
-def FinalizeOp : Quantum_Op<"finalize", [NoMemoryEffect]> {
+def FinalizeOp : Quantum_Op<"finalize"> {
     let summary = "Teardown the quantum runtime.";
 
     let assemblyFormat = [{

From 7de0d80668e18a13641837c283831aadc91f6514 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Mon, 26 Aug 2024 14:03:39 -0400
Subject: [PATCH 019/183] Register new one-shot bufferization pass

---
 mlir/lib/Driver/CompilerDriver.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/mlir/lib/Driver/CompilerDriver.cpp b/mlir/lib/Driver/CompilerDriver.cpp
index af19b09038..99a03f2347 100644
--- a/mlir/lib/Driver/CompilerDriver.cpp
+++ b/mlir/lib/Driver/CompilerDriver.cpp
@@ -64,6 +64,7 @@
 #include "Mitigation/Transforms/Passes.h"
 #include "Quantum/IR/QuantumDialect.h"
 #include "Quantum/Transforms/Passes.h"
+#include "mlir/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.h"
 
 #include "Enzyme.h"
 #include "Timer.hpp"
@@ -71,6 +72,7 @@
 using namespace mlir;
 using namespace catalyst;
 using namespace catalyst::driver;
+using namespace catalyst::quantum;
 
 namespace catalyst::utils {
 
@@ -300,6 +302,9 @@ void registerAllCatalystDialects(DialectRegistry &registry)
     registry.insert<quantum::QuantumDialect>();
     registry.insert<gradient::GradientDialect>();
     registry.insert<mitigation::MitigationDialect>();
+
+    // Extend one-shot bufferization pass.
+    catalyst::quantum::registerBufferizableOpInterfaceExternalModels(registry);
 }
 } // namespace
 

From af6f65a401d1f8948013f49c7b54ef5f6df47edc Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Mon, 26 Aug 2024 14:22:23 -0400
Subject: [PATCH 020/183] Fix include typos

---
 mlir/lib/Driver/CompilerDriver.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/lib/Driver/CompilerDriver.cpp b/mlir/lib/Driver/CompilerDriver.cpp
index 99a03f2347..07d92c6a58 100644
--- a/mlir/lib/Driver/CompilerDriver.cpp
+++ b/mlir/lib/Driver/CompilerDriver.cpp
@@ -64,7 +64,7 @@
 #include "Mitigation/Transforms/Passes.h"
 #include "Quantum/IR/QuantumDialect.h"
 #include "Quantum/Transforms/Passes.h"
-#include "mlir/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.h"
+#include "Quantum/Transforms/BufferizableOpInterfaceImpl.h"
 
 #include "Enzyme.h"
 #include "Timer.hpp"

From 1edbcd2bffb97832e17b77225e59b002e4b4b4a9 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Mon, 26 Aug 2024 14:27:31 -0400
Subject: [PATCH 021/183] Replace ExtractOP with StateOp for the first
 bufferization example

---
 .../Quantum/Transforms/BufferizableOpInterfaceImpl.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp
index f3dbb521e2..11f3f614e9 100644
--- a/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -10,9 +10,9 @@ using namespace catalyst::quantum;
 namespace {
 
 /// Bufferization of tensor.extract. Replace with memref.load.
-struct ExtractOpInterface
-    : public mlir::bufferization::BufferizableOpInterface::ExternalModel<ExtractOpInterface,
-                                                    catalyst::quantum::ExtractOp> {
+struct StateOpInterface
+    : public mlir::bufferization::BufferizableOpInterface::ExternalModel<StateOpInterface,
+                                                    catalyst::quantum::StateOp> {
   bool bufferizesToMemoryRead(mlir::Operation *op, mlir::OpOperand &opOperand,
                               const mlir::bufferization::AnalysisState &state) const {
     return true;
@@ -31,7 +31,7 @@ struct ExtractOpInterface
 
   LogicalResult bufferize(mlir::Operation *op, RewriterBase &rewriter,
                           const mlir::bufferization::BufferizationOptions &options) const {
-    auto extractOp = cast<catalyst::quantum::ExtractOp>(op);
+    auto StateOp = cast<catalyst::quantum::StateOp>(op);
 
     return success();
   }
@@ -42,6 +42,6 @@ struct ExtractOpInterface
 void catalyst::quantum::registerBufferizableOpInterfaceExternalModels(
     DialectRegistry &registry) {
   registry.addExtension(+[](MLIRContext *ctx, catalyst::quantum::QuantumDialect *dialect) {
-    ExtractOp::attachInterface<ExtractOpInterface>(*ctx);
+    StateOp::attachInterface<StateOpInterface>(*ctx);
   });
 }
\ No newline at end of file

From f94dcf785dbc15b5247b0a5fc6a203ff55af7d81 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Tue, 27 Aug 2024 15:39:32 -0400
Subject: [PATCH 022/183] Test StateOp bufferization

---
 frontend/catalyst/compiler.py                 |  4 ++--
 .../BufferizableOpInterfaceImpl.cpp           | 19 ++++++++++++++++---
 2 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/frontend/catalyst/compiler.py b/frontend/catalyst/compiler.py
index 6a1bf8242c..85d3172db6 100644
--- a/frontend/catalyst/compiler.py
+++ b/frontend/catalyst/compiler.py
@@ -195,8 +195,8 @@ def run_writing_command(command: List[str], compile_options: Optional[CompileOpt
     [
         "eliminate-empty-tensors",
         "one-shot-bufferize{bufferize-function-boundaries}",
-        "func.func(buffer-hoisting)",
-        "func.func(buffer-loop-hoisting)",
+        #"func.func(buffer-hoisting)",
+        #"func.func(buffer-loop-hoisting)",
         #"buffer-results-to-out-params",
         #"drop-equivalent-buffer-results",
         #"func.func(promote-buffers-to-stack)",
diff --git a/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp
index 11f3f614e9..457bd05613 100644
--- a/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -1,5 +1,6 @@
-#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Transforms/DialectConversion.h"
 
 #include "Quantum/IR/QuantumOps.h"
 #include "Quantum/Transforms/BufferizableOpInterfaceImpl.h"
@@ -15,7 +16,7 @@ struct StateOpInterface
                                                     catalyst::quantum::StateOp> {
   bool bufferizesToMemoryRead(mlir::Operation *op, mlir::OpOperand &opOperand,
                               const mlir::bufferization::AnalysisState &state) const {
-    return true;
+    return false;
   }
 
   bool bufferizesToMemoryWrite(mlir::Operation *op, mlir::OpOperand &opOperand,
@@ -31,7 +32,19 @@ struct StateOpInterface
 
   LogicalResult bufferize(mlir::Operation *op, RewriterBase &rewriter,
                           const mlir::bufferization::BufferizationOptions &options) const {
-    auto StateOp = cast<catalyst::quantum::StateOp>(op);
+    auto stateOp = cast<StateOp>(op);
+    Location loc = op->getLoc();
+    FailureOr<Value> tensorAlloc = allocateTensorForShapedValue(
+        rewriter, loc, stateOp.getState(), options,
+        /*copy=*/false);
+    if (failed(tensorAlloc))
+      return failure();
+    llvm::outs() << "This rewrite happens!\n";
+    auto tensorType = cast<RankedTensorType>(tensorAlloc->getType());
+    MemRefType resultType = MemRefType::get(tensorType.getShape(), tensorType.getElementType());
+    //rewriter.create<memref::AllocOp>(loc, resultType);
+    Value allocVal = rewriter.replaceOpWithNewOp<memref::AllocOp>(stateOp, resultType);
+    rewriter.create<memref::AllocOp>(loc, cast<MemRefType>(allocVal.getType()));
 
     return success();
   }

From a3816a6bce42565a53dd577734d28b02366d8717 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Tue, 27 Aug 2024 19:26:29 -0400
Subject: [PATCH 023/183] Use replaceOpWithBufferizedValues to get rid of
 segmentation fault

---
 .../BufferizableOpInterfaceImpl.cpp           | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp
index 457bd05613..b8dfdc3451 100644
--- a/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -16,12 +16,12 @@ struct StateOpInterface
                                                     catalyst::quantum::StateOp> {
   bool bufferizesToMemoryRead(mlir::Operation *op, mlir::OpOperand &opOperand,
                               const mlir::bufferization::AnalysisState &state) const {
-    return false;
+    return true;
   }
 
   bool bufferizesToMemoryWrite(mlir::Operation *op, mlir::OpOperand &opOperand,
                                const mlir::bufferization::AnalysisState &state) const {
-    return false;
+    return true;
   }
 
   mlir::bufferization::AliasingValueList getAliasingValues(mlir::Operation *op,
@@ -34,17 +34,12 @@ struct StateOpInterface
                           const mlir::bufferization::BufferizationOptions &options) const {
     auto stateOp = cast<StateOp>(op);
     Location loc = op->getLoc();
-    FailureOr<Value> tensorAlloc = allocateTensorForShapedValue(
-        rewriter, loc, stateOp.getState(), options,
-        /*copy=*/false);
-    if (failed(tensorAlloc))
-      return failure();
-    llvm::outs() << "This rewrite happens!\n";
-    auto tensorType = cast<RankedTensorType>(tensorAlloc->getType());
+    auto tensorType = cast<RankedTensorType>(stateOp.getState().getType());
     MemRefType resultType = MemRefType::get(tensorType.getShape(), tensorType.getElementType());
-    //rewriter.create<memref::AllocOp>(loc, resultType);
-    Value allocVal = rewriter.replaceOpWithNewOp<memref::AllocOp>(stateOp, resultType);
-    rewriter.create<memref::AllocOp>(loc, cast<MemRefType>(allocVal.getType()));
+
+    Value allocVal = rewriter.create<memref::AllocOp>(loc, resultType);
+    rewriter.create<StateOp>(loc, TypeRange{}, ValueRange{stateOp.getObs(), allocVal});
+    mlir::bufferization::replaceOpWithBufferizedValues(rewriter, op, allocVal);
 
     return success();
   }

From fc46399e4c3bc5a59830483748648a7a48098782 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Wed, 28 Aug 2024 11:21:51 -0400
Subject: [PATCH 024/183] Set buffer write for StateOp as false

---
 mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp
index b8dfdc3451..a3fd82fab3 100644
--- a/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -21,7 +21,7 @@ struct StateOpInterface
 
   bool bufferizesToMemoryWrite(mlir::Operation *op, mlir::OpOperand &opOperand,
                                const mlir::bufferization::AnalysisState &state) const {
-    return true;
+    return false;
   }
 
   mlir::bufferization::AliasingValueList getAliasingValues(mlir::Operation *op,

From f02870939bed617fc7b33edd405222d53c982e9f Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Wed, 28 Aug 2024 11:47:02 -0400
Subject: [PATCH 025/183] Add new bufferization pass for ProbsOp

---
 .../BufferizableOpInterfaceImpl.cpp           | 42 ++++++++++++++++++-
 1 file changed, 40 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp
index a3fd82fab3..cc199c7c7d 100644
--- a/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -10,7 +10,8 @@ using namespace catalyst::quantum;
 
 namespace {
 
-/// Bufferization of tensor.extract. Replace with memref.load.
+/// Bufferization of catalyst.quantum.state. Replace with memref.alloc and a new
+/// catalyst.quantum.state that uses the memory allocated by memref.alloc.
 struct StateOpInterface
     : public mlir::bufferization::BufferizableOpInterface::ExternalModel<StateOpInterface,
                                                     catalyst::quantum::StateOp> {
@@ -45,11 +46,48 @@ struct StateOpInterface
   }
 };
 
-}
+/// Bufferization of catalyst.quantum.probs. Replace with memref.alloc and a new
+/// catalyst.quantum.probs that uses the memory allocated by memref.alloc.
+struct ProbsOpInterface
+    : public mlir::bufferization::BufferizableOpInterface::ExternalModel<ProbsOpInterface,
+                                                    catalyst::quantum::ProbsOp> {
+  bool bufferizesToMemoryRead(mlir::Operation *op, mlir::OpOperand &opOperand,
+                              const mlir::bufferization::AnalysisState &state) const {
+    return true;
+  }
+
+  bool bufferizesToMemoryWrite(mlir::Operation *op, mlir::OpOperand &opOperand,
+                               const mlir::bufferization::AnalysisState &state) const {
+    return false;
+  }
+
+  mlir::bufferization::AliasingValueList getAliasingValues(mlir::Operation *op,
+                                      mlir::OpOperand &opOperand,
+                                      const mlir::bufferization::AnalysisState &state) const {
+    return {};
+  }
+
+  LogicalResult bufferize(mlir::Operation *op, RewriterBase &rewriter,
+                          const mlir::bufferization::BufferizationOptions &options) const {
+    auto probsOp = cast<ProbsOp>(op);
+    Location loc = op->getLoc();
+    auto tensorType = cast<RankedTensorType>(probsOp.getProbabilities().getType());
+    MemRefType resultType = MemRefType::get(tensorType.getShape(), tensorType.getElementType());
+
+    Value allocVal = rewriter.create<memref::AllocOp>(loc, resultType);
+    rewriter.create<ProbsOp>(loc, TypeRange{}, ValueRange{probsOp.getObs(), allocVal});
+    mlir::bufferization::replaceOpWithBufferizedValues(rewriter, op, allocVal);
+
+    return success();
+  }
+};
+
+} // namespace
 
 void catalyst::quantum::registerBufferizableOpInterfaceExternalModels(
     DialectRegistry &registry) {
   registry.addExtension(+[](MLIRContext *ctx, catalyst::quantum::QuantumDialect *dialect) {
     StateOp::attachInterface<StateOpInterface>(*ctx);
+    ProbsOp::attachInterface<ProbsOpInterface>(*ctx);
   });
 }
\ No newline at end of file

From dc631a7489a6e3268f93a1780baa9aeac7d2ed6c Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Wed, 28 Aug 2024 12:20:51 -0400
Subject: [PATCH 026/183] Add new bufferization interface for CountsOp

---
 .../BufferizableOpInterfaceImpl.cpp           | 41 +++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp
index cc199c7c7d..c2607e84fd 100644
--- a/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -82,6 +82,46 @@ struct ProbsOpInterface
   }
 };
 
+/// Bufferization of catalyst.quantum.counts. Replace with memref.alloc and a new
+/// catalyst.quantum.counts that uses the memory allocated by memref.alloc.
+struct CountsOpInterface
+    : public mlir::bufferization::BufferizableOpInterface::ExternalModel<CountsOpInterface,
+                                                    catalyst::quantum::CountsOp> {
+  bool bufferizesToMemoryRead(mlir::Operation *op, mlir::OpOperand &opOperand,
+                              const mlir::bufferization::AnalysisState &state) const {
+    return true;
+  }
+
+  bool bufferizesToMemoryWrite(mlir::Operation *op, mlir::OpOperand &opOperand,
+                               const mlir::bufferization::AnalysisState &state) const {
+    return false;
+  }
+
+  mlir::bufferization::AliasingValueList getAliasingValues(mlir::Operation *op,
+                                      mlir::OpOperand &opOperand,
+                                      const mlir::bufferization::AnalysisState &state) const {
+    return {};
+  }
+
+  LogicalResult bufferize(mlir::Operation *op, RewriterBase &rewriter,
+                          const mlir::bufferization::BufferizationOptions &options) const {
+    auto countsOp = cast<CountsOp>(op);
+    Location loc = op->getLoc();
+    auto tensorType0 = cast<RankedTensorType>(countsOp.getEigvals().getType());
+    auto tensorType1 = cast<RankedTensorType>(countsOp.getCounts().getType());
+    MemRefType resultType0 = MemRefType::get(tensorType0.getShape(), tensorType0.getElementType());
+    MemRefType resultType1 = MemRefType::get(tensorType1.getShape(), tensorType1.getElementType());
+
+    Value allocVal0 = rewriter.create<memref::AllocOp>(loc, resultType0);
+    Value allocVal1 = rewriter.create<memref::AllocOp>(loc, resultType1);
+    rewriter.create<CountsOp>(loc, nullptr, nullptr, countsOp.getObs(), allocVal0, allocVal1,
+                              countsOp.getShotsAttr());
+    mlir::bufferization::replaceOpWithBufferizedValues(rewriter, op, ValueRange{allocVal0, allocVal1});
+
+    return success();
+  }
+};
+
 } // namespace
 
 void catalyst::quantum::registerBufferizableOpInterfaceExternalModels(
@@ -89,5 +129,6 @@ void catalyst::quantum::registerBufferizableOpInterfaceExternalModels(
   registry.addExtension(+[](MLIRContext *ctx, catalyst::quantum::QuantumDialect *dialect) {
     StateOp::attachInterface<StateOpInterface>(*ctx);
     ProbsOp::attachInterface<ProbsOpInterface>(*ctx);
+    CountsOp::attachInterface<CountsOpInterface>(*ctx);
   });
 }
\ No newline at end of file

From b8181b8134e12020eecaabdc81b95194a7addb24 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Wed, 28 Aug 2024 14:14:57 -0400
Subject: [PATCH 027/183] Correct the read flag for StateOp like operations

---
 mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp
index c2607e84fd..c42a973d4c 100644
--- a/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -17,7 +17,7 @@ struct StateOpInterface
                                                     catalyst::quantum::StateOp> {
   bool bufferizesToMemoryRead(mlir::Operation *op, mlir::OpOperand &opOperand,
                               const mlir::bufferization::AnalysisState &state) const {
-    return true;
+    return false;
   }
 
   bool bufferizesToMemoryWrite(mlir::Operation *op, mlir::OpOperand &opOperand,
@@ -53,7 +53,7 @@ struct ProbsOpInterface
                                                     catalyst::quantum::ProbsOp> {
   bool bufferizesToMemoryRead(mlir::Operation *op, mlir::OpOperand &opOperand,
                               const mlir::bufferization::AnalysisState &state) const {
-    return true;
+    return false;
   }
 
   bool bufferizesToMemoryWrite(mlir::Operation *op, mlir::OpOperand &opOperand,
@@ -89,7 +89,7 @@ struct CountsOpInterface
                                                     catalyst::quantum::CountsOp> {
   bool bufferizesToMemoryRead(mlir::Operation *op, mlir::OpOperand &opOperand,
                               const mlir::bufferization::AnalysisState &state) const {
-    return true;
+    return false;
   }
 
   bool bufferizesToMemoryWrite(mlir::Operation *op, mlir::OpOperand &opOperand,

From 849e884bd363f37029e345dcf246d77db3e166d1 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Wed, 28 Aug 2024 15:52:35 -0400
Subject: [PATCH 028/183] Add new set_state bufferization

---
 .../BufferizableOpInterfaceImpl.cpp           | 110 ++++++++++++------
 1 file changed, 75 insertions(+), 35 deletions(-)

diff --git a/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp
index c42a973d4c..90dc64e523 100644
--- a/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -1,4 +1,5 @@
 #include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
+#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Transforms/DialectConversion.h"
 
@@ -13,26 +14,26 @@ namespace {
 /// Bufferization of catalyst.quantum.state. Replace with memref.alloc and a new
 /// catalyst.quantum.state that uses the memory allocated by memref.alloc.
 struct StateOpInterface
-    : public mlir::bufferization::BufferizableOpInterface::ExternalModel<StateOpInterface,
+    : public bufferization::BufferizableOpInterface::ExternalModel<StateOpInterface,
                                                     catalyst::quantum::StateOp> {
-  bool bufferizesToMemoryRead(mlir::Operation *op, mlir::OpOperand &opOperand,
-                              const mlir::bufferization::AnalysisState &state) const {
+  bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
+                              const bufferization::AnalysisState &state) const {
     return false;
   }
 
-  bool bufferizesToMemoryWrite(mlir::Operation *op, mlir::OpOperand &opOperand,
-                               const mlir::bufferization::AnalysisState &state) const {
+  bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
+                               const bufferization::AnalysisState &state) const {
     return false;
   }
 
-  mlir::bufferization::AliasingValueList getAliasingValues(mlir::Operation *op,
-                                      mlir::OpOperand &opOperand,
-                                      const mlir::bufferization::AnalysisState &state) const {
+  bufferization::AliasingValueList getAliasingValues(Operation *op,
+                                      OpOperand &opOperand,
+                                      const bufferization::AnalysisState &state) const {
     return {};
   }
 
-  LogicalResult bufferize(mlir::Operation *op, RewriterBase &rewriter,
-                          const mlir::bufferization::BufferizationOptions &options) const {
+  LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
+                          const bufferization::BufferizationOptions &options) const {
     auto stateOp = cast<StateOp>(op);
     Location loc = op->getLoc();
     auto tensorType = cast<RankedTensorType>(stateOp.getState().getType());
@@ -40,7 +41,7 @@ struct StateOpInterface
 
     Value allocVal = rewriter.create<memref::AllocOp>(loc, resultType);
     rewriter.create<StateOp>(loc, TypeRange{}, ValueRange{stateOp.getObs(), allocVal});
-    mlir::bufferization::replaceOpWithBufferizedValues(rewriter, op, allocVal);
+    bufferization::replaceOpWithBufferizedValues(rewriter, op, allocVal);
 
     return success();
   }
@@ -49,26 +50,26 @@ struct StateOpInterface
 /// Bufferization of catalyst.quantum.probs. Replace with memref.alloc and a new
 /// catalyst.quantum.probs that uses the memory allocated by memref.alloc.
 struct ProbsOpInterface
-    : public mlir::bufferization::BufferizableOpInterface::ExternalModel<ProbsOpInterface,
+    : public bufferization::BufferizableOpInterface::ExternalModel<ProbsOpInterface,
                                                     catalyst::quantum::ProbsOp> {
-  bool bufferizesToMemoryRead(mlir::Operation *op, mlir::OpOperand &opOperand,
-                              const mlir::bufferization::AnalysisState &state) const {
+  bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
+                              const bufferization::AnalysisState &state) const {
     return false;
   }
 
-  bool bufferizesToMemoryWrite(mlir::Operation *op, mlir::OpOperand &opOperand,
-                               const mlir::bufferization::AnalysisState &state) const {
+  bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
+                               const bufferization::AnalysisState &state) const {
     return false;
   }
 
-  mlir::bufferization::AliasingValueList getAliasingValues(mlir::Operation *op,
-                                      mlir::OpOperand &opOperand,
-                                      const mlir::bufferization::AnalysisState &state) const {
+  bufferization::AliasingValueList getAliasingValues(Operation *op,
+                                      OpOperand &opOperand,
+                                      const bufferization::AnalysisState &state) const {
     return {};
   }
 
-  LogicalResult bufferize(mlir::Operation *op, RewriterBase &rewriter,
-                          const mlir::bufferization::BufferizationOptions &options) const {
+  LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
+                          const bufferization::BufferizationOptions &options) const {
     auto probsOp = cast<ProbsOp>(op);
     Location loc = op->getLoc();
     auto tensorType = cast<RankedTensorType>(probsOp.getProbabilities().getType());
@@ -76,35 +77,35 @@ struct ProbsOpInterface
 
     Value allocVal = rewriter.create<memref::AllocOp>(loc, resultType);
     rewriter.create<ProbsOp>(loc, TypeRange{}, ValueRange{probsOp.getObs(), allocVal});
-    mlir::bufferization::replaceOpWithBufferizedValues(rewriter, op, allocVal);
+    bufferization::replaceOpWithBufferizedValues(rewriter, op, allocVal);
 
     return success();
   }
 };
 
-/// Bufferization of catalyst.quantum.counts. Replace with memref.alloc and a new
-/// catalyst.quantum.counts that uses the memory allocated by memref.alloc.
+/// Bufferization of catalyst.quantum.counts. Replace with memref.allocs and a new
+/// catalyst.quantum.counts that uses the memory allocated by memref.allocs.
 struct CountsOpInterface
-    : public mlir::bufferization::BufferizableOpInterface::ExternalModel<CountsOpInterface,
+    : public bufferization::BufferizableOpInterface::ExternalModel<CountsOpInterface,
                                                     catalyst::quantum::CountsOp> {
-  bool bufferizesToMemoryRead(mlir::Operation *op, mlir::OpOperand &opOperand,
-                              const mlir::bufferization::AnalysisState &state) const {
+  bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
+                              const bufferization::AnalysisState &state) const {
     return false;
   }
 
-  bool bufferizesToMemoryWrite(mlir::Operation *op, mlir::OpOperand &opOperand,
-                               const mlir::bufferization::AnalysisState &state) const {
+  bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
+                               const bufferization::AnalysisState &state) const {
     return false;
   }
 
-  mlir::bufferization::AliasingValueList getAliasingValues(mlir::Operation *op,
-                                      mlir::OpOperand &opOperand,
-                                      const mlir::bufferization::AnalysisState &state) const {
+  bufferization::AliasingValueList getAliasingValues(Operation *op,
+                                      OpOperand &opOperand,
+                                      const bufferization::AnalysisState &state) const {
     return {};
   }
 
-  LogicalResult bufferize(mlir::Operation *op, RewriterBase &rewriter,
-                          const mlir::bufferization::BufferizationOptions &options) const {
+  LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
+                          const bufferization::BufferizationOptions &options) const {
     auto countsOp = cast<CountsOp>(op);
     Location loc = op->getLoc();
     auto tensorType0 = cast<RankedTensorType>(countsOp.getEigvals().getType());
@@ -116,12 +117,50 @@ struct CountsOpInterface
     Value allocVal1 = rewriter.create<memref::AllocOp>(loc, resultType1);
     rewriter.create<CountsOp>(loc, nullptr, nullptr, countsOp.getObs(), allocVal0, allocVal1,
                               countsOp.getShotsAttr());
-    mlir::bufferization::replaceOpWithBufferizedValues(rewriter, op, ValueRange{allocVal0, allocVal1});
+    bufferization::replaceOpWithBufferizedValues(rewriter, op, ValueRange{allocVal0, allocVal1});
 
     return success();
   }
 };
 
+/// Bufferization of catalyst.quantum.set_state. Replace with bufferization::ToMemrefOp and
+/// a new catalyst.quantum.set_state that uses the memref from bufferization::ToMemrefOp.
+struct SetStateOpInterface
+    : public bufferization::BufferizableOpInterface::ExternalModel<SetStateOpInterface,
+                                                    catalyst::quantum::SetStateOp> {
+  bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
+                              const bufferization::AnalysisState &state) const {
+    return false;
+  }
+
+  bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
+                               const bufferization::AnalysisState &state) const {
+    return true;
+  }
+
+  bufferization::AliasingValueList getAliasingValues(Operation *op,
+                                      OpOperand &opOperand,
+                                      const bufferization::AnalysisState &state) const {
+    return {};
+  }
+
+  LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
+                          const bufferization::BufferizationOptions &options) const {
+    auto setStateOp = cast<SetStateOp>(op);
+    Location loc = op->getLoc();
+    auto tensorType = cast<RankedTensorType>(setStateOp.getInState().getType());
+    MemRefType memrefType = MemRefType::get(tensorType.getShape(), tensorType.getElementType());
+
+    auto toMemrefOp = rewriter.create<bufferization::ToMemrefOp>(loc, memrefType,
+                                                                 setStateOp.getInState());
+    auto memref = toMemrefOp.getResult();
+    auto newSetStateOp = rewriter.create<SetStateOp>(loc, setStateOp.getOutQubits().getTypes(),
+                                                     memref, setStateOp.getInQubits());
+    bufferization::replaceOpWithBufferizedValues(rewriter, op, newSetStateOp.getOutQubits());
+    return success();
+  }
+};
+
 } // namespace
 
 void catalyst::quantum::registerBufferizableOpInterfaceExternalModels(
@@ -130,5 +169,6 @@ void catalyst::quantum::registerBufferizableOpInterfaceExternalModels(
     StateOp::attachInterface<StateOpInterface>(*ctx);
     ProbsOp::attachInterface<ProbsOpInterface>(*ctx);
     CountsOp::attachInterface<CountsOpInterface>(*ctx);
+    SetStateOp::attachInterface<SetStateOpInterface>(*ctx);
   });
 }
\ No newline at end of file

From 39f175714a49f6931654279fa3005b98c22f8fd6 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Wed, 28 Aug 2024 16:23:54 -0400
Subject: [PATCH 029/183] Add SetBasisStateOp bufferization

---
 .../BufferizableOpInterfaceImpl.cpp           | 39 +++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp
index 90dc64e523..ccfbdc1c9d 100644
--- a/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -161,6 +161,44 @@ struct SetStateOpInterface
   }
 };
 
+/// Bufferization of catalyst.quantum.set_basic_state. Replace with bufferization::ToMemrefOp and
+/// a new catalyst.quantum.set_basic_state that uses the memref from bufferization::ToMemrefOp.
+struct SetBasisStateOpInterface
+    : public bufferization::BufferizableOpInterface::ExternalModel<SetBasisStateOpInterface,
+                                                    catalyst::quantum::SetBasisStateOp> {
+  bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
+                              const bufferization::AnalysisState &state) const {
+    return false;
+  }
+
+  bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
+                               const bufferization::AnalysisState &state) const {
+    return true;
+  }
+
+  bufferization::AliasingValueList getAliasingValues(Operation *op,
+                                      OpOperand &opOperand,
+                                      const bufferization::AnalysisState &state) const {
+    return {};
+  }
+
+  LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
+                          const bufferization::BufferizationOptions &options) const {
+    auto setBasisStateOp = cast<SetBasisStateOp>(op);
+    Location loc = op->getLoc();
+    auto tensorType = cast<RankedTensorType>(setBasisStateOp.getBasisState().getType());
+    MemRefType memrefType = MemRefType::get(tensorType.getShape(), tensorType.getElementType());
+
+    auto toMemrefOp = rewriter.create<bufferization::ToMemrefOp>(loc, memrefType,
+                                                                 setBasisStateOp.getBasisState());
+    auto memref = toMemrefOp.getResult();
+    auto newSetStateOp = rewriter.create<SetBasisStateOp>(loc, setBasisStateOp.getOutQubits().getTypes(),
+                                                     memref, setBasisStateOp.getInQubits());
+    bufferization::replaceOpWithBufferizedValues(rewriter, op, newSetStateOp.getOutQubits());
+    return success();
+  }
+};
+
 } // namespace
 
 void catalyst::quantum::registerBufferizableOpInterfaceExternalModels(
@@ -170,5 +208,6 @@ void catalyst::quantum::registerBufferizableOpInterfaceExternalModels(
     ProbsOp::attachInterface<ProbsOpInterface>(*ctx);
     CountsOp::attachInterface<CountsOpInterface>(*ctx);
     SetStateOp::attachInterface<SetStateOpInterface>(*ctx);
+    SetBasisStateOp::attachInterface<SetBasisStateOpInterface>(*ctx);
   });
 }
\ No newline at end of file

From b5157fff5846702b08d9ddd458d9c8278acc5d29 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Wed, 28 Aug 2024 17:17:07 -0400
Subject: [PATCH 030/183] Add QubitUnitary Bufferization

---
 .../BufferizableOpInterfaceImpl.cpp           | 47 +++++++++++++++++--
 1 file changed, 43 insertions(+), 4 deletions(-)

diff --git a/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp
index ccfbdc1c9d..9b9885d897 100644
--- a/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -11,6 +11,46 @@ using namespace catalyst::quantum;
 
 namespace {
 
+/// Bufferization of catalyst.quantum.state. Convert Matrix into memref.
+struct QubitUnitaryOpInterface
+    : public bufferization::BufferizableOpInterface::ExternalModel<QubitUnitaryOpInterface,
+                                                    catalyst::quantum::QubitUnitaryOp> {
+  bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
+                              const bufferization::AnalysisState &state) const {
+    return true;
+  }
+
+  bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
+                               const bufferization::AnalysisState &state) const {
+    return false;
+  }
+
+  bufferization::AliasingValueList getAliasingValues(Operation *op,
+                                      OpOperand &opOperand,
+                                      const bufferization::AnalysisState &state) const {
+    return {};
+  }
+
+  LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
+                          const bufferization::BufferizationOptions &options) const {
+    auto qubitUnitaryOp = cast<QubitUnitaryOp>(op);
+    Location loc = op->getLoc();
+    auto tensorType = cast<RankedTensorType>(qubitUnitaryOp.getMatrix().getType());
+    MemRefType memrefType = MemRefType::get(tensorType.getShape(), tensorType.getElementType());
+    auto toMemrefOp = rewriter.create<bufferization::ToMemrefOp>(loc, memrefType,
+                                                                 qubitUnitaryOp.getMatrix());
+    auto memref = toMemrefOp.getResult();
+    auto newQubitUnitaryOp = rewriter.create<QubitUnitaryOp>(
+            loc, qubitUnitaryOp.getOutQubits().getTypes(),
+            qubitUnitaryOp.getOutCtrlQubits().getTypes(), memref,
+            qubitUnitaryOp.getInQubits(), qubitUnitaryOp.getAdjointAttr(),
+            qubitUnitaryOp.getInCtrlQubits(), qubitUnitaryOp.getInCtrlValues());
+    bufferization::replaceOpWithBufferizedValues(rewriter, op, newQubitUnitaryOp.getOutQubits());
+
+    return success();
+  }
+};
+
 /// Bufferization of catalyst.quantum.state. Replace with memref.alloc and a new
 /// catalyst.quantum.state that uses the memory allocated by memref.alloc.
 struct StateOpInterface
@@ -123,8 +163,7 @@ struct CountsOpInterface
   }
 };
 
-/// Bufferization of catalyst.quantum.set_state. Replace with bufferization::ToMemrefOp and
-/// a new catalyst.quantum.set_state that uses the memref from bufferization::ToMemrefOp.
+/// Bufferization of catalyst.quantum.set_state. Convert InState into memref.
 struct SetStateOpInterface
     : public bufferization::BufferizableOpInterface::ExternalModel<SetStateOpInterface,
                                                     catalyst::quantum::SetStateOp> {
@@ -161,8 +200,7 @@ struct SetStateOpInterface
   }
 };
 
-/// Bufferization of catalyst.quantum.set_basic_state. Replace with bufferization::ToMemrefOp and
-/// a new catalyst.quantum.set_basic_state that uses the memref from bufferization::ToMemrefOp.
+/// Bufferization of catalyst.quantum.set_basic_state. Convert BasisState into memref.
 struct SetBasisStateOpInterface
     : public bufferization::BufferizableOpInterface::ExternalModel<SetBasisStateOpInterface,
                                                     catalyst::quantum::SetBasisStateOp> {
@@ -204,6 +242,7 @@ struct SetBasisStateOpInterface
 void catalyst::quantum::registerBufferizableOpInterfaceExternalModels(
     DialectRegistry &registry) {
   registry.addExtension(+[](MLIRContext *ctx, catalyst::quantum::QuantumDialect *dialect) {
+    QubitUnitaryOp::attachInterface<QubitUnitaryOpInterface>(*ctx);
     StateOp::attachInterface<StateOpInterface>(*ctx);
     ProbsOp::attachInterface<ProbsOpInterface>(*ctx);
     CountsOp::attachInterface<CountsOpInterface>(*ctx);

From 5556493f6239390a85ff0a5a30cc1c1516dd04df Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Wed, 28 Aug 2024 17:34:40 -0400
Subject: [PATCH 031/183] Add hermitian bufferization

---
 .../BufferizableOpInterfaceImpl.cpp           | 40 ++++++++++++++++++-
 1 file changed, 39 insertions(+), 1 deletion(-)

diff --git a/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp
index 9b9885d897..50e0ff6497 100644
--- a/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -11,7 +11,7 @@ using namespace catalyst::quantum;
 
 namespace {
 
-/// Bufferization of catalyst.quantum.state. Convert Matrix into memref.
+/// Bufferization of catalyst.quantum.unitary. Convert Matrix into memref.
 struct QubitUnitaryOpInterface
     : public bufferization::BufferizableOpInterface::ExternalModel<QubitUnitaryOpInterface,
                                                     catalyst::quantum::QubitUnitaryOp> {
@@ -51,6 +51,43 @@ struct QubitUnitaryOpInterface
   }
 };
 
+/// Bufferization of catalyst.quantum.hermitian. Convert Matrix into memref.
+struct HermitianOpInterface
+    : public bufferization::BufferizableOpInterface::ExternalModel<HermitianOpInterface,
+                                                    catalyst::quantum::HermitianOp> {
+  bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
+                              const bufferization::AnalysisState &state) const {
+    return true;
+  }
+
+  bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
+                               const bufferization::AnalysisState &state) const {
+    return false;
+  }
+
+  bufferization::AliasingValueList getAliasingValues(Operation *op,
+                                      OpOperand &opOperand,
+                                      const bufferization::AnalysisState &state) const {
+    return {};
+  }
+
+  LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
+                          const bufferization::BufferizationOptions &options) const {
+    auto hermitianOp = cast<HermitianOp>(op);
+    Location loc = op->getLoc();
+    auto tensorType = cast<RankedTensorType>(hermitianOp.getMatrix().getType());
+    MemRefType memrefType = MemRefType::get(tensorType.getShape(), tensorType.getElementType());
+    auto toMemrefOp = rewriter.create<bufferization::ToMemrefOp>(loc, memrefType,
+                                                                 hermitianOp.getMatrix());
+    auto memref = toMemrefOp.getResult();
+    auto newHermitianOp = rewriter.create<HermitianOp>(loc, hermitianOp.getType(), memref,
+                                                       hermitianOp.getQubits());
+    bufferization::replaceOpWithBufferizedValues(rewriter, op, newHermitianOp.getObs());
+
+    return success();
+  }
+};
+
 /// Bufferization of catalyst.quantum.state. Replace with memref.alloc and a new
 /// catalyst.quantum.state that uses the memory allocated by memref.alloc.
 struct StateOpInterface
@@ -243,6 +280,7 @@ void catalyst::quantum::registerBufferizableOpInterfaceExternalModels(
     DialectRegistry &registry) {
   registry.addExtension(+[](MLIRContext *ctx, catalyst::quantum::QuantumDialect *dialect) {
     QubitUnitaryOp::attachInterface<QubitUnitaryOpInterface>(*ctx);
+    HermitianOp::attachInterface<HermitianOpInterface>(*ctx);
     StateOp::attachInterface<StateOpInterface>(*ctx);
     ProbsOp::attachInterface<ProbsOpInterface>(*ctx);
     CountsOp::attachInterface<CountsOpInterface>(*ctx);

From c3bf61abb15810edb208d784caa9edeb5bea123a Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Wed, 28 Aug 2024 17:44:58 -0400
Subject: [PATCH 032/183] Add Hamiltonian bufferization

---
 .../BufferizableOpInterfaceImpl.cpp           | 38 +++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp
index 50e0ff6497..a43544357a 100644
--- a/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -88,6 +88,43 @@ struct HermitianOpInterface
   }
 };
 
+/// Bufferization of catalyst.quantum.hamiltonian. Convert Matrix into memref.
+struct HamiltonianOpInterface
+    : public bufferization::BufferizableOpInterface::ExternalModel<HamiltonianOpInterface,
+                                                    catalyst::quantum::HamiltonianOp> {
+  bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
+                              const bufferization::AnalysisState &state) const {
+    return true;
+  }
+
+  bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
+                               const bufferization::AnalysisState &state) const {
+    return false;
+  }
+
+  bufferization::AliasingValueList getAliasingValues(Operation *op,
+                                      OpOperand &opOperand,
+                                      const bufferization::AnalysisState &state) const {
+    return {};
+  }
+
+  LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
+                          const bufferization::BufferizationOptions &options) const {
+    auto hamiltonianOp = cast<HamiltonianOp>(op);
+    Location loc = op->getLoc();
+    auto tensorType = cast<RankedTensorType>(hamiltonianOp.getCoeffs().getType());
+    MemRefType memrefType = MemRefType::get(tensorType.getShape(), tensorType.getElementType());
+    auto toMemrefOp = rewriter.create<bufferization::ToMemrefOp>(loc, memrefType,
+                                                                 hamiltonianOp.getCoeffs());
+    auto memref = toMemrefOp.getResult();
+    auto newHamiltonianOp = rewriter.create<HamiltonianOp>(loc, hamiltonianOp.getType(), memref,
+                                                       hamiltonianOp.getTerms());
+    bufferization::replaceOpWithBufferizedValues(rewriter, op, newHamiltonianOp.getObs());
+
+    return success();
+  }
+};
+
 /// Bufferization of catalyst.quantum.state. Replace with memref.alloc and a new
 /// catalyst.quantum.state that uses the memory allocated by memref.alloc.
 struct StateOpInterface
@@ -281,6 +318,7 @@ void catalyst::quantum::registerBufferizableOpInterfaceExternalModels(
   registry.addExtension(+[](MLIRContext *ctx, catalyst::quantum::QuantumDialect *dialect) {
     QubitUnitaryOp::attachInterface<QubitUnitaryOpInterface>(*ctx);
     HermitianOp::attachInterface<HermitianOpInterface>(*ctx);
+    HamiltonianOp::attachInterface<HamiltonianOpInterface>(*ctx);
     StateOp::attachInterface<StateOpInterface>(*ctx);
     ProbsOp::attachInterface<ProbsOpInterface>(*ctx);
     CountsOp::attachInterface<CountsOpInterface>(*ctx);

From a9ed44b82baa0c891c518b9107328699cd8513a8 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Wed, 28 Aug 2024 17:51:26 -0400
Subject: [PATCH 033/183] Remove redundant scope resolutions

---
 .../Transforms/BufferizableOpInterfaceImpl.cpp | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp
index a43544357a..76e073b9b2 100644
--- a/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -14,7 +14,7 @@ namespace {
 /// Bufferization of catalyst.quantum.unitary. Convert Matrix into memref.
 struct QubitUnitaryOpInterface
     : public bufferization::BufferizableOpInterface::ExternalModel<QubitUnitaryOpInterface,
-                                                    catalyst::quantum::QubitUnitaryOp> {
+                                                    QubitUnitaryOp> {
   bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
                               const bufferization::AnalysisState &state) const {
     return true;
@@ -54,7 +54,7 @@ struct QubitUnitaryOpInterface
 /// Bufferization of catalyst.quantum.hermitian. Convert Matrix into memref.
 struct HermitianOpInterface
     : public bufferization::BufferizableOpInterface::ExternalModel<HermitianOpInterface,
-                                                    catalyst::quantum::HermitianOp> {
+                                                    HermitianOp> {
   bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
                               const bufferization::AnalysisState &state) const {
     return true;
@@ -91,7 +91,7 @@ struct HermitianOpInterface
 /// Bufferization of catalyst.quantum.hamiltonian. Convert Matrix into memref.
 struct HamiltonianOpInterface
     : public bufferization::BufferizableOpInterface::ExternalModel<HamiltonianOpInterface,
-                                                    catalyst::quantum::HamiltonianOp> {
+                                                    HamiltonianOp> {
   bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
                               const bufferization::AnalysisState &state) const {
     return true;
@@ -129,7 +129,7 @@ struct HamiltonianOpInterface
 /// catalyst.quantum.state that uses the memory allocated by memref.alloc.
 struct StateOpInterface
     : public bufferization::BufferizableOpInterface::ExternalModel<StateOpInterface,
-                                                    catalyst::quantum::StateOp> {
+                                                    StateOp> {
   bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
                               const bufferization::AnalysisState &state) const {
     return false;
@@ -165,7 +165,7 @@ struct StateOpInterface
 /// catalyst.quantum.probs that uses the memory allocated by memref.alloc.
 struct ProbsOpInterface
     : public bufferization::BufferizableOpInterface::ExternalModel<ProbsOpInterface,
-                                                    catalyst::quantum::ProbsOp> {
+                                                    ProbsOp> {
   bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
                               const bufferization::AnalysisState &state) const {
     return false;
@@ -201,7 +201,7 @@ struct ProbsOpInterface
 /// catalyst.quantum.counts that uses the memory allocated by memref.allocs.
 struct CountsOpInterface
     : public bufferization::BufferizableOpInterface::ExternalModel<CountsOpInterface,
-                                                    catalyst::quantum::CountsOp> {
+                                                    CountsOp> {
   bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
                               const bufferization::AnalysisState &state) const {
     return false;
@@ -240,7 +240,7 @@ struct CountsOpInterface
 /// Bufferization of catalyst.quantum.set_state. Convert InState into memref.
 struct SetStateOpInterface
     : public bufferization::BufferizableOpInterface::ExternalModel<SetStateOpInterface,
-                                                    catalyst::quantum::SetStateOp> {
+                                                    SetStateOp> {
   bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
                               const bufferization::AnalysisState &state) const {
     return false;
@@ -277,7 +277,7 @@ struct SetStateOpInterface
 /// Bufferization of catalyst.quantum.set_basic_state. Convert BasisState into memref.
 struct SetBasisStateOpInterface
     : public bufferization::BufferizableOpInterface::ExternalModel<SetBasisStateOpInterface,
-                                                    catalyst::quantum::SetBasisStateOp> {
+                                                    SetBasisStateOp> {
   bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
                               const bufferization::AnalysisState &state) const {
     return false;
@@ -315,7 +315,7 @@ struct SetBasisStateOpInterface
 
 void catalyst::quantum::registerBufferizableOpInterfaceExternalModels(
     DialectRegistry &registry) {
-  registry.addExtension(+[](MLIRContext *ctx, catalyst::quantum::QuantumDialect *dialect) {
+  registry.addExtension(+[](MLIRContext *ctx, QuantumDialect *dialect) {
     QubitUnitaryOp::attachInterface<QubitUnitaryOpInterface>(*ctx);
     HermitianOp::attachInterface<HermitianOpInterface>(*ctx);
     HamiltonianOp::attachInterface<HamiltonianOpInterface>(*ctx);

From b54c10437592a41279179bfff1caa02c082c1024 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Wed, 28 Aug 2024 18:01:00 -0400
Subject: [PATCH 034/183] Add SampleOp bufferization

---
 .../BufferizableOpInterfaceImpl.cpp           | 38 +++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp
index 76e073b9b2..3e7b081b01 100644
--- a/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -125,6 +125,43 @@ struct HamiltonianOpInterface
   }
 };
 
+/// Bufferization of catalyst.quantum.sample. Replace with memref.alloc and a new
+/// catalyst.quantum.sample that uses the memory allocated by memref.alloc.
+struct SampleOpInterface
+    : public bufferization::BufferizableOpInterface::ExternalModel<SampleOpInterface,
+                                                    SampleOp> {
+  bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
+                              const bufferization::AnalysisState &state) const {
+    return false;
+  }
+
+  bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
+                               const bufferization::AnalysisState &state) const {
+    return false;
+  }
+
+  bufferization::AliasingValueList getAliasingValues(Operation *op,
+                                      OpOperand &opOperand,
+                                      const bufferization::AnalysisState &state) const {
+    return {};
+  }
+
+  LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
+                          const bufferization::BufferizationOptions &options) const {
+    auto sampleOp = cast<SampleOp>(op);
+    Location loc = op->getLoc();
+    auto tensorType = cast<RankedTensorType>(sampleOp.getSamples().getType());
+    MemRefType resultType = MemRefType::get(tensorType.getShape(), tensorType.getElementType());
+
+    Value allocVal = rewriter.create<memref::AllocOp>(loc, resultType);
+    rewriter.create<SampleOp>(loc, TypeRange{}, ValueRange{sampleOp.getObs(), allocVal},
+                              sampleOp->getAttrs());
+    bufferization::replaceOpWithBufferizedValues(rewriter, op, allocVal);
+
+    return success();
+  }
+};
+
 /// Bufferization of catalyst.quantum.state. Replace with memref.alloc and a new
 /// catalyst.quantum.state that uses the memory allocated by memref.alloc.
 struct StateOpInterface
@@ -319,6 +356,7 @@ void catalyst::quantum::registerBufferizableOpInterfaceExternalModels(
     QubitUnitaryOp::attachInterface<QubitUnitaryOpInterface>(*ctx);
     HermitianOp::attachInterface<HermitianOpInterface>(*ctx);
     HamiltonianOp::attachInterface<HamiltonianOpInterface>(*ctx);
+    SampleOp::attachInterface<SampleOpInterface>(*ctx);
     StateOp::attachInterface<StateOpInterface>(*ctx);
     ProbsOp::attachInterface<ProbsOpInterface>(*ctx);
     CountsOp::attachInterface<CountsOpInterface>(*ctx);

From 440b926f81c22ba56a92786b8e757670b0a94f70 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Thu, 29 Aug 2024 10:35:57 -0400
Subject: [PATCH 035/183] Draft bufferization templates for Catalyst and
 Gradient IRs

---
 .../Transforms/BufferizableOpInterfaceImpl.h  | 11 ++++++++++
 .../Transforms/BufferizableOpInterfaceImpl.h  | 13 ++++++++++++
 .../BufferizableOpInterfaceImpl.cpp           | 21 +++++++++++++++++++
 mlir/lib/Catalyst/Transforms/CMakeLists.txt   |  1 +
 mlir/lib/Driver/CompilerDriver.cpp            |  6 +++++-
 .../BufferizableOpInterfaceImpl.cpp           | 21 +++++++++++++++++++
 mlir/lib/Gradient/Transforms/CMakeLists.txt   |  1 +
 7 files changed, 73 insertions(+), 1 deletion(-)
 create mode 100644 mlir/include/Catalyst/Transforms/BufferizableOpInterfaceImpl.h
 create mode 100644 mlir/include/Gradient/Transforms/BufferizableOpInterfaceImpl.h
 create mode 100644 mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
 create mode 100644 mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp

diff --git a/mlir/include/Catalyst/Transforms/BufferizableOpInterfaceImpl.h b/mlir/include/Catalyst/Transforms/BufferizableOpInterfaceImpl.h
new file mode 100644
index 0000000000..9c0e982cb0
--- /dev/null
+++ b/mlir/include/Catalyst/Transforms/BufferizableOpInterfaceImpl.h
@@ -0,0 +1,11 @@
+#pragma once
+
+using namespace mlir;
+
+namespace catalyst {
+
+
+void registerBufferizableOpInterfaceExternalModels(mlir::DialectRegistry &registry);
+
+
+} // namespace catalyst
\ No newline at end of file
diff --git a/mlir/include/Gradient/Transforms/BufferizableOpInterfaceImpl.h b/mlir/include/Gradient/Transforms/BufferizableOpInterfaceImpl.h
new file mode 100644
index 0000000000..c8426be8b3
--- /dev/null
+++ b/mlir/include/Gradient/Transforms/BufferizableOpInterfaceImpl.h
@@ -0,0 +1,13 @@
+#pragma once
+
+using namespace mlir;
+
+namespace catalyst {
+
+namespace gradient {
+
+void registerBufferizableOpInterfaceExternalModels(mlir::DialectRegistry &registry);
+
+}
+
+} // namespace catalyst
\ No newline at end of file
diff --git a/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
new file mode 100644
index 0000000000..a1ee1289ba
--- /dev/null
+++ b/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -0,0 +1,21 @@
+#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
+#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Transforms/DialectConversion.h"
+
+#include "Catalyst/IR/CatalystOps.h"
+#include "Catalyst/Transforms/BufferizableOpInterfaceImpl.h"
+
+using namespace mlir;
+using namespace catalyst;
+
+namespace {
+
+} // namespace
+
+void catalyst::registerBufferizableOpInterfaceExternalModels(
+    DialectRegistry &registry) {
+  registry.addExtension(+[](MLIRContext *ctx, CatalystDialect *dialect) {
+
+  });
+}
\ No newline at end of file
diff --git a/mlir/lib/Catalyst/Transforms/CMakeLists.txt b/mlir/lib/Catalyst/Transforms/CMakeLists.txt
index 71e8fc968c..e7113aa778 100644
--- a/mlir/lib/Catalyst/Transforms/CMakeLists.txt
+++ b/mlir/lib/Catalyst/Transforms/CMakeLists.txt
@@ -7,6 +7,7 @@ file(GLOB SRC
     qnode_to_async_lowering.cpp
     QnodeToAsyncPatterns.cpp
     RegisterAllPasses.cpp
+    BufferizableOpInterfaceImpl.cpp
     BufferizationPatterns.cpp
     catalyst_bufferize.cpp
     catalyst_to_llvm.cpp
diff --git a/mlir/lib/Driver/CompilerDriver.cpp b/mlir/lib/Driver/CompilerDriver.cpp
index 07d92c6a58..67533ad2ef 100644
--- a/mlir/lib/Driver/CompilerDriver.cpp
+++ b/mlir/lib/Driver/CompilerDriver.cpp
@@ -53,18 +53,20 @@
 #include "llvm/Transforms/IPO/GlobalDCE.h"
 
 #include "Catalyst/IR/CatalystDialect.h"
+#include "Catalyst/Transforms/BufferizableOpInterfaceImpl.h"
 #include "Catalyst/Transforms/Passes.h"
 #include "Driver/CatalystLLVMTarget.h"
 #include "Driver/CompilerDriver.h"
 #include "Driver/Support.h"
 #include "Gradient/IR/GradientDialect.h"
 #include "Gradient/IR/GradientInterfaces.h"
+#include "Gradient/Transforms/BufferizableOpInterfaceImpl.h"
 #include "Gradient/Transforms/Passes.h"
 #include "Mitigation/IR/MitigationDialect.h"
 #include "Mitigation/Transforms/Passes.h"
 #include "Quantum/IR/QuantumDialect.h"
-#include "Quantum/Transforms/Passes.h"
 #include "Quantum/Transforms/BufferizableOpInterfaceImpl.h"
+#include "Quantum/Transforms/Passes.h"
 
 #include "Enzyme.h"
 #include "Timer.hpp"
@@ -304,7 +306,9 @@ void registerAllCatalystDialects(DialectRegistry &registry)
     registry.insert<mitigation::MitigationDialect>();
 
     // Extend one-shot bufferization pass.
+    catalyst::registerBufferizableOpInterfaceExternalModels(registry);
     catalyst::quantum::registerBufferizableOpInterfaceExternalModels(registry);
+    catalyst::gradient::registerBufferizableOpInterfaceExternalModels(registry);
 }
 } // namespace
 
diff --git a/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
new file mode 100644
index 0000000000..dee62249f4
--- /dev/null
+++ b/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -0,0 +1,21 @@
+#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
+#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Transforms/DialectConversion.h"
+
+#include "Gradient/IR/GradientOps.h"
+#include "Gradient/Transforms/BufferizableOpInterfaceImpl.h"
+
+using namespace mlir;
+using namespace catalyst::gradient;
+
+namespace {
+
+} // namespace
+
+void catalyst::gradient::registerBufferizableOpInterfaceExternalModels(
+    DialectRegistry &registry) {
+  registry.addExtension(+[](MLIRContext *ctx, GradientDialect *dialect) {
+
+  });
+}
\ No newline at end of file
diff --git a/mlir/lib/Gradient/Transforms/CMakeLists.txt b/mlir/lib/Gradient/Transforms/CMakeLists.txt
index 7c49addbec..82c00aa58a 100644
--- a/mlir/lib/Gradient/Transforms/CMakeLists.txt
+++ b/mlir/lib/Gradient/Transforms/CMakeLists.txt
@@ -2,6 +2,7 @@ set(LIBRARY_NAME gradient-transforms)
 
 file(GLOB SRC
     GradMethods/*.cpp
+    BufferizableOpInterfaceImpl.cpp
     BufferizationPatterns.cpp
     gradient_bufferize.cpp
     LoweringPatterns.cpp

From 159057c863afc1bbd79752a3600ae605539ee0f7 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Thu, 29 Aug 2024 11:35:15 -0400
Subject: [PATCH 036/183] Add printOp Bufferization

---
 .../BufferizableOpInterfaceImpl.cpp           | 36 ++++++++++++++++++-
 1 file changed, 35 insertions(+), 1 deletion(-)

diff --git a/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
index a1ee1289ba..3302b1ea92 100644
--- a/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -11,11 +11,45 @@ using namespace catalyst;
 
 namespace {
 
+/// Bufferization of catalyst.quantum.hermitian. Convert Matrix into memref.
+struct PrintOpInterface
+    : public bufferization::BufferizableOpInterface::ExternalModel<PrintOpInterface,
+                                                    PrintOp> {
+  bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
+                              const bufferization::AnalysisState &state) const {
+    return true;
+  }
+
+  bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
+                               const bufferization::AnalysisState &state) const {
+    return false;
+  }
+
+  bufferization::AliasingValueList getAliasingValues(Operation *op,
+                                      OpOperand &opOperand,
+                                      const bufferization::AnalysisState &state) const {
+    return {};
+  }
+
+  LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
+                          const bufferization::BufferizationOptions &options) const {
+    auto printOp = cast<PrintOp>(op);
+    if (printOp.getVal()) {
+        FailureOr<Value> source = getBuffer(rewriter, printOp.getVal(), options);
+        if (failed(source))
+            return failure();
+        bufferization::replaceOpWithNewBufferizedOp<PrintOp>(rewriter, op, *source,
+                        printOp.getConstValAttr(), printOp.getPrintDescriptorAttr());
+    }
+    return success();
+  }
+};
+
 } // namespace
 
 void catalyst::registerBufferizableOpInterfaceExternalModels(
     DialectRegistry &registry) {
   registry.addExtension(+[](MLIRContext *ctx, CatalystDialect *dialect) {
-
+    PrintOp::attachInterface<PrintOpInterface>(*ctx);
   });
 }
\ No newline at end of file

From 81394aef9cdf5db3cedb2f4093f84da6c1973074 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Thu, 29 Aug 2024 11:59:05 -0400
Subject: [PATCH 037/183] Fix indentation

---
 .../BufferizableOpInterfaceImpl.cpp           | 572 +++++++++---------
 1 file changed, 286 insertions(+), 286 deletions(-)

diff --git a/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp
index 3e7b081b01..8e09a014eb 100644
--- a/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -15,114 +15,114 @@ namespace {
 struct QubitUnitaryOpInterface
     : public bufferization::BufferizableOpInterface::ExternalModel<QubitUnitaryOpInterface,
                                                     QubitUnitaryOp> {
-  bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
-                              const bufferization::AnalysisState &state) const {
-    return true;
-  }
+    bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
+                                const bufferization::AnalysisState &state) const {
+        return true;
+    }
 
-  bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
-                               const bufferization::AnalysisState &state) const {
-    return false;
-  }
+    bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
+                                 const bufferization::AnalysisState &state) const {
+        return false;
+    }
 
-  bufferization::AliasingValueList getAliasingValues(Operation *op,
+    bufferization::AliasingValueList getAliasingValues(Operation *op,
                                       OpOperand &opOperand,
                                       const bufferization::AnalysisState &state) const {
-    return {};
-  }
-
-  LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
-                          const bufferization::BufferizationOptions &options) const {
-    auto qubitUnitaryOp = cast<QubitUnitaryOp>(op);
-    Location loc = op->getLoc();
-    auto tensorType = cast<RankedTensorType>(qubitUnitaryOp.getMatrix().getType());
-    MemRefType memrefType = MemRefType::get(tensorType.getShape(), tensorType.getElementType());
-    auto toMemrefOp = rewriter.create<bufferization::ToMemrefOp>(loc, memrefType,
-                                                                 qubitUnitaryOp.getMatrix());
-    auto memref = toMemrefOp.getResult();
-    auto newQubitUnitaryOp = rewriter.create<QubitUnitaryOp>(
-            loc, qubitUnitaryOp.getOutQubits().getTypes(),
-            qubitUnitaryOp.getOutCtrlQubits().getTypes(), memref,
-            qubitUnitaryOp.getInQubits(), qubitUnitaryOp.getAdjointAttr(),
-            qubitUnitaryOp.getInCtrlQubits(), qubitUnitaryOp.getInCtrlValues());
-    bufferization::replaceOpWithBufferizedValues(rewriter, op, newQubitUnitaryOp.getOutQubits());
-
-    return success();
-  }
+        return {};
+    }
+
+    LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
+                            const bufferization::BufferizationOptions &options) const {
+        auto qubitUnitaryOp = cast<QubitUnitaryOp>(op);
+        Location loc = op->getLoc();
+        auto tensorType = cast<RankedTensorType>(qubitUnitaryOp.getMatrix().getType());
+        MemRefType memrefType = MemRefType::get(tensorType.getShape(), tensorType.getElementType());
+        auto toMemrefOp = rewriter.create<bufferization::ToMemrefOp>(loc, memrefType,
+                                                                     qubitUnitaryOp.getMatrix());
+        auto memref = toMemrefOp.getResult();
+        auto newQubitUnitaryOp = rewriter.create<QubitUnitaryOp>(
+                loc, qubitUnitaryOp.getOutQubits().getTypes(),
+                qubitUnitaryOp.getOutCtrlQubits().getTypes(), memref,
+                qubitUnitaryOp.getInQubits(), qubitUnitaryOp.getAdjointAttr(),
+                qubitUnitaryOp.getInCtrlQubits(), qubitUnitaryOp.getInCtrlValues());
+        bufferization::replaceOpWithBufferizedValues(rewriter, op, newQubitUnitaryOp.getOutQubits());
+
+        return success();
+    }
 };
 
 /// Bufferization of catalyst.quantum.hermitian. Convert Matrix into memref.
 struct HermitianOpInterface
     : public bufferization::BufferizableOpInterface::ExternalModel<HermitianOpInterface,
                                                     HermitianOp> {
-  bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
-                              const bufferization::AnalysisState &state) const {
-    return true;
-  }
-
-  bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
-                               const bufferization::AnalysisState &state) const {
-    return false;
-  }
-
-  bufferization::AliasingValueList getAliasingValues(Operation *op,
-                                      OpOperand &opOperand,
-                                      const bufferization::AnalysisState &state) const {
-    return {};
-  }
-
-  LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
-                          const bufferization::BufferizationOptions &options) const {
-    auto hermitianOp = cast<HermitianOp>(op);
-    Location loc = op->getLoc();
-    auto tensorType = cast<RankedTensorType>(hermitianOp.getMatrix().getType());
-    MemRefType memrefType = MemRefType::get(tensorType.getShape(), tensorType.getElementType());
-    auto toMemrefOp = rewriter.create<bufferization::ToMemrefOp>(loc, memrefType,
-                                                                 hermitianOp.getMatrix());
-    auto memref = toMemrefOp.getResult();
-    auto newHermitianOp = rewriter.create<HermitianOp>(loc, hermitianOp.getType(), memref,
-                                                       hermitianOp.getQubits());
-    bufferization::replaceOpWithBufferizedValues(rewriter, op, newHermitianOp.getObs());
-
-    return success();
-  }
+    bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
+                                const bufferization::AnalysisState &state) const {
+        return true;
+    }
+
+    bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
+                                 const bufferization::AnalysisState &state) const {
+        return false;
+    }
+
+    bufferization::AliasingValueList getAliasingValues(Operation *op,
+                                        OpOperand &opOperand,
+                                        const bufferization::AnalysisState &state) const {
+        return {};
+    }
+
+    LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
+                            const bufferization::BufferizationOptions &options) const {
+        auto hermitianOp = cast<HermitianOp>(op);
+        Location loc = op->getLoc();
+        auto tensorType = cast<RankedTensorType>(hermitianOp.getMatrix().getType());
+        MemRefType memrefType = MemRefType::get(tensorType.getShape(), tensorType.getElementType());
+        auto toMemrefOp = rewriter.create<bufferization::ToMemrefOp>(loc, memrefType,
+                                                                     hermitianOp.getMatrix());
+        auto memref = toMemrefOp.getResult();
+        auto newHermitianOp = rewriter.create<HermitianOp>(loc, hermitianOp.getType(), memref,
+                                                           hermitianOp.getQubits());
+        bufferization::replaceOpWithBufferizedValues(rewriter, op, newHermitianOp.getObs());
+
+        return success();
+    }
 };
 
 /// Bufferization of catalyst.quantum.hamiltonian. Convert Matrix into memref.
 struct HamiltonianOpInterface
     : public bufferization::BufferizableOpInterface::ExternalModel<HamiltonianOpInterface,
                                                     HamiltonianOp> {
-  bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
-                              const bufferization::AnalysisState &state) const {
-    return true;
-  }
-
-  bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
-                               const bufferization::AnalysisState &state) const {
-    return false;
-  }
-
-  bufferization::AliasingValueList getAliasingValues(Operation *op,
-                                      OpOperand &opOperand,
-                                      const bufferization::AnalysisState &state) const {
-    return {};
-  }
-
-  LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
-                          const bufferization::BufferizationOptions &options) const {
-    auto hamiltonianOp = cast<HamiltonianOp>(op);
-    Location loc = op->getLoc();
-    auto tensorType = cast<RankedTensorType>(hamiltonianOp.getCoeffs().getType());
-    MemRefType memrefType = MemRefType::get(tensorType.getShape(), tensorType.getElementType());
-    auto toMemrefOp = rewriter.create<bufferization::ToMemrefOp>(loc, memrefType,
-                                                                 hamiltonianOp.getCoeffs());
-    auto memref = toMemrefOp.getResult();
-    auto newHamiltonianOp = rewriter.create<HamiltonianOp>(loc, hamiltonianOp.getType(), memref,
-                                                       hamiltonianOp.getTerms());
-    bufferization::replaceOpWithBufferizedValues(rewriter, op, newHamiltonianOp.getObs());
-
-    return success();
-  }
+    bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
+                                const bufferization::AnalysisState &state) const {
+        return true;
+    }
+
+    bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
+                                 const bufferization::AnalysisState &state) const {
+        return false;
+    }
+
+    bufferization::AliasingValueList getAliasingValues(Operation *op,
+                                        OpOperand &opOperand,
+                                        const bufferization::AnalysisState &state) const {
+        return {};
+    }
+
+    LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
+                            const bufferization::BufferizationOptions &options) const {
+        auto hamiltonianOp = cast<HamiltonianOp>(op);
+        Location loc = op->getLoc();
+        auto tensorType = cast<RankedTensorType>(hamiltonianOp.getCoeffs().getType());
+        MemRefType memrefType = MemRefType::get(tensorType.getShape(), tensorType.getElementType());
+        auto toMemrefOp = rewriter.create<bufferization::ToMemrefOp>(loc, memrefType,
+                                                                     hamiltonianOp.getCoeffs());
+        auto memref = toMemrefOp.getResult();
+        auto newHamiltonianOp = rewriter.create<HamiltonianOp>(loc, hamiltonianOp.getType(), memref,
+                                                           hamiltonianOp.getTerms());
+        bufferization::replaceOpWithBufferizedValues(rewriter, op, newHamiltonianOp.getObs());
+
+        return success();
+    }
 };
 
 /// Bufferization of catalyst.quantum.sample. Replace with memref.alloc and a new
@@ -130,36 +130,36 @@ struct HamiltonianOpInterface
 struct SampleOpInterface
     : public bufferization::BufferizableOpInterface::ExternalModel<SampleOpInterface,
                                                     SampleOp> {
-  bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
-                              const bufferization::AnalysisState &state) const {
-    return false;
-  }
-
-  bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
-                               const bufferization::AnalysisState &state) const {
-    return false;
-  }
-
-  bufferization::AliasingValueList getAliasingValues(Operation *op,
-                                      OpOperand &opOperand,
-                                      const bufferization::AnalysisState &state) const {
-    return {};
-  }
-
-  LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
-                          const bufferization::BufferizationOptions &options) const {
-    auto sampleOp = cast<SampleOp>(op);
-    Location loc = op->getLoc();
-    auto tensorType = cast<RankedTensorType>(sampleOp.getSamples().getType());
-    MemRefType resultType = MemRefType::get(tensorType.getShape(), tensorType.getElementType());
-
-    Value allocVal = rewriter.create<memref::AllocOp>(loc, resultType);
-    rewriter.create<SampleOp>(loc, TypeRange{}, ValueRange{sampleOp.getObs(), allocVal},
-                              sampleOp->getAttrs());
-    bufferization::replaceOpWithBufferizedValues(rewriter, op, allocVal);
-
-    return success();
-  }
+    bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
+                                const bufferization::AnalysisState &state) const {
+        return false;
+    }
+
+    bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
+                                 const bufferization::AnalysisState &state) const {
+        return false;
+    }
+
+    bufferization::AliasingValueList getAliasingValues(Operation *op,
+                                        OpOperand &opOperand,
+                                        const bufferization::AnalysisState &state) const {
+        return {};
+    }
+
+    LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
+                            const bufferization::BufferizationOptions &options) const {
+        auto sampleOp = cast<SampleOp>(op);
+        Location loc = op->getLoc();
+        auto tensorType = cast<RankedTensorType>(sampleOp.getSamples().getType());
+        MemRefType resultType = MemRefType::get(tensorType.getShape(), tensorType.getElementType());
+
+        Value allocVal = rewriter.create<memref::AllocOp>(loc, resultType);
+        rewriter.create<SampleOp>(loc, TypeRange{}, ValueRange{sampleOp.getObs(), allocVal},
+                                  sampleOp->getAttrs());
+        bufferization::replaceOpWithBufferizedValues(rewriter, op, allocVal);
+
+        return success();
+    }
 };
 
 /// Bufferization of catalyst.quantum.state. Replace with memref.alloc and a new
@@ -167,35 +167,35 @@ struct SampleOpInterface
 struct StateOpInterface
     : public bufferization::BufferizableOpInterface::ExternalModel<StateOpInterface,
                                                     StateOp> {
-  bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
-                              const bufferization::AnalysisState &state) const {
-    return false;
-  }
-
-  bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
-                               const bufferization::AnalysisState &state) const {
-    return false;
-  }
-
-  bufferization::AliasingValueList getAliasingValues(Operation *op,
-                                      OpOperand &opOperand,
-                                      const bufferization::AnalysisState &state) const {
-    return {};
-  }
-
-  LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
-                          const bufferization::BufferizationOptions &options) const {
-    auto stateOp = cast<StateOp>(op);
-    Location loc = op->getLoc();
-    auto tensorType = cast<RankedTensorType>(stateOp.getState().getType());
-    MemRefType resultType = MemRefType::get(tensorType.getShape(), tensorType.getElementType());
-
-    Value allocVal = rewriter.create<memref::AllocOp>(loc, resultType);
-    rewriter.create<StateOp>(loc, TypeRange{}, ValueRange{stateOp.getObs(), allocVal});
-    bufferization::replaceOpWithBufferizedValues(rewriter, op, allocVal);
-
-    return success();
-  }
+    bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
+                                const bufferization::AnalysisState &state) const {
+        return false;
+    }
+
+    bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
+                                 const bufferization::AnalysisState &state) const {
+        return false;
+    }
+
+    bufferization::AliasingValueList getAliasingValues(Operation *op,
+                                        OpOperand &opOperand,
+                                        const bufferization::AnalysisState &state) const {
+        return {};
+    }
+
+    LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
+                            const bufferization::BufferizationOptions &options) const {
+        auto stateOp = cast<StateOp>(op);
+        Location loc = op->getLoc();
+        auto tensorType = cast<RankedTensorType>(stateOp.getState().getType());
+        MemRefType resultType = MemRefType::get(tensorType.getShape(), tensorType.getElementType());
+
+        Value allocVal = rewriter.create<memref::AllocOp>(loc, resultType);
+        rewriter.create<StateOp>(loc, TypeRange{}, ValueRange{stateOp.getObs(), allocVal});
+        bufferization::replaceOpWithBufferizedValues(rewriter, op, allocVal);
+
+        return success();
+    }
 };
 
 /// Bufferization of catalyst.quantum.probs. Replace with memref.alloc and a new
@@ -203,35 +203,35 @@ struct StateOpInterface
 struct ProbsOpInterface
     : public bufferization::BufferizableOpInterface::ExternalModel<ProbsOpInterface,
                                                     ProbsOp> {
-  bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
-                              const bufferization::AnalysisState &state) const {
-    return false;
-  }
-
-  bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
-                               const bufferization::AnalysisState &state) const {
-    return false;
-  }
-
-  bufferization::AliasingValueList getAliasingValues(Operation *op,
-                                      OpOperand &opOperand,
-                                      const bufferization::AnalysisState &state) const {
-    return {};
-  }
-
-  LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
-                          const bufferization::BufferizationOptions &options) const {
-    auto probsOp = cast<ProbsOp>(op);
-    Location loc = op->getLoc();
-    auto tensorType = cast<RankedTensorType>(probsOp.getProbabilities().getType());
-    MemRefType resultType = MemRefType::get(tensorType.getShape(), tensorType.getElementType());
-
-    Value allocVal = rewriter.create<memref::AllocOp>(loc, resultType);
-    rewriter.create<ProbsOp>(loc, TypeRange{}, ValueRange{probsOp.getObs(), allocVal});
-    bufferization::replaceOpWithBufferizedValues(rewriter, op, allocVal);
-
-    return success();
-  }
+    bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
+                                const bufferization::AnalysisState &state) const {
+        return false;
+    }
+
+    bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
+                                 const bufferization::AnalysisState &state) const {
+        return false;
+    }
+
+    bufferization::AliasingValueList getAliasingValues(Operation *op,
+                                        OpOperand &opOperand,
+                                        const bufferization::AnalysisState &state) const {
+        return {};
+    }
+
+    LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
+                            const bufferization::BufferizationOptions &options) const {
+        auto probsOp = cast<ProbsOp>(op);
+        Location loc = op->getLoc();
+        auto tensorType = cast<RankedTensorType>(probsOp.getProbabilities().getType());
+        MemRefType resultType = MemRefType::get(tensorType.getShape(), tensorType.getElementType());
+
+        Value allocVal = rewriter.create<memref::AllocOp>(loc, resultType);
+        rewriter.create<ProbsOp>(loc, TypeRange{}, ValueRange{probsOp.getObs(), allocVal});
+        bufferization::replaceOpWithBufferizedValues(rewriter, op, allocVal);
+
+        return success();
+    }
 };
 
 /// Bufferization of catalyst.quantum.counts. Replace with memref.allocs and a new
@@ -239,128 +239,128 @@ struct ProbsOpInterface
 struct CountsOpInterface
     : public bufferization::BufferizableOpInterface::ExternalModel<CountsOpInterface,
                                                     CountsOp> {
-  bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
-                              const bufferization::AnalysisState &state) const {
-    return false;
-  }
-
-  bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
-                               const bufferization::AnalysisState &state) const {
-    return false;
-  }
-
-  bufferization::AliasingValueList getAliasingValues(Operation *op,
-                                      OpOperand &opOperand,
-                                      const bufferization::AnalysisState &state) const {
-    return {};
-  }
-
-  LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
-                          const bufferization::BufferizationOptions &options) const {
-    auto countsOp = cast<CountsOp>(op);
-    Location loc = op->getLoc();
-    auto tensorType0 = cast<RankedTensorType>(countsOp.getEigvals().getType());
-    auto tensorType1 = cast<RankedTensorType>(countsOp.getCounts().getType());
-    MemRefType resultType0 = MemRefType::get(tensorType0.getShape(), tensorType0.getElementType());
-    MemRefType resultType1 = MemRefType::get(tensorType1.getShape(), tensorType1.getElementType());
-
-    Value allocVal0 = rewriter.create<memref::AllocOp>(loc, resultType0);
-    Value allocVal1 = rewriter.create<memref::AllocOp>(loc, resultType1);
-    rewriter.create<CountsOp>(loc, nullptr, nullptr, countsOp.getObs(), allocVal0, allocVal1,
-                              countsOp.getShotsAttr());
-    bufferization::replaceOpWithBufferizedValues(rewriter, op, ValueRange{allocVal0, allocVal1});
-
-    return success();
-  }
+    bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
+                                const bufferization::AnalysisState &state) const {
+        return false;
+    }
+
+    bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
+                                 const bufferization::AnalysisState &state) const {
+        return false;
+    }
+
+    bufferization::AliasingValueList getAliasingValues(Operation *op,
+                                        OpOperand &opOperand,
+                                        const bufferization::AnalysisState &state) const {
+        return {};
+    }
+
+    LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
+                            const bufferization::BufferizationOptions &options) const {
+        auto countsOp = cast<CountsOp>(op);
+        Location loc = op->getLoc();
+        auto tensorType0 = cast<RankedTensorType>(countsOp.getEigvals().getType());
+        auto tensorType1 = cast<RankedTensorType>(countsOp.getCounts().getType());
+        MemRefType resultType0 = MemRefType::get(tensorType0.getShape(), tensorType0.getElementType());
+        MemRefType resultType1 = MemRefType::get(tensorType1.getShape(), tensorType1.getElementType());
+
+        Value allocVal0 = rewriter.create<memref::AllocOp>(loc, resultType0);
+        Value allocVal1 = rewriter.create<memref::AllocOp>(loc, resultType1);
+        rewriter.create<CountsOp>(loc, nullptr, nullptr, countsOp.getObs(), allocVal0, allocVal1,
+                                  countsOp.getShotsAttr());
+        bufferization::replaceOpWithBufferizedValues(rewriter, op, ValueRange{allocVal0, allocVal1});
+
+        return success();
+    }
 };
 
 /// Bufferization of catalyst.quantum.set_state. Convert InState into memref.
 struct SetStateOpInterface
     : public bufferization::BufferizableOpInterface::ExternalModel<SetStateOpInterface,
                                                     SetStateOp> {
-  bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
-                              const bufferization::AnalysisState &state) const {
-    return false;
-  }
-
-  bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
-                               const bufferization::AnalysisState &state) const {
-    return true;
-  }
-
-  bufferization::AliasingValueList getAliasingValues(Operation *op,
-                                      OpOperand &opOperand,
-                                      const bufferization::AnalysisState &state) const {
-    return {};
-  }
-
-  LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
-                          const bufferization::BufferizationOptions &options) const {
-    auto setStateOp = cast<SetStateOp>(op);
-    Location loc = op->getLoc();
-    auto tensorType = cast<RankedTensorType>(setStateOp.getInState().getType());
-    MemRefType memrefType = MemRefType::get(tensorType.getShape(), tensorType.getElementType());
-
-    auto toMemrefOp = rewriter.create<bufferization::ToMemrefOp>(loc, memrefType,
-                                                                 setStateOp.getInState());
-    auto memref = toMemrefOp.getResult();
-    auto newSetStateOp = rewriter.create<SetStateOp>(loc, setStateOp.getOutQubits().getTypes(),
-                                                     memref, setStateOp.getInQubits());
-    bufferization::replaceOpWithBufferizedValues(rewriter, op, newSetStateOp.getOutQubits());
-    return success();
-  }
+    bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
+                                const bufferization::AnalysisState &state) const {
+        return false;
+    }
+
+    bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
+                                 const bufferization::AnalysisState &state) const {
+        return true;
+    }
+
+    bufferization::AliasingValueList getAliasingValues(Operation *op,
+                                        OpOperand &opOperand,
+                                        const bufferization::AnalysisState &state) const {
+        return {};
+    }
+
+    LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
+                            const bufferization::BufferizationOptions &options) const {
+        auto setStateOp = cast<SetStateOp>(op);
+        Location loc = op->getLoc();
+        auto tensorType = cast<RankedTensorType>(setStateOp.getInState().getType());
+        MemRefType memrefType = MemRefType::get(tensorType.getShape(), tensorType.getElementType());
+
+        auto toMemrefOp = rewriter.create<bufferization::ToMemrefOp>(loc, memrefType,
+                                                                     setStateOp.getInState());
+        auto memref = toMemrefOp.getResult();
+        auto newSetStateOp = rewriter.create<SetStateOp>(loc, setStateOp.getOutQubits().getTypes(),
+                                                         memref, setStateOp.getInQubits());
+        bufferization::replaceOpWithBufferizedValues(rewriter, op, newSetStateOp.getOutQubits());
+        return success();
+    }
 };
 
 /// Bufferization of catalyst.quantum.set_basic_state. Convert BasisState into memref.
 struct SetBasisStateOpInterface
     : public bufferization::BufferizableOpInterface::ExternalModel<SetBasisStateOpInterface,
                                                     SetBasisStateOp> {
-  bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
-                              const bufferization::AnalysisState &state) const {
-    return false;
-  }
-
-  bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
-                               const bufferization::AnalysisState &state) const {
-    return true;
-  }
-
-  bufferization::AliasingValueList getAliasingValues(Operation *op,
-                                      OpOperand &opOperand,
-                                      const bufferization::AnalysisState &state) const {
-    return {};
-  }
-
-  LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
-                          const bufferization::BufferizationOptions &options) const {
-    auto setBasisStateOp = cast<SetBasisStateOp>(op);
-    Location loc = op->getLoc();
-    auto tensorType = cast<RankedTensorType>(setBasisStateOp.getBasisState().getType());
-    MemRefType memrefType = MemRefType::get(tensorType.getShape(), tensorType.getElementType());
-
-    auto toMemrefOp = rewriter.create<bufferization::ToMemrefOp>(loc, memrefType,
-                                                                 setBasisStateOp.getBasisState());
-    auto memref = toMemrefOp.getResult();
-    auto newSetStateOp = rewriter.create<SetBasisStateOp>(loc, setBasisStateOp.getOutQubits().getTypes(),
-                                                     memref, setBasisStateOp.getInQubits());
-    bufferization::replaceOpWithBufferizedValues(rewriter, op, newSetStateOp.getOutQubits());
-    return success();
-  }
+    bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
+                                const bufferization::AnalysisState &state) const {
+        return false;
+    }
+
+    bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
+                                 const bufferization::AnalysisState &state) const {
+        return true;
+    }
+
+    bufferization::AliasingValueList getAliasingValues(Operation *op,
+                                        OpOperand &opOperand,
+                                        const bufferization::AnalysisState &state) const {
+        return {};
+    }
+
+    LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
+                            const bufferization::BufferizationOptions &options) const {
+        auto setBasisStateOp = cast<SetBasisStateOp>(op);
+        Location loc = op->getLoc();
+        auto tensorType = cast<RankedTensorType>(setBasisStateOp.getBasisState().getType());
+        MemRefType memrefType = MemRefType::get(tensorType.getShape(), tensorType.getElementType());
+
+        auto toMemrefOp = rewriter.create<bufferization::ToMemrefOp>(loc, memrefType,
+                                                                     setBasisStateOp.getBasisState());
+        auto memref = toMemrefOp.getResult();
+        auto newSetStateOp = rewriter.create<SetBasisStateOp>(loc, setBasisStateOp.getOutQubits().getTypes(),
+                                                         memref, setBasisStateOp.getInQubits());
+        bufferization::replaceOpWithBufferizedValues(rewriter, op, newSetStateOp.getOutQubits());
+        return success();
+    }
 };
 
 } // namespace
 
 void catalyst::quantum::registerBufferizableOpInterfaceExternalModels(
     DialectRegistry &registry) {
-  registry.addExtension(+[](MLIRContext *ctx, QuantumDialect *dialect) {
-    QubitUnitaryOp::attachInterface<QubitUnitaryOpInterface>(*ctx);
-    HermitianOp::attachInterface<HermitianOpInterface>(*ctx);
-    HamiltonianOp::attachInterface<HamiltonianOpInterface>(*ctx);
-    SampleOp::attachInterface<SampleOpInterface>(*ctx);
-    StateOp::attachInterface<StateOpInterface>(*ctx);
-    ProbsOp::attachInterface<ProbsOpInterface>(*ctx);
-    CountsOp::attachInterface<CountsOpInterface>(*ctx);
-    SetStateOp::attachInterface<SetStateOpInterface>(*ctx);
-    SetBasisStateOp::attachInterface<SetBasisStateOpInterface>(*ctx);
-  });
+    registry.addExtension(+[](MLIRContext *ctx, QuantumDialect *dialect) {
+        QubitUnitaryOp::attachInterface<QubitUnitaryOpInterface>(*ctx);
+        HermitianOp::attachInterface<HermitianOpInterface>(*ctx);
+        HamiltonianOp::attachInterface<HamiltonianOpInterface>(*ctx);
+        SampleOp::attachInterface<SampleOpInterface>(*ctx);
+        StateOp::attachInterface<StateOpInterface>(*ctx);
+        ProbsOp::attachInterface<ProbsOpInterface>(*ctx);
+        CountsOp::attachInterface<CountsOpInterface>(*ctx);
+        SetStateOp::attachInterface<SetStateOpInterface>(*ctx);
+        SetBasisStateOp::attachInterface<SetBasisStateOpInterface>(*ctx);
+    });
 }
\ No newline at end of file

From e6df804bf5d4fe5b7925a16aeaf403aba9f8c5a4 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Thu, 29 Aug 2024 12:03:41 -0400
Subject: [PATCH 038/183] Remove redundant imclude from QuantumDialect

---
 mlir/lib/Quantum/IR/QuantumDialect.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/mlir/lib/Quantum/IR/QuantumDialect.cpp b/mlir/lib/Quantum/IR/QuantumDialect.cpp
index 04bfe34f2c..385f4e0ae5 100644
--- a/mlir/lib/Quantum/IR/QuantumDialect.cpp
+++ b/mlir/lib/Quantum/IR/QuantumDialect.cpp
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
 #include "mlir/IR/DialectImplementation.h" // needed for generated type parser
 #include "llvm/ADT/TypeSwitch.h"           // needed for generated type parser
 
@@ -44,8 +43,6 @@ void QuantumDialect::initialize()
 #define GET_OP_LIST
 #include "Quantum/IR/QuantumOps.cpp.inc"
         >();
-
-    declarePromisedInterfaces<bufferization::BufferizableOpInterface>();
 }
 
 //===----------------------------------------------------------------------===//

From 980b0dbfe24a7c7f854deebf3459d4cd01c7b961 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Thu, 29 Aug 2024 13:04:09 -0400
Subject: [PATCH 039/183] Fix the other indentations

---
 .../BufferizableOpInterfaceImpl.cpp           | 64 ++++++++++---------
 1 file changed, 33 insertions(+), 31 deletions(-)

diff --git a/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
index 3302b1ea92..f5101fe3e1 100644
--- a/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -11,45 +11,47 @@ using namespace catalyst;
 
 namespace {
 
-/// Bufferization of catalyst.quantum.hermitian. Convert Matrix into memref.
+/// Bufferization of catalyst.print. Get memref of printOp.val.
 struct PrintOpInterface
     : public bufferization::BufferizableOpInterface::ExternalModel<PrintOpInterface,
                                                     PrintOp> {
-  bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
-                              const bufferization::AnalysisState &state) const {
-    return true;
-  }
-
-  bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
-                               const bufferization::AnalysisState &state) const {
-    return false;
-  }
-
-  bufferization::AliasingValueList getAliasingValues(Operation *op,
-                                      OpOperand &opOperand,
-                                      const bufferization::AnalysisState &state) const {
-    return {};
-  }
-
-  LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
-                          const bufferization::BufferizationOptions &options) const {
-    auto printOp = cast<PrintOp>(op);
-    if (printOp.getVal()) {
-        FailureOr<Value> source = getBuffer(rewriter, printOp.getVal(), options);
-        if (failed(source))
-            return failure();
-        bufferization::replaceOpWithNewBufferizedOp<PrintOp>(rewriter, op, *source,
-                        printOp.getConstValAttr(), printOp.getPrintDescriptorAttr());
+    bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
+                                const bufferization::AnalysisState &state) const {
+        return true;
     }
-    return success();
-  }
+
+    bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
+                                 const bufferization::AnalysisState &state) const {
+        return false;
+    }
+
+    bufferization::AliasingValueList getAliasingValues(Operation *op,
+                                        OpOperand &opOperand,
+                                        const bufferization::AnalysisState &state) const {
+        return {};
+    }
+
+    LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
+                            const bufferization::BufferizationOptions &options) const {
+        auto printOp = cast<PrintOp>(op);
+        if (printOp.getVal()) {
+            FailureOr<Value> source = getBuffer(rewriter, printOp.getVal(), options);
+            if (failed(source))
+                return failure();
+            bufferization::replaceOpWithNewBufferizedOp<PrintOp>(rewriter, op, *source,
+                            printOp.getConstValAttr(), printOp.getPrintDescriptorAttr());
+        }
+        return success();
+    }
+
 };
 
 } // namespace
 
 void catalyst::registerBufferizableOpInterfaceExternalModels(
     DialectRegistry &registry) {
-  registry.addExtension(+[](MLIRContext *ctx, CatalystDialect *dialect) {
-    PrintOp::attachInterface<PrintOpInterface>(*ctx);
-  });
+    registry.addExtension(+[](MLIRContext *ctx, CatalystDialect *dialect) {
+        //CustomCallOp::attachInterface<CustomCallOpInterface>(*ctx);
+        PrintOp::attachInterface<PrintOpInterface>(*ctx);
+    });
 }
\ No newline at end of file

From 01df77ea054400987a00859af96315d70b40cec9 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Thu, 29 Aug 2024 13:30:15 -0400
Subject: [PATCH 040/183] Add CustomCall bufferization

---
 .../BufferizableOpInterfaceImpl.cpp           | 67 ++++++++++++++++++-
 1 file changed, 66 insertions(+), 1 deletion(-)

diff --git a/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
index f5101fe3e1..4e03cdebb7 100644
--- a/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -43,7 +43,72 @@ struct PrintOpInterface
         }
         return success();
     }
+};
+
+struct CustomCallOpInterface
+    : public bufferization::BufferizableOpInterface::ExternalModel<CustomCallOpInterface,
+                                                    CustomCallOp> {
+    bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
+                                const bufferization::AnalysisState &state) const {
+        return true;
+    }
+
+    bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
+                                 const bufferization::AnalysisState &state) const {
+        return true;
+    }
+
+    bufferization::AliasingValueList getAliasingValues(Operation *op,
+                                        OpOperand &opOperand,
+                                        const bufferization::AnalysisState &state) const {
+        return {};
+    }
+
+    LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
+                            const bufferization::BufferizationOptions &options) const {
+        auto customCallOp = cast<CustomCallOp>(op);
+
+        // Add bufferized arguments
+        SmallVector<Value> bufferArgs;
+        ValueRange operands = customCallOp.getOperands();
+        for (Value operand : operands) {
+            FailureOr<Value> opBuffer = getBuffer(rewriter, operand, options);
+            if (failed(opBuffer))
+                return failure();
+            bufferArgs.push_back(*opBuffer);
+        }
 
+        // Add bufferized return values to the arguments
+        ValueRange results = customCallOp.getResults();
+        for (Value result : results) {
+            Type resultType = result.getType();
+            RankedTensorType tensorType = dyn_cast<RankedTensorType>(resultType);
+            if (!tensorType) {
+                return failure();
+            }
+            auto options = bufferization::BufferizationOptions();
+            FailureOr<Value> tensorAlloc = bufferization::allocateTensorForShapedValue(
+                rewriter, op->getLoc(), result, options, false);
+            MemRefType memrefType =
+                MemRefType::get(tensorType.getShape(), tensorType.getElementType());
+            auto newBuffer =
+                rewriter.create<bufferization::ToMemrefOp>(op->getLoc(), memrefType, *tensorAlloc);
+            bufferArgs.push_back(newBuffer);
+        }
+
+         // Add the initial number of arguments
+        int32_t numArguments = static_cast<int32_t>(customCallOp.getNumOperands());
+        DenseI32ArrayAttr numArgumentsDenseAttr = rewriter.getDenseI32ArrayAttr({numArguments});
+
+        // Create an updated custom call operation
+        rewriter.create<CustomCallOp>(op->getLoc(), TypeRange{}, bufferArgs,
+                                      customCallOp.getCallTargetName(), numArgumentsDenseAttr);
+        size_t startIndex = bufferArgs.size() - customCallOp.getNumResults();
+        SmallVector<Value> bufferResults(bufferArgs.begin() + startIndex, bufferArgs.end());
+        bufferization::replaceOpWithBufferizedValues(rewriter, op, bufferResults);
+
+        return success();
+    }
 };
 
 } // namespace
@@ -51,7 +116,7 @@ struct PrintOpInterface
 void catalyst::registerBufferizableOpInterfaceExternalModels(
     DialectRegistry &registry) {
     registry.addExtension(+[](MLIRContext *ctx, CatalystDialect *dialect) {
-        //CustomCallOp::attachInterface<CustomCallOpInterface>(*ctx);
+        CustomCallOp::attachInterface<CustomCallOpInterface>(*ctx);
         PrintOp::attachInterface<PrintOpInterface>(*ctx);
     });
 }
\ No newline at end of file

From 4a90eefa5fdce2e099f6602f81c95879fb78f8b4 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Thu, 29 Aug 2024 13:34:46 -0400
Subject: [PATCH 041/183] Set write flag of custom call as false

---
 mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
index 4e03cdebb7..a1cd08200d 100644
--- a/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -55,7 +55,7 @@ struct CustomCallOpInterface
 
     bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
                                  const bufferization::AnalysisState &state) const {
-        return true;
+        return false;
     }
 
     bufferization::AliasingValueList getAliasingValues(Operation *op,

From c4547ae733202c12ca31f0f8571b3c0e228935f0 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Thu, 29 Aug 2024 15:25:34 -0400
Subject: [PATCH 042/183] Tentatively add CallbackCallOp bufferization

---
 .../BufferizableOpInterfaceImpl.cpp           | 108 ++++++++++++++++++
 1 file changed, 108 insertions(+)

diff --git a/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
index a1cd08200d..d6dc6b7800 100644
--- a/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -45,6 +45,7 @@ struct PrintOpInterface
     }
 };
 
+/// Bufferization of catalyst.print. Mainly get buffers for arguments.
 struct CustomCallOpInterface
     : public bufferization::BufferizableOpInterface::ExternalModel<CustomCallOpInterface,
                                                     CustomCallOp> {
@@ -111,6 +112,111 @@ struct CustomCallOpInterface
     }
 };
 
+struct CallbackOpInterface
+    : public bufferization::BufferizableOpInterface::ExternalModel<CallbackOpInterface,
+                                                    CallbackOp> {
+    bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
+                                const bufferization::AnalysisState &state) const {
+        return true;
+    }
+
+    bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
+                                 const bufferization::AnalysisState &state) const {
+        return false;
+    }
+
+    bufferization::AliasingValueList getAliasingValues(Operation *op,
+                                        OpOperand &opOperand,
+                                        const bufferization::AnalysisState &state) const {
+        return {};
+    }
+
+    LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
+                            const bufferization::BufferizationOptions &options) const {
+        auto callbackOp = cast<CallbackOp>(op);
+
+        // Only match here if we have all memref arguments and return values.
+        // Only match if we have result types.
+        if (!llvm::any_of(callbackOp.getArgumentTypes(), [](Type argType) { return !isa<MemRefType>(argType); }) &&
+            !llvm::any_of(callbackOp.getResultTypes(),[](Type argType) { return !isa<MemRefType>(argType); }) &&
+            !callbackOp.getResultTypes().empty()) {
+
+            auto argTys = callbackOp.getArgumentTypes();
+            auto retTys = callbackOp.getResultTypes();
+            SmallVector<Type> emptyRets;
+            SmallVector<Type> args(argTys.begin(), argTys.end());
+            args.insert(args.end(), retTys.begin(), retTys.end());
+            auto callbackTy = rewriter.getFunctionType(args, emptyRets);
+            rewriter.modifyOpInPlace(op, [&] { callbackOp.setFunctionType(callbackTy); });
+        }
+
+        return success();
+    }
+};
+
+struct CallbackCallOpInterface
+    : public bufferization::BufferizableOpInterface::ExternalModel<CallbackCallOpInterface,
+                                                    CallbackCallOp> {
+    bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
+                                const bufferization::AnalysisState &state) const {
+        return true;
+    }
+
+    bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
+                                 const bufferization::AnalysisState &state) const {
+        return false;
+    }
+
+    bufferization::AliasingValueList getAliasingValues(Operation *op,
+                                        OpOperand &opOperand,
+                                        const bufferization::AnalysisState &state) const {
+        return {};
+    }
+
+    LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
+                            const bufferization::BufferizationOptions &options) const {
+        auto callOp = cast<CallbackCallOp>(op);
+
+        if(callOp->getNumResults() != callOp.getResultTypes().size())
+            return failure();
+
+        SmallVector<Value> newInputs;
+        auto operands = callOp.getOperands();
+        for (Value operand : operands) {
+            FailureOr<Value> opBuffer = getBuffer(rewriter, operand, options);
+            if (failed(opBuffer))
+                return failure();
+            newInputs.push_back(*opBuffer);
+        }
+        auto results = callOp.getResults();
+
+        auto loc = callOp->getLoc();
+        SmallVector<Value> outmemrefs;
+        for (auto result : results) {
+            FailureOr<Value> tensorAlloc =
+                bufferization::allocateTensorForShapedValue(rewriter, loc, result, options, false);
+            if (failed(tensorAlloc))
+                return failure();
+
+            auto tensor = *tensorAlloc;
+            RankedTensorType tensorTy = cast<RankedTensorType>(tensor.getType());
+            auto shape = tensorTy.getShape();
+            auto elementTy = tensorTy.getElementType();
+            auto memrefType = MemRefType::get(shape, elementTy);
+            auto toMemrefOp = rewriter.create<bufferization::ToMemrefOp>(loc, memrefType, tensor);
+            auto memref = toMemrefOp.getResult();
+            outmemrefs.push_back(memref);
+            newInputs.push_back(memref);
+        }
+
+        SmallVector<Type> emptyRets;
+        //rewriter.create<CallbackCallOp>(loc, emptyRets, callOp.getCallee(), newInputs);
+        bufferization::replaceOpWithNewBufferizedOp<CallbackCallOp>(rewriter, op, emptyRets, callOp.getCallee(), newInputs);
+        /*bufferization::replaceOpWithBufferizedValues(rewriter, callOp, outmemrefs);*/
+        return success();
+    }
+};
+
 } // namespace
 
 void catalyst::registerBufferizableOpInterfaceExternalModels(
@@ -118,5 +224,7 @@ void catalyst::registerBufferizableOpInterfaceExternalModels(
     registry.addExtension(+[](MLIRContext *ctx, CatalystDialect *dialect) {
         CustomCallOp::attachInterface<CustomCallOpInterface>(*ctx);
         PrintOp::attachInterface<PrintOpInterface>(*ctx);
+        //CallbackOp::attachInterface<CallbackOpInterface>(*ctx);
+        CallbackCallOp::attachInterface<CallbackCallOpInterface>(*ctx);
     });
 }
\ No newline at end of file

From 0c2eea843fbde391a2c009553d89b22c73e3e7b6 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Thu, 29 Aug 2024 15:47:11 -0400
Subject: [PATCH 043/183] Still create a new type converter in CallbackCallOp
 pass

---
 .../Transforms/BufferizableOpInterfaceImpl.cpp   | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
index d6dc6b7800..70df95336d 100644
--- a/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -1,5 +1,6 @@
 #include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
 #include "mlir/Dialect/Bufferization/IR/Bufferization.h"
+#include "mlir/Dialect/Bufferization/Transforms/Bufferize.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Transforms/DialectConversion.h"
 
@@ -177,7 +178,13 @@ struct CallbackCallOpInterface
                             const bufferization::BufferizationOptions &options) const {
         auto callOp = cast<CallbackCallOp>(op);
 
-        if(callOp->getNumResults() != callOp.getResultTypes().size())
+        bufferization::BufferizeTypeConverter typeConverter;
+
+        SmallVector<Type> convertedResults;
+        if (failed(typeConverter.convertTypes(callOp.getResultTypes(), convertedResults)))
+            return failure();
+
+        if(callOp->getNumResults() != convertedResults.size())
             return failure();
 
         SmallVector<Value> newInputs;
@@ -210,9 +217,8 @@ struct CallbackCallOpInterface
         }
 
         SmallVector<Type> emptyRets;
-        //rewriter.create<CallbackCallOp>(loc, emptyRets, callOp.getCallee(), newInputs);
-        bufferization::replaceOpWithNewBufferizedOp<CallbackCallOp>(rewriter, op, emptyRets, callOp.getCallee(), newInputs);
-        /*bufferization::replaceOpWithBufferizedValues(rewriter, callOp, outmemrefs);*/
+        bufferization::replaceOpWithNewBufferizedOp<CallbackCallOp>(rewriter, op, emptyRets,
+                                                                        callOp.getCallee(), newInputs);
         return success();
     }
 };
@@ -224,7 +230,7 @@ void catalyst::registerBufferizableOpInterfaceExternalModels(
     registry.addExtension(+[](MLIRContext *ctx, CatalystDialect *dialect) {
         CustomCallOp::attachInterface<CustomCallOpInterface>(*ctx);
         PrintOp::attachInterface<PrintOpInterface>(*ctx);
-        //CallbackOp::attachInterface<CallbackOpInterface>(*ctx);
+        CallbackOp::attachInterface<CallbackOpInterface>(*ctx);
         CallbackCallOp::attachInterface<CallbackCallOpInterface>(*ctx);
     });
 }
\ No newline at end of file

From 71fa5d719252f48cb8756172d4711f5091bd070c Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Thu, 29 Aug 2024 17:35:59 -0400
Subject: [PATCH 044/183] Use replaceOpWithBufferizedValues at the end of
 CallbackCallOp

---
 .../lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
index 70df95336d..7717b63f99 100644
--- a/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -195,8 +195,8 @@ struct CallbackCallOpInterface
                 return failure();
             newInputs.push_back(*opBuffer);
         }
-        auto results = callOp.getResults();
 
+        auto results = callOp.getResults();
         auto loc = callOp->getLoc();
         SmallVector<Value> outmemrefs;
         for (auto result : results) {
@@ -217,8 +217,8 @@ struct CallbackCallOpInterface
         }
 
         SmallVector<Type> emptyRets;
-        bufferization::replaceOpWithNewBufferizedOp<CallbackCallOp>(rewriter, op, emptyRets,
-                                                                        callOp.getCallee(), newInputs);
+        rewriter.create<CallbackCallOp>(loc, emptyRets, callOp.getCallee(), newInputs);
+        bufferization::replaceOpWithBufferizedValues(rewriter, op, outmemrefs);
         return success();
     }
 };

From 4295bfc994e5e8c7d1bd452db1065d9383437fee Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Fri, 30 Aug 2024 12:52:22 -0400
Subject: [PATCH 045/183] Add new passes to quantum-opt

---
 mlir/tools/quantum-opt/quantum-opt.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/mlir/tools/quantum-opt/quantum-opt.cpp b/mlir/tools/quantum-opt/quantum-opt.cpp
index b2c46933d5..96d9f022a9 100644
--- a/mlir/tools/quantum-opt/quantum-opt.cpp
+++ b/mlir/tools/quantum-opt/quantum-opt.cpp
@@ -26,12 +26,15 @@
 
 #include "Catalyst/IR/CatalystDialect.h"
 #include "Catalyst/Transforms/Passes.h"
+#include "Catalyst/Transforms/BufferizableOpInterfaceImpl.h"
 #include "Gradient/IR/GradientDialect.h"
 #include "Gradient/Transforms/Passes.h"
+#include "Gradient/Transforms/BufferizableOpInterfaceImpl.h"
 #include "Mitigation/IR/MitigationDialect.h"
 #include "Mitigation/Transforms/Passes.h"
 #include "Quantum/IR/QuantumDialect.h"
 #include "Quantum/Transforms/Passes.h"
+#include "Quantum/Transforms/BufferizableOpInterfaceImpl.h"
 
 int main(int argc, char **argv)
 {
@@ -50,6 +53,10 @@ int main(int argc, char **argv)
     registry.insert<catalyst::mitigation::MitigationDialect>();
     registry.insert<mlir::mhlo::MhloDialect>();
 
+    catalyst::registerBufferizableOpInterfaceExternalModels(registry);
+    catalyst::quantum::registerBufferizableOpInterfaceExternalModels(registry);
+    catalyst::gradient::registerBufferizableOpInterfaceExternalModels(registry);
+
     return mlir::asMainReturnCode(
         mlir::MlirOptMain(argc, argv, "Quantum optimizer driver\n", registry));
 }

From c01dd1f019d970e1aea865430c98980dff4284fd Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Fri, 30 Aug 2024 18:05:34 -0400
Subject: [PATCH 046/183] Add AdjointOp bufferization

---
 .../BufferizableOpInterfaceImpl.cpp           | 124 +++++++++++++++++-
 1 file changed, 121 insertions(+), 3 deletions(-)

diff --git a/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
index dee62249f4..69b15e890e 100644
--- a/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -1,21 +1,139 @@
+#include "mlir/Conversion/LLVMCommon/MemRefBuilder.h"
+#include "mlir/Conversion/LLVMCommon/Pattern.h"
+#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
+#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
 #include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
 #include "mlir/Dialect/Bufferization/IR/Bufferization.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/LLVMIR/FunctionCallUtils.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Transforms/DialectConversion.h"
 
 #include "Gradient/IR/GradientOps.h"
 #include "Gradient/Transforms/BufferizableOpInterfaceImpl.h"
+#include "Quantum/IR/QuantumOps.h"
 
 using namespace mlir;
 using namespace catalyst::gradient;
 
 namespace {
 
+constexpr int64_t UNKNOWN = ShapedType::kDynamic;
+
+LLVM::LLVMFuncOp ensureFunctionDeclaration(RewriterBase &rewriter, Operation *op,
+                                           StringRef fnSymbol, Type fnType)
+{
+    Operation *fnDecl = SymbolTable::lookupNearestSymbolFrom(op, rewriter.getStringAttr(fnSymbol));
+
+    if (!fnDecl) {
+        RewriterBase::InsertionGuard insertGuard(rewriter);
+        ModuleOp mod = op->getParentOfType<ModuleOp>();
+        rewriter.setInsertionPointToStart(mod.getBody());
+
+        fnDecl = rewriter.create<LLVM::LLVMFuncOp>(op->getLoc(), fnSymbol, fnType);
+    }
+    else {
+        assert(isa<LLVM::LLVMFuncOp>(fnDecl) && "QIR function declaration is not a LLVMFuncOp");
+    }
+
+    return cast<LLVM::LLVMFuncOp>(fnDecl);
+}
+
+struct AdjointOpInterface
+    : public bufferization::BufferizableOpInterface::ExternalModel<AdjointOpInterface,
+                                                    AdjointOp> {
+    bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
+                                const bufferization::AnalysisState &state) const {
+        return false;
+    }
+
+    bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
+                                 const bufferization::AnalysisState &state) const {
+        return false;
+    }
+
+    bufferization::AliasingValueList getAliasingValues(Operation *op,
+                                      OpOperand &opOperand,
+                                      const bufferization::AnalysisState &state) const {
+        return {};
+    }
+
+    LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
+                            const bufferization::BufferizationOptions &options) const {
+        auto adjointOp = cast<AdjointOp>(op);
+        MLIRContext *ctx = rewriter.getContext();
+        Location loc = op->getLoc();
+        LLVMTypeConverter typeConverter(ctx);
+        Type vectorType = typeConverter.convertType(MemRefType::get({UNKNOWN}, Float64Type::get(ctx)));
+
+        for (Type type : adjointOp.getResultTypes()) {
+            if (!isa<MemRefType>(type))
+                return adjointOp.emitOpError("must be bufferized before lowering");
+
+            // Currently only expval gradients are supported by the runtime,
+            // leading to tensor<?xf64> return values.
+            if (dyn_cast<MemRefType>(type) != MemRefType::get({UNKNOWN}, Float64Type::get(ctx)))
+                return adjointOp.emitOpError("adjoint can only return MemRef<?xf64> or tuple thereof");
+        }
+
+        // The callee of the adjoint op must return as a single result the quantum register.
+        func::FuncOp callee =
+            SymbolTable::lookupNearestSymbolFrom<func::FuncOp>(adjointOp, adjointOp.getCalleeAttr());
+        assert(callee && callee.getNumResults() == 1 && "invalid qfunc symbol in adjoint op");
+
+        StringRef cacheFnName = "__catalyst__rt__toggle_recorder";
+        StringRef gradFnName = "__catalyst__qis__Gradient";
+        Type cacheFnSignature =
+            LLVM::LLVMFunctionType::get(LLVM::LLVMVoidType::get(ctx), IntegerType::get(ctx, 1));
+        Type gradFnSignature = LLVM::LLVMFunctionType::get(
+            LLVM::LLVMVoidType::get(ctx), IntegerType::get(ctx, 64), /*isVarArg=*/true);
+
+        LLVM::LLVMFuncOp cacheFnDecl =
+            ensureFunctionDeclaration(rewriter, adjointOp, cacheFnName, cacheFnSignature);
+        LLVM::LLVMFuncOp gradFnDecl =
+            ensureFunctionDeclaration(rewriter, adjointOp, gradFnName, gradFnSignature);
+
+        // Run the forward pass and cache the circuit.
+        Value c_true = rewriter.create<LLVM::ConstantOp>(
+            loc, rewriter.getIntegerAttr(IntegerType::get(ctx, 1), 1));
+        Value c_false = rewriter.create<LLVM::ConstantOp>(
+            loc, rewriter.getIntegerAttr(IntegerType::get(ctx, 1), 0));
+        rewriter.create<LLVM::CallOp>(loc, cacheFnDecl, c_true);
+        Value qreg = rewriter.create<func::CallOp>(loc, callee, adjointOp.getArgs()).getResult(0);
+        if (!isa<catalyst::quantum::QuregType>(qreg.getType()))
+            return callee.emitOpError("qfunc must return quantum register");
+        rewriter.create<LLVM::CallOp>(loc, cacheFnDecl, c_false);
+
+        // We follow the C ABI convention of passing result memrefs as struct pointers in the
+        // arguments to the C function, although in this case as a variadic argument list to allow
+        // for a varying number of results in a single signature.
+        Value c1 = rewriter.create<LLVM::ConstantOp>(loc, rewriter.getI64IntegerAttr(1));
+        Value numResults = rewriter.create<LLVM::ConstantOp>(
+            loc, rewriter.getI64IntegerAttr(adjointOp.getDataIn().size()));
+        SmallVector<Value> args = {numResults};
+
+        // Might need to modify here.
+        for (Value memref : adjointOp.getDataIn()) {
+            Value newArg = rewriter.create<LLVM::AllocaOp>(
+                loc, LLVM::LLVMPointerType::get(rewriter.getContext()), vectorType, c1);
+            rewriter.create<LLVM::StoreOp>(loc, memref, newArg);
+            args.push_back(newArg);
+        }
+
+        rewriter.create<LLVM::CallOp>(loc, gradFnDecl, args);
+        rewriter.create<catalyst::quantum::DeallocOp>(loc, qreg);
+        mlir::bufferization::replaceOpWithNewBufferizedOp<catalyst::quantum::DeviceReleaseOp>(rewriter, adjointOp);
+
+        return success();
+    }
+};
+
 } // namespace
 
 void catalyst::gradient::registerBufferizableOpInterfaceExternalModels(
     DialectRegistry &registry) {
-  registry.addExtension(+[](MLIRContext *ctx, GradientDialect *dialect) {
-
-  });
+    registry.addExtension(+[](MLIRContext *ctx, GradientDialect *dialect) {
+        AdjointOp::attachInterface<AdjointOpInterface>(*ctx);
+    });
 }
\ No newline at end of file

From 45581c75360b735d8249739558fd97f29df1523f Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Fri, 30 Aug 2024 18:13:05 -0400
Subject: [PATCH 047/183] Use BufferizeTypeConverter in new pass

---
 mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
index 69b15e890e..1dfb518e5e 100644
--- a/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -4,6 +4,7 @@
 #include "mlir/Conversion/LLVMCommon/TypeConverter.h"
 #include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
 #include "mlir/Dialect/Bufferization/IR/Bufferization.h"
+#include "mlir/Dialect/Bufferization/Transforms/Bufferize.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/LLVMIR/FunctionCallUtils.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
@@ -64,7 +65,7 @@ struct AdjointOpInterface
         auto adjointOp = cast<AdjointOp>(op);
         MLIRContext *ctx = rewriter.getContext();
         Location loc = op->getLoc();
-        LLVMTypeConverter typeConverter(ctx);
+        bufferization::BufferizeTypeConverter typeConverter;
         Type vectorType = typeConverter.convertType(MemRefType::get({UNKNOWN}, Float64Type::get(ctx)));
 
         for (Type type : adjointOp.getResultTypes()) {

From 9ba2cd9cb12c36c985519634cc335ef10aa47ee7 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Tue, 3 Sep 2024 17:34:50 -0400
Subject: [PATCH 048/183] Make CallbackOp use Unstructured Control Flow
 Bufferizable OpInterface

---
 mlir/lib/Catalyst/IR/CatalystDialect.cpp      |  3 ++
 .../BufferizableOpInterfaceImpl.cpp           | 50 +++++++++++++------
 2 files changed, 37 insertions(+), 16 deletions(-)

diff --git a/mlir/lib/Catalyst/IR/CatalystDialect.cpp b/mlir/lib/Catalyst/IR/CatalystDialect.cpp
index 20df887830..a94a9658c4 100644
--- a/mlir/lib/Catalyst/IR/CatalystDialect.cpp
+++ b/mlir/lib/Catalyst/IR/CatalystDialect.cpp
@@ -14,6 +14,7 @@
 
 #include "Catalyst/IR/CatalystDialect.h"
 #include "Catalyst/IR/CatalystOps.h"
+#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/DialectImplementation.h" // needed for generated type parser
 #include "mlir/Interfaces/FunctionImplementation.h"
@@ -40,6 +41,8 @@ void CatalystDialect::initialize()
 #define GET_OP_LIST
 #include "Catalyst/IR/CatalystOps.cpp.inc"
         >();
+    declarePromisedInterfaces<bufferization::BufferizableOpInterface, CustomCallOp,
+                         PrintOp, CallbackCallOp, CallbackOp>();
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
index 7717b63f99..f29cc6fe41 100644
--- a/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -1,5 +1,6 @@
 #include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
 #include "mlir/Dialect/Bufferization/IR/Bufferization.h"
+#include "mlir/Dialect/Bufferization/IR/UnstructuredControlFlow.h"
 #include "mlir/Dialect/Bufferization/Transforms/Bufferize.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Transforms/DialectConversion.h"
@@ -114,8 +115,24 @@ struct CustomCallOpInterface
 };
 
 struct CallbackOpInterface
-    : public bufferization::BufferizableOpInterface::ExternalModel<CallbackOpInterface,
-                                                    CallbackOp> {
+    : public bufferization::OpWithUnstructuredControlFlowBufferizableOpInterfaceExternalModel<
+                                                            CallbackOpInterface, CallbackOp> {
+
+    static bool supportsUnstructuredControlFlow() { return true; }
+
+    bool hasTensorSemantics(Operation *op) const {
+        auto isaTensor = llvm::IsaPred<TensorType>;
+
+        // A function has tensor semantics if it has tensor arguments/results.
+        auto callbackOp = cast<CallbackOp>(op);
+        bool hasTensorArg = any_of(callbackOp.getArgumentTypes(), isaTensor);
+        bool hasTensorResult = any_of(callbackOp.getResultTypes(), isaTensor);
+        if (hasTensorArg || hasTensorResult)
+            return true;
+
+        return false;
+    }
+
     bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
                                 const bufferization::AnalysisState &state) const {
         return true;
@@ -136,20 +153,21 @@ struct CallbackOpInterface
                             const bufferization::BufferizationOptions &options) const {
         auto callbackOp = cast<CallbackOp>(op);
 
-        // Only match here if we have all memref arguments and return values.
-        // Only match if we have result types.
-        if (!llvm::any_of(callbackOp.getArgumentTypes(), [](Type argType) { return !isa<MemRefType>(argType); }) &&
-            !llvm::any_of(callbackOp.getResultTypes(),[](Type argType) { return !isa<MemRefType>(argType); }) &&
-            !callbackOp.getResultTypes().empty()) {
-
-            auto argTys = callbackOp.getArgumentTypes();
-            auto retTys = callbackOp.getResultTypes();
-            SmallVector<Type> emptyRets;
-            SmallVector<Type> args(argTys.begin(), argTys.end());
-            args.insert(args.end(), retTys.begin(), retTys.end());
-            auto callbackTy = rewriter.getFunctionType(args, emptyRets);
-            rewriter.modifyOpInPlace(op, [&] { callbackOp.setFunctionType(callbackTy); });
-        }
+         auto argTys = callbackOp.getArgumentTypes();
+         auto retTys = callbackOp.getResultTypes();
+         SmallVector<Type> emptyRets;
+         SmallVector<Type> args(argTys.begin(), argTys.end());
+         args.insert(args.end(), retTys.begin(), retTys.end());
+         SmallVector<Type> bufferArgs;
+         for (Type ty : args) {
+             auto tensorType = dyn_cast<RankedTensorType>(ty);
+             if (!tensorType)
+                 bufferArgs.push_back(ty);
+             else
+                 bufferArgs.push_back(MemRefType::get(tensorType.getShape(), tensorType.getElementType()));
+         }
+         auto callbackTy = rewriter.getFunctionType(bufferArgs, emptyRets);
+         rewriter.modifyOpInPlace(op, [&] { callbackOp.setFunctionType(callbackTy); });
 
         return success();
     }

From 58a0bed0b4e5789f255fc21d9d9ee849b3618a1a Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Tue, 3 Sep 2024 17:50:58 -0400
Subject: [PATCH 049/183] Add tentative patch for moduleOp bufferization

---
 mlir/patches/moduleOp-bufferization.patch | 153 ++++++++++++++++++++++
 1 file changed, 153 insertions(+)
 create mode 100644 mlir/patches/moduleOp-bufferization.patch

diff --git a/mlir/patches/moduleOp-bufferization.patch b/mlir/patches/moduleOp-bufferization.patch
new file mode 100644
index 0000000000..aaedd050bd
--- /dev/null
+++ b/mlir/patches/moduleOp-bufferization.patch
@@ -0,0 +1,153 @@
+diff --git a/mlir/lib/Dialect/Bufferization/Transforms/OneShotModuleBufferize.cpp b/mlir/lib/Dialect/Bufferization/Transforms/OneShotModuleBufferize.cpp
+index 0a4072605c26..5b1536a59e11 100644
+--- a/mlir/lib/Dialect/Bufferization/Transforms/OneShotModuleBufferize.cpp
++++ b/mlir/lib/Dialect/Bufferization/Transforms/OneShotModuleBufferize.cpp
+@@ -75,7 +75,7 @@ using namespace mlir::bufferization;
+ using namespace mlir::bufferization::func_ext;
+ 
+ /// A mapping of FuncOps to their callers.
+-using FuncCallerMap = DenseMap<func::FuncOp, DenseSet<Operation *>>;
++using FuncCallerMap = DenseMap<FunctionOpInterface, DenseSet<Operation *>>;
+ 
+ /// Get or create FuncAnalysisState.
+ static FuncAnalysisState &
+@@ -247,6 +247,15 @@ static func::FuncOp getCalledFunction(func::CallOp callOp) {
+       SymbolTable::lookupNearestSymbolFrom(callOp, sym));
+ }
+ 
++static FunctionOpInterface getCalledFunction(CallOpInterface callOp) {
++  SymbolRefAttr sym =
++      llvm::dyn_cast_if_present<SymbolRefAttr>(callOp.getCallableForCallee());
++  if (!sym)
++    return nullptr;
++  return dyn_cast_or_null<FunctionOpInterface>(
++      SymbolTable::lookupNearestSymbolFrom(callOp, sym));
++}
++
+ /// Gather equivalence info of CallOps.
+ /// Note: This only adds new equivalence info if the called function was already
+ /// analyzed.
+@@ -277,11 +286,15 @@ static void equivalenceAnalysis(func::FuncOp funcOp,
+ }
+ 
+ /// Return "true" if the given function signature has tensor semantics.
+-static bool hasTensorSignature(func::FuncOp funcOp) {
+-  return llvm::any_of(funcOp.getFunctionType().getInputs(),
++static bool hasTensorSignature(FunctionOpInterface funcOp) {
++  return llvm::any_of(funcOp.getArgumentTypes(),
+                       llvm::IsaPred<TensorType>) ||
+-         llvm::any_of(funcOp.getFunctionType().getResults(),
++         llvm::any_of(funcOp.getResultTypes(),
+                       llvm::IsaPred<TensorType>);
++  /*return llvm::any_of(funcOp.getFunctionType().getInputs(),
++                      llvm::IsaPred<TensorType>) ||
++         llvm::any_of(funcOp.getFunctionType().getResults(),
++                      llvm::IsaPred<TensorType>);*/
+ }
+ 
+ /// Store all functions of the `moduleOp` in `orderedFuncOps`, sorted by
+@@ -291,14 +304,16 @@ static bool hasTensorSignature(func::FuncOp funcOp) {
+ /// retrieve the called FuncOp from any func::CallOp.
+ static LogicalResult
+ getFuncOpsOrderedByCalls(ModuleOp moduleOp,
+-                         SmallVectorImpl<func::FuncOp> &orderedFuncOps,
++                         SmallVectorImpl<FunctionOpInterface> &orderedFuncOps,
+                          FuncCallerMap &callerMap) {
+   // For each FuncOp, the set of functions called by it (i.e. the union of
+   // symbols of all nested func::CallOp).
+-  DenseMap<func::FuncOp, DenseSet<func::FuncOp>> calledBy;
++  DenseMap<FunctionOpInterface, DenseSet<FunctionOpInterface>> calledBy;
+   // For each FuncOp, the number of func::CallOp it contains.
+-  DenseMap<func::FuncOp, unsigned> numberCallOpsContainedInFuncOp;
+-  WalkResult res = moduleOp.walk([&](func::FuncOp funcOp) -> WalkResult {
++  DenseMap<FunctionOpInterface, unsigned> numberCallOpsContainedInFuncOp;
++  WalkResult res = moduleOp.walk([&](FunctionOpInterface funcOpIn) -> WalkResult {
++    if(isa<FuncOp>(funcOpIn)) {
++    func::FuncOp funcOp = cast<FuncOp>(funcOpIn);
+     if (!funcOp.getBody().empty()) {
+       func::ReturnOp returnOp = getAssumedUniqueReturnOp(funcOp);
+       if (!returnOp)
+@@ -306,11 +321,12 @@ getFuncOpsOrderedByCalls(ModuleOp moduleOp,
+                << "cannot bufferize a FuncOp with tensors and "
+                   "without a unique ReturnOp";
+     }
++    }
+ 
+     // Collect function calls and populate the caller map.
+-    numberCallOpsContainedInFuncOp[funcOp] = 0;
+-    return funcOp.walk([&](func::CallOp callOp) -> WalkResult {
+-      func::FuncOp calledFunction = getCalledFunction(callOp);
++    numberCallOpsContainedInFuncOp[funcOpIn] = 0;
++    return funcOpIn.walk([&](CallOpInterface callOp) -> WalkResult {
++      FunctionOpInterface calledFunction = getCalledFunction(callOp);
+       assert(calledFunction && "could not retrieved called func::FuncOp");
+       // If the called function does not have any tensors in its signature, then
+       // it is not necessary to bufferize the callee before the caller.
+@@ -318,8 +334,8 @@ getFuncOpsOrderedByCalls(ModuleOp moduleOp,
+         return WalkResult::skip();
+ 
+       callerMap[calledFunction].insert(callOp);
+-      if (calledBy[calledFunction].insert(funcOp).second) {
+-        numberCallOpsContainedInFuncOp[funcOp]++;
++      if (calledBy[calledFunction].insert(funcOpIn).second) {
++        numberCallOpsContainedInFuncOp[funcOpIn]++;
+       }
+       return WalkResult::advance();
+     });
+@@ -379,7 +395,7 @@ mlir::bufferization::analyzeModuleOp(ModuleOp moduleOp,
+   FuncAnalysisState &funcState = getOrCreateFuncAnalysisState(state);
+ 
+   // A list of functions in the order in which they are analyzed + bufferized.
+-  SmallVector<func::FuncOp> orderedFuncOps;
++  SmallVector<FunctionOpInterface> orderedFuncOps;
+ 
+   // A mapping of FuncOps to their callers.
+   FuncCallerMap callerMap;
+@@ -388,7 +404,10 @@ mlir::bufferization::analyzeModuleOp(ModuleOp moduleOp,
+     return failure();
+ 
+   // Analyze ops.
+-  for (func::FuncOp funcOp : orderedFuncOps) {
++   for (FunctionOpInterface funcOpIn : orderedFuncOps) {
++    if(!isa<func::FuncOp>(funcOpIn))
++        continue;
++    func::FuncOp funcOp = cast<func::FuncOp>(funcOpIn);
+     if (!state.getOptions().isOpAllowed(funcOp))
+       continue;
+ 
+@@ -430,20 +449,20 @@ LogicalResult mlir::bufferization::bufferizeModuleOp(
+   IRRewriter rewriter(moduleOp.getContext());
+ 
+   // A list of functions in the order in which they are analyzed + bufferized.
+-  SmallVector<func::FuncOp> orderedFuncOps;
++  SmallVector<FunctionOpInterface> orderedFuncOps;
+ 
+   // A mapping of FuncOps to their callers.
+   FuncCallerMap callerMap;
+ 
+   if (failed(getFuncOpsOrderedByCalls(moduleOp, orderedFuncOps, callerMap)))
+     return failure();
++  SmallVector<FunctionOpInterface> ops;
+ 
+   // Bufferize functions.
+-  for (func::FuncOp funcOp : orderedFuncOps) {
++  for (FunctionOpInterface funcOp : orderedFuncOps) {
+     // Note: It would be good to apply cleanups here but we cannot as aliasInfo
+     // would be invalidated.
+-
+-    if (llvm::is_contained(options.noAnalysisFuncFilter, funcOp.getSymName())) {
++    if (llvm::is_contained(options.noAnalysisFuncFilter, funcOp.getName())) {
+       // This function was not analyzed and RaW conflicts were not resolved.
+       // Buffer copies must be inserted before every write.
+       OneShotBufferizationOptions updatedOptions = options;
+@@ -456,8 +475,8 @@ LogicalResult mlir::bufferization::bufferizeModuleOp(
+     }
+ 
+     // Change buffer return types to more precise layout maps.
+-    if (options.inferFunctionResultLayout)
+-      foldMemRefCasts(funcOp);
++    if (options.inferFunctionResultLayout && isa<func::FuncOp>(funcOp))
++      foldMemRefCasts(cast<func::FuncOp>(funcOp));
+   }
+ 
+   // Bufferize all other ops.

From f0a10bf13489c2a1072cd34bd8191b2150791d49 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Tue, 3 Sep 2024 18:03:13 -0400
Subject: [PATCH 050/183] Update patch script

---
 mlir/Makefile | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/mlir/Makefile b/mlir/Makefile
index 1d5a126ef6..c553d3274b 100644
--- a/mlir/Makefile
+++ b/mlir/Makefile
@@ -12,6 +12,8 @@ ENZYME_BUILD_DIR?=$(MK_DIR)/Enzyme/build
 RT_BUILD_DIR?=$(MK_DIR)/../runtime/build
 ENABLE_ASAN?=OFF
 BUILD_TYPE?=Release
+LLVM_TARGET_FILE=$(MK_DIR)/llvm-project/mlir/lib/Dialect/Bufferization/Transforms/OneShotModuleBufferize.cpp
+LLVM_PATCH_FILE=$(MK_DIR)/patches/moduleOp-bufferization.patch
 TARGET_FILE=$(MK_DIR)/mlir-hlo/mhlo/transforms/CMakeLists.txt
 PATCH_FILE=$(MK_DIR)/patches/mhlo-Add-PassesIncGen-in-transforms-CMakeList.patch
 
@@ -54,6 +56,9 @@ all: llvm mhlo enzyme dialects
 .PHONY: llvm
 llvm:
 	@echo "build LLVM and MLIR enabling Python bindings"
+	@if patch --dry-run -p1 -N $(LLVM_TARGET_FILE) $(LLVM_PATCH_FILE) > /dev/null 2>&1; then \
+        patch -p1 $(LLVM_TARGET_FILE) $(LLVM_PATCH_FILE); \
+	fi
 	cmake -G Ninja -S llvm-project/llvm -B $(LLVM_BUILD_DIR) \
 		-DCMAKE_BUILD_TYPE=$(BUILD_TYPE) \
 		-DLLVM_BUILD_EXAMPLES=OFF \

From 03bee24efc3598de24d20596007b8fc4bfe32ced Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Wed, 4 Sep 2024 09:59:41 -0400
Subject: [PATCH 051/183] Reformatting

---
 .../Transforms/BufferizableOpInterfaceImpl.h  |   2 -
 mlir/lib/Catalyst/IR/CatalystDialect.cpp      |   4 +-
 .../BufferizableOpInterfaceImpl.cpp           | 119 +++++-----
 .../BufferizableOpInterfaceImpl.cpp           |  37 +--
 .../BufferizableOpInterfaceImpl.cpp           | 216 ++++++++++--------
 mlir/tools/quantum-opt/quantum-opt.cpp        |   6 +-
 6 files changed, 220 insertions(+), 164 deletions(-)

diff --git a/mlir/include/Catalyst/Transforms/BufferizableOpInterfaceImpl.h b/mlir/include/Catalyst/Transforms/BufferizableOpInterfaceImpl.h
index 9c0e982cb0..e56c914ba4 100644
--- a/mlir/include/Catalyst/Transforms/BufferizableOpInterfaceImpl.h
+++ b/mlir/include/Catalyst/Transforms/BufferizableOpInterfaceImpl.h
@@ -4,8 +4,6 @@ using namespace mlir;
 
 namespace catalyst {
 
-
 void registerBufferizableOpInterfaceExternalModels(mlir::DialectRegistry &registry);
 
-
 } // namespace catalyst
\ No newline at end of file
diff --git a/mlir/lib/Catalyst/IR/CatalystDialect.cpp b/mlir/lib/Catalyst/IR/CatalystDialect.cpp
index a94a9658c4..158330521d 100644
--- a/mlir/lib/Catalyst/IR/CatalystDialect.cpp
+++ b/mlir/lib/Catalyst/IR/CatalystDialect.cpp
@@ -41,8 +41,8 @@ void CatalystDialect::initialize()
 #define GET_OP_LIST
 #include "Catalyst/IR/CatalystOps.cpp.inc"
         >();
-    declarePromisedInterfaces<bufferization::BufferizableOpInterface, CustomCallOp,
-                         PrintOp, CallbackCallOp, CallbackOp>();
+    declarePromisedInterfaces<bufferization::BufferizableOpInterface, CustomCallOp, PrintOp,
+                              CallbackCallOp, CallbackOp>();
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
index f29cc6fe41..fff38f63c2 100644
--- a/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -15,33 +15,36 @@ namespace {
 
 /// Bufferization of catalyst.print. Get memref of printOp.val.
 struct PrintOpInterface
-    : public bufferization::BufferizableOpInterface::ExternalModel<PrintOpInterface,
-                                                    PrintOp> {
+    : public bufferization::BufferizableOpInterface::ExternalModel<PrintOpInterface, PrintOp> {
     bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
-                                const bufferization::AnalysisState &state) const {
+                                const bufferization::AnalysisState &state) const
+    {
         return true;
     }
 
     bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
-                                 const bufferization::AnalysisState &state) const {
+                                 const bufferization::AnalysisState &state) const
+    {
         return false;
     }
 
-    bufferization::AliasingValueList getAliasingValues(Operation *op,
-                                        OpOperand &opOperand,
-                                        const bufferization::AnalysisState &state) const {
+    bufferization::AliasingValueList
+    getAliasingValues(Operation *op, OpOperand &opOperand,
+                      const bufferization::AnalysisState &state) const
+    {
         return {};
     }
 
     LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
-                            const bufferization::BufferizationOptions &options) const {
+                            const bufferization::BufferizationOptions &options) const
+    {
         auto printOp = cast<PrintOp>(op);
         if (printOp.getVal()) {
             FailureOr<Value> source = getBuffer(rewriter, printOp.getVal(), options);
             if (failed(source))
                 return failure();
-            bufferization::replaceOpWithNewBufferizedOp<PrintOp>(rewriter, op, *source,
-                            printOp.getConstValAttr(), printOp.getPrintDescriptorAttr());
+            bufferization::replaceOpWithNewBufferizedOp<PrintOp>(
+                rewriter, op, *source, printOp.getConstValAttr(), printOp.getPrintDescriptorAttr());
         }
         return success();
     }
@@ -50,25 +53,29 @@ struct PrintOpInterface
 /// Bufferization of catalyst.print. Mainly get buffers for arguments.
 struct CustomCallOpInterface
     : public bufferization::BufferizableOpInterface::ExternalModel<CustomCallOpInterface,
-                                                    CustomCallOp> {
+                                                                   CustomCallOp> {
     bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
-                                const bufferization::AnalysisState &state) const {
+                                const bufferization::AnalysisState &state) const
+    {
         return true;
     }
 
     bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
-                                 const bufferization::AnalysisState &state) const {
+                                 const bufferization::AnalysisState &state) const
+    {
         return false;
     }
 
-    bufferization::AliasingValueList getAliasingValues(Operation *op,
-                                        OpOperand &opOperand,
-                                        const bufferization::AnalysisState &state) const {
+    bufferization::AliasingValueList
+    getAliasingValues(Operation *op, OpOperand &opOperand,
+                      const bufferization::AnalysisState &state) const
+    {
         return {};
     }
 
     LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
-                            const bufferization::BufferizationOptions &options) const {
+                            const bufferization::BufferizationOptions &options) const
+    {
         auto customCallOp = cast<CustomCallOp>(op);
 
         // Add bufferized arguments
@@ -99,7 +106,7 @@ struct CustomCallOpInterface
             bufferArgs.push_back(newBuffer);
         }
 
-         // Add the initial number of arguments
+        // Add the initial number of arguments
         int32_t numArguments = static_cast<int32_t>(customCallOp.getNumOperands());
         DenseI32ArrayAttr numArgumentsDenseAttr = rewriter.getDenseI32ArrayAttr({numArguments});
 
@@ -116,11 +123,12 @@ struct CustomCallOpInterface
 
 struct CallbackOpInterface
     : public bufferization::OpWithUnstructuredControlFlowBufferizableOpInterfaceExternalModel<
-                                                            CallbackOpInterface, CallbackOp> {
+          CallbackOpInterface, CallbackOp> {
 
     static bool supportsUnstructuredControlFlow() { return true; }
 
-    bool hasTensorSemantics(Operation *op) const {
+    bool hasTensorSemantics(Operation *op) const
+    {
         auto isaTensor = llvm::IsaPred<TensorType>;
 
         // A function has tensor semantics if it has tensor arguments/results.
@@ -134,40 +142,45 @@ struct CallbackOpInterface
     }
 
     bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
-                                const bufferization::AnalysisState &state) const {
+                                const bufferization::AnalysisState &state) const
+    {
         return true;
     }
 
     bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
-                                 const bufferization::AnalysisState &state) const {
+                                 const bufferization::AnalysisState &state) const
+    {
         return false;
     }
 
-    bufferization::AliasingValueList getAliasingValues(Operation *op,
-                                        OpOperand &opOperand,
-                                        const bufferization::AnalysisState &state) const {
+    bufferization::AliasingValueList
+    getAliasingValues(Operation *op, OpOperand &opOperand,
+                      const bufferization::AnalysisState &state) const
+    {
         return {};
     }
 
     LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
-                            const bufferization::BufferizationOptions &options) const {
+                            const bufferization::BufferizationOptions &options) const
+    {
         auto callbackOp = cast<CallbackOp>(op);
 
-         auto argTys = callbackOp.getArgumentTypes();
-         auto retTys = callbackOp.getResultTypes();
-         SmallVector<Type> emptyRets;
-         SmallVector<Type> args(argTys.begin(), argTys.end());
-         args.insert(args.end(), retTys.begin(), retTys.end());
-         SmallVector<Type> bufferArgs;
-         for (Type ty : args) {
-             auto tensorType = dyn_cast<RankedTensorType>(ty);
-             if (!tensorType)
-                 bufferArgs.push_back(ty);
-             else
-                 bufferArgs.push_back(MemRefType::get(tensorType.getShape(), tensorType.getElementType()));
-         }
-         auto callbackTy = rewriter.getFunctionType(bufferArgs, emptyRets);
-         rewriter.modifyOpInPlace(op, [&] { callbackOp.setFunctionType(callbackTy); });
+        auto argTys = callbackOp.getArgumentTypes();
+        auto retTys = callbackOp.getResultTypes();
+        SmallVector<Type> emptyRets;
+        SmallVector<Type> args(argTys.begin(), argTys.end());
+        args.insert(args.end(), retTys.begin(), retTys.end());
+        SmallVector<Type> bufferArgs;
+        for (Type ty : args) {
+            auto tensorType = dyn_cast<RankedTensorType>(ty);
+            if (!tensorType)
+                bufferArgs.push_back(ty);
+            else
+                bufferArgs.push_back(
+                    MemRefType::get(tensorType.getShape(), tensorType.getElementType()));
+        }
+        auto callbackTy = rewriter.getFunctionType(bufferArgs, emptyRets);
+        rewriter.modifyOpInPlace(op, [&] { callbackOp.setFunctionType(callbackTy); });
 
         return success();
     }
@@ -175,25 +188,29 @@ struct CallbackOpInterface
 
 struct CallbackCallOpInterface
     : public bufferization::BufferizableOpInterface::ExternalModel<CallbackCallOpInterface,
-                                                    CallbackCallOp> {
+                                                                   CallbackCallOp> {
     bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
-                                const bufferization::AnalysisState &state) const {
+                                const bufferization::AnalysisState &state) const
+    {
         return true;
     }
 
     bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
-                                 const bufferization::AnalysisState &state) const {
+                                 const bufferization::AnalysisState &state) const
+    {
         return false;
     }
 
-    bufferization::AliasingValueList getAliasingValues(Operation *op,
-                                        OpOperand &opOperand,
-                                        const bufferization::AnalysisState &state) const {
+    bufferization::AliasingValueList
+    getAliasingValues(Operation *op, OpOperand &opOperand,
+                      const bufferization::AnalysisState &state) const
+    {
         return {};
     }
 
     LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
-                            const bufferization::BufferizationOptions &options) const {
+                            const bufferization::BufferizationOptions &options) const
+    {
         auto callOp = cast<CallbackCallOp>(op);
 
         bufferization::BufferizeTypeConverter typeConverter;
@@ -202,7 +219,7 @@ struct CallbackCallOpInterface
         if (failed(typeConverter.convertTypes(callOp.getResultTypes(), convertedResults)))
             return failure();
 
-        if(callOp->getNumResults() != convertedResults.size())
+        if (callOp->getNumResults() != convertedResults.size())
             return failure();
 
         SmallVector<Value> newInputs;
@@ -243,8 +260,8 @@ struct CallbackCallOpInterface
 
 } // namespace
 
-void catalyst::registerBufferizableOpInterfaceExternalModels(
-    DialectRegistry &registry) {
+void catalyst::registerBufferizableOpInterfaceExternalModels(DialectRegistry &registry)
+{
     registry.addExtension(+[](MLIRContext *ctx, CatalystDialect *dialect) {
         CustomCallOp::attachInterface<CustomCallOpInterface>(*ctx);
         PrintOp::attachInterface<PrintOpInterface>(*ctx);
diff --git a/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
index 1dfb518e5e..d1155da8e7 100644
--- a/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -1,7 +1,6 @@
 #include "mlir/Conversion/LLVMCommon/MemRefBuilder.h"
 #include "mlir/Conversion/LLVMCommon/Pattern.h"
 #include "mlir/Conversion/LLVMCommon/TypeConverter.h"
-#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
 #include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
 #include "mlir/Dialect/Bufferization/IR/Bufferization.h"
 #include "mlir/Dialect/Bufferization/Transforms/Bufferize.h"
@@ -42,31 +41,35 @@ LLVM::LLVMFuncOp ensureFunctionDeclaration(RewriterBase &rewriter, Operation *op
 }
 
 struct AdjointOpInterface
-    : public bufferization::BufferizableOpInterface::ExternalModel<AdjointOpInterface,
-                                                    AdjointOp> {
+    : public bufferization::BufferizableOpInterface::ExternalModel<AdjointOpInterface, AdjointOp> {
     bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
-                                const bufferization::AnalysisState &state) const {
+                                const bufferization::AnalysisState &state) const
+    {
         return false;
     }
 
     bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
-                                 const bufferization::AnalysisState &state) const {
+                                 const bufferization::AnalysisState &state) const
+    {
         return false;
     }
 
-    bufferization::AliasingValueList getAliasingValues(Operation *op,
-                                      OpOperand &opOperand,
-                                      const bufferization::AnalysisState &state) const {
+    bufferization::AliasingValueList
+    getAliasingValues(Operation *op, OpOperand &opOperand,
+                      const bufferization::AnalysisState &state) const
+    {
         return {};
     }
 
     LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
-                            const bufferization::BufferizationOptions &options) const {
+                            const bufferization::BufferizationOptions &options) const
+    {
         auto adjointOp = cast<AdjointOp>(op);
         MLIRContext *ctx = rewriter.getContext();
         Location loc = op->getLoc();
         bufferization::BufferizeTypeConverter typeConverter;
-        Type vectorType = typeConverter.convertType(MemRefType::get({UNKNOWN}, Float64Type::get(ctx)));
+        Type vectorType =
+            typeConverter.convertType(MemRefType::get({UNKNOWN}, Float64Type::get(ctx)));
 
         for (Type type : adjointOp.getResultTypes()) {
             if (!isa<MemRefType>(type))
@@ -75,12 +78,13 @@ struct AdjointOpInterface
             // Currently only expval gradients are supported by the runtime,
             // leading to tensor<?xf64> return values.
             if (dyn_cast<MemRefType>(type) != MemRefType::get({UNKNOWN}, Float64Type::get(ctx)))
-                return adjointOp.emitOpError("adjoint can only return MemRef<?xf64> or tuple thereof");
+                return adjointOp.emitOpError(
+                    "adjoint can only return MemRef<?xf64> or tuple thereof");
         }
 
         // The callee of the adjoint op must return as a single result the quantum register.
-        func::FuncOp callee =
-            SymbolTable::lookupNearestSymbolFrom<func::FuncOp>(adjointOp, adjointOp.getCalleeAttr());
+        func::FuncOp callee = SymbolTable::lookupNearestSymbolFrom<func::FuncOp>(
+            adjointOp, adjointOp.getCalleeAttr());
         assert(callee && callee.getNumResults() == 1 && "invalid qfunc symbol in adjoint op");
 
         StringRef cacheFnName = "__catalyst__rt__toggle_recorder";
@@ -124,7 +128,8 @@ struct AdjointOpInterface
 
         rewriter.create<LLVM::CallOp>(loc, gradFnDecl, args);
         rewriter.create<catalyst::quantum::DeallocOp>(loc, qreg);
-        mlir::bufferization::replaceOpWithNewBufferizedOp<catalyst::quantum::DeviceReleaseOp>(rewriter, adjointOp);
+        mlir::bufferization::replaceOpWithNewBufferizedOp<catalyst::quantum::DeviceReleaseOp>(
+            rewriter, adjointOp);
 
         return success();
     }
@@ -132,8 +137,8 @@ struct AdjointOpInterface
 
 } // namespace
 
-void catalyst::gradient::registerBufferizableOpInterfaceExternalModels(
-    DialectRegistry &registry) {
+void catalyst::gradient::registerBufferizableOpInterfaceExternalModels(DialectRegistry &registry)
+{
     registry.addExtension(+[](MLIRContext *ctx, GradientDialect *dialect) {
         AdjointOp::attachInterface<AdjointOpInterface>(*ctx);
     });
diff --git a/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp
index 8e09a014eb..08689c18a9 100644
--- a/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -14,38 +14,43 @@ namespace {
 /// Bufferization of catalyst.quantum.unitary. Convert Matrix into memref.
 struct QubitUnitaryOpInterface
     : public bufferization::BufferizableOpInterface::ExternalModel<QubitUnitaryOpInterface,
-                                                    QubitUnitaryOp> {
+                                                                   QubitUnitaryOp> {
     bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
-                                const bufferization::AnalysisState &state) const {
+                                const bufferization::AnalysisState &state) const
+    {
         return true;
     }
 
     bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
-                                 const bufferization::AnalysisState &state) const {
+                                 const bufferization::AnalysisState &state) const
+    {
         return false;
     }
 
-    bufferization::AliasingValueList getAliasingValues(Operation *op,
-                                      OpOperand &opOperand,
-                                      const bufferization::AnalysisState &state) const {
+    bufferization::AliasingValueList
+    getAliasingValues(Operation *op, OpOperand &opOperand,
+                      const bufferization::AnalysisState &state) const
+    {
         return {};
     }
 
     LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
-                            const bufferization::BufferizationOptions &options) const {
+                            const bufferization::BufferizationOptions &options) const
+    {
         auto qubitUnitaryOp = cast<QubitUnitaryOp>(op);
         Location loc = op->getLoc();
         auto tensorType = cast<RankedTensorType>(qubitUnitaryOp.getMatrix().getType());
         MemRefType memrefType = MemRefType::get(tensorType.getShape(), tensorType.getElementType());
-        auto toMemrefOp = rewriter.create<bufferization::ToMemrefOp>(loc, memrefType,
-                                                                     qubitUnitaryOp.getMatrix());
+        auto toMemrefOp =
+            rewriter.create<bufferization::ToMemrefOp>(loc, memrefType, qubitUnitaryOp.getMatrix());
         auto memref = toMemrefOp.getResult();
         auto newQubitUnitaryOp = rewriter.create<QubitUnitaryOp>(
-                loc, qubitUnitaryOp.getOutQubits().getTypes(),
-                qubitUnitaryOp.getOutCtrlQubits().getTypes(), memref,
-                qubitUnitaryOp.getInQubits(), qubitUnitaryOp.getAdjointAttr(),
-                qubitUnitaryOp.getInCtrlQubits(), qubitUnitaryOp.getInCtrlValues());
-        bufferization::replaceOpWithBufferizedValues(rewriter, op, newQubitUnitaryOp.getOutQubits());
+            loc, qubitUnitaryOp.getOutQubits().getTypes(),
+            qubitUnitaryOp.getOutCtrlQubits().getTypes(), memref, qubitUnitaryOp.getInQubits(),
+            qubitUnitaryOp.getAdjointAttr(), qubitUnitaryOp.getInCtrlQubits(),
+            qubitUnitaryOp.getInCtrlValues());
+        bufferization::replaceOpWithBufferizedValues(rewriter, op,
+                                                     newQubitUnitaryOp.getOutQubits());
 
         return success();
     }
@@ -54,31 +59,35 @@ struct QubitUnitaryOpInterface
 /// Bufferization of catalyst.quantum.hermitian. Convert Matrix into memref.
 struct HermitianOpInterface
     : public bufferization::BufferizableOpInterface::ExternalModel<HermitianOpInterface,
-                                                    HermitianOp> {
+                                                                   HermitianOp> {
     bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
-                                const bufferization::AnalysisState &state) const {
+                                const bufferization::AnalysisState &state) const
+    {
         return true;
     }
 
     bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
-                                 const bufferization::AnalysisState &state) const {
+                                 const bufferization::AnalysisState &state) const
+    {
         return false;
     }
 
-    bufferization::AliasingValueList getAliasingValues(Operation *op,
-                                        OpOperand &opOperand,
-                                        const bufferization::AnalysisState &state) const {
+    bufferization::AliasingValueList
+    getAliasingValues(Operation *op, OpOperand &opOperand,
+                      const bufferization::AnalysisState &state) const
+    {
         return {};
     }
 
     LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
-                            const bufferization::BufferizationOptions &options) const {
+                            const bufferization::BufferizationOptions &options) const
+    {
         auto hermitianOp = cast<HermitianOp>(op);
         Location loc = op->getLoc();
         auto tensorType = cast<RankedTensorType>(hermitianOp.getMatrix().getType());
         MemRefType memrefType = MemRefType::get(tensorType.getShape(), tensorType.getElementType());
-        auto toMemrefOp = rewriter.create<bufferization::ToMemrefOp>(loc, memrefType,
-                                                                     hermitianOp.getMatrix());
+        auto toMemrefOp =
+            rewriter.create<bufferization::ToMemrefOp>(loc, memrefType, hermitianOp.getMatrix());
         auto memref = toMemrefOp.getResult();
         auto newHermitianOp = rewriter.create<HermitianOp>(loc, hermitianOp.getType(), memref,
                                                            hermitianOp.getQubits());
@@ -91,34 +100,38 @@ struct HermitianOpInterface
 /// Bufferization of catalyst.quantum.hamiltonian. Convert Matrix into memref.
 struct HamiltonianOpInterface
     : public bufferization::BufferizableOpInterface::ExternalModel<HamiltonianOpInterface,
-                                                    HamiltonianOp> {
+                                                                   HamiltonianOp> {
     bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
-                                const bufferization::AnalysisState &state) const {
+                                const bufferization::AnalysisState &state) const
+    {
         return true;
     }
 
     bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
-                                 const bufferization::AnalysisState &state) const {
+                                 const bufferization::AnalysisState &state) const
+    {
         return false;
     }
 
-    bufferization::AliasingValueList getAliasingValues(Operation *op,
-                                        OpOperand &opOperand,
-                                        const bufferization::AnalysisState &state) const {
+    bufferization::AliasingValueList
+    getAliasingValues(Operation *op, OpOperand &opOperand,
+                      const bufferization::AnalysisState &state) const
+    {
         return {};
     }
 
     LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
-                            const bufferization::BufferizationOptions &options) const {
+                            const bufferization::BufferizationOptions &options) const
+    {
         auto hamiltonianOp = cast<HamiltonianOp>(op);
         Location loc = op->getLoc();
         auto tensorType = cast<RankedTensorType>(hamiltonianOp.getCoeffs().getType());
         MemRefType memrefType = MemRefType::get(tensorType.getShape(), tensorType.getElementType());
-        auto toMemrefOp = rewriter.create<bufferization::ToMemrefOp>(loc, memrefType,
-                                                                     hamiltonianOp.getCoeffs());
+        auto toMemrefOp =
+            rewriter.create<bufferization::ToMemrefOp>(loc, memrefType, hamiltonianOp.getCoeffs());
         auto memref = toMemrefOp.getResult();
         auto newHamiltonianOp = rewriter.create<HamiltonianOp>(loc, hamiltonianOp.getType(), memref,
-                                                           hamiltonianOp.getTerms());
+                                                               hamiltonianOp.getTerms());
         bufferization::replaceOpWithBufferizedValues(rewriter, op, newHamiltonianOp.getObs());
 
         return success();
@@ -128,26 +141,29 @@ struct HamiltonianOpInterface
 /// Bufferization of catalyst.quantum.sample. Replace with memref.alloc and a new
 /// catalyst.quantum.sample that uses the memory allocated by memref.alloc.
 struct SampleOpInterface
-    : public bufferization::BufferizableOpInterface::ExternalModel<SampleOpInterface,
-                                                    SampleOp> {
+    : public bufferization::BufferizableOpInterface::ExternalModel<SampleOpInterface, SampleOp> {
     bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
-                                const bufferization::AnalysisState &state) const {
+                                const bufferization::AnalysisState &state) const
+    {
         return false;
     }
 
     bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
-                                 const bufferization::AnalysisState &state) const {
+                                 const bufferization::AnalysisState &state) const
+    {
         return false;
     }
 
-    bufferization::AliasingValueList getAliasingValues(Operation *op,
-                                        OpOperand &opOperand,
-                                        const bufferization::AnalysisState &state) const {
+    bufferization::AliasingValueList
+    getAliasingValues(Operation *op, OpOperand &opOperand,
+                      const bufferization::AnalysisState &state) const
+    {
         return {};
     }
 
     LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
-                            const bufferization::BufferizationOptions &options) const {
+                            const bufferization::BufferizationOptions &options) const
+    {
         auto sampleOp = cast<SampleOp>(op);
         Location loc = op->getLoc();
         auto tensorType = cast<RankedTensorType>(sampleOp.getSamples().getType());
@@ -165,26 +181,29 @@ struct SampleOpInterface
 /// Bufferization of catalyst.quantum.state. Replace with memref.alloc and a new
 /// catalyst.quantum.state that uses the memory allocated by memref.alloc.
 struct StateOpInterface
-    : public bufferization::BufferizableOpInterface::ExternalModel<StateOpInterface,
-                                                    StateOp> {
+    : public bufferization::BufferizableOpInterface::ExternalModel<StateOpInterface, StateOp> {
     bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
-                                const bufferization::AnalysisState &state) const {
+                                const bufferization::AnalysisState &state) const
+    {
         return false;
     }
 
     bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
-                                 const bufferization::AnalysisState &state) const {
+                                 const bufferization::AnalysisState &state) const
+    {
         return false;
     }
 
-    bufferization::AliasingValueList getAliasingValues(Operation *op,
-                                        OpOperand &opOperand,
-                                        const bufferization::AnalysisState &state) const {
+    bufferization::AliasingValueList
+    getAliasingValues(Operation *op, OpOperand &opOperand,
+                      const bufferization::AnalysisState &state) const
+    {
         return {};
     }
 
     LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
-                            const bufferization::BufferizationOptions &options) const {
+                            const bufferization::BufferizationOptions &options) const
+    {
         auto stateOp = cast<StateOp>(op);
         Location loc = op->getLoc();
         auto tensorType = cast<RankedTensorType>(stateOp.getState().getType());
@@ -201,26 +220,29 @@ struct StateOpInterface
 /// Bufferization of catalyst.quantum.probs. Replace with memref.alloc and a new
 /// catalyst.quantum.probs that uses the memory allocated by memref.alloc.
 struct ProbsOpInterface
-    : public bufferization::BufferizableOpInterface::ExternalModel<ProbsOpInterface,
-                                                    ProbsOp> {
+    : public bufferization::BufferizableOpInterface::ExternalModel<ProbsOpInterface, ProbsOp> {
     bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
-                                const bufferization::AnalysisState &state) const {
+                                const bufferization::AnalysisState &state) const
+    {
         return false;
     }
 
     bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
-                                 const bufferization::AnalysisState &state) const {
+                                 const bufferization::AnalysisState &state) const
+    {
         return false;
     }
 
-    bufferization::AliasingValueList getAliasingValues(Operation *op,
-                                        OpOperand &opOperand,
-                                        const bufferization::AnalysisState &state) const {
+    bufferization::AliasingValueList
+    getAliasingValues(Operation *op, OpOperand &opOperand,
+                      const bufferization::AnalysisState &state) const
+    {
         return {};
     }
 
     LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
-                            const bufferization::BufferizationOptions &options) const {
+                            const bufferization::BufferizationOptions &options) const
+    {
         auto probsOp = cast<ProbsOp>(op);
         Location loc = op->getLoc();
         auto tensorType = cast<RankedTensorType>(probsOp.getProbabilities().getType());
@@ -237,38 +259,44 @@ struct ProbsOpInterface
 /// Bufferization of catalyst.quantum.counts. Replace with memref.allocs and a new
 /// catalyst.quantum.counts that uses the memory allocated by memref.allocs.
 struct CountsOpInterface
-    : public bufferization::BufferizableOpInterface::ExternalModel<CountsOpInterface,
-                                                    CountsOp> {
+    : public bufferization::BufferizableOpInterface::ExternalModel<CountsOpInterface, CountsOp> {
     bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
-                                const bufferization::AnalysisState &state) const {
+                                const bufferization::AnalysisState &state) const
+    {
         return false;
     }
 
     bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
-                                 const bufferization::AnalysisState &state) const {
+                                 const bufferization::AnalysisState &state) const
+    {
         return false;
     }
 
-    bufferization::AliasingValueList getAliasingValues(Operation *op,
-                                        OpOperand &opOperand,
-                                        const bufferization::AnalysisState &state) const {
+    bufferization::AliasingValueList
+    getAliasingValues(Operation *op, OpOperand &opOperand,
+                      const bufferization::AnalysisState &state) const
+    {
         return {};
     }
 
     LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
-                            const bufferization::BufferizationOptions &options) const {
+                            const bufferization::BufferizationOptions &options) const
+    {
         auto countsOp = cast<CountsOp>(op);
         Location loc = op->getLoc();
         auto tensorType0 = cast<RankedTensorType>(countsOp.getEigvals().getType());
         auto tensorType1 = cast<RankedTensorType>(countsOp.getCounts().getType());
-        MemRefType resultType0 = MemRefType::get(tensorType0.getShape(), tensorType0.getElementType());
-        MemRefType resultType1 = MemRefType::get(tensorType1.getShape(), tensorType1.getElementType());
+        MemRefType resultType0 =
+            MemRefType::get(tensorType0.getShape(), tensorType0.getElementType());
+        MemRefType resultType1 =
+            MemRefType::get(tensorType1.getShape(), tensorType1.getElementType());
 
         Value allocVal0 = rewriter.create<memref::AllocOp>(loc, resultType0);
         Value allocVal1 = rewriter.create<memref::AllocOp>(loc, resultType1);
         rewriter.create<CountsOp>(loc, nullptr, nullptr, countsOp.getObs(), allocVal0, allocVal1,
                                   countsOp.getShotsAttr());
-        bufferization::replaceOpWithBufferizedValues(rewriter, op, ValueRange{allocVal0, allocVal1});
+        bufferization::replaceOpWithBufferizedValues(rewriter, op,
+                                                     ValueRange{allocVal0, allocVal1});
 
         return success();
     }
@@ -277,32 +305,36 @@ struct CountsOpInterface
 /// Bufferization of catalyst.quantum.set_state. Convert InState into memref.
 struct SetStateOpInterface
     : public bufferization::BufferizableOpInterface::ExternalModel<SetStateOpInterface,
-                                                    SetStateOp> {
+                                                                   SetStateOp> {
     bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
-                                const bufferization::AnalysisState &state) const {
+                                const bufferization::AnalysisState &state) const
+    {
         return false;
     }
 
     bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
-                                 const bufferization::AnalysisState &state) const {
+                                 const bufferization::AnalysisState &state) const
+    {
         return true;
     }
 
-    bufferization::AliasingValueList getAliasingValues(Operation *op,
-                                        OpOperand &opOperand,
-                                        const bufferization::AnalysisState &state) const {
+    bufferization::AliasingValueList
+    getAliasingValues(Operation *op, OpOperand &opOperand,
+                      const bufferization::AnalysisState &state) const
+    {
         return {};
     }
 
     LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
-                            const bufferization::BufferizationOptions &options) const {
+                            const bufferization::BufferizationOptions &options) const
+    {
         auto setStateOp = cast<SetStateOp>(op);
         Location loc = op->getLoc();
         auto tensorType = cast<RankedTensorType>(setStateOp.getInState().getType());
         MemRefType memrefType = MemRefType::get(tensorType.getShape(), tensorType.getElementType());
 
-        auto toMemrefOp = rewriter.create<bufferization::ToMemrefOp>(loc, memrefType,
-                                                                     setStateOp.getInState());
+        auto toMemrefOp =
+            rewriter.create<bufferization::ToMemrefOp>(loc, memrefType, setStateOp.getInState());
         auto memref = toMemrefOp.getResult();
         auto newSetStateOp = rewriter.create<SetStateOp>(loc, setStateOp.getOutQubits().getTypes(),
                                                          memref, setStateOp.getInQubits());
@@ -314,35 +346,39 @@ struct SetStateOpInterface
 /// Bufferization of catalyst.quantum.set_basic_state. Convert BasisState into memref.
 struct SetBasisStateOpInterface
     : public bufferization::BufferizableOpInterface::ExternalModel<SetBasisStateOpInterface,
-                                                    SetBasisStateOp> {
+                                                                   SetBasisStateOp> {
     bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
-                                const bufferization::AnalysisState &state) const {
+                                const bufferization::AnalysisState &state) const
+    {
         return false;
     }
 
     bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
-                                 const bufferization::AnalysisState &state) const {
+                                 const bufferization::AnalysisState &state) const
+    {
         return true;
     }
 
-    bufferization::AliasingValueList getAliasingValues(Operation *op,
-                                        OpOperand &opOperand,
-                                        const bufferization::AnalysisState &state) const {
+    bufferization::AliasingValueList
+    getAliasingValues(Operation *op, OpOperand &opOperand,
+                      const bufferization::AnalysisState &state) const
+    {
         return {};
     }
 
     LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
-                            const bufferization::BufferizationOptions &options) const {
+                            const bufferization::BufferizationOptions &options) const
+    {
         auto setBasisStateOp = cast<SetBasisStateOp>(op);
         Location loc = op->getLoc();
         auto tensorType = cast<RankedTensorType>(setBasisStateOp.getBasisState().getType());
         MemRefType memrefType = MemRefType::get(tensorType.getShape(), tensorType.getElementType());
 
-        auto toMemrefOp = rewriter.create<bufferization::ToMemrefOp>(loc, memrefType,
-                                                                     setBasisStateOp.getBasisState());
+        auto toMemrefOp = rewriter.create<bufferization::ToMemrefOp>(
+            loc, memrefType, setBasisStateOp.getBasisState());
         auto memref = toMemrefOp.getResult();
-        auto newSetStateOp = rewriter.create<SetBasisStateOp>(loc, setBasisStateOp.getOutQubits().getTypes(),
-                                                         memref, setBasisStateOp.getInQubits());
+        auto newSetStateOp = rewriter.create<SetBasisStateOp>(
+            loc, setBasisStateOp.getOutQubits().getTypes(), memref, setBasisStateOp.getInQubits());
         bufferization::replaceOpWithBufferizedValues(rewriter, op, newSetStateOp.getOutQubits());
         return success();
     }
@@ -350,8 +386,8 @@ struct SetBasisStateOpInterface
 
 } // namespace
 
-void catalyst::quantum::registerBufferizableOpInterfaceExternalModels(
-    DialectRegistry &registry) {
+void catalyst::quantum::registerBufferizableOpInterfaceExternalModels(DialectRegistry &registry)
+{
     registry.addExtension(+[](MLIRContext *ctx, QuantumDialect *dialect) {
         QubitUnitaryOp::attachInterface<QubitUnitaryOpInterface>(*ctx);
         HermitianOp::attachInterface<HermitianOpInterface>(*ctx);
diff --git a/mlir/tools/quantum-opt/quantum-opt.cpp b/mlir/tools/quantum-opt/quantum-opt.cpp
index 96d9f022a9..aa11589368 100644
--- a/mlir/tools/quantum-opt/quantum-opt.cpp
+++ b/mlir/tools/quantum-opt/quantum-opt.cpp
@@ -25,16 +25,16 @@
 #include "mhlo/IR/hlo_ops.h"
 
 #include "Catalyst/IR/CatalystDialect.h"
-#include "Catalyst/Transforms/Passes.h"
 #include "Catalyst/Transforms/BufferizableOpInterfaceImpl.h"
+#include "Catalyst/Transforms/Passes.h"
 #include "Gradient/IR/GradientDialect.h"
-#include "Gradient/Transforms/Passes.h"
 #include "Gradient/Transforms/BufferizableOpInterfaceImpl.h"
+#include "Gradient/Transforms/Passes.h"
 #include "Mitigation/IR/MitigationDialect.h"
 #include "Mitigation/Transforms/Passes.h"
 #include "Quantum/IR/QuantumDialect.h"
-#include "Quantum/Transforms/Passes.h"
 #include "Quantum/Transforms/BufferizableOpInterfaceImpl.h"
+#include "Quantum/Transforms/Passes.h"
 
 int main(int argc, char **argv)
 {

From ba85b20f636d9e84419e28d54b1ad889a863776d Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Wed, 4 Sep 2024 13:10:45 -0400
Subject: [PATCH 052/183] Update llvm patch

---
 mlir/patches/moduleOp-bufferization.patch | 96 ++++++++++++++---------
 1 file changed, 58 insertions(+), 38 deletions(-)

diff --git a/mlir/patches/moduleOp-bufferization.patch b/mlir/patches/moduleOp-bufferization.patch
index aaedd050bd..c9d15d3c5f 100644
--- a/mlir/patches/moduleOp-bufferization.patch
+++ b/mlir/patches/moduleOp-bufferization.patch
@@ -1,5 +1,5 @@
 diff --git a/mlir/lib/Dialect/Bufferization/Transforms/OneShotModuleBufferize.cpp b/mlir/lib/Dialect/Bufferization/Transforms/OneShotModuleBufferize.cpp
-index 0a4072605c26..5b1536a59e11 100644
+index 0a4072605c26..2983af0fcbf3 100644
 --- a/mlir/lib/Dialect/Bufferization/Transforms/OneShotModuleBufferize.cpp
 +++ b/mlir/lib/Dialect/Bufferization/Transforms/OneShotModuleBufferize.cpp
 @@ -75,7 +75,7 @@ using namespace mlir::bufferization;
@@ -27,7 +27,7 @@ index 0a4072605c26..5b1536a59e11 100644
  /// Gather equivalence info of CallOps.
  /// Note: This only adds new equivalence info if the called function was already
  /// analyzed.
-@@ -277,11 +286,15 @@ static void equivalenceAnalysis(func::FuncOp funcOp,
+@@ -277,10 +286,10 @@ static void equivalenceAnalysis(func::FuncOp funcOp,
  }
  
  /// Return "true" if the given function signature has tensor semantics.
@@ -39,14 +39,9 @@ index 0a4072605c26..5b1536a59e11 100644
 -         llvm::any_of(funcOp.getFunctionType().getResults(),
 +         llvm::any_of(funcOp.getResultTypes(),
                        llvm::IsaPred<TensorType>);
-+  /*return llvm::any_of(funcOp.getFunctionType().getInputs(),
-+                      llvm::IsaPred<TensorType>) ||
-+         llvm::any_of(funcOp.getFunctionType().getResults(),
-+                      llvm::IsaPred<TensorType>);*/
  }
  
- /// Store all functions of the `moduleOp` in `orderedFuncOps`, sorted by
-@@ -291,14 +304,16 @@ static bool hasTensorSignature(func::FuncOp funcOp) {
+@@ -291,26 +300,30 @@ static bool hasTensorSignature(func::FuncOp funcOp) {
  /// retrieve the called FuncOp from any func::CallOp.
  static LogicalResult
  getFuncOpsOrderedByCalls(ModuleOp moduleOp,
@@ -60,41 +55,36 @@ index 0a4072605c26..5b1536a59e11 100644
    // For each FuncOp, the number of func::CallOp it contains.
 -  DenseMap<func::FuncOp, unsigned> numberCallOpsContainedInFuncOp;
 -  WalkResult res = moduleOp.walk([&](func::FuncOp funcOp) -> WalkResult {
+-    if (!funcOp.getBody().empty()) {
+-      func::ReturnOp returnOp = getAssumedUniqueReturnOp(funcOp);
+-      if (!returnOp)
+-        return funcOp->emitError()
+-               << "cannot bufferize a FuncOp with tensors and "
+-                  "without a unique ReturnOp";
 +  DenseMap<FunctionOpInterface, unsigned> numberCallOpsContainedInFuncOp;
-+  WalkResult res = moduleOp.walk([&](FunctionOpInterface funcOpIn) -> WalkResult {
-+    if(isa<FuncOp>(funcOpIn)) {
-+    func::FuncOp funcOp = cast<FuncOp>(funcOpIn);
-     if (!funcOp.getBody().empty()) {
-       func::ReturnOp returnOp = getAssumedUniqueReturnOp(funcOp);
-       if (!returnOp)
-@@ -306,11 +321,12 @@ getFuncOpsOrderedByCalls(ModuleOp moduleOp,
-                << "cannot bufferize a FuncOp with tensors and "
-                   "without a unique ReturnOp";
++  WalkResult res = moduleOp.walk([&](FunctionOpInterface funcOp) -> WalkResult {
++    // Only handle ReturnOp if funcOp is exactly the FuncOp type.
++    if(isa<FuncOp>(funcOp)) {
++      FuncOp funcOpCasted = cast<FuncOp>(funcOp);
++      if (!funcOpCasted.getBody().empty()) {
++        func::ReturnOp returnOp = getAssumedUniqueReturnOp(funcOpCasted);
++        if (!returnOp)
++          return funcOp->emitError()
++                 << "cannot bufferize a FuncOp with tensors and "
++                    "without a unique ReturnOp";
++      }
      }
-+    }
  
      // Collect function calls and populate the caller map.
--    numberCallOpsContainedInFuncOp[funcOp] = 0;
+     numberCallOpsContainedInFuncOp[funcOp] = 0;
 -    return funcOp.walk([&](func::CallOp callOp) -> WalkResult {
 -      func::FuncOp calledFunction = getCalledFunction(callOp);
-+    numberCallOpsContainedInFuncOp[funcOpIn] = 0;
-+    return funcOpIn.walk([&](CallOpInterface callOp) -> WalkResult {
++    return funcOp.walk([&](CallOpInterface callOp) -> WalkResult {
 +      FunctionOpInterface calledFunction = getCalledFunction(callOp);
        assert(calledFunction && "could not retrieved called func::FuncOp");
        // If the called function does not have any tensors in its signature, then
        // it is not necessary to bufferize the callee before the caller.
-@@ -318,8 +334,8 @@ getFuncOpsOrderedByCalls(ModuleOp moduleOp,
-         return WalkResult::skip();
- 
-       callerMap[calledFunction].insert(callOp);
--      if (calledBy[calledFunction].insert(funcOp).second) {
--        numberCallOpsContainedInFuncOp[funcOp]++;
-+      if (calledBy[calledFunction].insert(funcOpIn).second) {
-+        numberCallOpsContainedInFuncOp[funcOpIn]++;
-       }
-       return WalkResult::advance();
-     });
-@@ -379,7 +395,7 @@ mlir::bufferization::analyzeModuleOp(ModuleOp moduleOp,
+@@ -379,7 +392,7 @@ mlir::bufferization::analyzeModuleOp(ModuleOp moduleOp,
    FuncAnalysisState &funcState = getOrCreateFuncAnalysisState(state);
  
    // A list of functions in the order in which they are analyzed + bufferized.
@@ -103,18 +93,48 @@ index 0a4072605c26..5b1536a59e11 100644
  
    // A mapping of FuncOps to their callers.
    FuncCallerMap callerMap;
-@@ -388,7 +404,10 @@ mlir::bufferization::analyzeModuleOp(ModuleOp moduleOp,
+@@ -388,27 +401,33 @@ mlir::bufferization::analyzeModuleOp(ModuleOp moduleOp,
      return failure();
  
    // Analyze ops.
 -  for (func::FuncOp funcOp : orderedFuncOps) {
-+   for (FunctionOpInterface funcOpIn : orderedFuncOps) {
-+    if(!isa<func::FuncOp>(funcOpIn))
+-    if (!state.getOptions().isOpAllowed(funcOp))
++  for (FunctionOpInterface funcOp : orderedFuncOps) {
++
++    // The following analysis is specific to the FuncOp type.
++    if(!isa<FuncOp>(funcOp))
 +        continue;
-+    func::FuncOp funcOp = cast<func::FuncOp>(funcOpIn);
-     if (!state.getOptions().isOpAllowed(funcOp))
++    FuncOp funcOpCasted = cast<func::FuncOp>(funcOp);
++
++    if (!state.getOptions().isOpAllowed(funcOpCasted))
        continue;
  
+     // Now analyzing function.
+-    funcState.startFunctionAnalysis(funcOp);
++    funcState.startFunctionAnalysis(funcOpCasted);
+ 
+     // Gather equivalence info for CallOps.
+-    equivalenceAnalysis(funcOp, state, funcState);
++    equivalenceAnalysis(funcOpCasted, state, funcState);
+ 
+     // Analyze funcOp.
+-    if (failed(analyzeOp(funcOp, state, statistics)))
++    if (failed(analyzeOp(funcOpCasted, state, statistics)))
+       return failure();
+ 
+     // Run some extra function analyses.
+-    if (failed(aliasingFuncOpBBArgsAnalysis(funcOp, state, funcState)) ||
+-        failed(funcOpBbArgReadWriteAnalysis(funcOp, state, funcState)))
++    if (failed(aliasingFuncOpBBArgsAnalysis(funcOpCasted, state, funcState)) ||
++        failed(funcOpBbArgReadWriteAnalysis(funcOpCasted, state, funcState)))
+       return failure();
+ 
+     // Mark op as fully analyzed.
+-    funcState.analyzedFuncOps[funcOp] = FuncOpAnalysisState::Analyzed;
++    funcState.analyzedFuncOps[funcOpCasted] = FuncOpAnalysisState::Analyzed;
+   }
+ 
+   return success();
 @@ -430,20 +449,20 @@ LogicalResult mlir::bufferization::bufferizeModuleOp(
    IRRewriter rewriter(moduleOp.getContext());
  

From 1c730d2aa52ce0abf6ffb7c9da22b6f8fa15205d Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Wed, 4 Sep 2024 13:15:19 -0400
Subject: [PATCH 053/183] Stick with old llvm version for now

---
 .dep-versions     | 4 ++--
 mlir/llvm-project | 2 +-
 mlir/mlir-hlo     | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.dep-versions b/.dep-versions
index d66a0df75e..25e038fc10 100644
--- a/.dep-versions
+++ b/.dep-versions
@@ -1,7 +1,7 @@
 # Always update the version check in catalyst.__init__ when changing the JAX version.
 jax=0.4.28
-mhlo=39c37c43fb9db18144f2e155a0fe65864646a968
-llvm=6f2c61071c274a1b5e212e6ad4114641ec7c7fc3
+mhlo=89a891c986650c33df76885f5620e0a92150d90f
+llvm=3a8316216807d64a586b971f51695e23883331f7
 enzyme=v0.0.130
 
 # Always remove custom PL/LQ versions before release.
diff --git a/mlir/llvm-project b/mlir/llvm-project
index 6f2c61071c..3a83162168 160000
--- a/mlir/llvm-project
+++ b/mlir/llvm-project
@@ -1 +1 @@
-Subproject commit 6f2c61071c274a1b5e212e6ad4114641ec7c7fc3
+Subproject commit 3a8316216807d64a586b971f51695e23883331f7
diff --git a/mlir/mlir-hlo b/mlir/mlir-hlo
index 39c37c43fb..89a891c986 160000
--- a/mlir/mlir-hlo
+++ b/mlir/mlir-hlo
@@ -1 +1 @@
-Subproject commit 39c37c43fb9db18144f2e155a0fe65864646a968
+Subproject commit 89a891c986650c33df76885f5620e0a92150d90f

From 31c9139bb43134d5a5b757dd8237da818a91d466 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Wed, 4 Sep 2024 13:28:29 -0400
Subject: [PATCH 054/183] Rollback llvvm fix

---
 mlir/lib/Driver/CompilerDriver.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/lib/Driver/CompilerDriver.cpp b/mlir/lib/Driver/CompilerDriver.cpp
index 67533ad2ef..004fa92c7f 100644
--- a/mlir/lib/Driver/CompilerDriver.cpp
+++ b/mlir/lib/Driver/CompilerDriver.cpp
@@ -673,7 +673,7 @@ LogicalResult QuantumDriverMain(const CompilerOptions &options, CompilerOutput &
         if (options.lowerToLLVM) {
             llvmModule =
                 timer::timer(translateModuleToLLVMIR, "translateModuleToLLVMIR",
-                             /* add_endl */ false, *op, llvmContext, "LLVMDialectModule", false);
+                             /* add_endl */ false, *op, llvmContext, "LLVMDialectModule");
             if (!llvmModule) {
                 CO_MSG(options, Verbosity::Urgent, "Failed to translate LLVM module\n");
                 return failure();

From 525422b71b3a48b0ca26de4dcd363b902f9956b7 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Wed, 4 Sep 2024 13:32:31 -0400
Subject: [PATCH 055/183] Reformatting

---
 frontend/catalyst/compiler.py      | 20 ++++++++++----------
 mlir/lib/Driver/CompilerDriver.cpp |  5 ++---
 2 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/frontend/catalyst/compiler.py b/frontend/catalyst/compiler.py
index 85d3172db6..91df7f9716 100644
--- a/frontend/catalyst/compiler.py
+++ b/frontend/catalyst/compiler.py
@@ -195,12 +195,12 @@ def run_writing_command(command: List[str], compile_options: Optional[CompileOpt
     [
         "eliminate-empty-tensors",
         "one-shot-bufferize{bufferize-function-boundaries}",
-        #"func.func(buffer-hoisting)",
-        #"func.func(buffer-loop-hoisting)",
-        #"buffer-results-to-out-params",
-        #"drop-equivalent-buffer-results",
-        #"func.func(promote-buffers-to-stack)",
-        #"buffer-deallocation-pipeline",
+        # "func.func(buffer-hoisting)",
+        # "func.func(buffer-loop-hoisting)",
+        # "buffer-results-to-out-params",
+        # "drop-equivalent-buffer-results",
+        # "func.func(promote-buffers-to-stack)",
+        # "buffer-deallocation-pipeline",
     ],
 )
 
@@ -256,8 +256,8 @@ def run_writing_command(command: List[str], compile_options: Optional[CompileOpt
     HLO_LOWERING_PASS,
     QUANTUM_COMPILATION_PASS,
     BUFFERIZATION_PASS,
-    #BUFFERIZATION_PASS2,
-    #BUFFERIZATION_PASS3,
+    # BUFFERIZATION_PASS2,
+    # BUFFERIZATION_PASS3,
     MLIR_TO_LLVM_PASS,
 ]
 
@@ -273,8 +273,8 @@ def run_writing_command(command: List[str], compile_options: Optional[CompileOpt
     HLO_LOWERING_PASS,
     QUANTUM_COMPILATION_PASS,
     BUFFERIZATION_PASS,
-    #BUFFERIZATION_PASS2,
-    #BUFFERIZATION_PASS3,
+    # BUFFERIZATION_PASS2,
+    # BUFFERIZATION_PASS3,
     MLIR_TO_LLVM_ASYNC_PASS,
 ]
 
diff --git a/mlir/lib/Driver/CompilerDriver.cpp b/mlir/lib/Driver/CompilerDriver.cpp
index 004fa92c7f..74340385a6 100644
--- a/mlir/lib/Driver/CompilerDriver.cpp
+++ b/mlir/lib/Driver/CompilerDriver.cpp
@@ -671,9 +671,8 @@ LogicalResult QuantumDriverMain(const CompilerOptions &options, CompilerOutput &
         outIRStream << *op;
 
         if (options.lowerToLLVM) {
-            llvmModule =
-                timer::timer(translateModuleToLLVMIR, "translateModuleToLLVMIR",
-                             /* add_endl */ false, *op, llvmContext, "LLVMDialectModule");
+            llvmModule = timer::timer(translateModuleToLLVMIR, "translateModuleToLLVMIR",
+                                      /* add_endl */ false, *op, llvmContext, "LLVMDialectModule");
             if (!llvmModule) {
                 CO_MSG(options, Verbosity::Urgent, "Failed to translate LLVM module\n");
                 return failure();

From d7c236d7e31c947319ccc5dd80cab2d9202f6182 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Wed, 4 Sep 2024 15:49:18 -0400
Subject: [PATCH 056/183] Update patch

---
 mlir/patches/moduleOp-bufferization.patch | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/mlir/patches/moduleOp-bufferization.patch b/mlir/patches/moduleOp-bufferization.patch
index c9d15d3c5f..75a8a65639 100644
--- a/mlir/patches/moduleOp-bufferization.patch
+++ b/mlir/patches/moduleOp-bufferization.patch
@@ -1,5 +1,5 @@
 diff --git a/mlir/lib/Dialect/Bufferization/Transforms/OneShotModuleBufferize.cpp b/mlir/lib/Dialect/Bufferization/Transforms/OneShotModuleBufferize.cpp
-index 0a4072605c26..2983af0fcbf3 100644
+index 0a4072605c26..5231fe860553 100644
 --- a/mlir/lib/Dialect/Bufferization/Transforms/OneShotModuleBufferize.cpp
 +++ b/mlir/lib/Dialect/Bufferization/Transforms/OneShotModuleBufferize.cpp
 @@ -75,7 +75,7 @@ using namespace mlir::bufferization;
@@ -135,7 +135,7 @@ index 0a4072605c26..2983af0fcbf3 100644
    }
  
    return success();
-@@ -430,20 +449,20 @@ LogicalResult mlir::bufferization::bufferizeModuleOp(
+@@ -430,7 +449,7 @@ LogicalResult mlir::bufferization::bufferizeModuleOp(
    IRRewriter rewriter(moduleOp.getContext());
  
    // A list of functions in the order in which they are analyzed + bufferized.
@@ -144,17 +144,15 @@ index 0a4072605c26..2983af0fcbf3 100644
  
    // A mapping of FuncOps to their callers.
    FuncCallerMap callerMap;
- 
-   if (failed(getFuncOpsOrderedByCalls(moduleOp, orderedFuncOps, callerMap)))
+@@ -439,11 +458,11 @@ LogicalResult mlir::bufferization::bufferizeModuleOp(
      return failure();
-+  SmallVector<FunctionOpInterface> ops;
  
    // Bufferize functions.
 -  for (func::FuncOp funcOp : orderedFuncOps) {
 +  for (FunctionOpInterface funcOp : orderedFuncOps) {
      // Note: It would be good to apply cleanups here but we cannot as aliasInfo
      // would be invalidated.
--
+ 
 -    if (llvm::is_contained(options.noAnalysisFuncFilter, funcOp.getSymName())) {
 +    if (llvm::is_contained(options.noAnalysisFuncFilter, funcOp.getName())) {
        // This function was not analyzed and RaW conflicts were not resolved.

From ac4fc89b623bdc17482b606eeca07ee0479c4282 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Wed, 4 Sep 2024 16:48:59 -0400
Subject: [PATCH 057/183] Correct AdjointOpInterface

---
 mlir/lib/Gradient/IR/GradientDialect.cpp      |  2 +
 .../BufferizableOpInterfaceImpl.cpp           | 80 ++++---------------
 2 files changed, 17 insertions(+), 65 deletions(-)

diff --git a/mlir/lib/Gradient/IR/GradientDialect.cpp b/mlir/lib/Gradient/IR/GradientDialect.cpp
index 4d9cfddb00..74f7d05aa7 100644
--- a/mlir/lib/Gradient/IR/GradientDialect.cpp
+++ b/mlir/lib/Gradient/IR/GradientDialect.cpp
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
 #include "mlir/Transforms/InliningUtils.h"
 
 #include "Gradient/IR/GradientDialect.h"
@@ -50,6 +51,7 @@ void GradientDialect::initialize()
 #include "Gradient/IR/GradientOps.cpp.inc"
         >();
     addInterface<GradientInlinerInterface>();
+    declarePromisedInterfaces<bufferization::BufferizableOpInterface, AdjointOp>();
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
index d1155da8e7..15c2090abd 100644
--- a/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -64,73 +64,23 @@ struct AdjointOpInterface
     LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
                             const bufferization::BufferizationOptions &options) const
     {
-        auto adjointOp = cast<AdjointOp>(op);
-        MLIRContext *ctx = rewriter.getContext();
-        Location loc = op->getLoc();
-        bufferization::BufferizeTypeConverter typeConverter;
-        Type vectorType =
-            typeConverter.convertType(MemRefType::get({UNKNOWN}, Float64Type::get(ctx)));
-
-        for (Type type : adjointOp.getResultTypes()) {
-            if (!isa<MemRefType>(type))
-                return adjointOp.emitOpError("must be bufferized before lowering");
-
-            // Currently only expval gradients are supported by the runtime,
-            // leading to tensor<?xf64> return values.
-            if (dyn_cast<MemRefType>(type) != MemRefType::get({UNKNOWN}, Float64Type::get(ctx)))
-                return adjointOp.emitOpError(
-                    "adjoint can only return MemRef<?xf64> or tuple thereof");
+        llvm::outs() << "Happens?\n";
+        SmallVector<Type> resTypes;
+        if (failed(getTypeConverter()->convertTypes(op.getResultTypes(), resTypes)))
+            return failure();
+
+        Location loc = op.getLoc();
+        Value gradSize = op.getGradSize();
+        SmallVector<Value> memrefValues;
+        for (Type resType : resTypes) {
+            MemRefType memrefType = cast<MemRefType>(resType);
+            Value memrefValue = rewriter.create<memref::AllocOp>(loc, memrefType, gradSize);
+            memrefValues.push_back(memrefValue);
         }
 
-        // The callee of the adjoint op must return as a single result the quantum register.
-        func::FuncOp callee = SymbolTable::lookupNearestSymbolFrom<func::FuncOp>(
-            adjointOp, adjointOp.getCalleeAttr());
-        assert(callee && callee.getNumResults() == 1 && "invalid qfunc symbol in adjoint op");
-
-        StringRef cacheFnName = "__catalyst__rt__toggle_recorder";
-        StringRef gradFnName = "__catalyst__qis__Gradient";
-        Type cacheFnSignature =
-            LLVM::LLVMFunctionType::get(LLVM::LLVMVoidType::get(ctx), IntegerType::get(ctx, 1));
-        Type gradFnSignature = LLVM::LLVMFunctionType::get(
-            LLVM::LLVMVoidType::get(ctx), IntegerType::get(ctx, 64), /*isVarArg=*/true);
-
-        LLVM::LLVMFuncOp cacheFnDecl =
-            ensureFunctionDeclaration(rewriter, adjointOp, cacheFnName, cacheFnSignature);
-        LLVM::LLVMFuncOp gradFnDecl =
-            ensureFunctionDeclaration(rewriter, adjointOp, gradFnName, gradFnSignature);
-
-        // Run the forward pass and cache the circuit.
-        Value c_true = rewriter.create<LLVM::ConstantOp>(
-            loc, rewriter.getIntegerAttr(IntegerType::get(ctx, 1), 1));
-        Value c_false = rewriter.create<LLVM::ConstantOp>(
-            loc, rewriter.getIntegerAttr(IntegerType::get(ctx, 1), 0));
-        rewriter.create<LLVM::CallOp>(loc, cacheFnDecl, c_true);
-        Value qreg = rewriter.create<func::CallOp>(loc, callee, adjointOp.getArgs()).getResult(0);
-        if (!isa<catalyst::quantum::QuregType>(qreg.getType()))
-            return callee.emitOpError("qfunc must return quantum register");
-        rewriter.create<LLVM::CallOp>(loc, cacheFnDecl, c_false);
-
-        // We follow the C ABI convention of passing result memrefs as struct pointers in the
-        // arguments to the C function, although in this case as a variadic argument list to allow
-        // for a varying number of results in a single signature.
-        Value c1 = rewriter.create<LLVM::ConstantOp>(loc, rewriter.getI64IntegerAttr(1));
-        Value numResults = rewriter.create<LLVM::ConstantOp>(
-            loc, rewriter.getI64IntegerAttr(adjointOp.getDataIn().size()));
-        SmallVector<Value> args = {numResults};
-
-        // Might need to modify here.
-        for (Value memref : adjointOp.getDataIn()) {
-            Value newArg = rewriter.create<LLVM::AllocaOp>(
-                loc, LLVM::LLVMPointerType::get(rewriter.getContext()), vectorType, c1);
-            rewriter.create<LLVM::StoreOp>(loc, memref, newArg);
-            args.push_back(newArg);
-        }
-
-        rewriter.create<LLVM::CallOp>(loc, gradFnDecl, args);
-        rewriter.create<catalyst::quantum::DeallocOp>(loc, qreg);
-        mlir::bufferization::replaceOpWithNewBufferizedOp<catalyst::quantum::DeviceReleaseOp>(
-            rewriter, adjointOp);
-
+        rewriter.create<AdjointOp>(loc, TypeRange{}, op.getCalleeAttr(), adaptor.getGradSize(),
+                                   adaptor.getArgs(), memrefValues);
+       bufferization::replaceOpWithBufferizedValues(rewriter, op, memrefValues);
         return success();
     }
 };

From 4428eec98b1ec6ecc349ac865e0bab4c988595ef Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Wed, 4 Sep 2024 17:03:21 -0400
Subject: [PATCH 058/183] Fix bufferization in AjointOp

---
 .../BufferizableOpInterfaceImpl.cpp           | 46 ++++++++-----------
 1 file changed, 20 insertions(+), 26 deletions(-)

diff --git a/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
index 15c2090abd..4e95bd0373 100644
--- a/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -19,27 +19,6 @@ using namespace catalyst::gradient;
 
 namespace {
 
-constexpr int64_t UNKNOWN = ShapedType::kDynamic;
-
-LLVM::LLVMFuncOp ensureFunctionDeclaration(RewriterBase &rewriter, Operation *op,
-                                           StringRef fnSymbol, Type fnType)
-{
-    Operation *fnDecl = SymbolTable::lookupNearestSymbolFrom(op, rewriter.getStringAttr(fnSymbol));
-
-    if (!fnDecl) {
-        RewriterBase::InsertionGuard insertGuard(rewriter);
-        ModuleOp mod = op->getParentOfType<ModuleOp>();
-        rewriter.setInsertionPointToStart(mod.getBody());
-
-        fnDecl = rewriter.create<LLVM::LLVMFuncOp>(op->getLoc(), fnSymbol, fnType);
-    }
-    else {
-        assert(isa<LLVM::LLVMFuncOp>(fnDecl) && "QIR function declaration is not a LLVMFuncOp");
-    }
-
-    return cast<LLVM::LLVMFuncOp>(fnDecl);
-}
-
 struct AdjointOpInterface
     : public bufferization::BufferizableOpInterface::ExternalModel<AdjointOpInterface, AdjointOp> {
     bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
@@ -65,12 +44,17 @@ struct AdjointOpInterface
                             const bufferization::BufferizationOptions &options) const
     {
         llvm::outs() << "Happens?\n";
+
+        auto adjointOp = cast<AdjointOp>(op);
+
+        bufferization::BufferizeTypeConverter typeConverter;
+
         SmallVector<Type> resTypes;
-        if (failed(getTypeConverter()->convertTypes(op.getResultTypes(), resTypes)))
+        if (failed(typeConverter.convertTypes(adjointOp.getResultTypes(), resTypes)))
             return failure();
 
-        Location loc = op.getLoc();
-        Value gradSize = op.getGradSize();
+        Location loc = adjointOp.getLoc();
+        Value gradSize = adjointOp.getGradSize();
         SmallVector<Value> memrefValues;
         for (Type resType : resTypes) {
             MemRefType memrefType = cast<MemRefType>(resType);
@@ -78,8 +62,18 @@ struct AdjointOpInterface
             memrefValues.push_back(memrefValue);
         }
 
-        rewriter.create<AdjointOp>(loc, TypeRange{}, op.getCalleeAttr(), adaptor.getGradSize(),
-                                   adaptor.getArgs(), memrefValues);
+        SmallVector<Value> bufferArgs;
+        ValueRange operands = adjointOp.getArgs();
+        for (Value operand : operands) {
+            FailureOr<Value> opBuffer = getBuffer(rewriter, operand, options);
+            if (failed(opBuffer))
+                return failure();
+            bufferArgs.push_back(*opBuffer);
+        }
+
+
+        rewriter.create<AdjointOp>(loc, TypeRange{}, adjointOp.getCalleeAttr(), adjointOp.getGradSize(),
+                                   bufferArgs, memrefValues);
        bufferization::replaceOpWithBufferizedValues(rewriter, op, memrefValues);
         return success();
     }

From dde8e6849631a54726f9aec158e16a8537421747 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Thu, 5 Sep 2024 11:37:35 -0400
Subject: [PATCH 059/183] Add backpropOp bufferization

---
 mlir/lib/Gradient/IR/GradientDialect.cpp      |   3 +-
 .../BufferizableOpInterfaceImpl.cpp           | 135 +++++++++++++++++-
 2 files changed, 134 insertions(+), 4 deletions(-)

diff --git a/mlir/lib/Gradient/IR/GradientDialect.cpp b/mlir/lib/Gradient/IR/GradientDialect.cpp
index 74f7d05aa7..c80b1b5eb6 100644
--- a/mlir/lib/Gradient/IR/GradientDialect.cpp
+++ b/mlir/lib/Gradient/IR/GradientDialect.cpp
@@ -51,7 +51,8 @@ void GradientDialect::initialize()
 #include "Gradient/IR/GradientOps.cpp.inc"
         >();
     addInterface<GradientInlinerInterface>();
-    declarePromisedInterfaces<bufferization::BufferizableOpInterface, AdjointOp>();
+    declarePromisedInterfaces<bufferization::BufferizableOpInterface, AdjointOp,
+                            BackpropOp, ForwardOp, ReverseOp>();
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
index 4e95bd0373..50393fc416 100644
--- a/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -5,6 +5,7 @@
 #include "mlir/Dialect/Bufferization/IR/Bufferization.h"
 #include "mlir/Dialect/Bufferization/Transforms/Bufferize.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Index/IR/IndexOps.h"
 #include "mlir/Dialect/LLVMIR/FunctionCallUtils.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
@@ -12,6 +13,7 @@
 
 #include "Gradient/IR/GradientOps.h"
 #include "Gradient/Transforms/BufferizableOpInterfaceImpl.h"
+#include "Gradient/Utils/GradientShape.h"
 #include "Quantum/IR/QuantumOps.h"
 
 using namespace mlir;
@@ -19,12 +21,42 @@ using namespace catalyst::gradient;
 
 namespace {
 
+Value generateAllocation(OpBuilder &builder, Location loc, Value reference)
+{
+    auto memrefType = cast<MemRefType>(reference.getType());
+    // Get dynamic dimension sizes from the provided reference value if necessary.
+    SmallVector<Value> dynamicDims;
+    if (!memrefType.hasStaticShape()) {
+        for (int64_t dim = 0; dim < memrefType.getRank(); dim++) {
+            if (memrefType.isDynamicDim(dim)) {
+                Value dimIndex = builder.create<index::ConstantOp>(loc, dim);
+                dynamicDims.push_back(builder.create<memref::DimOp>(loc, reference, dimIndex));
+            }
+        }
+    }
+
+    return builder.create<memref::AllocOp>(loc, memrefType, dynamicDims);
+}
+
+/// Helper function to generate a set of memref allocations.
+///
+/// The allocation size and shape is deduced from a list of existing memref values.
+///
+void generateAllocations(RewriterBase &rewriter, Location loc,
+                         SmallVectorImpl<Value> &allocations, ValueRange referenceValues)
+{
+    for (Value memref : referenceValues) {
+        allocations.push_back(
+            generateAllocation(rewriter, loc, cast<TypedValue<MemRefType>>(memref)));
+    }
+}
+
 struct AdjointOpInterface
     : public bufferization::BufferizableOpInterface::ExternalModel<AdjointOpInterface, AdjointOp> {
     bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
                                 const bufferization::AnalysisState &state) const
     {
-        return false;
+        return true;
     }
 
     bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
@@ -43,8 +75,6 @@ struct AdjointOpInterface
     LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
                             const bufferization::BufferizationOptions &options) const
     {
-        llvm::outs() << "Happens?\n";
-
         auto adjointOp = cast<AdjointOp>(op);
 
         bufferization::BufferizeTypeConverter typeConverter;
@@ -79,11 +109,110 @@ struct AdjointOpInterface
     }
 };
 
+struct BackpropOpInterface
+    : public bufferization::BufferizableOpInterface::ExternalModel<BackpropOpInterface, BackpropOp> {
+    bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
+                                const bufferization::AnalysisState &state) const
+    {
+        return true;
+    }
+
+    bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
+                                 const bufferization::AnalysisState &state) const
+    {
+        return true;
+    }
+
+    bufferization::AliasingValueList
+    getAliasingValues(Operation *op, OpOperand &opOperand,
+                      const bufferization::AnalysisState &state) const
+    {
+        return {};
+    }
+
+    LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
+                            const bufferization::BufferizationOptions &options) const
+    {
+        auto backpropOp = cast<BackpropOp>(op);
+
+        Location loc = backpropOp.getLoc();
+        SmallVector<Value> gradients;
+        SmallVector<Value> argShadows;
+        // Conceptually a map from scalar result indices (w.r.t. other scalars) to the position in
+        // the overall list of returned gradients.
+        // For instance, a backprop op that returns (tensor, f64, tensor, f64, f64) will have
+        // scalarIndices = {1, 3, 4}.
+        SmallVector<unsigned> scalarIndices;
+        SmallVector<Type> scalarReturnTypes;
+        std::vector<Value> diffArgs =
+            computeDiffArgs(backpropOp.getArgs(), backpropOp.getDiffArgIndicesAttr());
+        for (const auto &[idx, diffArg] : llvm::enumerate(diffArgs)) {
+            // Allocate buffers to place the differentiation results (gradients) into. Enzyme refers
+            // to these as shadow arguments. There is one result for each differentiable MemRef
+            // argument, with a matching shape and type.
+            if (isa<MemRefType>(diffArg.getType())) {
+                Value shadow = generateAllocation(rewriter, loc, diffArg);
+                gradients.push_back(shadow);
+                argShadows.push_back(shadow);
+            }
+            else if (isa<FloatType>(diffArg.getType())) {
+                scalarReturnTypes.push_back(diffArg.getType());
+                scalarIndices.push_back(idx);
+                // Put a null placeholder value that will be filled in with the result of the
+                // bufferized BackpropOp.
+                gradients.push_back(Value());
+            }
+        }
+
+        // Enzyme requires buffers for the primal outputs as well, even though we don't need their
+        // values. We'll mark them dupNoNeed later on to allow Enzyme to optimize away their
+        // computation.
+        SmallVector<Value> calleeResults, resShadows;
+        ValueRange cotangents = backpropOp.getCotangents();
+        generateAllocations(rewriter, loc, calleeResults, cotangents);
+        // Enzyme mutates the result shadows but the cotangent tensors must be immutable, so we
+        // create copies to pass into Enzyme. Concretely, this issue pops up with multiple
+        // BackpropOps that have the same cotangent tensor due to a CSE effect from one-shot
+        // bufferization.
+        generateAllocations(rewriter, loc, resShadows, cotangents);
+        for (const auto &[cotangent, resShadow] : llvm::zip(cotangents, resShadows)) {
+            rewriter.create<memref::CopyOp>(loc, cotangent, resShadow);
+        }
+
+        DenseIntElementsAttr diffArgIndicesAttr = backpropOp.getDiffArgIndices().value_or(nullptr);
+        auto bufferizedBackpropOp = rewriter.create<BackpropOp>(
+            loc, TypeRange{}, scalarReturnTypes, backpropOp.getCalleeAttr(), backpropOp.getArgs(), argShadows,
+            calleeResults, resShadows, diffArgIndicesAttr, backpropOp.getKeepValueResultsAttr());
+
+        // Fill in the null placeholders.
+        for (const auto &[idx, scalarResult] :
+             llvm::enumerate(bufferizedBackpropOp.getGradients())) {
+            gradients[scalarIndices[idx]] = scalarResult;
+        }
+
+        // BackpropOp can return two results for value_and_grad: values and gradients
+        // or only one for grad: gradients
+        SmallVector<Value> results;
+        {
+            // If we are lowering a value_and_grad operation, then take values from the
+            // calleeResults
+            if (!backpropOp.getVals().empty()) {
+                results.insert(results.end(), calleeResults.begin(), calleeResults.end());
+            }
+            results.insert(results.end(), gradients.begin(), gradients.end());
+        }
+
+        rewriter.replaceOp(op, results);
+        return success();
+    }
+};
+
 } // namespace
 
 void catalyst::gradient::registerBufferizableOpInterfaceExternalModels(DialectRegistry &registry)
 {
     registry.addExtension(+[](MLIRContext *ctx, GradientDialect *dialect) {
         AdjointOp::attachInterface<AdjointOpInterface>(*ctx);
+        BackpropOp::attachInterface<BackpropOpInterface>(*ctx);
     });
 }
\ No newline at end of file

From af0da0c7af39022adbd3d12dee82eb88fbf7579a Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Thu, 5 Sep 2024 11:48:59 -0400
Subject: [PATCH 060/183] Add restrict to ToTensorOp

---
 .../lib/Gradient/Transforms/GradMethods/ClassicalJacobian.cpp | 4 ++--
 .../Gradient/Transforms/GradMethods/PS_QuantumGradient.cpp    | 4 ++--
 mlir/test/Gradient/PS_QuantumGradientTest.mlir                | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/mlir/lib/Gradient/Transforms/GradMethods/ClassicalJacobian.cpp b/mlir/lib/Gradient/Transforms/GradMethods/ClassicalJacobian.cpp
index 8cbf83224c..e84a619576 100644
--- a/mlir/lib/Gradient/Transforms/GradMethods/ClassicalJacobian.cpp
+++ b/mlir/lib/Gradient/Transforms/GradMethods/ClassicalJacobian.cpp
@@ -147,7 +147,7 @@ func::FuncOp genSplitPreprocessed(PatternRewriter &rewriter, Location loc, func:
         PatternRewriter::InsertionGuard insertGuard(rewriter);
         rewriter.setInsertionPointToStart(&splitFn.getBody().front());
         Value paramsBuffer = rewriter.create<memref::AllocOp>(loc, paramsBufferType, paramCount);
-        Value paramsTensor = rewriter.create<bufferization::ToTensorOp>(loc, paramsBuffer);
+        Value paramsTensor = rewriter.create<bufferization::ToTensorOp>(loc, paramsBuffer, /*restrict=*/true);
 
         qnodeQuantumArgs.push_back(paramsTensor);
         MemRefType paramsProcessedType = MemRefType::get({}, rewriter.getIndexType());
@@ -290,7 +290,7 @@ func::FuncOp genArgMapFunction(PatternRewriter &rewriter, Location loc, func::Fu
                 PatternRewriter::InsertionGuard insertionGuard(rewriter);
                 rewriter.setInsertionPoint(returnOp);
                 Value paramsVector =
-                    rewriter.create<bufferization::ToTensorOp>(loc, paramsVectorType, paramsBuffer);
+                    rewriter.create<bufferization::ToTensorOp>(loc, paramsVectorType, paramsBuffer, /*restrict=*/true) ;
                 returnOp.getOperandsMutable().assign(paramsVector);
             }
         });
diff --git a/mlir/lib/Gradient/Transforms/GradMethods/PS_QuantumGradient.cpp b/mlir/lib/Gradient/Transforms/GradMethods/PS_QuantumGradient.cpp
index 15f005e464..e0a1670b9a 100644
--- a/mlir/lib/Gradient/Transforms/GradMethods/PS_QuantumGradient.cpp
+++ b/mlir/lib/Gradient/Transforms/GradMethods/PS_QuantumGradient.cpp
@@ -58,7 +58,7 @@ static std::vector<Value> computePartialDerivative(PatternRewriter &rewriter, Lo
 {
     constexpr double shift = PI / 2;
     ShapedType shiftVectorType = RankedTensorType::get({numShifts}, rewriter.getF64Type());
-    Value selectorVector = rewriter.create<bufferization::ToTensorOp>(loc, selectorBuffer);
+    Value selectorVector = rewriter.create<bufferization::ToTensorOp>(loc, selectorBuffer, /*restrict=*/true);
 
     // Define the shift vectors (pos/neg) as sparse tensor constants.
     DenseElementsAttr nonZeroIndices = rewriter.getI64TensorAttr(currentShift);
@@ -285,7 +285,7 @@ func::FuncOp ParameterShiftLowering::genQGradFunction(PatternRewriter &rewriter,
                 gradientTensors.reserve(gradResTypes.size());
                 for (Value gradientBuffer : gradientBuffers) {
                     gradientTensors.push_back(
-                        rewriter.create<bufferization::ToTensorOp>(loc, gradientBuffer));
+                        rewriter.create<bufferization::ToTensorOp>(loc, gradientBuffer, /*restrict=*/true));
                 }
                 op->setOperands(gradientTensors);
             }
diff --git a/mlir/test/Gradient/PS_QuantumGradientTest.mlir b/mlir/test/Gradient/PS_QuantumGradientTest.mlir
index c13ca339d4..8c8034981b 100644
--- a/mlir/test/Gradient/PS_QuantumGradientTest.mlir
+++ b/mlir/test/Gradient/PS_QuantumGradientTest.mlir
@@ -425,7 +425,7 @@ func.func @multi_res_circuit(%arg0: f64) -> (f64, tensor<2xf64>) attributes {qno
     %r = quantum.alloc(1) : !quantum.reg
     %q_0 = quantum.extract %r[%idx] : !quantum.reg -> !quantum.bit
 
-    // CHECK:         [[SEL:%.+]] = bufferization.to_tensor [[SELBUFF]] : memref<0xindex>
+    // CHECK:         [[SEL:%.+]] = bufferization.to_tensor [[SELBUFF]] restrict : memref<0xindex>
     // CHECK:         [[EVALPOS:%.+]]:2 = call @multi_res_circuit.shifted(%arg0, [[SHIFTPOS]], [[SEL]]) : {{.+}} -> (f64, tensor<2xf64>)
     // CHECK:         [[EVALNEG:%.+]]:2 = call @multi_res_circuit.shifted(%arg0, [[SHIFTNEG]], [[SEL]]) : {{.+}} -> (f64, tensor<2xf64>)
     // CHECK:         [[DIFF0:%.+]] = arith.subf [[EVALPOS]]#0, [[EVALNEG]]#0

From cae09e05b40ec20b8a0f3d24c4f20686dbd75ca9 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Thu, 5 Sep 2024 14:46:27 -0400
Subject: [PATCH 061/183] Correct bufferized args in backpropOp

---
 .../BufferizableOpInterfaceImpl.cpp           | 44 +++++++++++++++----
 1 file changed, 36 insertions(+), 8 deletions(-)

diff --git a/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
index 50393fc416..cc9b403139 100644
--- a/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -104,7 +104,7 @@ struct AdjointOpInterface
 
         rewriter.create<AdjointOp>(loc, TypeRange{}, adjointOp.getCalleeAttr(), adjointOp.getGradSize(),
                                    bufferArgs, memrefValues);
-       bufferization::replaceOpWithBufferizedValues(rewriter, op, memrefValues);
+        bufferization::replaceOpWithBufferizedValues(rewriter, op, memrefValues);
         return success();
     }
 };
@@ -144,8 +144,24 @@ struct BackpropOpInterface
         // scalarIndices = {1, 3, 4}.
         SmallVector<unsigned> scalarIndices;
         SmallVector<Type> scalarReturnTypes;
+
+        SmallVector<Value> bufferArgs;
+        ValueRange operands = backpropOp.getArgs();
+        for (Value operand : operands) {
+            if(isa<TensorType>(operand.getType())) {
+                FailureOr<Value> opBuffer = getBuffer(rewriter, operand, options);
+                if (failed(opBuffer))
+                    return failure();
+                bufferArgs.push_back(*opBuffer);
+            } else {
+                bufferArgs.push_back(operand);
+            }
+
+        }
+
         std::vector<Value> diffArgs =
-            computeDiffArgs(backpropOp.getArgs(), backpropOp.getDiffArgIndicesAttr());
+            computeDiffArgs(bufferArgs, backpropOp.getDiffArgIndicesAttr());
+
         for (const auto &[idx, diffArg] : llvm::enumerate(diffArgs)) {
             // Allocate buffers to place the differentiation results (gradients) into. Enzyme refers
             // to these as shadow arguments. There is one result for each differentiable MemRef
@@ -169,21 +185,33 @@ struct BackpropOpInterface
         // computation.
         SmallVector<Value> calleeResults, resShadows;
         ValueRange cotangents = backpropOp.getCotangents();
-        generateAllocations(rewriter, loc, calleeResults, cotangents);
+        SmallVector<Value> bufferCotangentsList;
+        for (Value operand : cotangents) {
+            FailureOr<Value> opBuffer = getBuffer(rewriter, operand, options);
+            if (failed(opBuffer))
+                return failure();
+            bufferCotangentsList.push_back(*opBuffer);
+        }
+        mlir::ValueRange bufferCotangents(bufferCotangentsList);
+
+        generateAllocations(rewriter, loc, calleeResults, bufferCotangents);
         // Enzyme mutates the result shadows but the cotangent tensors must be immutable, so we
         // create copies to pass into Enzyme. Concretely, this issue pops up with multiple
         // BackpropOps that have the same cotangent tensor due to a CSE effect from one-shot
         // bufferization.
-        generateAllocations(rewriter, loc, resShadows, cotangents);
-        for (const auto &[cotangent, resShadow] : llvm::zip(cotangents, resShadows)) {
+        generateAllocations(rewriter, loc, resShadows, bufferCotangents);
+        for (const auto &[cotangent, resShadow] : llvm::zip(bufferCotangents, resShadows)) {
             rewriter.create<memref::CopyOp>(loc, cotangent, resShadow);
         }
 
+
+        llvm::outs() << "======================\n";
+        llvm::outs() << scalarReturnTypes;
+        llvm::outs() << "======================\n";
         DenseIntElementsAttr diffArgIndicesAttr = backpropOp.getDiffArgIndices().value_or(nullptr);
         auto bufferizedBackpropOp = rewriter.create<BackpropOp>(
-            loc, TypeRange{}, scalarReturnTypes, backpropOp.getCalleeAttr(), backpropOp.getArgs(), argShadows,
+            loc, TypeRange{}, scalarReturnTypes, backpropOp.getCalleeAttr(), bufferArgs, argShadows,
             calleeResults, resShadows, diffArgIndicesAttr, backpropOp.getKeepValueResultsAttr());
-
         // Fill in the null placeholders.
         for (const auto &[idx, scalarResult] :
              llvm::enumerate(bufferizedBackpropOp.getGradients())) {
@@ -202,7 +230,7 @@ struct BackpropOpInterface
             results.insert(results.end(), gradients.begin(), gradients.end());
         }
 
-        rewriter.replaceOp(op, results);
+        bufferization::replaceOpWithBufferizedValues(rewriter, op, results);
         return success();
     }
 };

From 1182aca021d6dd182ac37f792dad6202273ae782 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Fri, 6 Sep 2024 12:02:56 -0400
Subject: [PATCH 062/183] Use identity-layout-map

---
 frontend/catalyst/compiler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/frontend/catalyst/compiler.py b/frontend/catalyst/compiler.py
index 91df7f9716..3333eb4e9b 100644
--- a/frontend/catalyst/compiler.py
+++ b/frontend/catalyst/compiler.py
@@ -194,7 +194,7 @@ def run_writing_command(command: List[str], compile_options: Optional[CompileOpt
     "BufferizationPass",
     [
         "eliminate-empty-tensors",
-        "one-shot-bufferize{bufferize-function-boundaries}",
+        "one-shot-bufferize{bufferize-function-boundaries function-boundary-type-conversion=identity-layout-map}",
         # "func.func(buffer-hoisting)",
         # "func.func(buffer-loop-hoisting)",
         # "buffer-results-to-out-params",

From 67d1d6a5a2f9f1d70dfd06bf96b9148f7c58fe60 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Fri, 6 Sep 2024 12:07:25 -0400
Subject: [PATCH 063/183] Update llvm patch with new ReturnLike mechanism

---
 mlir/patches/moduleOp-bufferization.patch | 335 +++++++++++++++++-----
 1 file changed, 261 insertions(+), 74 deletions(-)

diff --git a/mlir/patches/moduleOp-bufferization.patch b/mlir/patches/moduleOp-bufferization.patch
index 75a8a65639..51cc79536e 100644
--- a/mlir/patches/moduleOp-bufferization.patch
+++ b/mlir/patches/moduleOp-bufferization.patch
@@ -1,5 +1,53 @@
+diff --git a/mlir/include/mlir/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.h b/mlir/include/mlir/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.h
+index 0b91d3d675b7..8bed0dfc5814 100644
+--- a/mlir/include/mlir/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.h
++++ b/mlir/include/mlir/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.h
+@@ -50,24 +50,24 @@ struct FuncAnalysisState : public OneShotAnalysisState::Extension {
+ 
+   /// A mapping of ReturnOp OpOperand indices to equivalent FuncOp BBArg
+   /// indices.
+-  DenseMap<FuncOp, IndexMapping> equivalentFuncArgs;
++  DenseMap<FunctionOpInterface, IndexMapping> equivalentFuncArgs;
+ 
+   /// A mapping of FuncOp BBArg indices to aliasing ReturnOp OpOperand indices.
+-  DenseMap<FuncOp, IndexToIndexListMapping> aliasingReturnVals;
++  DenseMap<FunctionOpInterface, IndexToIndexListMapping> aliasingReturnVals;
+ 
+   /// A set of all read BlockArguments of FuncOps.
+-  DenseMap<FuncOp, BbArgIndexSet> readBbArgs;
++  DenseMap<FunctionOpInterface, BbArgIndexSet> readBbArgs;
+ 
+   /// A set of all written-to BlockArguments of FuncOps.
+-  DenseMap<FuncOp, BbArgIndexSet> writtenBbArgs;
++  DenseMap<FunctionOpInterface, BbArgIndexSet> writtenBbArgs;
+ 
+   /// Keep track of which FuncOps are fully analyzed or currently being
+   /// analyzed.
+-  DenseMap<FuncOp, FuncOpAnalysisState> analyzedFuncOps;
++  DenseMap<FunctionOpInterface, FuncOpAnalysisState> analyzedFuncOps;
+ 
+   /// This function is called right before analyzing the given FuncOp. It
+   /// initializes the data structures for the FuncOp in this state object.
+-  void startFunctionAnalysis(FuncOp funcOp);
++  void startFunctionAnalysis(FunctionOpInterface funcOp);
+ };
+ 
+ void registerBufferizableOpInterfaceExternalModels(DialectRegistry &registry);
+diff --git a/mlir/lib/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.cpp
+index 053ea7935260..fed1c49d22be 100644
+--- a/mlir/lib/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.cpp
++++ b/mlir/lib/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.cpp
+@@ -22,7 +22,7 @@ namespace mlir {
+ namespace bufferization {
+ namespace func_ext {
+ 
+-void FuncAnalysisState::startFunctionAnalysis(FuncOp funcOp) {
++void FuncAnalysisState::startFunctionAnalysis(FunctionOpInterface funcOp) {
+   analyzedFuncOps[funcOp] = FuncOpAnalysisState::InProgress;
+   auto createdEquiv = equivalentFuncArgs.try_emplace(funcOp, IndexMapping());
+   auto createdAliasingResults =
 diff --git a/mlir/lib/Dialect/Bufferization/Transforms/OneShotModuleBufferize.cpp b/mlir/lib/Dialect/Bufferization/Transforms/OneShotModuleBufferize.cpp
-index 0a4072605c26..5231fe860553 100644
+index 0a4072605c26..bd054ac4e7b8 100644
 --- a/mlir/lib/Dialect/Bufferization/Transforms/OneShotModuleBufferize.cpp
 +++ b/mlir/lib/Dialect/Bufferization/Transforms/OneShotModuleBufferize.cpp
 @@ -75,7 +75,7 @@ using namespace mlir::bufferization;
@@ -11,23 +59,148 @@ index 0a4072605c26..5231fe860553 100644
  
  /// Get or create FuncAnalysisState.
  static FuncAnalysisState &
-@@ -247,6 +247,15 @@ static func::FuncOp getCalledFunction(func::CallOp callOp) {
-       SymbolTable::lookupNearestSymbolFrom(callOp, sym));
+@@ -88,15 +88,19 @@ getOrCreateFuncAnalysisState(OneShotAnalysisState &state) {
+ 
+ /// Return the unique ReturnOp that terminates `funcOp`.
+ /// Return nullptr if there is no such unique ReturnOp.
+-static func::ReturnOp getAssumedUniqueReturnOp(func::FuncOp funcOp) {
+-  func::ReturnOp returnOp;
+-  for (Block &b : funcOp.getBody()) {
+-    if (auto candidateOp = dyn_cast<func::ReturnOp>(b.getTerminator())) {
++/// Return `funcOp` it self if there is no ReturnOp.
++static Operation* getAssumedUniqueReturnOp(FunctionOpInterface funcOp) {
++  Operation *returnOp = nullptr;
++  for (Block &b : funcOp.getFunctionBody()) {
++    auto candidateOp = b.getTerminator();
++    if (candidateOp && candidateOp->hasTrait<OpTrait::ReturnLike>()) {
+       if (returnOp)
+         return nullptr;
+       returnOp = candidateOp;
+     }
+   }
++  if (!returnOp)
++    return funcOp;
+   return returnOp;
  }
  
+@@ -126,16 +130,15 @@ static void annotateEquivalentReturnBbArg(OpOperand &returnVal,
+ /// Store function BlockArguments that are equivalent to/aliasing a returned
+ /// value in FuncAnalysisState.
+ static LogicalResult
+-aliasingFuncOpBBArgsAnalysis(FuncOp funcOp, OneShotAnalysisState &state,
++aliasingFuncOpBBArgsAnalysis(FunctionOpInterface funcOp, OneShotAnalysisState &state,
+                              FuncAnalysisState &funcState) {
+-  if (funcOp.getBody().empty()) {
++  if (funcOp.getFunctionBody().empty()) {
+     // No function body available. Conservatively assume that every tensor
+     // return value may alias with any tensor bbArg.
+-    FunctionType type = funcOp.getFunctionType();
+-    for (const auto &inputIt : llvm::enumerate(type.getInputs())) {
++    for (const auto &inputIt : llvm::enumerate(funcOp.getArgumentTypes())) {
+       if (!isa<TensorType>(inputIt.value()))
+         continue;
+-      for (const auto &resultIt : llvm::enumerate(type.getResults())) {
++      for (const auto &resultIt : llvm::enumerate(funcOp.getResultTypes())) {
+         if (!isa<TensorType>(resultIt.value()))
+           continue;
+         int64_t returnIdx = resultIt.index();
+@@ -147,7 +150,10 @@ aliasingFuncOpBBArgsAnalysis(FuncOp funcOp, OneShotAnalysisState &state,
+   }
+ 
+   // Support only single return-terminated block in the function.
+-  func::ReturnOp returnOp = getAssumedUniqueReturnOp(funcOp);
++  // If funcOp has no returnOp, skip the following analysis.
++  Operation *returnOp = getAssumedUniqueReturnOp(funcOp);
++  if (returnOp == funcOp)
++    return success();
+   assert(returnOp && "expected func with single return op");
+ 
+   for (OpOperand &returnVal : returnOp->getOpOperands())
+@@ -168,7 +174,7 @@ aliasingFuncOpBBArgsAnalysis(FuncOp funcOp, OneShotAnalysisState &state,
+   return success();
+ }
+ 
+-static void annotateFuncArgAccess(func::FuncOp funcOp, int64_t idx, bool isRead,
++static void annotateFuncArgAccess(FunctionOpInterface funcOp, int64_t idx, bool isRead,
+                                   bool isWritten) {
+   OpBuilder b(funcOp.getContext());
+   Attribute accessType;
+@@ -189,12 +195,12 @@ static void annotateFuncArgAccess(func::FuncOp funcOp, int64_t idx, bool isRead,
+ /// function with unknown ops, we conservatively assume that such ops bufferize
+ /// to a read + write.
+ static LogicalResult
+-funcOpBbArgReadWriteAnalysis(FuncOp funcOp, OneShotAnalysisState &state,
++funcOpBbArgReadWriteAnalysis(FunctionOpInterface funcOp, OneShotAnalysisState &state,
+                              FuncAnalysisState &funcState) {
+-  for (int64_t idx = 0, e = funcOp.getFunctionType().getNumInputs(); idx < e;
++  for (int64_t idx = 0, e = funcOp.getNumArguments(); idx < e;
+        ++idx) {
+     // Skip non-tensor arguments.
+-    if (!isa<TensorType>(funcOp.getFunctionType().getInput(idx)))
++    if (!isa<TensorType>(funcOp.getArgumentTypes()[idx]))
+       continue;
+     bool isRead;
+     bool isWritten;
+@@ -204,7 +210,7 @@ funcOpBbArgReadWriteAnalysis(FuncOp funcOp, OneShotAnalysisState &state,
+       StringRef str = accessAttr.getValue();
+       isRead = str == "read" || str == "read-write";
+       isWritten = str == "write" || str == "read-write";
+-    } else if (funcOp.getBody().empty()) {
++    } else if (funcOp.getFunctionBody().empty()) {
+       // If the function has no body, conservatively assume that all args are
+       // read + written.
+       isRead = true;
+@@ -230,20 +236,19 @@ funcOpBbArgReadWriteAnalysis(FuncOp funcOp, OneShotAnalysisState &state,
+ 
+ /// Remove bufferization attributes on FuncOp arguments.
+ static void removeBufferizationAttributes(BlockArgument bbArg) {
+-  auto funcOp = cast<func::FuncOp>(bbArg.getOwner()->getParentOp());
++  auto funcOp = cast<FunctionOpInterface>(bbArg.getOwner()->getParentOp());
+   funcOp.removeArgAttr(bbArg.getArgNumber(),
+                        BufferizationDialect::kBufferLayoutAttrName);
+   funcOp.removeArgAttr(bbArg.getArgNumber(),
+                        BufferizationDialect::kWritableAttrName);
+ }
+ 
+-/// Return the func::FuncOp called by `callOp`.
+-static func::FuncOp getCalledFunction(func::CallOp callOp) {
 +static FunctionOpInterface getCalledFunction(CallOpInterface callOp) {
-+  SymbolRefAttr sym =
-+      llvm::dyn_cast_if_present<SymbolRefAttr>(callOp.getCallableForCallee());
-+  if (!sym)
-+    return nullptr;
+   SymbolRefAttr sym =
+       llvm::dyn_cast_if_present<SymbolRefAttr>(callOp.getCallableForCallee());
+   if (!sym)
+     return nullptr;
+-  return dyn_cast_or_null<func::FuncOp>(
 +  return dyn_cast_or_null<FunctionOpInterface>(
-+      SymbolTable::lookupNearestSymbolFrom(callOp, sym));
-+}
-+
- /// Gather equivalence info of CallOps.
+       SymbolTable::lookupNearestSymbolFrom(callOp, sym));
+ }
+ 
+@@ -251,12 +256,12 @@ static func::FuncOp getCalledFunction(func::CallOp callOp) {
  /// Note: This only adds new equivalence info if the called function was already
  /// analyzed.
-@@ -277,10 +286,10 @@ static void equivalenceAnalysis(func::FuncOp funcOp,
+ // TODO: This does not handle cyclic function call graphs etc.
+-static void equivalenceAnalysis(func::FuncOp funcOp,
++static void equivalenceAnalysis(FunctionOpInterface funcOp,
+                                 OneShotAnalysisState &state,
+                                 FuncAnalysisState &funcState) {
+-  funcOp->walk([&](func::CallOp callOp) {
+-    func::FuncOp calledFunction = getCalledFunction(callOp);
+-    assert(calledFunction && "could not retrieved called func::FuncOp");
++  funcOp->walk([&](CallOpInterface callOp) {
++    FunctionOpInterface calledFunction = getCalledFunction(callOp);
++    assert(calledFunction && "could not retrieved called FunctionOpInterface");
+ 
+     // No equivalence info available for the called function.
+     if (!funcState.equivalentFuncArgs.count(calledFunction))
+@@ -267,7 +272,7 @@ static void equivalenceAnalysis(func::FuncOp funcOp,
+       int64_t bbargIdx = it.second;
+       if (!state.isInPlace(callOp->getOpOperand(bbargIdx)))
+         continue;
+-      Value returnVal = callOp.getResult(returnIdx);
++      Value returnVal = callOp->getResult(returnIdx);
+       Value argVal = callOp->getOperand(bbargIdx);
+       state.unionEquivalenceClasses(returnVal, argVal);
+     }
+@@ -277,10 +282,10 @@ static void equivalenceAnalysis(func::FuncOp funcOp,
  }
  
  /// Return "true" if the given function signature has tensor semantics.
@@ -41,7 +214,7 @@ index 0a4072605c26..5231fe860553 100644
                        llvm::IsaPred<TensorType>);
  }
  
-@@ -291,26 +300,30 @@ static bool hasTensorSignature(func::FuncOp funcOp) {
+@@ -291,17 +296,17 @@ static bool hasTensorSignature(func::FuncOp funcOp) {
  /// retrieve the called FuncOp from any func::CallOp.
  static LogicalResult
  getFuncOpsOrderedByCalls(ModuleOp moduleOp,
@@ -58,33 +231,56 @@ index 0a4072605c26..5231fe860553 100644
 -    if (!funcOp.getBody().empty()) {
 -      func::ReturnOp returnOp = getAssumedUniqueReturnOp(funcOp);
 -      if (!returnOp)
--        return funcOp->emitError()
--               << "cannot bufferize a FuncOp with tensors and "
--                  "without a unique ReturnOp";
 +  DenseMap<FunctionOpInterface, unsigned> numberCallOpsContainedInFuncOp;
 +  WalkResult res = moduleOp.walk([&](FunctionOpInterface funcOp) -> WalkResult {
-+    // Only handle ReturnOp if funcOp is exactly the FuncOp type.
-+    if(isa<FuncOp>(funcOp)) {
-+      FuncOp funcOpCasted = cast<FuncOp>(funcOp);
-+      if (!funcOpCasted.getBody().empty()) {
-+        func::ReturnOp returnOp = getAssumedUniqueReturnOp(funcOpCasted);
-+        if (!returnOp)
-+          return funcOp->emitError()
-+                 << "cannot bufferize a FuncOp with tensors and "
-+                    "without a unique ReturnOp";
-+      }
-     }
++    if (!funcOp.getFunctionBody().empty()) {
++      Operation *returnOp = getAssumedUniqueReturnOp(funcOp);
++      if (!returnOp && returnOp != funcOp)
+         return funcOp->emitError()
+                << "cannot bufferize a FuncOp with tensors and "
+                   "without a unique ReturnOp";
+@@ -309,9 +314,9 @@ getFuncOpsOrderedByCalls(ModuleOp moduleOp,
  
      // Collect function calls and populate the caller map.
      numberCallOpsContainedInFuncOp[funcOp] = 0;
 -    return funcOp.walk([&](func::CallOp callOp) -> WalkResult {
 -      func::FuncOp calledFunction = getCalledFunction(callOp);
+-      assert(calledFunction && "could not retrieved called func::FuncOp");
 +    return funcOp.walk([&](CallOpInterface callOp) -> WalkResult {
 +      FunctionOpInterface calledFunction = getCalledFunction(callOp);
-       assert(calledFunction && "could not retrieved called func::FuncOp");
++      assert(calledFunction && "could not retrieved called FunctionOpInterface");
        // If the called function does not have any tensors in its signature, then
        // it is not necessary to bufferize the callee before the caller.
-@@ -379,7 +392,7 @@ mlir::bufferization::analyzeModuleOp(ModuleOp moduleOp,
+       if (!hasTensorSignature(calledFunction))
+@@ -349,11 +354,15 @@ getFuncOpsOrderedByCalls(ModuleOp moduleOp,
+ /// most generic layout map as function return types. After bufferizing the
+ /// entire function body, a more concise memref type can potentially be used for
+ /// the return type of the function.
+-static void foldMemRefCasts(func::FuncOp funcOp) {
+-  if (funcOp.getBody().empty())
++static void foldMemRefCasts(FunctionOpInterface funcOp) {
++  if (funcOp.getFunctionBody().empty())
++    return;
++
++  Operation *returnOp = getAssumedUniqueReturnOp(funcOp);
++
++  if (!returnOp || returnOp == funcOp)
+     return;
+ 
+-  func::ReturnOp returnOp = getAssumedUniqueReturnOp(funcOp);
+   SmallVector<Type> resultTypes;
+ 
+   for (OpOperand &operand : returnOp->getOpOperands()) {
+@@ -366,7 +375,7 @@ static void foldMemRefCasts(func::FuncOp funcOp) {
+   }
+ 
+   auto newFuncType = FunctionType::get(
+-      funcOp.getContext(), funcOp.getFunctionType().getInputs(), resultTypes);
++      funcOp.getContext(), funcOp.getArgumentTypes(), resultTypes);
+   funcOp.setType(newFuncType);
+ }
+ 
+@@ -379,7 +388,7 @@ mlir::bufferization::analyzeModuleOp(ModuleOp moduleOp,
    FuncAnalysisState &funcState = getOrCreateFuncAnalysisState(state);
  
    // A list of functions in the order in which they are analyzed + bufferized.
@@ -93,49 +289,26 @@ index 0a4072605c26..5231fe860553 100644
  
    // A mapping of FuncOps to their callers.
    FuncCallerMap callerMap;
-@@ -388,27 +401,33 @@ mlir::bufferization::analyzeModuleOp(ModuleOp moduleOp,
+@@ -388,7 +397,8 @@ mlir::bufferization::analyzeModuleOp(ModuleOp moduleOp,
      return failure();
  
    // Analyze ops.
 -  for (func::FuncOp funcOp : orderedFuncOps) {
--    if (!state.getOptions().isOpAllowed(funcOp))
 +  for (FunctionOpInterface funcOp : orderedFuncOps) {
 +
-+    // The following analysis is specific to the FuncOp type.
-+    if(!isa<FuncOp>(funcOp))
-+        continue;
-+    FuncOp funcOpCasted = cast<func::FuncOp>(funcOp);
-+
-+    if (!state.getOptions().isOpAllowed(funcOpCasted))
+     if (!state.getOptions().isOpAllowed(funcOp))
        continue;
  
-     // Now analyzing function.
--    funcState.startFunctionAnalysis(funcOp);
-+    funcState.startFunctionAnalysis(funcOpCasted);
- 
-     // Gather equivalence info for CallOps.
--    equivalenceAnalysis(funcOp, state, funcState);
-+    equivalenceAnalysis(funcOpCasted, state, funcState);
+@@ -416,7 +426,7 @@ mlir::bufferization::analyzeModuleOp(ModuleOp moduleOp,
  
-     // Analyze funcOp.
--    if (failed(analyzeOp(funcOp, state, statistics)))
-+    if (failed(analyzeOp(funcOpCasted, state, statistics)))
-       return failure();
- 
-     // Run some extra function analyses.
--    if (failed(aliasingFuncOpBBArgsAnalysis(funcOp, state, funcState)) ||
--        failed(funcOpBbArgReadWriteAnalysis(funcOp, state, funcState)))
-+    if (failed(aliasingFuncOpBBArgsAnalysis(funcOpCasted, state, funcState)) ||
-+        failed(funcOpBbArgReadWriteAnalysis(funcOpCasted, state, funcState)))
-       return failure();
- 
-     // Mark op as fully analyzed.
--    funcState.analyzedFuncOps[funcOp] = FuncOpAnalysisState::Analyzed;
-+    funcState.analyzedFuncOps[funcOpCasted] = FuncOpAnalysisState::Analyzed;
-   }
- 
-   return success();
-@@ -430,7 +449,7 @@ LogicalResult mlir::bufferization::bufferizeModuleOp(
+ void mlir::bufferization::removeBufferizationAttributesInModule(
+     ModuleOp moduleOp) {
+-  moduleOp.walk([&](func::FuncOp op) {
++  moduleOp.walk([&](FunctionOpInterface op) {
+     for (BlockArgument bbArg : op.getArguments())
+       removeBufferizationAttributes(bbArg);
+   });
+@@ -430,7 +440,7 @@ LogicalResult mlir::bufferization::bufferizeModuleOp(
    IRRewriter rewriter(moduleOp.getContext());
  
    // A list of functions in the order in which they are analyzed + bufferized.
@@ -144,7 +317,7 @@ index 0a4072605c26..5231fe860553 100644
  
    // A mapping of FuncOps to their callers.
    FuncCallerMap callerMap;
-@@ -439,11 +458,11 @@ LogicalResult mlir::bufferization::bufferizeModuleOp(
+@@ -439,11 +449,11 @@ LogicalResult mlir::bufferization::bufferizeModuleOp(
      return failure();
  
    // Bufferize functions.
@@ -158,14 +331,28 @@ index 0a4072605c26..5231fe860553 100644
        // This function was not analyzed and RaW conflicts were not resolved.
        // Buffer copies must be inserted before every write.
        OneShotBufferizationOptions updatedOptions = options;
-@@ -456,8 +475,8 @@ LogicalResult mlir::bufferization::bufferizeModuleOp(
-     }
- 
-     // Change buffer return types to more precise layout maps.
--    if (options.inferFunctionResultLayout)
--      foldMemRefCasts(funcOp);
-+    if (options.inferFunctionResultLayout && isa<func::FuncOp>(funcOp))
-+      foldMemRefCasts(cast<func::FuncOp>(funcOp));
-   }
- 
+@@ -463,7 +473,7 @@ LogicalResult mlir::bufferization::bufferizeModuleOp(
    // Bufferize all other ops.
+   for (Operation &op : llvm::make_early_inc_range(moduleOp.getOps())) {
+     // Functions were already bufferized.
+-    if (isa<func::FuncOp>(&op))
++    if (isa<FunctionOpInterface>(&op))
+       continue;
+     if (failed(bufferizeOp(&op, options, statistics)))
+       return failure();
+@@ -490,12 +500,12 @@ LogicalResult mlir::bufferization::runOneShotModuleBufferize(
+       // FuncOps whose names are specified in options.noAnalysisFuncFilter will
+       // not be analyzed. Ops in these FuncOps will not be analyzed as well.
+       OpFilter::Entry::FilterFn analysisFilterFn = [=](Operation *op) {
+-        auto func = dyn_cast<func::FuncOp>(op);
++        auto func = dyn_cast<FunctionOpInterface>(op);
+         if (!func)
+-          func = op->getParentOfType<func::FuncOp>();
++          func = op->getParentOfType<FunctionOpInterface>();
+         if (func)
+           return llvm::is_contained(options.noAnalysisFuncFilter,
+-                                    func.getSymName());
++                                    func.getName());
+         return false;
+       };
+       OneShotBufferizationOptions updatedOptions(options);

From c6d834cfca547087e5661d13e323039eea208948 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Fri, 6 Sep 2024 12:17:43 -0400
Subject: [PATCH 064/183] Cleanup

---
 mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
index cc9b403139..c4a04ebae1 100644
--- a/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -204,10 +204,6 @@ struct BackpropOpInterface
             rewriter.create<memref::CopyOp>(loc, cotangent, resShadow);
         }
 
-
-        llvm::outs() << "======================\n";
-        llvm::outs() << scalarReturnTypes;
-        llvm::outs() << "======================\n";
         DenseIntElementsAttr diffArgIndicesAttr = backpropOp.getDiffArgIndices().value_or(nullptr);
         auto bufferizedBackpropOp = rewriter.create<BackpropOp>(
             loc, TypeRange{}, scalarReturnTypes, backpropOp.getCalleeAttr(), bufferArgs, argShadows,

From f8a1a8ff5cebf98d03535f584c964967c6c9ec8f Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Fri, 6 Sep 2024 12:44:07 -0400
Subject: [PATCH 065/183] Include convert-elementwise-to-linalg back to
 bufferization pass

---
 frontend/catalyst/compiler.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/frontend/catalyst/compiler.py b/frontend/catalyst/compiler.py
index 3333eb4e9b..669a6aa2a6 100644
--- a/frontend/catalyst/compiler.py
+++ b/frontend/catalyst/compiler.py
@@ -194,6 +194,7 @@ def run_writing_command(command: List[str], compile_options: Optional[CompileOpt
     "BufferizationPass",
     [
         "eliminate-empty-tensors",
+        "convert-elementwise-to-linalg",
         "one-shot-bufferize{bufferize-function-boundaries function-boundary-type-conversion=identity-layout-map}",
         # "func.func(buffer-hoisting)",
         # "func.func(buffer-loop-hoisting)",

From 9a1a79121f19cbcc9adb907deb350e235d1db5ba Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Fri, 6 Sep 2024 13:07:06 -0400
Subject: [PATCH 066/183] Add ForwardOp Bufferization

---
 .../BufferizableOpInterfaceImpl.cpp           | 122 ++++++++++++++++++
 1 file changed, 122 insertions(+)

diff --git a/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
index c4a04ebae1..6007407420 100644
--- a/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -231,6 +231,128 @@ struct BackpropOpInterface
     }
 };
 
+struct ForwardOpInterface
+    : public bufferization::BufferizableOpInterface::ExternalModel<ForwardOpInterface, ForwardOp> {
+    bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
+                                const bufferization::AnalysisState &state) const
+    {
+        return true;
+    }
+
+    bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
+                                 const bufferization::AnalysisState &state) const
+    {
+        return true;
+    }
+
+    bufferization::AliasingValueList
+    getAliasingValues(Operation *op, OpOperand &opOperand,
+                      const bufferization::AnalysisState &state) const
+    {
+        return {};
+    }
+
+    LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
+                            const bufferization::BufferizationOptions &options) const
+    {
+        auto forwardOp = cast<ForwardOp>(op);
+
+        auto argc = forwardOp.getArgc();
+        auto resc = forwardOp.getResc();
+        SmallVector<Value> inputs;
+        SmallVector<Value> differentials;
+        SmallVector<Value> outputs;
+        SmallVector<Value> cotangents;
+
+        Block *block;
+        rewriter.modifyOpInPlace(op, [&] { block = forwardOp.addEntryBlock(); });
+
+        PatternRewriter::InsertionGuard guard(rewriter);
+        rewriter.setInsertionPointToStart(block);
+        auto params = forwardOp.getArguments();
+
+        for (size_t i = 0; i < argc * 2; i++) {
+            bool isDup = (i % 2) != 0;
+            Value val = params[i];
+            isDup ? differentials.push_back(val) : inputs.push_back(val);
+        }
+
+        auto upperLimit = (argc * 2) + (resc * 2);
+        for (size_t i = argc * 2; i < upperLimit; i++) {
+            bool isDup = (i % 2) != 0;
+            Value val = params[i];
+            isDup ? cotangents.push_back(val) : outputs.push_back(val);
+        }
+
+        auto implAttr = forwardOp.getImplementationAttr();
+        auto impl = forwardOp.getImplementation();
+        auto implOp = SymbolTable::lookupNearestSymbolFrom<FunctionOpInterface>(op, implAttr);
+        auto implResTy = implOp.getResultTypes();
+        Location loc = forwardOp.getLoc();
+
+        SmallVector<Value> tensorInputs;
+        for (auto input : inputs) {
+            Value tensorIn = rewriter.create<bufferization::ToTensorOp>(loc, input);
+            tensorInputs.push_back(tensorIn);
+        }
+
+        auto callOp = rewriter.create<func::CallOp>(loc, impl, implResTy, tensorInputs);
+        SmallVector<Value> tensorOutputs(callOp.getResults());
+
+        for (auto [memrefOutput, tensorOutput] : llvm::zip(outputs, tensorOutputs)) {
+            Value castVal = rewriter.create<bufferization::ToMemrefOp>(loc, memrefOutput.getType(),
+                                                                       tensorOutput);
+            rewriter.create<memref::CopyOp>(loc, castVal, memrefOutput);
+        }
+
+        auto tapeCount = forwardOp.getTape();
+        SmallVector<Value> tapeOutputs;
+        tapeOutputs.insert(tapeOutputs.begin(), tensorOutputs.end() - tapeCount,
+                           tensorOutputs.end());
+
+        SmallVector<Value> tapeMemrefOutputs;
+        for (auto [tapeTensorOutput, memrefTapeOutput] :
+             llvm::zip(tapeOutputs, forwardOp.getResultTypes())) {
+            Value castVal =
+                rewriter.create<bufferization::ToMemrefOp>(loc, memrefTapeOutput, tapeTensorOutput);
+            tapeMemrefOutputs.push_back(castVal);
+        }
+
+        auto F = rewriter.getIntegerAttr(rewriter.getI1Type(), 0);
+        bufferization::replaceOpWithNewBufferizedOp<catalyst::gradient::ReturnOp>(rewriter, op, tapeMemrefOutputs, F);
+
+        return success();
+    }
+};
+
+struct ReverseOpInterface
+    : public bufferization::BufferizableOpInterface::ExternalModel<ReverseOpInterface, ReverseOp> {
+    bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
+                                const bufferization::AnalysisState &state) const
+    {
+        return true;
+    }
+
+    bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
+                                 const bufferization::AnalysisState &state) const
+    {
+        return false;
+    }
+
+    bufferization::AliasingValueList
+    getAliasingValues(Operation *op, OpOperand &opOperand,
+                      const bufferization::AnalysisState &state) const
+    {
+        return {};
+    }
+
+    LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
+                            const bufferization::BufferizationOptions &options) const
+    {
+        return success();
+    }
+};
+
 } // namespace
 
 void catalyst::gradient::registerBufferizableOpInterfaceExternalModels(DialectRegistry &registry)

From 3aead9cfe8ea7c233fb4b43354cad1e04bbd9df6 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Fri, 6 Sep 2024 13:10:57 -0400
Subject: [PATCH 067/183] Add reverseOp Bufferization

---
 .../BufferizableOpInterfaceImpl.cpp           | 67 ++++++++++++++++++-
 1 file changed, 66 insertions(+), 1 deletion(-)

diff --git a/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
index 6007407420..be6a76d9a1 100644
--- a/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -336,7 +336,7 @@ struct ReverseOpInterface
     bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
                                  const bufferization::AnalysisState &state) const
     {
-        return false;
+        return true;
     }
 
     bufferization::AliasingValueList
@@ -349,6 +349,71 @@ struct ReverseOpInterface
     LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
                             const bufferization::BufferizationOptions &options) const
     {
+        auto reverseOp = cast<ReverseOp>(op);
+
+        auto argc = reverseOp.getArgc();
+        auto resc = reverseOp.getResc();
+        SmallVector<Value> inputs;
+        SmallVector<Value> differentials;
+        SmallVector<Value> outputs;
+        SmallVector<Value> cotangents;
+        SmallVector<Value> tapeElements;
+
+        Block *block;
+        rewriter.modifyOpInPlace(op, [&] { block = reverseOp.addEntryBlock(); });
+
+        PatternRewriter::InsertionGuard guard(rewriter);
+        rewriter.setInsertionPointToStart(block);
+        auto params = reverseOp.getArguments();
+
+        for (size_t i = 0; i < argc * 2; i++) {
+            bool isDup = (i % 2) != 0;
+            Value val = params[i];
+            isDup ? differentials.push_back(val) : inputs.push_back(val);
+        }
+
+        auto upperLimit = (argc * 2) + (resc * 2);
+        for (size_t i = argc * 2; i < upperLimit; i++) {
+            bool isDup = (i % 2) != 0;
+            Value val = params[i];
+            isDup ? cotangents.push_back(val) : outputs.push_back(val);
+        }
+
+        auto tapeCount = reverseOp.getTape();
+        auto uppestLimit = upperLimit + tapeCount;
+        for (size_t i = upperLimit; i < uppestLimit; i++) {
+            tapeElements.push_back(params[i]);
+        }
+
+        auto implAttr = reverseOp.getImplementationAttr();
+        auto impl = reverseOp.getImplementation();
+        auto implOp = SymbolTable::lookupNearestSymbolFrom<FunctionOpInterface>(reverseOp, implAttr);
+        auto implResTy = implOp.getResultTypes();
+        Location loc = reverseOp.getLoc();
+
+        SmallVector<Value> tensorInputs;
+        for (auto tapeElement : tapeElements) {
+            Value tensorIn = rewriter.create<bufferization::ToTensorOp>(loc, tapeElement);
+            tensorInputs.push_back(tensorIn);
+        }
+
+        for (auto cotangent : cotangents) {
+            Value tensorIn = rewriter.create<bufferization::ToTensorOp>(loc, cotangent);
+            tensorInputs.push_back(tensorIn);
+        }
+
+        auto callOp = rewriter.create<func::CallOp>(loc, impl, implResTy, tensorInputs);
+        SmallVector<Value> tensorOutputs(callOp.getResults());
+
+        for (auto [differential, tensorOutput] : llvm::zip(differentials, tensorOutputs)) {
+            Value castVal = rewriter.create<bufferization::ToMemrefOp>(loc, differential.getType(),
+                                                                       tensorOutput);
+            rewriter.create<memref::CopyOp>(loc, castVal, differential);
+        }
+
+        auto T = rewriter.getIntegerAttr(rewriter.getI1Type(), 1);
+        rewriter.create<catalyst::gradient::ReturnOp>(loc, ValueRange{}, T);
+
         return success();
     }
 };

From 418f12248515ddb0abdd4232f90a944cd0100120 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Fri, 6 Sep 2024 13:14:18 -0400
Subject: [PATCH 068/183] Correct bufferization interface registration

---
 mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp | 2 ++
 mlir/lib/Quantum/IR/QuantumDialect.cpp                       | 4 ++++
 2 files changed, 6 insertions(+)

diff --git a/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
index be6a76d9a1..8f472ca399 100644
--- a/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -425,5 +425,7 @@ void catalyst::gradient::registerBufferizableOpInterfaceExternalModels(DialectRe
     registry.addExtension(+[](MLIRContext *ctx, GradientDialect *dialect) {
         AdjointOp::attachInterface<AdjointOpInterface>(*ctx);
         BackpropOp::attachInterface<BackpropOpInterface>(*ctx);
+        ForwardOp::attachInterface<ForwardOpInterface>(*ctx);
+        ReverseOp::attachInterface<ReverseOpInterface>(*ctx);
     });
 }
\ No newline at end of file
diff --git a/mlir/lib/Quantum/IR/QuantumDialect.cpp b/mlir/lib/Quantum/IR/QuantumDialect.cpp
index 385f4e0ae5..c8c3788efd 100644
--- a/mlir/lib/Quantum/IR/QuantumDialect.cpp
+++ b/mlir/lib/Quantum/IR/QuantumDialect.cpp
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
 #include "mlir/IR/DialectImplementation.h" // needed for generated type parser
 #include "llvm/ADT/TypeSwitch.h"           // needed for generated type parser
 
@@ -43,6 +44,9 @@ void QuantumDialect::initialize()
 #define GET_OP_LIST
 #include "Quantum/IR/QuantumOps.cpp.inc"
         >();
+    declarePromisedInterfaces<bufferization::BufferizableOpInterface, QubitUnitaryOp,
+                            HermitianOp, HamiltonianOp, SampleOp, StateOp, ProbsOp,
+                            CountsOp, SetStateOp, SetBasisStateOp>();
 }
 
 //===----------------------------------------------------------------------===//

From 576ebe635f622e983af4da7825f347861155431c Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Mon, 9 Sep 2024 12:23:18 -0400
Subject: [PATCH 069/183] Rebuild MemRefType without memory layout for
 BackpropOp

---
 frontend/catalyst/compiler.py                                | 2 +-
 mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/frontend/catalyst/compiler.py b/frontend/catalyst/compiler.py
index 669a6aa2a6..6f1ac8933f 100644
--- a/frontend/catalyst/compiler.py
+++ b/frontend/catalyst/compiler.py
@@ -195,7 +195,7 @@ def run_writing_command(command: List[str], compile_options: Optional[CompileOpt
     [
         "eliminate-empty-tensors",
         "convert-elementwise-to-linalg",
-        "one-shot-bufferize{bufferize-function-boundaries function-boundary-type-conversion=identity-layout-map}",
+        "one-shot-bufferize{bufferize-function-boundaries}",
         # "func.func(buffer-hoisting)",
         # "func.func(buffer-loop-hoisting)",
         # "buffer-results-to-out-params",
diff --git a/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
index 8f472ca399..e7605ce107 100644
--- a/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -23,7 +23,9 @@ namespace {
 
 Value generateAllocation(OpBuilder &builder, Location loc, Value reference)
 {
-    auto memrefType = cast<MemRefType>(reference.getType());
+    auto origMemrefType = cast<MemRefType>(reference.getType());
+    // Rebuild MemRefType without memory layout.
+    auto memrefType = MemRefType::get(origMemrefType.getShape(), origMemrefType.getElementType());
     // Get dynamic dimension sizes from the provided reference value if necessary.
     SmallVector<Value> dynamicDims;
     if (!memrefType.hasStaticShape()) {

From 3c4056cb65c2000459d9c61e8467e325d5607faa Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Mon, 9 Sep 2024 13:35:02 -0400
Subject: [PATCH 070/183] Add back convert-arraylist-to-memref

---
 frontend/catalyst/compiler.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/frontend/catalyst/compiler.py b/frontend/catalyst/compiler.py
index 6f1ac8933f..630bc5c698 100644
--- a/frontend/catalyst/compiler.py
+++ b/frontend/catalyst/compiler.py
@@ -196,6 +196,10 @@ def run_writing_command(command: List[str], compile_options: Optional[CompileOpt
         "eliminate-empty-tensors",
         "convert-elementwise-to-linalg",
         "one-shot-bufferize{bufferize-function-boundaries}",
+        "convert-arraylist-to-memref",
+        "convert-bufferization-to-memref",
+        "canonicalize",
+        "cp-global-memref",
         # "func.func(buffer-hoisting)",
         # "func.func(buffer-loop-hoisting)",
         # "buffer-results-to-out-params",

From 68d6233483563545b1237c3eacdfeffee6ecd02f Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Mon, 9 Sep 2024 15:32:29 -0400
Subject: [PATCH 071/183] Include scf-for-preprocessing from LLVM PR#87594

---
 mlir/patches/scf-for-preprocessing.patch | 128 +++++++++++++++++++++++
 1 file changed, 128 insertions(+)
 create mode 100644 mlir/patches/scf-for-preprocessing.patch

diff --git a/mlir/patches/scf-for-preprocessing.patch b/mlir/patches/scf-for-preprocessing.patch
new file mode 100644
index 0000000000..769556de62
--- /dev/null
+++ b/mlir/patches/scf-for-preprocessing.patch
@@ -0,0 +1,128 @@
+diff --git a/mlir/include/mlir/Dialect/SCF/Transforms/Passes.h b/mlir/include/mlir/Dialect/SCF/Transforms/Passes.h
+index 90b315e83a8cfd..6107219ea94ae1 100644
+--- a/mlir/include/mlir/Dialect/SCF/Transforms/Passes.h
++++ b/mlir/include/mlir/Dialect/SCF/Transforms/Passes.h
+@@ -23,6 +23,9 @@ namespace mlir {
+ /// Creates a pass that bufferizes the SCF dialect.
+ std::unique_ptr<Pass> createSCFBufferizePass();
+ 
++/// Creates a pass that preprocesses SCF loop before One-Shot Bufferize.
++std::unique_ptr<Pass> createSCFLoopBufferizationPreprocessingPass();
++
+ /// Creates a pass that specializes for loop for unrolling and
+ /// vectorization.
+ std::unique_ptr<Pass> createForLoopSpecializationPass();
+diff --git a/mlir/include/mlir/Dialect/SCF/Transforms/Passes.td b/mlir/include/mlir/Dialect/SCF/Transforms/Passes.td
+index 350611ad86873d..94d3e51a1c9044 100644
+--- a/mlir/include/mlir/Dialect/SCF/Transforms/Passes.td
++++ b/mlir/include/mlir/Dialect/SCF/Transforms/Passes.td
+@@ -18,6 +18,27 @@ def SCFBufferize : Pass<"scf-bufferize"> {
+                            "memref::MemRefDialect"];
+ }
+ 
++def SCFLoopBufferizationPreprocessing
++    : Pass<"scf-loop-bufferization-preprocessing"> {
++  let summary = "Preprocess loops before One-Shot Bufferize";
++
++  let description = [{
++    Preprocess `scf.for` loops before running One-Shot Bufferize to support
++    loops where a yielded tensor is not equivalent to the respective iter_arg.
++    Such IR is currently not supported by One-Shot Bufferize.
++
++    This pass inserts a `bufferization.materialize_in_destination` op for every
++    yielded tensor, such that the yielded value is guaranteed to materialize in
++    the future buffer of the iter_arg; this is done by copying the tensor
++    contents into the iter_arg buffer. Such memcpys are a no-op in case the
++    tensor contents already materialize in the iter_arg buffer.
++  }];
++
++  let constructor = "mlir::createSCFLoopBufferizationPreprocessingPass()";
++  let dependentDialects = ["bufferization::BufferizationDialect",
++                           "scf::SCFDialect"];
++}
++
+ // Note: Making these canonicalization patterns would require a dependency
+ // of the SCF dialect on the Affine/Tensor/MemRef dialects or vice versa.
+ def SCFForLoopCanonicalization
+diff --git a/mlir/lib/Dialect/SCF/Transforms/Bufferize.cpp b/mlir/lib/Dialect/SCF/Transforms/Bufferize.cpp
+index 21c618ab633f60..727c4fc7c6396e 100644
+--- a/mlir/lib/Dialect/SCF/Transforms/Bufferize.cpp
++++ b/mlir/lib/Dialect/SCF/Transforms/Bufferize.cpp
+@@ -17,6 +17,7 @@
+ 
+ namespace mlir {
+ #define GEN_PASS_DEF_SCFBUFFERIZE
++#define GEN_PASS_DEF_SCFLOOPBUFFERIZATIONPREPROCESSING
+ #include "mlir/Dialect/SCF/Transforms/Passes.h.inc"
+ } // namespace mlir
+ 
+@@ -40,8 +41,40 @@ struct SCFBufferizePass : public impl::SCFBufferizeBase<SCFBufferizePass> {
+       return signalPassFailure();
+   };
+ };
++
++struct SCFLoopBufferizationPreprocessingPass
++    : public impl::SCFLoopBufferizationPreprocessingBase<
++          SCFLoopBufferizationPreprocessingPass> {
++  void runOnOperation() override {
++    OpBuilder builder(getOperation()->getContext());
++    getOperation()->walk([&](scf::YieldOp yieldOp) {
++      builder.setInsertionPoint(yieldOp);
++      // TODO: Support scf.while.
++      auto forOp = dyn_cast<scf::ForOp>(yieldOp->getParentOp());
++      if (!forOp)
++        return WalkResult::skip();
++      for (OpOperand &operand : yieldOp->getOpOperands()) {
++        auto tensorType = dyn_cast<TensorType>(operand.get().getType());
++        if (!tensorType)
++          continue;
++        auto bbArg = forOp.getRegionIterArgs()[operand.getOperandNumber()];
++        Value materialized =
++            builder
++                .create<bufferization::MaterializeInDestinationOp>(
++                    yieldOp.getLoc(), tensorType, operand.get(), bbArg)
++                .getResult();
++        operand.set(materialized);
++      }
++      return WalkResult::advance();
++    });
++  }
++};
+ } // namespace
+ 
+ std::unique_ptr<Pass> mlir::createSCFBufferizePass() {
+   return std::make_unique<SCFBufferizePass>();
+ }
++
++std::unique_ptr<Pass> mlir::createSCFLoopBufferizationPreprocessingPass() {
++  return std::make_unique<SCFLoopBufferizationPreprocessingPass>();
++}
+diff --git a/mlir/test/Dialect/SCF/one-shot-bufferize-preprocessing.mlir b/mlir/test/Dialect/SCF/one-shot-bufferize-preprocessing.mlir
+new file mode 100644
+index 00000000000000..17661178245088
+--- /dev/null
++++ b/mlir/test/Dialect/SCF/one-shot-bufferize-preprocessing.mlir
+@@ -0,0 +1,23 @@
++// RUN: mlir-opt %s -scf-loop-bufferization-preprocessing -one-shot-bufferize="bufferize-function-boundaries function-boundary-type-conversion=identity-layout-map" -canonicalize | FileCheck %s
++
++// CHECK-LABEL: func @conflict_in_loop(
++//  CHECK-SAME:     %[[A:.*]]: memref<10xf32>
++func.func @conflict_in_loop(%A: tensor<10xf32>, %f: f32, %idx: index, %lb: index, %ub: index, %step: index) -> f32 {
++  // CHECK: scf.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} {
++  %r = scf.for %i = %lb to %ub step %step iter_args(%tA = %A) -> (tensor<10xf32>) {
++    // CHECK: %[[alloc:.*]] = memref.alloc()
++    // CHECK: memref.copy %[[A]], %[[alloc]]
++    // CHECK: memref.store %{{.*}}, %[[alloc]]
++    %0 = tensor.insert %f into %tA[%i] : tensor<10xf32>
++    // CHECK: %[[read:.*]] = memref.load %[[A]]
++    %read = tensor.extract %tA[%idx] : tensor<10xf32>
++    // CHECK: vector.print %[[read]]
++    vector.print %read : f32
++    // CHECK: memref.copy %[[alloc]], %[[A]]
++    scf.yield %0 : tensor<10xf32>
++  }
++
++  // CHECK: memref.load %[[A]]
++  %f0 = tensor.extract %r[%step] : tensor<10xf32>
++  return %f0 : f32
++}

From 31b1dc91fb6b688e9250b4cc2052f6856258b11e Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Mon, 9 Sep 2024 16:37:39 -0400
Subject: [PATCH 072/183] Add scf-loop-bufferization-preprocessing into
 bufferization pass

---
 frontend/catalyst/compiler.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/frontend/catalyst/compiler.py b/frontend/catalyst/compiler.py
index 630bc5c698..ed59e4646c 100644
--- a/frontend/catalyst/compiler.py
+++ b/frontend/catalyst/compiler.py
@@ -195,6 +195,7 @@ def run_writing_command(command: List[str], compile_options: Optional[CompileOpt
     [
         "eliminate-empty-tensors",
         "convert-elementwise-to-linalg",
+        "scf-loop-bufferization-preprocessing",
         "one-shot-bufferize{bufferize-function-boundaries}",
         "convert-arraylist-to-memref",
         "convert-bufferization-to-memref",

From 81f0a868a250173a75965c6551ede759bb45da0d Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Mon, 9 Sep 2024 16:38:11 -0400
Subject: [PATCH 073/183] Make llvm patches apply to directory

---
 mlir/Makefile | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/mlir/Makefile b/mlir/Makefile
index c553d3274b..4c8b9ef4f3 100644
--- a/mlir/Makefile
+++ b/mlir/Makefile
@@ -12,8 +12,9 @@ ENZYME_BUILD_DIR?=$(MK_DIR)/Enzyme/build
 RT_BUILD_DIR?=$(MK_DIR)/../runtime/build
 ENABLE_ASAN?=OFF
 BUILD_TYPE?=Release
-LLVM_TARGET_FILE=$(MK_DIR)/llvm-project/mlir/lib/Dialect/Bufferization/Transforms/OneShotModuleBufferize.cpp
-LLVM_PATCH_FILE=$(MK_DIR)/patches/moduleOp-bufferization.patch
+LLVM_ROOT=$(MK_DIR)/llvm-project
+LLVM_MODULE_PATCH_FILE=$(MK_DIR)/patches/moduleOp-bufferization.patch
+LLVM_SCF_PATCH_FILE=$(MK_DIR)/patches/scf-for-preprocessing.patch
 TARGET_FILE=$(MK_DIR)/mlir-hlo/mhlo/transforms/CMakeLists.txt
 PATCH_FILE=$(MK_DIR)/patches/mhlo-Add-PassesIncGen-in-transforms-CMakeList.patch
 
@@ -56,8 +57,11 @@ all: llvm mhlo enzyme dialects
 .PHONY: llvm
 llvm:
 	@echo "build LLVM and MLIR enabling Python bindings"
-	@if patch --dry-run -p1 -N $(LLVM_TARGET_FILE) $(LLVM_PATCH_FILE) > /dev/null 2>&1; then \
-        patch -p1 $(LLVM_TARGET_FILE) $(LLVM_PATCH_FILE); \
+	@if patch --dry-run -p1 -N --directory=$(LLVM_ROOT) < $(LLVM_MODULE_PATCH_FILE) > /dev/null 2>&1; then \
+        patch -p1 --directory=$(LLVM_ROOT) < $(LLVM_MODULE_PATCH_FILE); \
+	fi
+	@if patch --dry-run -p1 -N --directory=$(LLVM_ROOT) < $(LLVM_SCF_PATCH_FILE) > /dev/null 2>&1; then \
+        patch -p1 --directory=$(LLVM_ROOT) < $(LLVM_SCF_PATCH_FILE); \
 	fi
 	cmake -G Ninja -S llvm-project/llvm -B $(LLVM_BUILD_DIR) \
 		-DCMAKE_BUILD_TYPE=$(BUILD_TYPE) \

From ee9b78da9519b1efa2912e9b66595532694a384b Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Tue, 10 Sep 2024 15:13:26 -0400
Subject: [PATCH 074/183] Remove functionOpInterface workaround for no
 ReturnLike from the patch

---
 mlir/patches/moduleOp-bufferization.patch | 622 ++++++++++++++++++++--
 1 file changed, 580 insertions(+), 42 deletions(-)

diff --git a/mlir/patches/moduleOp-bufferization.patch b/mlir/patches/moduleOp-bufferization.patch
index 51cc79536e..25384f99fb 100644
--- a/mlir/patches/moduleOp-bufferization.patch
+++ b/mlir/patches/moduleOp-bufferization.patch
@@ -47,7 +47,7 @@ index 053ea7935260..fed1c49d22be 100644
    auto createdEquiv = equivalentFuncArgs.try_emplace(funcOp, IndexMapping());
    auto createdAliasingResults =
 diff --git a/mlir/lib/Dialect/Bufferization/Transforms/OneShotModuleBufferize.cpp b/mlir/lib/Dialect/Bufferization/Transforms/OneShotModuleBufferize.cpp
-index 0a4072605c26..bd054ac4e7b8 100644
+index 0a4072605c26..ce90d907b4ca 100644
 --- a/mlir/lib/Dialect/Bufferization/Transforms/OneShotModuleBufferize.cpp
 +++ b/mlir/lib/Dialect/Bufferization/Transforms/OneShotModuleBufferize.cpp
 @@ -75,7 +75,7 @@ using namespace mlir::bufferization;
@@ -59,7 +59,7 @@ index 0a4072605c26..bd054ac4e7b8 100644
  
  /// Get or create FuncAnalysisState.
  static FuncAnalysisState &
-@@ -88,15 +88,19 @@ getOrCreateFuncAnalysisState(OneShotAnalysisState &state) {
+@@ -88,10 +88,11 @@ getOrCreateFuncAnalysisState(OneShotAnalysisState &state) {
  
  /// Return the unique ReturnOp that terminates `funcOp`.
  /// Return nullptr if there is no such unique ReturnOp.
@@ -67,7 +67,6 @@ index 0a4072605c26..bd054ac4e7b8 100644
 -  func::ReturnOp returnOp;
 -  for (Block &b : funcOp.getBody()) {
 -    if (auto candidateOp = dyn_cast<func::ReturnOp>(b.getTerminator())) {
-+/// Return `funcOp` it self if there is no ReturnOp.
 +static Operation* getAssumedUniqueReturnOp(FunctionOpInterface funcOp) {
 +  Operation *returnOp = nullptr;
 +  for (Block &b : funcOp.getFunctionBody()) {
@@ -76,14 +75,7 @@ index 0a4072605c26..bd054ac4e7b8 100644
        if (returnOp)
          return nullptr;
        returnOp = candidateOp;
-     }
-   }
-+  if (!returnOp)
-+    return funcOp;
-   return returnOp;
- }
- 
-@@ -126,16 +130,15 @@ static void annotateEquivalentReturnBbArg(OpOperand &returnVal,
+@@ -126,16 +127,15 @@ static void annotateEquivalentReturnBbArg(OpOperand &returnVal,
  /// Store function BlockArguments that are equivalent to/aliasing a returned
  /// value in FuncAnalysisState.
  static LogicalResult
@@ -104,19 +96,16 @@ index 0a4072605c26..bd054ac4e7b8 100644
          if (!isa<TensorType>(resultIt.value()))
            continue;
          int64_t returnIdx = resultIt.index();
-@@ -147,7 +150,10 @@ aliasingFuncOpBBArgsAnalysis(FuncOp funcOp, OneShotAnalysisState &state,
+@@ -147,7 +147,7 @@ aliasingFuncOpBBArgsAnalysis(FuncOp funcOp, OneShotAnalysisState &state,
    }
  
    // Support only single return-terminated block in the function.
 -  func::ReturnOp returnOp = getAssumedUniqueReturnOp(funcOp);
-+  // If funcOp has no returnOp, skip the following analysis.
 +  Operation *returnOp = getAssumedUniqueReturnOp(funcOp);
-+  if (returnOp == funcOp)
-+    return success();
    assert(returnOp && "expected func with single return op");
  
    for (OpOperand &returnVal : returnOp->getOpOperands())
-@@ -168,7 +174,7 @@ aliasingFuncOpBBArgsAnalysis(FuncOp funcOp, OneShotAnalysisState &state,
+@@ -168,7 +168,7 @@ aliasingFuncOpBBArgsAnalysis(FuncOp funcOp, OneShotAnalysisState &state,
    return success();
  }
  
@@ -125,7 +114,7 @@ index 0a4072605c26..bd054ac4e7b8 100644
                                    bool isWritten) {
    OpBuilder b(funcOp.getContext());
    Attribute accessType;
-@@ -189,12 +195,12 @@ static void annotateFuncArgAccess(func::FuncOp funcOp, int64_t idx, bool isRead,
+@@ -189,12 +189,12 @@ static void annotateFuncArgAccess(func::FuncOp funcOp, int64_t idx, bool isRead,
  /// function with unknown ops, we conservatively assume that such ops bufferize
  /// to a read + write.
  static LogicalResult
@@ -141,7 +130,7 @@ index 0a4072605c26..bd054ac4e7b8 100644
        continue;
      bool isRead;
      bool isWritten;
-@@ -204,7 +210,7 @@ funcOpBbArgReadWriteAnalysis(FuncOp funcOp, OneShotAnalysisState &state,
+@@ -204,7 +204,7 @@ funcOpBbArgReadWriteAnalysis(FuncOp funcOp, OneShotAnalysisState &state,
        StringRef str = accessAttr.getValue();
        isRead = str == "read" || str == "read-write";
        isWritten = str == "write" || str == "read-write";
@@ -150,7 +139,7 @@ index 0a4072605c26..bd054ac4e7b8 100644
        // If the function has no body, conservatively assume that all args are
        // read + written.
        isRead = true;
-@@ -230,20 +236,19 @@ funcOpBbArgReadWriteAnalysis(FuncOp funcOp, OneShotAnalysisState &state,
+@@ -230,20 +230,19 @@ funcOpBbArgReadWriteAnalysis(FuncOp funcOp, OneShotAnalysisState &state,
  
  /// Remove bufferization attributes on FuncOp arguments.
  static void removeBufferizationAttributes(BlockArgument bbArg) {
@@ -174,7 +163,7 @@ index 0a4072605c26..bd054ac4e7b8 100644
        SymbolTable::lookupNearestSymbolFrom(callOp, sym));
  }
  
-@@ -251,12 +256,12 @@ static func::FuncOp getCalledFunction(func::CallOp callOp) {
+@@ -251,12 +250,12 @@ static func::FuncOp getCalledFunction(func::CallOp callOp) {
  /// Note: This only adds new equivalence info if the called function was already
  /// analyzed.
  // TODO: This does not handle cyclic function call graphs etc.
@@ -191,7 +180,7 @@ index 0a4072605c26..bd054ac4e7b8 100644
  
      // No equivalence info available for the called function.
      if (!funcState.equivalentFuncArgs.count(calledFunction))
-@@ -267,7 +272,7 @@ static void equivalenceAnalysis(func::FuncOp funcOp,
+@@ -267,7 +266,7 @@ static void equivalenceAnalysis(func::FuncOp funcOp,
        int64_t bbargIdx = it.second;
        if (!state.isInPlace(callOp->getOpOperand(bbargIdx)))
          continue;
@@ -200,7 +189,7 @@ index 0a4072605c26..bd054ac4e7b8 100644
        Value argVal = callOp->getOperand(bbargIdx);
        state.unionEquivalenceClasses(returnVal, argVal);
      }
-@@ -277,10 +282,10 @@ static void equivalenceAnalysis(func::FuncOp funcOp,
+@@ -277,10 +276,10 @@ static void equivalenceAnalysis(func::FuncOp funcOp,
  }
  
  /// Return "true" if the given function signature has tensor semantics.
@@ -214,7 +203,7 @@ index 0a4072605c26..bd054ac4e7b8 100644
                        llvm::IsaPred<TensorType>);
  }
  
-@@ -291,17 +296,17 @@ static bool hasTensorSignature(func::FuncOp funcOp) {
+@@ -291,16 +290,16 @@ static bool hasTensorSignature(func::FuncOp funcOp) {
  /// retrieve the called FuncOp from any func::CallOp.
  static LogicalResult
  getFuncOpsOrderedByCalls(ModuleOp moduleOp,
@@ -230,16 +219,14 @@ index 0a4072605c26..bd054ac4e7b8 100644
 -  WalkResult res = moduleOp.walk([&](func::FuncOp funcOp) -> WalkResult {
 -    if (!funcOp.getBody().empty()) {
 -      func::ReturnOp returnOp = getAssumedUniqueReturnOp(funcOp);
--      if (!returnOp)
 +  DenseMap<FunctionOpInterface, unsigned> numberCallOpsContainedInFuncOp;
 +  WalkResult res = moduleOp.walk([&](FunctionOpInterface funcOp) -> WalkResult {
 +    if (!funcOp.getFunctionBody().empty()) {
 +      Operation *returnOp = getAssumedUniqueReturnOp(funcOp);
-+      if (!returnOp && returnOp != funcOp)
+       if (!returnOp)
          return funcOp->emitError()
                 << "cannot bufferize a FuncOp with tensors and "
-                   "without a unique ReturnOp";
-@@ -309,9 +314,9 @@ getFuncOpsOrderedByCalls(ModuleOp moduleOp,
+@@ -309,9 +308,9 @@ getFuncOpsOrderedByCalls(ModuleOp moduleOp,
  
      // Collect function calls and populate the caller map.
      numberCallOpsContainedInFuncOp[funcOp] = 0;
@@ -252,7 +239,7 @@ index 0a4072605c26..bd054ac4e7b8 100644
        // If the called function does not have any tensors in its signature, then
        // it is not necessary to bufferize the callee before the caller.
        if (!hasTensorSignature(calledFunction))
-@@ -349,11 +354,15 @@ getFuncOpsOrderedByCalls(ModuleOp moduleOp,
+@@ -349,11 +348,11 @@ getFuncOpsOrderedByCalls(ModuleOp moduleOp,
  /// most generic layout map as function return types. After bufferizing the
  /// entire function body, a more concise memref type can potentially be used for
  /// the return type of the function.
@@ -260,18 +247,14 @@ index 0a4072605c26..bd054ac4e7b8 100644
 -  if (funcOp.getBody().empty())
 +static void foldMemRefCasts(FunctionOpInterface funcOp) {
 +  if (funcOp.getFunctionBody().empty())
-+    return;
-+
-+  Operation *returnOp = getAssumedUniqueReturnOp(funcOp);
-+
-+  if (!returnOp || returnOp == funcOp)
      return;
  
 -  func::ReturnOp returnOp = getAssumedUniqueReturnOp(funcOp);
++  Operation *returnOp = getAssumedUniqueReturnOp(funcOp);
    SmallVector<Type> resultTypes;
  
    for (OpOperand &operand : returnOp->getOpOperands()) {
-@@ -366,7 +375,7 @@ static void foldMemRefCasts(func::FuncOp funcOp) {
+@@ -366,7 +365,7 @@ static void foldMemRefCasts(func::FuncOp funcOp) {
    }
  
    auto newFuncType = FunctionType::get(
@@ -280,7 +263,7 @@ index 0a4072605c26..bd054ac4e7b8 100644
    funcOp.setType(newFuncType);
  }
  
-@@ -379,7 +388,7 @@ mlir::bufferization::analyzeModuleOp(ModuleOp moduleOp,
+@@ -379,7 +378,7 @@ mlir::bufferization::analyzeModuleOp(ModuleOp moduleOp,
    FuncAnalysisState &funcState = getOrCreateFuncAnalysisState(state);
  
    // A list of functions in the order in which they are analyzed + bufferized.
@@ -289,17 +272,16 @@ index 0a4072605c26..bd054ac4e7b8 100644
  
    // A mapping of FuncOps to their callers.
    FuncCallerMap callerMap;
-@@ -388,7 +397,8 @@ mlir::bufferization::analyzeModuleOp(ModuleOp moduleOp,
+@@ -388,7 +387,7 @@ mlir::bufferization::analyzeModuleOp(ModuleOp moduleOp,
      return failure();
  
    // Analyze ops.
 -  for (func::FuncOp funcOp : orderedFuncOps) {
 +  for (FunctionOpInterface funcOp : orderedFuncOps) {
-+
      if (!state.getOptions().isOpAllowed(funcOp))
        continue;
  
-@@ -416,7 +426,7 @@ mlir::bufferization::analyzeModuleOp(ModuleOp moduleOp,
+@@ -416,7 +415,7 @@ mlir::bufferization::analyzeModuleOp(ModuleOp moduleOp,
  
  void mlir::bufferization::removeBufferizationAttributesInModule(
      ModuleOp moduleOp) {
@@ -308,7 +290,7 @@ index 0a4072605c26..bd054ac4e7b8 100644
      for (BlockArgument bbArg : op.getArguments())
        removeBufferizationAttributes(bbArg);
    });
-@@ -430,7 +440,7 @@ LogicalResult mlir::bufferization::bufferizeModuleOp(
+@@ -430,7 +429,7 @@ LogicalResult mlir::bufferization::bufferizeModuleOp(
    IRRewriter rewriter(moduleOp.getContext());
  
    // A list of functions in the order in which they are analyzed + bufferized.
@@ -317,7 +299,7 @@ index 0a4072605c26..bd054ac4e7b8 100644
  
    // A mapping of FuncOps to their callers.
    FuncCallerMap callerMap;
-@@ -439,11 +449,11 @@ LogicalResult mlir::bufferization::bufferizeModuleOp(
+@@ -439,11 +438,11 @@ LogicalResult mlir::bufferization::bufferizeModuleOp(
      return failure();
  
    // Bufferize functions.
@@ -331,7 +313,7 @@ index 0a4072605c26..bd054ac4e7b8 100644
        // This function was not analyzed and RaW conflicts were not resolved.
        // Buffer copies must be inserted before every write.
        OneShotBufferizationOptions updatedOptions = options;
-@@ -463,7 +473,7 @@ LogicalResult mlir::bufferization::bufferizeModuleOp(
+@@ -463,7 +462,7 @@ LogicalResult mlir::bufferization::bufferizeModuleOp(
    // Bufferize all other ops.
    for (Operation &op : llvm::make_early_inc_range(moduleOp.getOps())) {
      // Functions were already bufferized.
@@ -340,7 +322,7 @@ index 0a4072605c26..bd054ac4e7b8 100644
        continue;
      if (failed(bufferizeOp(&op, options, statistics)))
        return failure();
-@@ -490,12 +500,12 @@ LogicalResult mlir::bufferization::runOneShotModuleBufferize(
+@@ -490,12 +489,12 @@ LogicalResult mlir::bufferization::runOneShotModuleBufferize(
        // FuncOps whose names are specified in options.noAnalysisFuncFilter will
        // not be analyzed. Ops in these FuncOps will not be analyzed as well.
        OpFilter::Entry::FilterFn analysisFilterFn = [=](Operation *op) {
@@ -356,3 +338,559 @@ index 0a4072605c26..bd054ac4e7b8 100644
          return false;
        };
        OneShotBufferizationOptions updatedOptions(options);
+diff --git a/mlir/test/Dialect/Bufferization/Transforms/transform-ops.mlir b/mlir/test/Dialect/Bufferization/Transforms/transform-ops.mlir
+index 3c50a9e72d9d..588aa8a85a84 100644
+--- a/mlir/test/Dialect/Bufferization/Transforms/transform-ops.mlir
++++ b/mlir/test/Dialect/Bufferization/Transforms/transform-ops.mlir
+@@ -1,4 +1,4 @@
+-// RUN: mlir-opt --transform-interpreter %s -split-input-file -verify-diagnostics | FileCheck %s
++// RUN: mlir-opt --transform-interpreter="debug-payload-root-tag=payload" %s -split-input-file -verify-diagnostics | FileCheck %s
+ 
+ // Test One-Shot Bufferize.
+ 
+@@ -12,19 +12,21 @@ module attributes {transform.with_named_sequence} {
+ 
+ // CHECK-LABEL: func @test_function(
+ //  CHECK-SAME:     %[[A:.*]]: tensor<?xf32>
+-func.func @test_function(%A : tensor<?xf32>, %v : vector<4xf32>) -> (tensor<?xf32>) {
+-  %c0 = arith.constant 0 : index
++module @payload attributes { transform.target_tag = "payload" } {
++  func.func @test_function(%A : tensor<?xf32>, %v : vector<4xf32>) -> (tensor<?xf32>) {
++    %c0 = arith.constant 0 : index
+ 
+-  // CHECK: %[[A_memref:.*]] = bufferization.to_memref %[[A]]
+-  // CHECK: %[[dim:.*]] = memref.dim %[[A_memref]]
+-  // CHECK: %[[alloc:.*]] = memref.alloc(%[[dim]])
+-  // CHECK: memref.copy %[[A_memref]], %[[alloc]]
+-  // CHECK: vector.transfer_write %{{.*}}, %[[alloc]]
+-  // CHECK: %[[res_tensor:.*]] = bufferization.to_tensor %[[alloc]]
+-  %0 = vector.transfer_write %v, %A[%c0] : vector<4xf32>, tensor<?xf32>
++    // CHECK: %[[A_memref:.*]] = bufferization.to_memref %[[A]]
++    // CHECK: %[[dim:.*]] = memref.dim %[[A_memref]]
++    // CHECK: %[[alloc:.*]] = memref.alloc(%[[dim]])
++    // CHECK: memref.copy %[[A_memref]], %[[alloc]]
++    // CHECK: vector.transfer_write %{{.*}}, %[[alloc]]
++    // CHECK: %[[res_tensor:.*]] = bufferization.to_tensor %[[alloc]]
++    %0 = vector.transfer_write %v, %A[%c0] : vector<4xf32>, tensor<?xf32>
+ 
+-  // CHECK: return %[[res_tensor]]
+-  return %0 : tensor<?xf32>
++    // CHECK: return %[[res_tensor]]
++    return %0 : tensor<?xf32>
++  }
+ }
+ 
+ // -----
+@@ -42,19 +44,21 @@ module attributes {transform.with_named_sequence} {
+ // CHECK-LABEL: func @test_function(
+ //  CHECK-SAME:     %[[A:.*]]: tensor<?xf32>
+ //   CHECK-NOT:   memref.copy
+-func.func @test_function(%A : tensor<?xf32>, %v : vector<4xf32>) -> (tensor<?xf32>) {
+-  %c0 = arith.constant 0 : index
++module @payload attributes { transform.target_tag = "payload" } {
++  func.func @test_function(%A : tensor<?xf32>, %v : vector<4xf32>) -> (tensor<?xf32>) {
++    %c0 = arith.constant 0 : index
+ 
+-  // CHECK: %[[A_memref:.*]] = bufferization.to_memref %[[A]]
+-  // CHECK: %[[dim:.*]] = memref.dim %[[A_memref]]
+-  // CHECK: %[[alloc:.*]] = memref.alloc(%[[dim]])
+-  // CHECK: linalg.copy ins(%[[A_memref]] : memref<{{.*}}>) outs(%[[alloc]]
+-  // CHECK: vector.transfer_write %{{.*}}, %[[alloc]]
+-  // CHECK: %[[res_tensor:.*]] = bufferization.to_tensor %[[alloc]]
+-  %0 = vector.transfer_write %v, %A[%c0] : vector<4xf32>, tensor<?xf32>
++    // CHECK: %[[A_memref:.*]] = bufferization.to_memref %[[A]]
++    // CHECK: %[[dim:.*]] = memref.dim %[[A_memref]]
++    // CHECK: %[[alloc:.*]] = memref.alloc(%[[dim]])
++    // CHECK: linalg.copy ins(%[[A_memref]] : memref<{{.*}}>) outs(%[[alloc]]
++    // CHECK: vector.transfer_write %{{.*}}, %[[alloc]]
++    // CHECK: %[[res_tensor:.*]] = bufferization.to_tensor %[[alloc]]
++    %0 = vector.transfer_write %v, %A[%c0] : vector<4xf32>, tensor<?xf32>
+ 
+-  // CHECK: return %[[res_tensor]]
+-  return %0 : tensor<?xf32>
++    // CHECK: return %[[res_tensor]]
++    return %0 : tensor<?xf32>
++  }
+ }
+ 
+ // -----
+@@ -72,13 +76,15 @@ module attributes {transform.with_named_sequence} {
+ 
+ // CHECK-LABEL: func @test_function_analysis(
+ //  CHECK-SAME:     %[[A:.*]]: tensor<?xf32>
+-func.func @test_function_analysis(%A : tensor<?xf32>, %v : vector<4xf32>) -> (tensor<?xf32>) {
+-  %c0 = arith.constant 0 : index
+-  // CHECK: vector.transfer_write
+-  // CHECK-SAME: {__inplace_operands_attr__ = ["none", "false", "none"]}
+-  // CHECK-SAME: tensor<?xf32>
+-  %0 = vector.transfer_write %v, %A[%c0] : vector<4xf32>, tensor<?xf32>
+-  return %0 : tensor<?xf32>
++module @payload attributes { transform.target_tag = "payload" } {
++  func.func @test_function_analysis(%A : tensor<?xf32>, %v : vector<4xf32>) -> (tensor<?xf32>) {
++    %c0 = arith.constant 0 : index
++    // CHECK: vector.transfer_write
++    // CHECK-SAME: {__inplace_operands_attr__ = ["none", "false", "none"]}
++    // CHECK-SAME: tensor<?xf32>
++    %0 = vector.transfer_write %v, %A[%c0] : vector<4xf32>, tensor<?xf32>
++    return %0 : tensor<?xf32>
++  }
+ }
+ 
+ // -----
+@@ -95,10 +101,12 @@ module attributes {transform.with_named_sequence} {
+   }
+ }
+ 
+-func.func @test_unknown_op_failure() -> (tensor<?xf32>) {
+-  // expected-error @+1 {{op was not bufferized}}
+-  %0 = "test.dummy_op"() : () -> (tensor<?xf32>)
+-  return %0 : tensor<?xf32>
++module @payload attributes { transform.target_tag = "payload" } {
++  func.func @test_unknown_op_failure() -> (tensor<?xf32>) {
++    // expected-error @+1 {{op was not bufferized}}
++    %0 = "test.dummy_op"() : () -> (tensor<?xf32>)
++    return %0 : tensor<?xf32>
++  }
+ }
+ 
+ // -----
+@@ -111,7 +119,7 @@ module attributes {transform.with_named_sequence} {
+   }
+ }
+ 
+-module {
++module @payload attributes { transform.target_tag = "payload" } {
+   // CHECK-LABEL: func @test_function(
+   //  CHECK-SAME:     %[[A:.*]]: tensor<?xf32>
+   func.func @test_function(%A : tensor<?xf32>, %v : vector<4xf32>) -> (tensor<?xf32>) {
+@@ -146,11 +154,13 @@ module attributes {transform.with_named_sequence} {
+ // CHECK-SAME:  %[[A:.*]]: memref<12x9xf32>,
+ // CHECK-SAME:  %[[B:.*]]: memref<9x6xf32>,
+ // CHECK-SAME:  %[[C:.*]]: memref<12x6xf32>) -> memref<12x6xf32> {
+-func.func @matmul(%A: tensor<12x9xf32>, %B: tensor<9x6xf32>, %C: tensor<12x6xf32>) -> tensor<12x6xf32> {
+-  // CHECK: linalg.matmul ins(%[[A]], %[[B]] : memref<12x9xf32>, memref<9x6xf32>) outs(%[[C]] : memref<12x6xf32>)
+-  %D = linalg.matmul ins(%A, %B: tensor<12x9xf32>, tensor<9x6xf32>) outs(%C: tensor<12x6xf32>) -> tensor<12x6xf32>
+-  // CHECK: return %[[C]] : memref<12x6xf32>
+-  return %D : tensor<12x6xf32>
++module @payload attributes { transform.target_tag = "payload" } {
++  func.func @matmul(%A: tensor<12x9xf32>, %B: tensor<9x6xf32>, %C: tensor<12x6xf32>) -> tensor<12x6xf32> {
++    // CHECK: linalg.matmul ins(%[[A]], %[[B]] : memref<12x9xf32>, memref<9x6xf32>) outs(%[[C]] : memref<12x6xf32>)
++    %D = linalg.matmul ins(%A, %B: tensor<12x9xf32>, tensor<9x6xf32>) outs(%C: tensor<12x6xf32>) -> tensor<12x6xf32>
++    // CHECK: return %[[C]] : memref<12x6xf32>
++    return %D : tensor<12x6xf32>
++  }
+ }
+ 
+ // -----
+@@ -165,10 +175,12 @@ module attributes {transform.with_named_sequence} {
+ }
+ 
+ // Expect `bufferization.empty_tensor_to_alloc_tensor` to replace the tensor.empty.
+-func.func @empty_to_tensor_alloc() -> tensor<2x2xf32> {
+-  // CHECK: bufferization.alloc_tensor
+-  %0 = tensor.empty() : tensor<2x2xf32>
+-  return %0 : tensor<2x2xf32>
++module @payload attributes { transform.target_tag = "payload" } {
++  func.func @empty_to_tensor_alloc() -> tensor<2x2xf32> {
++    // CHECK: bufferization.alloc_tensor
++    %0 = tensor.empty() : tensor<2x2xf32>
++    return %0 : tensor<2x2xf32>
++  }
+ }
+ 
+ // -----
+@@ -185,13 +197,15 @@ module attributes {transform.with_named_sequence} {
+ //       CHECK:   tensor.extract_slice
+ //       CHECK:   linalg.fill
+ //       CHECK:   tensor.insert_slice
+-func.func @empty_tensor_elimination(
+-    %t: tensor<10xf32>, %f: f32) -> tensor<10xf32> {
+-  %0 = tensor.empty() : tensor<5xf32>
+-  %1 = linalg.fill ins(%f : f32) outs(%0 : tensor<5xf32>) -> tensor<5xf32>
+-  %2 = tensor.insert_slice %1 into %t [1][5][1]
+-      : tensor<5xf32> into tensor<10xf32>
+-  return %2 : tensor<10xf32>
++module @payload attributes { transform.target_tag = "payload" } {
++  func.func @empty_tensor_elimination(
++      %t: tensor<10xf32>, %f: f32) -> tensor<10xf32> {
++    %0 = tensor.empty() : tensor<5xf32>
++    %1 = linalg.fill ins(%f : f32) outs(%0 : tensor<5xf32>) -> tensor<5xf32>
++    %2 = tensor.insert_slice %1 into %t [1][5][1]
++        : tensor<5xf32> into tensor<10xf32>
++    return %2 : tensor<10xf32>
++  }
+ }
+ 
+ // -----
+@@ -208,12 +222,14 @@ module attributes {transform.with_named_sequence} {
+ //       CHECK:   memref.alloca
+ //       CHECK:   scf.for
+ //       CHECK:     memref.store
+-func.func @buffer_loop_hoisting(%lb: index, %ub: index, %step: index, %f: f32, %pos: index) {
+-  scf.for %iv = %lb to %ub step %step {
+-    %0 = memref.alloca() : memref<5xf32>
+-    memref.store %f, %0[%pos] : memref<5xf32>
++module @payload attributes { transform.target_tag = "payload" } {
++  func.func @buffer_loop_hoisting(%lb: index, %ub: index, %step: index, %f: f32, %pos: index) {
++    scf.for %iv = %lb to %ub step %step {
++      %0 = memref.alloca() : memref<5xf32>
++      memref.store %f, %0[%pos] : memref<5xf32>
++    }
++    return
+   }
+-  return
+ }
+ 
+ // -----
+@@ -231,10 +247,12 @@ module attributes {transform.with_named_sequence} {
+ 
+ // Expect `bufferization.bufferize_to_allocation` to create an alloc.
+ //  CHECK-LABEL: func.func @empty_to_tensor_alloc()
+-func.func @empty_to_tensor_alloc() -> tensor<2x2xf32> {
+-  // CHECK-NEXT: %[[alloca:.*]] = memref.alloca() : memref<2x2xf32>
+-  // CHECK-NEXT: %[[tensor:.*]] = bufferization.to_tensor %[[alloca]] restrict writable : memref<2x2xf32>
+-  // CHECK-NEXT: return %[[tensor]] : tensor<2x2xf32>
+-  %0 = bufferization.alloc_tensor() : tensor<2x2xf32>
+-  return %0 : tensor<2x2xf32>
++module @payload attributes { transform.target_tag = "payload" } {
++  func.func @empty_to_tensor_alloc() -> tensor<2x2xf32> {
++    // CHECK-NEXT: %[[alloca:.*]] = memref.alloca() : memref<2x2xf32>
++    // CHECK-NEXT: %[[tensor:.*]] = bufferization.to_tensor %[[alloca]] restrict writable : memref<2x2xf32>
++    // CHECK-NEXT: return %[[tensor]] : tensor<2x2xf32>
++    %0 = bufferization.alloc_tensor() : tensor<2x2xf32>
++    return %0 : tensor<2x2xf32>
++  }
+ }
+diff --git a/mlir/test/Dialect/LLVM/transform-e2e.mlir b/mlir/test/Dialect/LLVM/transform-e2e.mlir
+index c00b47fb936e..3e637a3ec49a 100644
+--- a/mlir/test/Dialect/LLVM/transform-e2e.mlir
++++ b/mlir/test/Dialect/LLVM/transform-e2e.mlir
+@@ -1,15 +1,17 @@
+-// RUN: mlir-opt %s --transform-interpreter -test-transform-dialect-erase-schedule --test-lower-to-llvm --split-input-file | FileCheck %s
++// RUN: mlir-opt %s --transform-interpreter="debug-payload-root-tag=payload" -test-transform-dialect-erase-schedule --test-lower-to-llvm --split-input-file | FileCheck %s
+ 
+ // CHECK-LABEL: llvm.func @matmul_tensors
+-func.func @matmul_tensors(
+-  %arg0: tensor<2x4xf32>, %arg1: tensor<4x6xf32>, %arg2: tensor<2x6xf32>)
+-    -> tensor<2x6xf32> {
+-// CHECK-NOT: linalg
+-// CHECK: llvm.intr.fmuladd{{.*}}
+-  %0 = linalg.matmul  ins(%arg0, %arg1: tensor<2x4xf32>, tensor<4x6xf32>)
+-                     outs(%arg2: tensor<2x6xf32>)
+-    -> tensor<2x6xf32>
+-  return %0 : tensor<2x6xf32>
++module @payload attributes { transform.target_tag = "payload" } {
++  func.func @matmul_tensors(
++    %arg0: tensor<2x4xf32>, %arg1: tensor<4x6xf32>, %arg2: tensor<2x6xf32>)
++      -> tensor<2x6xf32> {
++  // CHECK-NOT: linalg
++  // CHECK: llvm.intr.fmuladd{{.*}}
++    %0 = linalg.matmul  ins(%arg0, %arg1: tensor<2x4xf32>, tensor<4x6xf32>)
++                       outs(%arg2: tensor<2x6xf32>)
++      -> tensor<2x6xf32>
++    return %0 : tensor<2x6xf32>
++  }
+ }
+ 
+ module attributes {transform.with_named_sequence} {
+diff --git a/mlir/test/Dialect/Linalg/matmul-shared-memory-padding.mlir b/mlir/test/Dialect/Linalg/matmul-shared-memory-padding.mlir
+index 3f8d2ea06641..9c223737750a 100644
+--- a/mlir/test/Dialect/Linalg/matmul-shared-memory-padding.mlir
++++ b/mlir/test/Dialect/Linalg/matmul-shared-memory-padding.mlir
+@@ -1,4 +1,4 @@
+-// RUN: mlir-opt --split-input-file --transform-interpreter %s | FileCheck %s
++// RUN: mlir-opt --split-input-file --transform-interpreter="debug-payload-root-tag=payload" %s | FileCheck %s
+ 
+ // CHECK-LABEL: func @matmul_divisible
+ //       CHECK:   scf.forall
+@@ -24,19 +24,21 @@
+ //       CHECK:       scf.forall
+ //       CHECK:         vector.transfer_read
+ //       CHECK:         vector.transfer_write
+-func.func @matmul_divisible(%A: tensor<1024x1024xf32>,
+-                            %B: tensor<1024x1024xf32>,
+-                            %C: tensor<1024x1024xf32>)
+-    -> tensor<1024x1024xf32>
+-{
+-  %cst = arith.constant 0.000000e+00 : f32
+-  %0 = linalg.fill ins(%cst : f32)
+-                   outs(%C : tensor<1024x1024xf32>)
++module @payload attributes { transform.target_tag = "payload" } {
++  func.func @matmul_divisible(%A: tensor<1024x1024xf32>,
++                              %B: tensor<1024x1024xf32>,
++                              %C: tensor<1024x1024xf32>)
+       -> tensor<1024x1024xf32>
+-  %1 = linalg.matmul ins(%A, %B : tensor<1024x1024xf32>, tensor<1024x1024xf32>)
+-                     outs(%0 : tensor<1024x1024xf32>)
+-      -> tensor<1024x1024xf32>
+-  return %1 : tensor<1024x1024xf32>
++  {
++    %cst = arith.constant 0.000000e+00 : f32
++    %0 = linalg.fill ins(%cst : f32)
++                     outs(%C : tensor<1024x1024xf32>)
++        -> tensor<1024x1024xf32>
++    %1 = linalg.matmul ins(%A, %B : tensor<1024x1024xf32>, tensor<1024x1024xf32>)
++                       outs(%0 : tensor<1024x1024xf32>)
++        -> tensor<1024x1024xf32>
++    return %1 : tensor<1024x1024xf32>
++  }
+ }
+ 
+ module attributes {transform.with_named_sequence} {
+@@ -143,19 +145,21 @@ module attributes {transform.with_named_sequence} {
+ //       CHECK:       linalg.matmul
+ //       CHECK:       vector.transfer_read
+ //       CHECK:       vector.transfer_write
++module @payload attributes { transform.target_tag = "payload" } {
+ func.func @matmul_not_divisible(%A: tensor<1023x1023xf32>,
+-                                %B: tensor<1023x1023xf32>,
+-                                %C: tensor<1023x1023xf32>)
+-    -> tensor<1023x1023xf32>
+-{
+-  %cst = arith.constant 0.000000e+00 : f32
+-  %0 = linalg.fill ins(%cst : f32)
+-                   outs(%C : tensor<1023x1023xf32>)
++                                  %B: tensor<1023x1023xf32>,
++                                  %C: tensor<1023x1023xf32>)
+       -> tensor<1023x1023xf32>
+-  %1 = linalg.matmul ins(%A, %B : tensor<1023x1023xf32>, tensor<1023x1023xf32>)
+-                     outs(%0 : tensor<1023x1023xf32>)
+-      -> tensor<1023x1023xf32>
+-  return %1 : tensor<1023x1023xf32>
++  {
++    %cst = arith.constant 0.000000e+00 : f32
++    %0 = linalg.fill ins(%cst : f32)
++                     outs(%C : tensor<1023x1023xf32>)
++        -> tensor<1023x1023xf32>
++    %1 = linalg.matmul ins(%A, %B : tensor<1023x1023xf32>, tensor<1023x1023xf32>)
++                       outs(%0 : tensor<1023x1023xf32>)
++        -> tensor<1023x1023xf32>
++    return %1 : tensor<1023x1023xf32>
++  }
+ }
+ 
+ module attributes {transform.with_named_sequence} {
+diff --git a/mlir/test/Dialect/Linalg/pad-to-specific-memory-space.mlir b/mlir/test/Dialect/Linalg/pad-to-specific-memory-space.mlir
+index f2e9e839b7c4..5e5657980ba1 100644
+--- a/mlir/test/Dialect/Linalg/pad-to-specific-memory-space.mlir
++++ b/mlir/test/Dialect/Linalg/pad-to-specific-memory-space.mlir
+@@ -1,5 +1,5 @@
+ 
+-// RUN: mlir-opt --transform-interpreter -cse -canonicalize -split-input-file -verify-diagnostics %s | FileCheck %s
++// RUN: mlir-opt --transform-interpreter="debug-payload-root-tag=payload" -cse -canonicalize -split-input-file -verify-diagnostics %s | FileCheck %s
+ 
+ #map = affine_map<()[s0] -> (-s0 + 12, 7)>
+ 
+@@ -7,43 +7,45 @@
+ //  CHECK-SAME:     %[[arg0:.*]]: memref<24x12xf32, strided<[?, ?], offset: ?>>,
+ //  CHECK-SAME:     %[[arg1:.*]]: memref<12x25xf32, strided<[?, ?], offset: ?>>,
+ //  CHECK-SAME:     %[[arg2:.*]]: memref<24x25xf32, strided<[?, ?], offset: ?>>,
+-func.func @pad_to_memory_space(%arg0: tensor<24x12xf32>,
+-                               %arg1: tensor<12x25xf32>,
+-                               %arg2: tensor<24x25xf32>,
+-                               %iv0 : index, %iv1 : index,
+-                               %iv2 : index) -> tensor<24x25xf32> {
+-  %0 = affine.min #map()[%iv2]
+-
+-  // CHECK: %[[s0:.*]] = memref.subview %[[arg0]]
+-  %1 = tensor.extract_slice %arg0[%iv0, %iv2] [4, %0] [1, 1] : tensor<24x12xf32> to tensor<4x?xf32>
+-  // CHECK: %[[s1:.*]] = memref.subview %[[arg1]]
+-  %2 = tensor.extract_slice %arg1[%iv2, %iv1] [%0, 5] [1, 1] : tensor<12x25xf32> to tensor<?x5xf32>
+-  // CHECK: %[[s2:.*]] = memref.subview %[[arg2]]
+-  %3 = tensor.extract_slice %arg2[%iv0, %iv1] [4, 5] [1, 1] : tensor<24x25xf32> to tensor<4x5xf32>
+-
+-  // CHECK: %[[alloc0:.*]] = memref.alloc() : memref<4x7xf32, 3>
+-  // CHECK: linalg.fill {{.*}} outs(%[[alloc0]]
+-  // CHECK: %[[alloc0_view:.*]] = memref.subview %[[alloc0]][0, 0] [4, %{{.*}}] [1, 1]
+-  // CHECK: memref.copy %[[s0]], %[[alloc0_view]]
+-
+-  // CHECK: %[[alloc1:.*]] = memref.alloc() : memref<7x5xf32, 3>
+-  // CHECK: linalg.fill {{.*}} outs(%[[alloc1]]
+-  // CHECK: %[[alloc1_view:.*]] = memref.subview %[[alloc1]][0, 0] [%{{.*}}, 5] [1, 1]
+-  // CHECK: memref.copy %[[s1]], %[[alloc1_view]]
+-
+-  // CHECK: %[[alloc2:.*]] = memref.alloc() : memref<4x5xf32, 3>
+-  // CHECK-NOT: linalg.fill {{.*}} outs(%[[alloc2]]
+-  // No subview because there is 0 padding
+-  // CHECK: memref.copy %[[s2]], %[[alloc2]]
+-
+-  // CHECK: linalg.matmul ins(%[[alloc0]], %[[alloc1]] : {{.*}}) outs(%[[alloc2]] : {{.*}})
+-  // Copy back result.
+-  // CHECK: memref.copy %[[alloc2]], %[[s2]]
+-  %4 = linalg.matmul ins(%1, %2 : tensor<4x?xf32>, tensor<?x5xf32>) outs(%3 : tensor<4x5xf32>) -> tensor<4x5xf32>
+-
+-  // insert_slice bufferizes to a no-op.
+-  %5 = tensor.insert_slice %4 into %arg2[%iv0, %iv1] [4, 5] [1, 1] : tensor<4x5xf32> into tensor<24x25xf32>
+-  func.return %5 : tensor<24x25xf32>
++module @payload attributes { transform.target_tag = "payload" } {
++  func.func @pad_to_memory_space(%arg0: tensor<24x12xf32>,
++                                 %arg1: tensor<12x25xf32>,
++                                 %arg2: tensor<24x25xf32>,
++                                 %iv0 : index, %iv1 : index,
++                                 %iv2 : index) -> tensor<24x25xf32> {
++    %0 = affine.min #map()[%iv2]
++
++    // CHECK: %[[s0:.*]] = memref.subview %[[arg0]]
++    %1 = tensor.extract_slice %arg0[%iv0, %iv2] [4, %0] [1, 1] : tensor<24x12xf32> to tensor<4x?xf32>
++    // CHECK: %[[s1:.*]] = memref.subview %[[arg1]]
++    %2 = tensor.extract_slice %arg1[%iv2, %iv1] [%0, 5] [1, 1] : tensor<12x25xf32> to tensor<?x5xf32>
++    // CHECK: %[[s2:.*]] = memref.subview %[[arg2]]
++    %3 = tensor.extract_slice %arg2[%iv0, %iv1] [4, 5] [1, 1] : tensor<24x25xf32> to tensor<4x5xf32>
++
++    // CHECK: %[[alloc0:.*]] = memref.alloc() : memref<4x7xf32, 3>
++    // CHECK: linalg.fill {{.*}} outs(%[[alloc0]]
++    // CHECK: %[[alloc0_view:.*]] = memref.subview %[[alloc0]][0, 0] [4, %{{.*}}] [1, 1]
++    // CHECK: memref.copy %[[s0]], %[[alloc0_view]]
++
++    // CHECK: %[[alloc1:.*]] = memref.alloc() : memref<7x5xf32, 3>
++    // CHECK: linalg.fill {{.*}} outs(%[[alloc1]]
++    // CHECK: %[[alloc1_view:.*]] = memref.subview %[[alloc1]][0, 0] [%{{.*}}, 5] [1, 1]
++    // CHECK: memref.copy %[[s1]], %[[alloc1_view]]
++
++    // CHECK: %[[alloc2:.*]] = memref.alloc() : memref<4x5xf32, 3>
++    // CHECK-NOT: linalg.fill {{.*}} outs(%[[alloc2]]
++    // No subview because there is 0 padding
++    // CHECK: memref.copy %[[s2]], %[[alloc2]]
++
++    // CHECK: linalg.matmul ins(%[[alloc0]], %[[alloc1]] : {{.*}}) outs(%[[alloc2]] : {{.*}})
++    // Copy back result.
++    // CHECK: memref.copy %[[alloc2]], %[[s2]]
++    %4 = linalg.matmul ins(%1, %2 : tensor<4x?xf32>, tensor<?x5xf32>) outs(%3 : tensor<4x5xf32>) -> tensor<4x5xf32>
++
++    // insert_slice bufferizes to a no-op.
++    %5 = tensor.insert_slice %4 into %arg2[%iv0, %iv1] [4, 5] [1, 1] : tensor<4x5xf32> into tensor<24x25xf32>
++    func.return %5 : tensor<24x25xf32>
++  }
+ }
+ 
+ module attributes {transform.with_named_sequence} {
+@@ -69,40 +71,42 @@ module attributes {transform.with_named_sequence} {
+ //  CHECK-SAME:     %[[arg0:.*]]: memref<24x12xf32, strided<[?, ?], offset: ?>>,
+ //  CHECK-SAME:     %[[arg1:.*]]: memref<12x25xf32, strided<[?, ?], offset: ?>>,
+ //  CHECK-SAME:     %[[arg2:.*]]: memref<24x25xf32, strided<[?, ?], offset: ?>>,
+-func.func @vectorize_and_bufferize_pad(%arg0: tensor<24x12xf32>,
+-                                       %arg1: tensor<12x25xf32>,
+-                                       %arg2: tensor<24x25xf32>,
+-                                       %iv0 : index, %iv1 : index,
+-                                       %iv2 : index) -> tensor<24x25xf32> {
+-  %0 = affine.min #map()[%iv2]
+-
+-  // CHECK: %[[s0:.*]] = memref.subview %[[arg0]]
+-  %1 = tensor.extract_slice %arg0[%iv0, %iv2] [4, %0] [1, 1] : tensor<24x12xf32> to tensor<4x?xf32>
+-  // CHECK: %[[s1:.*]] = memref.subview %[[arg1]]
+-  %2 = tensor.extract_slice %arg1[%iv2, %iv1] [%0, 5] [1, 1] : tensor<12x25xf32> to tensor<?x5xf32>
+-  // CHECK: %[[s2:.*]] = memref.subview %[[arg2]]
+-  %3 = tensor.extract_slice %arg2[%iv0, %iv1] [4, 5] [1, 1] : tensor<24x25xf32> to tensor<4x5xf32>
+-
+-  // CHECK: %[[v0:.*]] = vector.mask {{.*}} { vector.transfer_read %[[s0]]
+-  // CHECK: %[[alloc0:.*]] = memref.alloc() : memref<4x7xf32, 3>
+-  // CHECK: vector.mask {{.*}} { vector.transfer_write %[[v0]], %[[alloc0]]
+-
+-  // CHECK: %[[v1:.*]] = vector.mask {{.*}} { vector.transfer_read %[[s1]]
+-  // CHECK: %[[alloc1:.*]] = memref.alloc() : memref<7x5xf32, 3>
+-  // CHECK: vector.mask {{.*}} { vector.transfer_write %[[v1]], %[[alloc1]]
+-
+-  // CHECK: %[[v2:.*]] = vector.mask {{.*}} { vector.transfer_read %[[s2]]
+-  // CHECK: %[[alloc2:.*]] = memref.alloc() : memref<4x5xf32, 3>
+-  // CHECK: vector.mask {{.*}} { vector.transfer_write %[[v2]], %[[alloc0]]
+-
+-  // CHECK: linalg.matmul ins(%[[alloc0]], %[[alloc1]] : {{.*}}) outs(%[[alloc2]] : {{.*}})
+-  // Copy back result.
+-  // CHECK: memref.copy %[[alloc2]], %[[s2]]
+-  %4 = linalg.matmul ins(%1, %2 : tensor<4x?xf32>, tensor<?x5xf32>) outs(%3 : tensor<4x5xf32>) -> tensor<4x5xf32>
+-
+-  // insert_slice bufferizes to a no-op.
+-  %5 = tensor.insert_slice %4 into %arg2[%iv0, %iv1] [4, 5] [1, 1] : tensor<4x5xf32> into tensor<24x25xf32>
+-  func.return %5 : tensor<24x25xf32>
++module @payload attributes { transform.target_tag = "payload" } {
++  func.func @vectorize_and_bufferize_pad(%arg0: tensor<24x12xf32>,
++                                         %arg1: tensor<12x25xf32>,
++                                         %arg2: tensor<24x25xf32>,
++                                         %iv0 : index, %iv1 : index,
++                                         %iv2 : index) -> tensor<24x25xf32> {
++    %0 = affine.min #map()[%iv2]
++
++    // CHECK: %[[s0:.*]] = memref.subview %[[arg0]]
++    %1 = tensor.extract_slice %arg0[%iv0, %iv2] [4, %0] [1, 1] : tensor<24x12xf32> to tensor<4x?xf32>
++    // CHECK: %[[s1:.*]] = memref.subview %[[arg1]]
++    %2 = tensor.extract_slice %arg1[%iv2, %iv1] [%0, 5] [1, 1] : tensor<12x25xf32> to tensor<?x5xf32>
++    // CHECK: %[[s2:.*]] = memref.subview %[[arg2]]
++    %3 = tensor.extract_slice %arg2[%iv0, %iv1] [4, 5] [1, 1] : tensor<24x25xf32> to tensor<4x5xf32>
++
++    // CHECK: %[[v0:.*]] = vector.mask {{.*}} { vector.transfer_read %[[s0]]
++    // CHECK: %[[alloc0:.*]] = memref.alloc() : memref<4x7xf32, 3>
++    // CHECK: vector.mask {{.*}} { vector.transfer_write %[[v0]], %[[alloc0]]
++
++    // CHECK: %[[v1:.*]] = vector.mask {{.*}} { vector.transfer_read %[[s1]]
++    // CHECK: %[[alloc1:.*]] = memref.alloc() : memref<7x5xf32, 3>
++    // CHECK: vector.mask {{.*}} { vector.transfer_write %[[v1]], %[[alloc1]]
++
++    // CHECK: %[[v2:.*]] = vector.mask {{.*}} { vector.transfer_read %[[s2]]
++    // CHECK: %[[alloc2:.*]] = memref.alloc() : memref<4x5xf32, 3>
++    // CHECK: vector.mask {{.*}} { vector.transfer_write %[[v2]], %[[alloc0]]
++
++    // CHECK: linalg.matmul ins(%[[alloc0]], %[[alloc1]] : {{.*}}) outs(%[[alloc2]] : {{.*}})
++    // Copy back result.
++    // CHECK: memref.copy %[[alloc2]], %[[s2]]
++    %4 = linalg.matmul ins(%1, %2 : tensor<4x?xf32>, tensor<?x5xf32>) outs(%3 : tensor<4x5xf32>) -> tensor<4x5xf32>
++
++    // insert_slice bufferizes to a no-op.
++    %5 = tensor.insert_slice %4 into %arg2[%iv0, %iv1] [4, 5] [1, 1] : tensor<4x5xf32> into tensor<24x25xf32>
++    func.return %5 : tensor<24x25xf32>
++  }
+ }
+ 
+ module attributes {transform.with_named_sequence} {
+diff --git a/mlir/test/Dialect/Vector/transform-vector.mlir b/mlir/test/Dialect/Vector/transform-vector.mlir
+index 75b29e22b4d2..21e615e60438 100644
+--- a/mlir/test/Dialect/Vector/transform-vector.mlir
++++ b/mlir/test/Dialect/Vector/transform-vector.mlir
+@@ -1,16 +1,18 @@
+-// RUN: mlir-opt %s --transform-interpreter --split-input-file | FileCheck %s
++// RUN: mlir-opt --transform-interpreter="debug-payload-root-tag=payload" %s --split-input-file | FileCheck %s
+ 
+ // CHECK-LABEL: func @matmul_tensors
+-func.func @matmul_tensors(
+-  %arg0: tensor<8x16xf32>, %arg1: tensor<16x32xf32>, %arg2: tensor<8x32xf32>)
+-    -> tensor<8x32xf32> {
+-// CHECK-NOT: linalg
+-// CHECK: vector.extract {{.*}} : vector<4xf32> from vector<8x4xf32>
+-// CHECK: vector.store {{.*}} : memref<8x32xf32>, vector<4xf32>
+-  %0 = linalg.matmul  ins(%arg0, %arg1: tensor<8x16xf32>, tensor<16x32xf32>)
+-                     outs(%arg2: tensor<8x32xf32>)
+-    -> tensor<8x32xf32>
+-  return %0 : tensor<8x32xf32>
++module @payload attributes { transform.target_tag = "payload" } {
++  func.func @matmul_tensors(
++    %arg0: tensor<8x16xf32>, %arg1: tensor<16x32xf32>, %arg2: tensor<8x32xf32>)
++      -> tensor<8x32xf32> {
++  // CHECK-NOT: linalg
++  // CHECK: vector.extract {{.*}} : vector<4xf32> from vector<8x4xf32>
++  // CHECK: vector.store {{.*}} : memref<8x32xf32>, vector<4xf32>
++    %0 = linalg.matmul  ins(%arg0, %arg1: tensor<8x16xf32>, tensor<16x32xf32>)
++                       outs(%arg2: tensor<8x32xf32>)
++      -> tensor<8x32xf32>
++    return %0 : tensor<8x32xf32>
++  }
+ }
+ 
+ module attributes {transform.with_named_sequence} {
+@@ -76,11 +78,13 @@ module attributes {transform.with_named_sequence} {
+ //  CHECK-SAME:   iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>}
+ //  CHECK-SAME:   %[[ARG0]], %[[ARG1]], %[[ARG2]] : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32>
+ //  CHECK-NEXT:   return %[[R]] : vector<64x64xf32>
+-func.func @fold_arith_extf_into_contract(%arg0: vector<64x64xf16>, %arg1: vector<64x64xf16>, %arg2: vector<64x64xf32>) -> vector<64x64xf32> {
+-    %lhs_f32 = arith.extf %arg0 : vector<64x64xf16> to vector<64x64xf32>
+-    %rhs_f32 = arith.extf %arg1 : vector<64x64xf16> to vector<64x64xf32>
+-    %result = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %lhs_f32, %rhs_f32, %arg2 : vector<64x64xf32>, vector<64x64xf32> into vector<64x64xf32>
+-    return %result : vector<64x64xf32>
++module @payload attributes { transform.target_tag = "payload" } {
++  func.func @fold_arith_extf_into_contract(%arg0: vector<64x64xf16>, %arg1: vector<64x64xf16>, %arg2: vector<64x64xf32>) -> vector<64x64xf32> {
++      %lhs_f32 = arith.extf %arg0 : vector<64x64xf16> to vector<64x64xf32>
++      %rhs_f32 = arith.extf %arg1 : vector<64x64xf16> to vector<64x64xf32>
++      %result = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %lhs_f32, %rhs_f32, %arg2 : vector<64x64xf32>, vector<64x64xf32> into vector<64x64xf32>
++      return %result : vector<64x64xf32>
++  }
+ }
+ 
+ module attributes {transform.with_named_sequence} {

From b9f0e9cd8b0aeec689ee9a0177c1dc3488513f13 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Tue, 10 Sep 2024 15:39:21 -0400
Subject: [PATCH 075/183] Make CopyGlobalMemRef create memref without memory
 layout and then cast it back

---
 mlir/lib/Quantum/Transforms/cp_global_buffers.cpp | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Quantum/Transforms/cp_global_buffers.cpp b/mlir/lib/Quantum/Transforms/cp_global_buffers.cpp
index b461dc8a60..227d7977b0 100644
--- a/mlir/lib/Quantum/Transforms/cp_global_buffers.cpp
+++ b/mlir/lib/Quantum/Transforms/cp_global_buffers.cpp
@@ -87,7 +87,10 @@ llvm::SmallVector<Value> getReturnMemRefs(func::ReturnOp op)
  */
 Value allocCopyMemrefDyn(Location loc, Value memref, PatternRewriter &rewriter)
 {
-    auto memrefType = cast<MemRefType>(memref.getType());
+    auto origMemrefType = cast<MemRefType>(memref.getType());
+    // Rebuild MemRefType without memory layout.
+    auto memrefType = MemRefType::get(origMemrefType.getShape(), origMemrefType.getElementType());
+
     llvm::SmallVector<Value> dynDims;
     {
         llvm::SmallVector<int64_t> dynIndices;
@@ -102,8 +105,10 @@ Value allocCopyMemrefDyn(Location loc, Value memref, PatternRewriter &rewriter)
     }
 
     Value newMemRef = rewriter.create<memref::AllocOp>(loc, memrefType, dynDims);
+    // Cast memrefType back to maintain memory layout.
+    Value castMemRef = rewriter.create<memref::CastOp>(loc, origMemrefType, newMemRef);
     rewriter.create<memref::CopyOp>(loc, memref, newMemRef);
-    return newMemRef;
+    return castMemRef;
 }
 
 /**

From c522097a1872492c7a4cbb61988f3c42b4645742 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Wed, 11 Sep 2024 15:56:23 -0400
Subject: [PATCH 076/183] Use allow-return-allocs-from-loops to avoid
 scf.for/while errors

---
 frontend/catalyst/compiler.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/frontend/catalyst/compiler.py b/frontend/catalyst/compiler.py
index ed59e4646c..f0408fac55 100644
--- a/frontend/catalyst/compiler.py
+++ b/frontend/catalyst/compiler.py
@@ -195,8 +195,7 @@ def run_writing_command(command: List[str], compile_options: Optional[CompileOpt
     [
         "eliminate-empty-tensors",
         "convert-elementwise-to-linalg",
-        "scf-loop-bufferization-preprocessing",
-        "one-shot-bufferize{bufferize-function-boundaries}",
+        "one-shot-bufferize{bufferize-function-boundaries allow-return-allocs-from-loops}",
         "convert-arraylist-to-memref",
         "convert-bufferization-to-memref",
         "canonicalize",

From 553ed3e5f996ddf34cbe556bc89f42d40df22251 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Wed, 11 Sep 2024 20:52:07 -0400
Subject: [PATCH 077/183] Make ForwardOp and ReverseOp compatible with
 ModuleOpBufferize

---
 .../BufferizableOpInterfaceImpl.cpp           | 44 ++++++++++++++++++-
 1 file changed, 42 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
index e7605ce107..04e5f92003 100644
--- a/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -3,6 +3,7 @@
 #include "mlir/Conversion/LLVMCommon/TypeConverter.h"
 #include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
 #include "mlir/Dialect/Bufferization/IR/Bufferization.h"
+#include "mlir/Dialect/Bufferization/IR/UnstructuredControlFlow.h"
 #include "mlir/Dialect/Bufferization/Transforms/Bufferize.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Index/IR/IndexOps.h"
@@ -234,7 +235,25 @@ struct BackpropOpInterface
 };
 
 struct ForwardOpInterface
-    : public bufferization::BufferizableOpInterface::ExternalModel<ForwardOpInterface, ForwardOp> {
+    : public bufferization::OpWithUnstructuredControlFlowBufferizableOpInterfaceExternalModel<
+        ForwardOpInterface, ForwardOp> {
+
+    static bool supportsUnstructuredControlFlow() { return true; }
+
+    bool hasTensorSemantics(Operation *op) const
+    {
+        auto isaTensor = llvm::IsaPred<TensorType>;
+
+        // A function has tensor semantics if it has tensor arguments/results.
+        auto forwardOp = cast<ForwardOp>(op);
+        bool hasTensorArg = any_of(forwardOp.getArgumentTypes(), isaTensor);
+        bool hasTensorResult = any_of(forwardOp.getResultTypes(), isaTensor);
+        if (hasTensorArg || hasTensorResult)
+            return true;
+
+        return false;
+    }
+
     bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
                                 const bufferization::AnalysisState &state) const
     {
@@ -328,7 +347,25 @@ struct ForwardOpInterface
 };
 
 struct ReverseOpInterface
-    : public bufferization::BufferizableOpInterface::ExternalModel<ReverseOpInterface, ReverseOp> {
+    : public bufferization::OpWithUnstructuredControlFlowBufferizableOpInterfaceExternalModel<
+        ReverseOpInterface, ReverseOp> {
+
+    static bool supportsUnstructuredControlFlow() { return true; }
+
+    bool hasTensorSemantics(Operation *op) const
+    {
+        auto isaTensor = llvm::IsaPred<TensorType>;
+
+        // A function has tensor semantics if it has tensor arguments/results.
+        auto reverseOp = cast<ReverseOp>(op);
+        bool hasTensorArg = any_of(reverseOp.getArgumentTypes(), isaTensor);
+        bool hasTensorResult = any_of(reverseOp.getResultTypes(), isaTensor);
+        if (hasTensorArg || hasTensorResult)
+            return true;
+
+        return false;
+    }
+
     bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
                                 const bufferization::AnalysisState &state) const
     {
@@ -352,6 +389,9 @@ struct ReverseOpInterface
                             const bufferization::BufferizationOptions &options) const
     {
         auto reverseOp = cast<ReverseOp>(op);
+        llvm::outs() << "Found reverse!\n";
+        llvm::outs() << "Found reverse!\n";
+        llvm::outs() << "Found reverse!\n";
 
         auto argc = reverseOp.getArgc();
         auto resc = reverseOp.getResc();

From c41879216cf0b9abbd40a2827bd70c3d2d2f531e Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Thu, 12 Sep 2024 13:55:05 -0400
Subject: [PATCH 078/183] Make ForwardOp and ReverseOp update signature

---
 .../BufferizableOpInterfaceImpl.cpp           | 50 ++++++++++++++++---
 1 file changed, 43 insertions(+), 7 deletions(-)

diff --git a/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
index 04e5f92003..156db2c599 100644
--- a/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -285,8 +285,28 @@ struct ForwardOpInterface
         SmallVector<Value> outputs;
         SmallVector<Value> cotangents;
 
+        // Update signature
+        auto argTys = forwardOp.getArgumentTypes();
+        auto retTys = forwardOp.getResultTypes();
+        SmallVector<Type> emptyRets;
+        SmallVector<Type> args(argTys.begin(), argTys.end());
+        args.insert(args.end(), retTys.begin(), retTys.end());
+        SmallVector<Type> bufferArgs;
+        for (Type ty : args) {
+            auto tensorType = dyn_cast<RankedTensorType>(ty);
+            if (!tensorType)
+                bufferArgs.push_back(ty);
+            else
+                bufferArgs.push_back(
+                    MemRefType::get(tensorType.getShape(), tensorType.getElementType()));
+        }
+        auto forwardTy = rewriter.getFunctionType(bufferArgs, emptyRets);
+        
         Block *block;
-        rewriter.modifyOpInPlace(op, [&] { block = forwardOp.addEntryBlock(); });
+        rewriter.modifyOpInPlace(op, [&] { 
+            forwardOp.setFunctionType(forwardTy); 
+            block = forwardOp.addEntryBlock();
+        });
 
         PatternRewriter::InsertionGuard guard(rewriter);
         rewriter.setInsertionPointToStart(block);
@@ -316,7 +336,6 @@ struct ForwardOpInterface
             Value tensorIn = rewriter.create<bufferization::ToTensorOp>(loc, input);
             tensorInputs.push_back(tensorIn);
         }
-
         auto callOp = rewriter.create<func::CallOp>(loc, impl, implResTy, tensorInputs);
         SmallVector<Value> tensorOutputs(callOp.getResults());
 
@@ -340,7 +359,7 @@ struct ForwardOpInterface
         }
 
         auto F = rewriter.getIntegerAttr(rewriter.getI1Type(), 0);
-        bufferization::replaceOpWithNewBufferizedOp<catalyst::gradient::ReturnOp>(rewriter, op, tapeMemrefOutputs, F);
+        rewriter.create<catalyst::gradient::ReturnOp>(loc, tapeMemrefOutputs, F);
 
         return success();
     }
@@ -389,9 +408,6 @@ struct ReverseOpInterface
                             const bufferization::BufferizationOptions &options) const
     {
         auto reverseOp = cast<ReverseOp>(op);
-        llvm::outs() << "Found reverse!\n";
-        llvm::outs() << "Found reverse!\n";
-        llvm::outs() << "Found reverse!\n";
 
         auto argc = reverseOp.getArgc();
         auto resc = reverseOp.getResc();
@@ -401,8 +417,28 @@ struct ReverseOpInterface
         SmallVector<Value> cotangents;
         SmallVector<Value> tapeElements;
 
+        // Update signature
+        auto argTys = reverseOp.getArgumentTypes();
+        auto retTys = reverseOp.getResultTypes();
+        SmallVector<Type> emptyRets;
+        SmallVector<Type> args(argTys.begin(), argTys.end());
+        args.insert(args.end(), retTys.begin(), retTys.end());
+        SmallVector<Type> bufferArgs;
+        for (Type ty : args) {
+            auto tensorType = dyn_cast<RankedTensorType>(ty);
+            if (!tensorType)
+                bufferArgs.push_back(ty);
+            else
+                bufferArgs.push_back(
+                    MemRefType::get(tensorType.getShape(), tensorType.getElementType()));
+        }
+        auto reverseTy = rewriter.getFunctionType(bufferArgs, emptyRets);
+        
         Block *block;
-        rewriter.modifyOpInPlace(op, [&] { block = reverseOp.addEntryBlock(); });
+        rewriter.modifyOpInPlace(op, [&] { 
+            reverseOp.setFunctionType(reverseTy); 
+            block = reverseOp.addEntryBlock();
+        });
 
         PatternRewriter::InsertionGuard guard(rewriter);
         rewriter.setInsertionPointToStart(block);

From 1e5f96c538cd2b2345b2e75693d5c7fc9f181c4f Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Thu, 12 Sep 2024 15:41:50 -0400
Subject: [PATCH 079/183] Do not bufferize ForwarOp if its implementation if
 not bufferized

---
 .../BufferizableOpInterfaceImpl.cpp           | 27 +++++++++++++++----
 1 file changed, 22 insertions(+), 5 deletions(-)

diff --git a/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
index 156db2c599..b945cbfa69 100644
--- a/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -10,6 +10,7 @@
 #include "mlir/Dialect/LLVMIR/FunctionCallUtils.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/IR/BuiltinTypes.h"
 #include "mlir/Transforms/DialectConversion.h"
 
 #include "Gradient/IR/GradientOps.h"
@@ -248,6 +249,21 @@ struct ForwardOpInterface
         auto forwardOp = cast<ForwardOp>(op);
         bool hasTensorArg = any_of(forwardOp.getArgumentTypes(), isaTensor);
         bool hasTensorResult = any_of(forwardOp.getResultTypes(), isaTensor);
+
+        // Implementation must be bufferized.
+        auto implAttr = forwardOp.getImplementationAttr();
+        auto implOp = SymbolTable::lookupNearestSymbolFrom<FunctionOpInterface>(op, implAttr);
+        auto implArgTy = implOp.getArgumentTypes();
+        auto implResTy = implOp.getResultTypes();
+        for (auto ty: implArgTy) {
+            if (!isa<MemRefType>(ty))
+                return false;
+        }
+        for (auto ty: implResTy) {
+            if (!isa<MemRefType>(ty))
+                return false;
+        }
+
         if (hasTensorArg || hasTensorResult)
             return true;
 
@@ -333,15 +349,16 @@ struct ForwardOpInterface
 
         SmallVector<Value> tensorInputs;
         for (auto input : inputs) {
-            Value tensorIn = rewriter.create<bufferization::ToTensorOp>(loc, input);
+            Value tensorIn = (isa<TensorType>(input.getType())) ? input :
+                rewriter.create<bufferization::ToTensorOp>(loc, input);
             tensorInputs.push_back(tensorIn);
         }
         auto callOp = rewriter.create<func::CallOp>(loc, impl, implResTy, tensorInputs);
         SmallVector<Value> tensorOutputs(callOp.getResults());
 
         for (auto [memrefOutput, tensorOutput] : llvm::zip(outputs, tensorOutputs)) {
-            Value castVal = rewriter.create<bufferization::ToMemrefOp>(loc, memrefOutput.getType(),
-                                                                       tensorOutput);
+            Value castVal = (isa<MemRefType>(tensorOutput.getType())) ? tensorOutput :
+                rewriter.create<bufferization::ToMemrefOp>(loc, memrefOutput.getType(), tensorOutput);
             rewriter.create<memref::CopyOp>(loc, castVal, memrefOutput);
         }
 
@@ -353,7 +370,7 @@ struct ForwardOpInterface
         SmallVector<Value> tapeMemrefOutputs;
         for (auto [tapeTensorOutput, memrefTapeOutput] :
              llvm::zip(tapeOutputs, forwardOp.getResultTypes())) {
-            Value castVal =
+            Value castVal = (isa<MemRefType>(tapeTensorOutput.getType())) ? tapeTensorOutput :
                 rewriter.create<bufferization::ToMemrefOp>(loc, memrefTapeOutput, tapeTensorOutput);
             tapeMemrefOutputs.push_back(castVal);
         }
@@ -504,6 +521,6 @@ void catalyst::gradient::registerBufferizableOpInterfaceExternalModels(DialectRe
         AdjointOp::attachInterface<AdjointOpInterface>(*ctx);
         BackpropOp::attachInterface<BackpropOpInterface>(*ctx);
         ForwardOp::attachInterface<ForwardOpInterface>(*ctx);
-        ReverseOp::attachInterface<ReverseOpInterface>(*ctx);
+        //ReverseOp::attachInterface<ReverseOpInterface>(*ctx);
     });
 }
\ No newline at end of file

From a2af4576e0ce453ef6786ff0d83764603074717c Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Thu, 12 Sep 2024 15:46:56 -0400
Subject: [PATCH 080/183] Update ReverseOp

---
 .../BufferizableOpInterfaceImpl.cpp           | 27 +++++++++++++++----
 1 file changed, 22 insertions(+), 5 deletions(-)

diff --git a/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
index b945cbfa69..b74f22509b 100644
--- a/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -396,6 +396,21 @@ struct ReverseOpInterface
         auto reverseOp = cast<ReverseOp>(op);
         bool hasTensorArg = any_of(reverseOp.getArgumentTypes(), isaTensor);
         bool hasTensorResult = any_of(reverseOp.getResultTypes(), isaTensor);
+
+        // Implementation must be bufferized.
+        auto implAttr = reverseOp.getImplementationAttr();
+        auto implOp = SymbolTable::lookupNearestSymbolFrom<FunctionOpInterface>(op, implAttr);
+        auto implArgTy = implOp.getArgumentTypes();
+        auto implResTy = implOp.getResultTypes();
+        for (auto ty: implArgTy) {
+            if (!isa<MemRefType>(ty))
+                return false;
+        }
+        for (auto ty: implResTy) {
+            if (!isa<MemRefType>(ty))
+                return false;
+        }
+
         if (hasTensorArg || hasTensorResult)
             return true;
 
@@ -488,12 +503,14 @@ struct ReverseOpInterface
 
         SmallVector<Value> tensorInputs;
         for (auto tapeElement : tapeElements) {
-            Value tensorIn = rewriter.create<bufferization::ToTensorOp>(loc, tapeElement);
+            Value tensorIn = (isa<TensorType>(tapeElement.getType())) ? tapeElement :
+                rewriter.create<bufferization::ToTensorOp>(loc, tapeElement);
             tensorInputs.push_back(tensorIn);
         }
 
         for (auto cotangent : cotangents) {
-            Value tensorIn = rewriter.create<bufferization::ToTensorOp>(loc, cotangent);
+            Value tensorIn = (isa<TensorType>(cotangent.getType())) ? cotangent :
+                rewriter.create<bufferization::ToTensorOp>(loc, cotangent);
             tensorInputs.push_back(tensorIn);
         }
 
@@ -501,8 +518,8 @@ struct ReverseOpInterface
         SmallVector<Value> tensorOutputs(callOp.getResults());
 
         for (auto [differential, tensorOutput] : llvm::zip(differentials, tensorOutputs)) {
-            Value castVal = rewriter.create<bufferization::ToMemrefOp>(loc, differential.getType(),
-                                                                       tensorOutput);
+            Value castVal = (isa<MemRefType>(tensorOutput.getType())) ? tensorOutput :
+                rewriter.create<bufferization::ToMemrefOp>(loc, differential.getType(), tensorOutput);
             rewriter.create<memref::CopyOp>(loc, castVal, differential);
         }
 
@@ -521,6 +538,6 @@ void catalyst::gradient::registerBufferizableOpInterfaceExternalModels(DialectRe
         AdjointOp::attachInterface<AdjointOpInterface>(*ctx);
         BackpropOp::attachInterface<BackpropOpInterface>(*ctx);
         ForwardOp::attachInterface<ForwardOpInterface>(*ctx);
-        //ReverseOp::attachInterface<ReverseOpInterface>(*ctx);
+        ReverseOp::attachInterface<ReverseOpInterface>(*ctx);
     });
 }
\ No newline at end of file

From 723fa1139df2453a4226cefe321955dd4559fd30 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Thu, 12 Sep 2024 15:58:14 -0400
Subject: [PATCH 081/183] Update conditions for ForwardOp and ReverseOp

---
 .../Gradient/Transforms/BufferizableOpInterfaceImpl.cpp  | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
index b74f22509b..f5bfe7d91d 100644
--- a/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -17,6 +17,7 @@
 #include "Gradient/Transforms/BufferizableOpInterfaceImpl.h"
 #include "Gradient/Utils/GradientShape.h"
 #include "Quantum/IR/QuantumOps.h"
+#include "llvm/ADT/STLExtras.h"
 
 using namespace mlir;
 using namespace catalyst::gradient;
@@ -249,6 +250,8 @@ struct ForwardOpInterface
         auto forwardOp = cast<ForwardOp>(op);
         bool hasTensorArg = any_of(forwardOp.getArgumentTypes(), isaTensor);
         bool hasTensorResult = any_of(forwardOp.getResultTypes(), isaTensor);
+        bool hasTensorFuncInType = any_of(forwardOp.getFunctionType().getInputs(), isaTensor);
+        bool hasTensorFuncOutType = any_of(forwardOp.getFunctionType().getResults(), isaTensor);
 
         // Implementation must be bufferized.
         auto implAttr = forwardOp.getImplementationAttr();
@@ -264,7 +267,7 @@ struct ForwardOpInterface
                 return false;
         }
 
-        if (hasTensorArg || hasTensorResult)
+        if (hasTensorArg || hasTensorResult || hasTensorFuncInType || hasTensorFuncOutType)
             return true;
 
         return false;
@@ -396,6 +399,8 @@ struct ReverseOpInterface
         auto reverseOp = cast<ReverseOp>(op);
         bool hasTensorArg = any_of(reverseOp.getArgumentTypes(), isaTensor);
         bool hasTensorResult = any_of(reverseOp.getResultTypes(), isaTensor);
+        bool hasTensorFuncInType = any_of(reverseOp.getFunctionType().getInputs(), isaTensor);
+        bool hasTensorFuncOutType = any_of(reverseOp.getFunctionType().getResults(), isaTensor);
 
         // Implementation must be bufferized.
         auto implAttr = reverseOp.getImplementationAttr();
@@ -411,7 +416,7 @@ struct ReverseOpInterface
                 return false;
         }
 
-        if (hasTensorArg || hasTensorResult)
+        if (hasTensorArg || hasTensorResult || hasTensorFuncInType || hasTensorFuncOutType)
             return true;
 
         return false;

From 49b197b84d554e34621fb689f9b0ea6c7d17b228 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Thu, 12 Sep 2024 16:55:29 -0400
Subject: [PATCH 082/183] Remove ForwardOp and ReverseOp workarounds

---
 .../BufferizableOpInterfaceImpl.cpp           | 30 -------------------
 1 file changed, 30 deletions(-)

diff --git a/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
index f5bfe7d91d..ea716be399 100644
--- a/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -252,21 +252,6 @@ struct ForwardOpInterface
         bool hasTensorResult = any_of(forwardOp.getResultTypes(), isaTensor);
         bool hasTensorFuncInType = any_of(forwardOp.getFunctionType().getInputs(), isaTensor);
         bool hasTensorFuncOutType = any_of(forwardOp.getFunctionType().getResults(), isaTensor);
-
-        // Implementation must be bufferized.
-        auto implAttr = forwardOp.getImplementationAttr();
-        auto implOp = SymbolTable::lookupNearestSymbolFrom<FunctionOpInterface>(op, implAttr);
-        auto implArgTy = implOp.getArgumentTypes();
-        auto implResTy = implOp.getResultTypes();
-        for (auto ty: implArgTy) {
-            if (!isa<MemRefType>(ty))
-                return false;
-        }
-        for (auto ty: implResTy) {
-            if (!isa<MemRefType>(ty))
-                return false;
-        }
-
         if (hasTensorArg || hasTensorResult || hasTensorFuncInType || hasTensorFuncOutType)
             return true;
 
@@ -401,21 +386,6 @@ struct ReverseOpInterface
         bool hasTensorResult = any_of(reverseOp.getResultTypes(), isaTensor);
         bool hasTensorFuncInType = any_of(reverseOp.getFunctionType().getInputs(), isaTensor);
         bool hasTensorFuncOutType = any_of(reverseOp.getFunctionType().getResults(), isaTensor);
-
-        // Implementation must be bufferized.
-        auto implAttr = reverseOp.getImplementationAttr();
-        auto implOp = SymbolTable::lookupNearestSymbolFrom<FunctionOpInterface>(op, implAttr);
-        auto implArgTy = implOp.getArgumentTypes();
-        auto implResTy = implOp.getResultTypes();
-        for (auto ty: implArgTy) {
-            if (!isa<MemRefType>(ty))
-                return false;
-        }
-        for (auto ty: implResTy) {
-            if (!isa<MemRefType>(ty))
-                return false;
-        }
-
         if (hasTensorArg || hasTensorResult || hasTensorFuncInType || hasTensorFuncOutType)
             return true;
 

From 0d4bfbd29244626ca39ee16a83738a5ffe794e53 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Thu, 12 Sep 2024 17:42:10 -0400
Subject: [PATCH 083/183] Create templates for ForwardOp and ReverseOp
 preprocessing

---
 mlir/include/Gradient/Transforms/Passes.h     |  1 +
 mlir/include/Gradient/Transforms/Passes.td    | 12 ++++
 mlir/include/Gradient/Transforms/Patterns.h   |  1 +
 .../Catalyst/Transforms/RegisterAllPasses.cpp |  1 +
 mlir/lib/Gradient/Transforms/CMakeLists.txt   |  2 +
 .../Transforms/PreprocessingPatterns.cpp      | 69 +++++++++++++++++++
 .../Transforms/gradient_preprocess.cpp        | 58 ++++++++++++++++
 7 files changed, 144 insertions(+)
 create mode 100644 mlir/lib/Gradient/Transforms/PreprocessingPatterns.cpp
 create mode 100644 mlir/lib/Gradient/Transforms/gradient_preprocess.cpp

diff --git a/mlir/include/Gradient/Transforms/Passes.h b/mlir/include/Gradient/Transforms/Passes.h
index 40881b28fb..539e562905 100644
--- a/mlir/include/Gradient/Transforms/Passes.h
+++ b/mlir/include/Gradient/Transforms/Passes.h
@@ -22,6 +22,7 @@ namespace catalyst {
 
 // Pass creation for use in mlir opt tools
 std::unique_ptr<mlir::Pass> createGradientBufferizationPass();
+std::unique_ptr<mlir::Pass> createGradientPreprocessingPass();
 std::unique_ptr<mlir::Pass> createGradientLoweringPass();
 std::unique_ptr<mlir::Pass> createGradientConversionPass();
 
diff --git a/mlir/include/Gradient/Transforms/Passes.td b/mlir/include/Gradient/Transforms/Passes.td
index 0b1fbfb76f..3c75e61ab2 100644
--- a/mlir/include/Gradient/Transforms/Passes.td
+++ b/mlir/include/Gradient/Transforms/Passes.td
@@ -71,4 +71,16 @@ def GradientConversionPass : Pass<"convert-gradient-to-llvm"> {
     ];
 }
 
+def GradientPreprocessingPass : Pass<"gradient-preprocess"> {
+    let summary = "Bufferize tensors in quantum operations.";
+
+    let dependentDialects = [
+        "bufferization::BufferizationDialect",
+        "memref::MemRefDialect",
+        "index::IndexDialect"
+    ];
+
+    let constructor = "catalyst::createGradientPreprocessingPass()";
+}
+
 #endif // GRADIENT_PASSES
diff --git a/mlir/include/Gradient/Transforms/Patterns.h b/mlir/include/Gradient/Transforms/Patterns.h
index c21dac8f6c..bfa46f1130 100644
--- a/mlir/include/Gradient/Transforms/Patterns.h
+++ b/mlir/include/Gradient/Transforms/Patterns.h
@@ -22,6 +22,7 @@ namespace catalyst {
 namespace gradient {
 
 void populateBufferizationPatterns(mlir::TypeConverter &, mlir::RewritePatternSet &);
+void populatePreprocessingPatterns(mlir::RewritePatternSet &);
 void populateLoweringPatterns(mlir::RewritePatternSet &);
 void populateConversionPatterns(mlir::LLVMTypeConverter &, mlir::RewritePatternSet &);
 
diff --git a/mlir/lib/Catalyst/Transforms/RegisterAllPasses.cpp b/mlir/lib/Catalyst/Transforms/RegisterAllPasses.cpp
index ea0220c582..48d0110b77 100644
--- a/mlir/lib/Catalyst/Transforms/RegisterAllPasses.cpp
+++ b/mlir/lib/Catalyst/Transforms/RegisterAllPasses.cpp
@@ -25,6 +25,7 @@ void catalyst::registerAllCatalystPasses()
     mlir::registerPass(catalyst::createGradientBufferizationPass);
     mlir::registerPass(catalyst::createGradientLoweringPass);
     mlir::registerPass(catalyst::createGradientConversionPass);
+    mlir::registerPass(catalyst::createGradientPreprocessingPass);
     mlir::registerPass(catalyst::createScatterLoweringPass);
     mlir::registerPass(catalyst::createAdjointLoweringPass);
     mlir::registerPass(catalyst::createQuantumBufferizationPass);
diff --git a/mlir/lib/Gradient/Transforms/CMakeLists.txt b/mlir/lib/Gradient/Transforms/CMakeLists.txt
index 82c00aa58a..ab26e832bc 100644
--- a/mlir/lib/Gradient/Transforms/CMakeLists.txt
+++ b/mlir/lib/Gradient/Transforms/CMakeLists.txt
@@ -5,6 +5,8 @@ file(GLOB SRC
     BufferizableOpInterfaceImpl.cpp
     BufferizationPatterns.cpp
     gradient_bufferize.cpp
+    PreprocessingPatterns.cpp
+    gradient_preprocess.cpp
     LoweringPatterns.cpp
     gradient_lowering.cpp
     ConversionPatterns.cpp
diff --git a/mlir/lib/Gradient/Transforms/PreprocessingPatterns.cpp b/mlir/lib/Gradient/Transforms/PreprocessingPatterns.cpp
new file mode 100644
index 0000000000..3330c64919
--- /dev/null
+++ b/mlir/lib/Gradient/Transforms/PreprocessingPatterns.cpp
@@ -0,0 +1,69 @@
+// Copyright 2022-2023 Xanadu Quantum Technologies Inc.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "iostream"
+#include "llvm/Support/raw_ostream.h"
+
+#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
+#include "mlir/Dialect/Index/IR/IndexOps.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/IR/SymbolTable.h"
+#include "mlir/Transforms/DialectConversion.h"
+
+#include "Gradient/IR/GradientOps.h"
+#include "Gradient/Transforms/Passes.h"
+#include "Gradient/Utils/GradientShape.h"
+
+using namespace mlir;
+using namespace catalyst::gradient;
+
+namespace {
+
+struct PreprocessForwardOp : public OpRewritePattern<ForwardOp> {
+    using mlir::OpRewritePattern<ForwardOp>::OpRewritePattern;
+
+    mlir::LogicalResult matchAndRewrite(ForwardOp op,
+                                        mlir::PatternRewriter &rewriter) const override
+    {
+        llvm::outs() << "forward\n";
+        return success();
+    }
+};
+
+struct PreprocessReverseOp : public OpRewritePattern<ReverseOp> {
+    using OpRewritePattern<ReverseOp>::OpRewritePattern;
+
+    mlir::LogicalResult matchAndRewrite(ReverseOp op,
+                                        mlir::PatternRewriter &rewriter) const override
+    {
+        llvm::outs() << "reverse\n";
+        return success();
+    }
+};
+
+} // namespace
+
+namespace catalyst {
+namespace gradient {
+
+void populatePreprocessingPatterns(RewritePatternSet &patterns)
+{
+    patterns.add<PreprocessForwardOp>(patterns.getContext());
+    patterns.add<PreprocessReverseOp>(patterns.getContext());
+}
+
+} // namespace gradient
+} // namespace catalyst
diff --git a/mlir/lib/Gradient/Transforms/gradient_preprocess.cpp b/mlir/lib/Gradient/Transforms/gradient_preprocess.cpp
new file mode 100644
index 0000000000..f470d5f8d9
--- /dev/null
+++ b/mlir/lib/Gradient/Transforms/gradient_preprocess.cpp
@@ -0,0 +1,58 @@
+// Copyright 2022-2023 Xanadu Quantum Technologies Inc.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
+#include "mlir/Dialect/Bufferization/Transforms/Bufferize.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Index/IR/IndexDialect.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+#include "Gradient/IR/GradientOps.h"
+#include "Gradient/Transforms/Passes.h"
+#include "Gradient/Transforms/Patterns.h"
+
+using namespace mlir;
+using namespace catalyst::gradient;
+
+namespace catalyst {
+namespace gradient {
+
+#define GEN_PASS_DEF_GRADIENTPREPROCESSINGPASS
+#include "Gradient/Transforms/Passes.h.inc"
+
+struct GradientPreprocessingPass : impl::GradientPreprocessingPassBase<GradientPreprocessingPass> {
+    using GradientPreprocessingPassBase::GradientPreprocessingPassBase;
+
+    void runOnOperation() final
+    {
+        RewritePatternSet patterns(&getContext());
+        populatePreprocessingPatterns(patterns);
+
+        if (failed(applyPatternsAndFoldGreedily(getOperation(), std::move(patterns)))) {
+            return signalPassFailure();
+        }
+    }
+};
+
+} // namespace gradient
+
+std::unique_ptr<Pass> createGradientPreprocessingPass()
+{
+    return std::make_unique<gradient::GradientPreprocessingPass>();
+}
+
+} // namespace catalyst

From 79ce24e8aca26dcbf24fe81dfb18886f8eb23892 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Thu, 12 Sep 2024 18:23:48 -0400
Subject: [PATCH 084/183] Use preprocessing steps to add Func.call in ForwardOp
 and ReverseOp

---
 .../Transforms/PreprocessingPatterns.cpp      | 107 ++++++++++++++++++
 1 file changed, 107 insertions(+)

diff --git a/mlir/lib/Gradient/Transforms/PreprocessingPatterns.cpp b/mlir/lib/Gradient/Transforms/PreprocessingPatterns.cpp
index 3330c64919..51b0bcd73d 100644
--- a/mlir/lib/Gradient/Transforms/PreprocessingPatterns.cpp
+++ b/mlir/lib/Gradient/Transforms/PreprocessingPatterns.cpp
@@ -21,6 +21,7 @@
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/IR/SymbolTable.h"
+#include "mlir/Support/LogicalResult.h"
 #include "mlir/Transforms/DialectConversion.h"
 
 #include "Gradient/IR/GradientOps.h"
@@ -38,7 +39,55 @@ struct PreprocessForwardOp : public OpRewritePattern<ForwardOp> {
     mlir::LogicalResult matchAndRewrite(ForwardOp op,
                                         mlir::PatternRewriter &rewriter) const override
     {
+        if(!op.getFunctionBody().empty())
+            return failure();
+
         llvm::outs() << "forward\n";
+
+        auto argc = op.getArgc();
+        auto resc = op.getResc();
+        SmallVector<Value> inputs;
+        SmallVector<Value> differentials;
+        SmallVector<Value> outputs;
+        SmallVector<Value> cotangents;
+
+        Block *block;
+        rewriter.modifyOpInPlace(op, [&] { block = op.addEntryBlock(); });
+
+        PatternRewriter::InsertionGuard guard(rewriter);
+        rewriter.setInsertionPointToStart(block);
+        auto params = op.getArguments();
+
+        for (size_t i = 0; i < argc * 2; i++) {
+            bool isDup = (i % 2) != 0;
+            Value val = params[i];
+            isDup ? differentials.push_back(val) : inputs.push_back(val);
+        }
+
+        auto upperLimit = (argc * 2) + (resc * 2);
+        for (size_t i = argc * 2; i < upperLimit; i++) {
+            bool isDup = (i % 2) != 0;
+            Value val = params[i];
+            isDup ? cotangents.push_back(val) : outputs.push_back(val);
+        }
+
+        auto implAttr = op.getImplementationAttr();
+        auto impl = op.getImplementation();
+        auto implOp = SymbolTable::lookupNearestSymbolFrom<FunctionOpInterface>(op, implAttr);
+        auto implResTy = implOp.getResultTypes();
+        Location loc = op.getLoc();
+
+        auto callOp = rewriter.create<func::CallOp>(loc, impl, implResTy, inputs);
+        SmallVector<Value> tensorOutputs(callOp.getResults());
+
+        auto tapeCount = op.getTape();
+        SmallVector<Value> tapeOutputs;
+        tapeOutputs.insert(tapeOutputs.begin(), tensorOutputs.end() - tapeCount,
+                           tensorOutputs.end());
+
+        auto F = rewriter.getIntegerAttr(rewriter.getI1Type(), 0);
+        rewriter.create<catalyst::gradient::ReturnOp>(loc, tapeOutputs, F);
+
         return success();
     }
 };
@@ -49,7 +98,65 @@ struct PreprocessReverseOp : public OpRewritePattern<ReverseOp> {
     mlir::LogicalResult matchAndRewrite(ReverseOp op,
                                         mlir::PatternRewriter &rewriter) const override
     {
+        if(!op.getFunctionBody().empty())
+            return failure();
+
         llvm::outs() << "reverse\n";
+
+        auto argc = op.getArgc();
+        auto resc = op.getResc();
+        SmallVector<Value> inputs;
+        SmallVector<Value> differentials;
+        SmallVector<Value> outputs;
+        SmallVector<Value> cotangents;
+        SmallVector<Value> tapeElements;
+
+        Block *block;
+        rewriter.modifyOpInPlace(op, [&] { block = op.addEntryBlock(); });
+
+        PatternRewriter::InsertionGuard guard(rewriter);
+        rewriter.setInsertionPointToStart(block);
+        auto params = op.getArguments();
+
+        for (size_t i = 0; i < argc * 2; i++) {
+            bool isDup = (i % 2) != 0;
+            Value val = params[i];
+            isDup ? differentials.push_back(val) : inputs.push_back(val);
+        }
+
+        auto upperLimit = (argc * 2) + (resc * 2);
+        for (size_t i = argc * 2; i < upperLimit; i++) {
+            bool isDup = (i % 2) != 0;
+            Value val = params[i];
+            isDup ? cotangents.push_back(val) : outputs.push_back(val);
+        }
+
+        auto tapeCount = op.getTape();
+        auto uppestLimit = upperLimit + tapeCount;
+        for (size_t i = upperLimit; i < uppestLimit; i++) {
+            tapeElements.push_back(params[i]);
+        }
+
+        auto implAttr = op.getImplementationAttr();
+        auto impl = op.getImplementation();
+        auto implOp = SymbolTable::lookupNearestSymbolFrom<FunctionOpInterface>(op, implAttr);
+        auto implResTy = implOp.getResultTypes();
+        Location loc = op.getLoc();
+
+        SmallVector<Value> tensorInputs;
+        for (auto tapeElement : tapeElements) {
+            tensorInputs.push_back(tapeElement);
+        }
+
+        for (auto cotangent : cotangents) {
+            tensorInputs.push_back(cotangent);
+        }
+
+        rewriter.create<func::CallOp>(loc, impl, implResTy, tensorInputs);
+
+        auto T = rewriter.getIntegerAttr(rewriter.getI1Type(), 1);
+        rewriter.create<catalyst::gradient::ReturnOp>(loc, ValueRange{}, T);
+        
         return success();
     }
 };

From 7d5fc1f35fb1ecbc07287a7943749f17383e6ca7 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Mon, 16 Sep 2024 17:07:28 -0400
Subject: [PATCH 085/183] Remove llvm scf patch

---
 mlir/Makefile                            |   4 -
 mlir/patches/scf-for-preprocessing.patch | 128 -----------------------
 2 files changed, 132 deletions(-)
 delete mode 100644 mlir/patches/scf-for-preprocessing.patch

diff --git a/mlir/Makefile b/mlir/Makefile
index 4c8b9ef4f3..debedd0c3a 100644
--- a/mlir/Makefile
+++ b/mlir/Makefile
@@ -14,7 +14,6 @@ ENABLE_ASAN?=OFF
 BUILD_TYPE?=Release
 LLVM_ROOT=$(MK_DIR)/llvm-project
 LLVM_MODULE_PATCH_FILE=$(MK_DIR)/patches/moduleOp-bufferization.patch
-LLVM_SCF_PATCH_FILE=$(MK_DIR)/patches/scf-for-preprocessing.patch
 TARGET_FILE=$(MK_DIR)/mlir-hlo/mhlo/transforms/CMakeLists.txt
 PATCH_FILE=$(MK_DIR)/patches/mhlo-Add-PassesIncGen-in-transforms-CMakeList.patch
 
@@ -60,9 +59,6 @@ llvm:
 	@if patch --dry-run -p1 -N --directory=$(LLVM_ROOT) < $(LLVM_MODULE_PATCH_FILE) > /dev/null 2>&1; then \
         patch -p1 --directory=$(LLVM_ROOT) < $(LLVM_MODULE_PATCH_FILE); \
 	fi
-	@if patch --dry-run -p1 -N --directory=$(LLVM_ROOT) < $(LLVM_SCF_PATCH_FILE) > /dev/null 2>&1; then \
-        patch -p1 --directory=$(LLVM_ROOT) < $(LLVM_SCF_PATCH_FILE); \
-	fi
 	cmake -G Ninja -S llvm-project/llvm -B $(LLVM_BUILD_DIR) \
 		-DCMAKE_BUILD_TYPE=$(BUILD_TYPE) \
 		-DLLVM_BUILD_EXAMPLES=OFF \
diff --git a/mlir/patches/scf-for-preprocessing.patch b/mlir/patches/scf-for-preprocessing.patch
deleted file mode 100644
index 769556de62..0000000000
--- a/mlir/patches/scf-for-preprocessing.patch
+++ /dev/null
@@ -1,128 +0,0 @@
-diff --git a/mlir/include/mlir/Dialect/SCF/Transforms/Passes.h b/mlir/include/mlir/Dialect/SCF/Transforms/Passes.h
-index 90b315e83a8cfd..6107219ea94ae1 100644
---- a/mlir/include/mlir/Dialect/SCF/Transforms/Passes.h
-+++ b/mlir/include/mlir/Dialect/SCF/Transforms/Passes.h
-@@ -23,6 +23,9 @@ namespace mlir {
- /// Creates a pass that bufferizes the SCF dialect.
- std::unique_ptr<Pass> createSCFBufferizePass();
- 
-+/// Creates a pass that preprocesses SCF loop before One-Shot Bufferize.
-+std::unique_ptr<Pass> createSCFLoopBufferizationPreprocessingPass();
-+
- /// Creates a pass that specializes for loop for unrolling and
- /// vectorization.
- std::unique_ptr<Pass> createForLoopSpecializationPass();
-diff --git a/mlir/include/mlir/Dialect/SCF/Transforms/Passes.td b/mlir/include/mlir/Dialect/SCF/Transforms/Passes.td
-index 350611ad86873d..94d3e51a1c9044 100644
---- a/mlir/include/mlir/Dialect/SCF/Transforms/Passes.td
-+++ b/mlir/include/mlir/Dialect/SCF/Transforms/Passes.td
-@@ -18,6 +18,27 @@ def SCFBufferize : Pass<"scf-bufferize"> {
-                            "memref::MemRefDialect"];
- }
- 
-+def SCFLoopBufferizationPreprocessing
-+    : Pass<"scf-loop-bufferization-preprocessing"> {
-+  let summary = "Preprocess loops before One-Shot Bufferize";
-+
-+  let description = [{
-+    Preprocess `scf.for` loops before running One-Shot Bufferize to support
-+    loops where a yielded tensor is not equivalent to the respective iter_arg.
-+    Such IR is currently not supported by One-Shot Bufferize.
-+
-+    This pass inserts a `bufferization.materialize_in_destination` op for every
-+    yielded tensor, such that the yielded value is guaranteed to materialize in
-+    the future buffer of the iter_arg; this is done by copying the tensor
-+    contents into the iter_arg buffer. Such memcpys are a no-op in case the
-+    tensor contents already materialize in the iter_arg buffer.
-+  }];
-+
-+  let constructor = "mlir::createSCFLoopBufferizationPreprocessingPass()";
-+  let dependentDialects = ["bufferization::BufferizationDialect",
-+                           "scf::SCFDialect"];
-+}
-+
- // Note: Making these canonicalization patterns would require a dependency
- // of the SCF dialect on the Affine/Tensor/MemRef dialects or vice versa.
- def SCFForLoopCanonicalization
-diff --git a/mlir/lib/Dialect/SCF/Transforms/Bufferize.cpp b/mlir/lib/Dialect/SCF/Transforms/Bufferize.cpp
-index 21c618ab633f60..727c4fc7c6396e 100644
---- a/mlir/lib/Dialect/SCF/Transforms/Bufferize.cpp
-+++ b/mlir/lib/Dialect/SCF/Transforms/Bufferize.cpp
-@@ -17,6 +17,7 @@
- 
- namespace mlir {
- #define GEN_PASS_DEF_SCFBUFFERIZE
-+#define GEN_PASS_DEF_SCFLOOPBUFFERIZATIONPREPROCESSING
- #include "mlir/Dialect/SCF/Transforms/Passes.h.inc"
- } // namespace mlir
- 
-@@ -40,8 +41,40 @@ struct SCFBufferizePass : public impl::SCFBufferizeBase<SCFBufferizePass> {
-       return signalPassFailure();
-   };
- };
-+
-+struct SCFLoopBufferizationPreprocessingPass
-+    : public impl::SCFLoopBufferizationPreprocessingBase<
-+          SCFLoopBufferizationPreprocessingPass> {
-+  void runOnOperation() override {
-+    OpBuilder builder(getOperation()->getContext());
-+    getOperation()->walk([&](scf::YieldOp yieldOp) {
-+      builder.setInsertionPoint(yieldOp);
-+      // TODO: Support scf.while.
-+      auto forOp = dyn_cast<scf::ForOp>(yieldOp->getParentOp());
-+      if (!forOp)
-+        return WalkResult::skip();
-+      for (OpOperand &operand : yieldOp->getOpOperands()) {
-+        auto tensorType = dyn_cast<TensorType>(operand.get().getType());
-+        if (!tensorType)
-+          continue;
-+        auto bbArg = forOp.getRegionIterArgs()[operand.getOperandNumber()];
-+        Value materialized =
-+            builder
-+                .create<bufferization::MaterializeInDestinationOp>(
-+                    yieldOp.getLoc(), tensorType, operand.get(), bbArg)
-+                .getResult();
-+        operand.set(materialized);
-+      }
-+      return WalkResult::advance();
-+    });
-+  }
-+};
- } // namespace
- 
- std::unique_ptr<Pass> mlir::createSCFBufferizePass() {
-   return std::make_unique<SCFBufferizePass>();
- }
-+
-+std::unique_ptr<Pass> mlir::createSCFLoopBufferizationPreprocessingPass() {
-+  return std::make_unique<SCFLoopBufferizationPreprocessingPass>();
-+}
-diff --git a/mlir/test/Dialect/SCF/one-shot-bufferize-preprocessing.mlir b/mlir/test/Dialect/SCF/one-shot-bufferize-preprocessing.mlir
-new file mode 100644
-index 00000000000000..17661178245088
---- /dev/null
-+++ b/mlir/test/Dialect/SCF/one-shot-bufferize-preprocessing.mlir
-@@ -0,0 +1,23 @@
-+// RUN: mlir-opt %s -scf-loop-bufferization-preprocessing -one-shot-bufferize="bufferize-function-boundaries function-boundary-type-conversion=identity-layout-map" -canonicalize | FileCheck %s
-+
-+// CHECK-LABEL: func @conflict_in_loop(
-+//  CHECK-SAME:     %[[A:.*]]: memref<10xf32>
-+func.func @conflict_in_loop(%A: tensor<10xf32>, %f: f32, %idx: index, %lb: index, %ub: index, %step: index) -> f32 {
-+  // CHECK: scf.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} {
-+  %r = scf.for %i = %lb to %ub step %step iter_args(%tA = %A) -> (tensor<10xf32>) {
-+    // CHECK: %[[alloc:.*]] = memref.alloc()
-+    // CHECK: memref.copy %[[A]], %[[alloc]]
-+    // CHECK: memref.store %{{.*}}, %[[alloc]]
-+    %0 = tensor.insert %f into %tA[%i] : tensor<10xf32>
-+    // CHECK: %[[read:.*]] = memref.load %[[A]]
-+    %read = tensor.extract %tA[%idx] : tensor<10xf32>
-+    // CHECK: vector.print %[[read]]
-+    vector.print %read : f32
-+    // CHECK: memref.copy %[[alloc]], %[[A]]
-+    scf.yield %0 : tensor<10xf32>
-+  }
-+
-+  // CHECK: memref.load %[[A]]
-+  %f0 = tensor.extract %r[%step] : tensor<10xf32>
-+  return %f0 : f32
-+}

From 6a24347b880efc28b5172db5247960f4b693ce03 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Tue, 17 Sep 2024 10:50:59 -0400
Subject: [PATCH 086/183] CI warning cleanup

---
 mlir/include/Gradient/Transforms/Passes.td    | 12 ----
 .../BufferizableOpInterfaceImpl.cpp           |  1 -
 .../Catalyst/Transforms/RegisterAllPasses.cpp |  1 -
 mlir/lib/Gradient/IR/GradientDialect.cpp      |  4 +-
 .../BufferizableOpInterfaceImpl.cpp           | 72 ++++++++++---------
 .../GradMethods/ClassicalJacobian.cpp         |  7 +-
 .../GradMethods/PS_QuantumGradient.cpp        |  7 +-
 mlir/lib/Quantum/IR/QuantumDialect.cpp        |  6 +-
 8 files changed, 53 insertions(+), 57 deletions(-)

diff --git a/mlir/include/Gradient/Transforms/Passes.td b/mlir/include/Gradient/Transforms/Passes.td
index 8173167d49..72a7dcf7c0 100644
--- a/mlir/include/Gradient/Transforms/Passes.td
+++ b/mlir/include/Gradient/Transforms/Passes.td
@@ -93,16 +93,4 @@ def GradientConversionPass : Pass<"convert-gradient-to-llvm"> {
     ];
 }
 
-def GradientPreprocessingPass : Pass<"gradient-preprocess"> {
-    let summary = "Bufferize tensors in quantum operations.";
-
-    let dependentDialects = [
-        "bufferization::BufferizationDialect",
-        "memref::MemRefDialect",
-        "index::IndexDialect"
-    ];
-
-    let constructor = "catalyst::createGradientPreprocessingPass()";
-}
-
 #endif // GRADIENT_PASSES
diff --git a/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
index fff38f63c2..15f287f1d5 100644
--- a/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -124,7 +124,6 @@ struct CustomCallOpInterface
 struct CallbackOpInterface
     : public bufferization::OpWithUnstructuredControlFlowBufferizableOpInterfaceExternalModel<
           CallbackOpInterface, CallbackOp> {
-
     static bool supportsUnstructuredControlFlow() { return true; }
 
     bool hasTensorSemantics(Operation *op) const
diff --git a/mlir/lib/Catalyst/Transforms/RegisterAllPasses.cpp b/mlir/lib/Catalyst/Transforms/RegisterAllPasses.cpp
index 13a2dee3e2..e0eb41f7f7 100644
--- a/mlir/lib/Catalyst/Transforms/RegisterAllPasses.cpp
+++ b/mlir/lib/Catalyst/Transforms/RegisterAllPasses.cpp
@@ -27,7 +27,6 @@ void catalyst::registerAllCatalystPasses()
     mlir::registerPass(catalyst::createGradientPostprocessingPass);
     mlir::registerPass(catalyst::createGradientLoweringPass);
     mlir::registerPass(catalyst::createGradientConversionPass);
-    mlir::registerPass(catalyst::createGradientPreprocessingPass);
     mlir::registerPass(catalyst::createScatterLoweringPass);
     mlir::registerPass(catalyst::createAdjointLoweringPass);
     mlir::registerPass(catalyst::createQuantumBufferizationPass);
diff --git a/mlir/lib/Gradient/IR/GradientDialect.cpp b/mlir/lib/Gradient/IR/GradientDialect.cpp
index c80b1b5eb6..068079b99f 100644
--- a/mlir/lib/Gradient/IR/GradientDialect.cpp
+++ b/mlir/lib/Gradient/IR/GradientDialect.cpp
@@ -51,8 +51,8 @@ void GradientDialect::initialize()
 #include "Gradient/IR/GradientOps.cpp.inc"
         >();
     addInterface<GradientInlinerInterface>();
-    declarePromisedInterfaces<bufferization::BufferizableOpInterface, AdjointOp,
-                            BackpropOp, ForwardOp, ReverseOp>();
+    declarePromisedInterfaces<bufferization::BufferizableOpInterface, AdjointOp, BackpropOp,
+                              ForwardOp, ReverseOp>();
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
index ea716be399..a5053f2598 100644
--- a/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -47,8 +47,8 @@ Value generateAllocation(OpBuilder &builder, Location loc, Value reference)
 ///
 /// The allocation size and shape is deduced from a list of existing memref values.
 ///
-void generateAllocations(RewriterBase &rewriter, Location loc,
-                         SmallVectorImpl<Value> &allocations, ValueRange referenceValues)
+void generateAllocations(RewriterBase &rewriter, Location loc, SmallVectorImpl<Value> &allocations,
+                         ValueRange referenceValues)
 {
     for (Value memref : referenceValues) {
         allocations.push_back(
@@ -106,16 +106,16 @@ struct AdjointOpInterface
             bufferArgs.push_back(*opBuffer);
         }
 
-
-        rewriter.create<AdjointOp>(loc, TypeRange{}, adjointOp.getCalleeAttr(), adjointOp.getGradSize(),
-                                   bufferArgs, memrefValues);
+        rewriter.create<AdjointOp>(loc, TypeRange{}, adjointOp.getCalleeAttr(),
+                                   adjointOp.getGradSize(), bufferArgs, memrefValues);
         bufferization::replaceOpWithBufferizedValues(rewriter, op, memrefValues);
         return success();
     }
 };
 
 struct BackpropOpInterface
-    : public bufferization::BufferizableOpInterface::ExternalModel<BackpropOpInterface, BackpropOp> {
+    : public bufferization::BufferizableOpInterface::ExternalModel<BackpropOpInterface,
+                                                                   BackpropOp> {
     bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
                                 const bufferization::AnalysisState &state) const
     {
@@ -153,15 +153,15 @@ struct BackpropOpInterface
         SmallVector<Value> bufferArgs;
         ValueRange operands = backpropOp.getArgs();
         for (Value operand : operands) {
-            if(isa<TensorType>(operand.getType())) {
+            if (isa<TensorType>(operand.getType())) {
                 FailureOr<Value> opBuffer = getBuffer(rewriter, operand, options);
                 if (failed(opBuffer))
                     return failure();
                 bufferArgs.push_back(*opBuffer);
-            } else {
+            }
+            else {
                 bufferArgs.push_back(operand);
             }
-
         }
 
         std::vector<Value> diffArgs =
@@ -238,10 +238,8 @@ struct BackpropOpInterface
 
 struct ForwardOpInterface
     : public bufferization::OpWithUnstructuredControlFlowBufferizableOpInterfaceExternalModel<
-        ForwardOpInterface, ForwardOp> {
-
+          ForwardOpInterface, ForwardOp> {
     static bool supportsUnstructuredControlFlow() { return true; }
-
     bool hasTensorSemantics(Operation *op) const
     {
         auto isaTensor = llvm::IsaPred<TensorType>;
@@ -305,10 +303,10 @@ struct ForwardOpInterface
                     MemRefType::get(tensorType.getShape(), tensorType.getElementType()));
         }
         auto forwardTy = rewriter.getFunctionType(bufferArgs, emptyRets);
-        
+
         Block *block;
-        rewriter.modifyOpInPlace(op, [&] { 
-            forwardOp.setFunctionType(forwardTy); 
+        rewriter.modifyOpInPlace(op, [&] {
+            forwardOp.setFunctionType(forwardTy);
             block = forwardOp.addEntryBlock();
         });
 
@@ -337,16 +335,19 @@ struct ForwardOpInterface
 
         SmallVector<Value> tensorInputs;
         for (auto input : inputs) {
-            Value tensorIn = (isa<TensorType>(input.getType())) ? input :
-                rewriter.create<bufferization::ToTensorOp>(loc, input);
+            Value tensorIn = (isa<TensorType>(input.getType()))
+                                 ? input
+                                 : rewriter.create<bufferization::ToTensorOp>(loc, input);
             tensorInputs.push_back(tensorIn);
         }
         auto callOp = rewriter.create<func::CallOp>(loc, impl, implResTy, tensorInputs);
         SmallVector<Value> tensorOutputs(callOp.getResults());
 
         for (auto [memrefOutput, tensorOutput] : llvm::zip(outputs, tensorOutputs)) {
-            Value castVal = (isa<MemRefType>(tensorOutput.getType())) ? tensorOutput :
-                rewriter.create<bufferization::ToMemrefOp>(loc, memrefOutput.getType(), tensorOutput);
+            Value castVal = (isa<MemRefType>(tensorOutput.getType()))
+                                ? tensorOutput
+                                : rewriter.create<bufferization::ToMemrefOp>(
+                                      loc, memrefOutput.getType(), tensorOutput);
             rewriter.create<memref::CopyOp>(loc, castVal, memrefOutput);
         }
 
@@ -358,8 +359,10 @@ struct ForwardOpInterface
         SmallVector<Value> tapeMemrefOutputs;
         for (auto [tapeTensorOutput, memrefTapeOutput] :
              llvm::zip(tapeOutputs, forwardOp.getResultTypes())) {
-            Value castVal = (isa<MemRefType>(tapeTensorOutput.getType())) ? tapeTensorOutput :
-                rewriter.create<bufferization::ToMemrefOp>(loc, memrefTapeOutput, tapeTensorOutput);
+            Value castVal = (isa<MemRefType>(tapeTensorOutput.getType()))
+                                ? tapeTensorOutput
+                                : rewriter.create<bufferization::ToMemrefOp>(loc, memrefTapeOutput,
+                                                                             tapeTensorOutput);
             tapeMemrefOutputs.push_back(castVal);
         }
 
@@ -372,7 +375,7 @@ struct ForwardOpInterface
 
 struct ReverseOpInterface
     : public bufferization::OpWithUnstructuredControlFlowBufferizableOpInterfaceExternalModel<
-        ReverseOpInterface, ReverseOp> {
+          ReverseOpInterface, ReverseOp> {
 
     static bool supportsUnstructuredControlFlow() { return true; }
 
@@ -440,10 +443,10 @@ struct ReverseOpInterface
                     MemRefType::get(tensorType.getShape(), tensorType.getElementType()));
         }
         auto reverseTy = rewriter.getFunctionType(bufferArgs, emptyRets);
-        
+
         Block *block;
-        rewriter.modifyOpInPlace(op, [&] { 
-            reverseOp.setFunctionType(reverseTy); 
+        rewriter.modifyOpInPlace(op, [&] {
+            reverseOp.setFunctionType(reverseTy);
             block = reverseOp.addEntryBlock();
         });
 
@@ -472,20 +475,23 @@ struct ReverseOpInterface
 
         auto implAttr = reverseOp.getImplementationAttr();
         auto impl = reverseOp.getImplementation();
-        auto implOp = SymbolTable::lookupNearestSymbolFrom<FunctionOpInterface>(reverseOp, implAttr);
+        auto implOp =
+            SymbolTable::lookupNearestSymbolFrom<FunctionOpInterface>(reverseOp, implAttr);
         auto implResTy = implOp.getResultTypes();
         Location loc = reverseOp.getLoc();
 
         SmallVector<Value> tensorInputs;
         for (auto tapeElement : tapeElements) {
-            Value tensorIn = (isa<TensorType>(tapeElement.getType())) ? tapeElement :
-                rewriter.create<bufferization::ToTensorOp>(loc, tapeElement);
+            Value tensorIn = (isa<TensorType>(tapeElement.getType()))
+                                 ? tapeElement
+                                 : rewriter.create<bufferization::ToTensorOp>(loc, tapeElement);
             tensorInputs.push_back(tensorIn);
         }
 
         for (auto cotangent : cotangents) {
-            Value tensorIn = (isa<TensorType>(cotangent.getType())) ? cotangent :
-                rewriter.create<bufferization::ToTensorOp>(loc, cotangent);
+            Value tensorIn = (isa<TensorType>(cotangent.getType()))
+                                 ? cotangent
+                                 : rewriter.create<bufferization::ToTensorOp>(loc, cotangent);
             tensorInputs.push_back(tensorIn);
         }
 
@@ -493,8 +499,10 @@ struct ReverseOpInterface
         SmallVector<Value> tensorOutputs(callOp.getResults());
 
         for (auto [differential, tensorOutput] : llvm::zip(differentials, tensorOutputs)) {
-            Value castVal = (isa<MemRefType>(tensorOutput.getType())) ? tensorOutput :
-                rewriter.create<bufferization::ToMemrefOp>(loc, differential.getType(), tensorOutput);
+            Value castVal = (isa<MemRefType>(tensorOutput.getType()))
+                                ? tensorOutput
+                                : rewriter.create<bufferization::ToMemrefOp>(
+                                      loc, differential.getType(), tensorOutput);
             rewriter.create<memref::CopyOp>(loc, castVal, differential);
         }
 
diff --git a/mlir/lib/Gradient/Transforms/GradMethods/ClassicalJacobian.cpp b/mlir/lib/Gradient/Transforms/GradMethods/ClassicalJacobian.cpp
index fe0e4fb3e6..30baf98952 100644
--- a/mlir/lib/Gradient/Transforms/GradMethods/ClassicalJacobian.cpp
+++ b/mlir/lib/Gradient/Transforms/GradMethods/ClassicalJacobian.cpp
@@ -147,7 +147,8 @@ func::FuncOp genSplitPreprocessed(PatternRewriter &rewriter, Location loc, func:
         PatternRewriter::InsertionGuard insertGuard(rewriter);
         rewriter.setInsertionPointToStart(&splitFn.getBody().front());
         Value paramsBuffer = rewriter.create<memref::AllocOp>(loc, paramsBufferType, paramCount);
-        Value paramsTensor = rewriter.create<bufferization::ToTensorOp>(loc, paramsBuffer, /*restrict=*/true);
+        Value paramsTensor =
+            rewriter.create<bufferization::ToTensorOp>(loc, paramsBuffer, /*restrict=*/true);
 
         qnodeQuantumArgs.push_back(paramsTensor);
         MemRefType paramsProcessedType = MemRefType::get({}, rewriter.getIndexType());
@@ -289,8 +290,8 @@ func::FuncOp genArgMapFunction(PatternRewriter &rewriter, Location loc, func::Fu
             else if (auto returnOp = dyn_cast<func::ReturnOp>(op)) {
                 PatternRewriter::InsertionGuard insertionGuard(rewriter);
                 rewriter.setInsertionPoint(returnOp);
-                Value paramsVector =
-                    rewriter.create<bufferization::ToTensorOp>(loc, paramsVectorType, paramsBuffer, /*restrict=*/true) ;
+                Value paramsVector = rewriter.create<bufferization::ToTensorOp>(
+                    loc, paramsVectorType, paramsBuffer, /*restrict=*/true);
                 returnOp.getOperandsMutable().assign(paramsVector);
             }
         });
diff --git a/mlir/lib/Gradient/Transforms/GradMethods/PS_QuantumGradient.cpp b/mlir/lib/Gradient/Transforms/GradMethods/PS_QuantumGradient.cpp
index 84c6c19ba4..c51277ed68 100644
--- a/mlir/lib/Gradient/Transforms/GradMethods/PS_QuantumGradient.cpp
+++ b/mlir/lib/Gradient/Transforms/GradMethods/PS_QuantumGradient.cpp
@@ -58,7 +58,8 @@ static std::vector<Value> computePartialDerivative(PatternRewriter &rewriter, Lo
 {
     constexpr double shift = PI / 2;
     ShapedType shiftVectorType = RankedTensorType::get({numShifts}, rewriter.getF64Type());
-    Value selectorVector = rewriter.create<bufferization::ToTensorOp>(loc, selectorBuffer, /*restrict=*/true);
+    Value selectorVector =
+        rewriter.create<bufferization::ToTensorOp>(loc, selectorBuffer, /*restrict=*/true);
 
     // Define the shift vectors (pos/neg) as sparse tensor constants.
     DenseElementsAttr nonZeroIndices = rewriter.getI64TensorAttr(currentShift);
@@ -284,8 +285,8 @@ func::FuncOp ParameterShiftLowering::genQGradFunction(PatternRewriter &rewriter,
                 std::vector<Value> gradientTensors;
                 gradientTensors.reserve(gradResTypes.size());
                 for (Value gradientBuffer : gradientBuffers) {
-                    gradientTensors.push_back(
-                        rewriter.create<bufferization::ToTensorOp>(loc, gradientBuffer, /*restrict=*/true));
+                    gradientTensors.push_back(rewriter.create<bufferization::ToTensorOp>(
+                        loc, gradientBuffer, /*restrict=*/true));
                 }
                 op->setOperands(gradientTensors);
             }
diff --git a/mlir/lib/Quantum/IR/QuantumDialect.cpp b/mlir/lib/Quantum/IR/QuantumDialect.cpp
index c8c3788efd..d4d820326f 100644
--- a/mlir/lib/Quantum/IR/QuantumDialect.cpp
+++ b/mlir/lib/Quantum/IR/QuantumDialect.cpp
@@ -44,9 +44,9 @@ void QuantumDialect::initialize()
 #define GET_OP_LIST
 #include "Quantum/IR/QuantumOps.cpp.inc"
         >();
-    declarePromisedInterfaces<bufferization::BufferizableOpInterface, QubitUnitaryOp,
-                            HermitianOp, HamiltonianOp, SampleOp, StateOp, ProbsOp,
-                            CountsOp, SetStateOp, SetBasisStateOp>();
+    declarePromisedInterfaces<bufferization::BufferizableOpInterface, QubitUnitaryOp, HermitianOp,
+                              HamiltonianOp, SampleOp, StateOp, ProbsOp, CountsOp, SetStateOp,
+                              SetBasisStateOp>();
 }
 
 //===----------------------------------------------------------------------===//

From 73bef19a8f53f82c161cef13d5909e4fab11c804 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Tue, 17 Sep 2024 13:56:39 -0400
Subject: [PATCH 087/183] Update llvm patch

---
 mlir/patches/moduleOp-bufferization.patch | 37 +++++++++++++++++++++--
 1 file changed, 35 insertions(+), 2 deletions(-)

diff --git a/mlir/patches/moduleOp-bufferization.patch b/mlir/patches/moduleOp-bufferization.patch
index 25384f99fb..683f1d51a3 100644
--- a/mlir/patches/moduleOp-bufferization.patch
+++ b/mlir/patches/moduleOp-bufferization.patch
@@ -34,7 +34,7 @@ index 0b91d3d675b7..8bed0dfc5814 100644
  
  void registerBufferizableOpInterfaceExternalModels(DialectRegistry &registry);
 diff --git a/mlir/lib/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.cpp
-index 053ea7935260..fed1c49d22be 100644
+index 9fbe574ec392..9749a71f3514 100644
 --- a/mlir/lib/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.cpp
 +++ b/mlir/lib/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.cpp
 @@ -22,7 +22,7 @@ namespace mlir {
@@ -842,7 +842,7 @@ index f2e9e839b7c4..5e5657980ba1 100644
  
  module attributes {transform.with_named_sequence} {
 diff --git a/mlir/test/Dialect/Vector/transform-vector.mlir b/mlir/test/Dialect/Vector/transform-vector.mlir
-index 75b29e22b4d2..21e615e60438 100644
+index 4b38db79bff3..0439844dc66c 100644
 --- a/mlir/test/Dialect/Vector/transform-vector.mlir
 +++ b/mlir/test/Dialect/Vector/transform-vector.mlir
 @@ -1,16 +1,18 @@
@@ -894,3 +894,36 @@ index 75b29e22b4d2..21e615e60438 100644
  }
  
  module attributes {transform.with_named_sequence} {
+diff --git a/mlir/test/Examples/transform/ChH/full.mlir b/mlir/test/Examples/transform/ChH/full.mlir
+index 259475ebdbf4..85dbf6702332 100644
+--- a/mlir/test/Examples/transform/ChH/full.mlir
++++ b/mlir/test/Examples/transform/ChH/full.mlir
+@@ -1,8 +1,6 @@
+-// RUN: mlir-opt %s --transform-interpreter \
+-// RUN:             --test-transform-dialect-erase-schedule \
+-// RUN:             --math-uplift-to-fma \
+-// RUN:             --convert-bufferization-to-memref \
+-// RUN:             --test-lower-to-llvm |\
++// RUN: mlir-opt %s --transform-interpreter="debug-payload-root-tag=payload" \
++// RUN:             --test-transform-dialect-erase-schedule |\
++// RUN: mlir-opt -pass-pipeline='builtin.module(builtin.module(math-uplift-to-fma,convert-bufferization-to-memref,test-lower-to-llvm))' - |\
+ // RUN: FileCheck %s
+ 
+ // Fixed-size tensor types to be used in convolution.
+@@ -19,6 +17,7 @@
+ // tensors annotated with attributes from the `bufferization` dialect. These
+ // attributes hint the bufferization pass to assume buffers can be directly
+ // used for these tensors without reshaping.
++module @payload attributes { transform.target_tag = "payload" } {
+ func.func @conv(
+     %input: !tinput {bufferization.writable = false,
+                      bufferization.access = "read",
+@@ -84,7 +83,7 @@ func.func @conv(
+ 
+   return %relued : !toutput
+ }
+-
++}
+ // Module containing the transformation script to be applied. The attribute
+ // is required to correctly verify the use of named (macro-like) sequences.
+ module attributes { transform.with_named_sequence } {

From 66e7b06e2980fdd2c256c5d441b1a3175c58f749 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Tue, 17 Sep 2024 14:26:14 -0400
Subject: [PATCH 088/183] Make gradient.ReturnOp ReturnLike

---
 mlir/include/Gradient/IR/GradientOps.h  | 1 +
 mlir/include/Gradient/IR/GradientOps.td | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/mlir/include/Gradient/IR/GradientOps.h b/mlir/include/Gradient/IR/GradientOps.h
index c6f6afadfe..a54e110043 100644
--- a/mlir/include/Gradient/IR/GradientOps.h
+++ b/mlir/include/Gradient/IR/GradientOps.h
@@ -21,6 +21,7 @@
 #include "mlir/IR/OpImplementation.h"
 #include "mlir/IR/SymbolTable.h"
 #include "mlir/Interfaces/CallInterfaces.h"
+#include "mlir/Interfaces/ControlFlowInterfaces.h"
 
 #include "Gradient/IR/GradientInterfaces.h"
 
diff --git a/mlir/include/Gradient/IR/GradientOps.td b/mlir/include/Gradient/IR/GradientOps.td
index fbb43129fd..e30c4bf982 100644
--- a/mlir/include/Gradient/IR/GradientOps.td
+++ b/mlir/include/Gradient/IR/GradientOps.td
@@ -17,6 +17,7 @@
 
 include "mlir/Interfaces/FunctionInterfaces.td"
 include "mlir/Interfaces/CallInterfaces.td"
+include "mlir/Interfaces/ControlFlowInterfaces.td"
 include "mlir/IR/SymbolInterfaces.td"
 include "mlir/IR/BuiltinAttributes.td"
 include "mlir/IR/OpBase.td"
@@ -388,7 +389,7 @@ def ReverseOp : Gradient_Op<"reverse",
 }
 
 def ReturnOp : Gradient_Op<"return",
-    [Terminator, ParentOneOf<["ForwardOp", "ReverseOp"]>]> {
+    [ReturnLike, Terminator, ParentOneOf<["ForwardOp", "ReverseOp"]>]> {
 
   let summary = "Return tapes or nothing";
 

From ea4f2d72227cd02616308da56577da6af8dd44e9 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Tue, 17 Sep 2024 15:04:34 -0400
Subject: [PATCH 089/183] Draft new ForwardOp bufferization

---
 .../BufferizableOpInterfaceImpl.cpp           | 95 ++++++++-----------
 1 file changed, 38 insertions(+), 57 deletions(-)

diff --git a/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
index a5053f2598..1340c0bb7a 100644
--- a/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -240,6 +240,7 @@ struct ForwardOpInterface
     : public bufferization::OpWithUnstructuredControlFlowBufferizableOpInterfaceExternalModel<
           ForwardOpInterface, ForwardOp> {
     static bool supportsUnstructuredControlFlow() { return true; }
+
     bool hasTensorSemantics(Operation *op) const
     {
         auto isaTensor = llvm::IsaPred<TensorType>;
@@ -280,14 +281,7 @@ struct ForwardOpInterface
     {
         auto forwardOp = cast<ForwardOp>(op);
 
-        auto argc = forwardOp.getArgc();
-        auto resc = forwardOp.getResc();
-        SmallVector<Value> inputs;
-        SmallVector<Value> differentials;
-        SmallVector<Value> outputs;
-        SmallVector<Value> cotangents;
-
-        // Update signature
+        // Update ForwardOp's signature
         auto argTys = forwardOp.getArgumentTypes();
         auto retTys = forwardOp.getResultTypes();
         SmallVector<Type> emptyRets;
@@ -303,72 +297,59 @@ struct ForwardOpInterface
                     MemRefType::get(tensorType.getShape(), tensorType.getElementType()));
         }
         auto forwardTy = rewriter.getFunctionType(bufferArgs, emptyRets);
-
-        Block *block;
         rewriter.modifyOpInPlace(op, [&] {
             forwardOp.setFunctionType(forwardTy);
-            block = forwardOp.addEntryBlock();
         });
 
+        // Get ForwardOp's block.
+        auto &block = forwardOp.getBody().front();
         PatternRewriter::InsertionGuard guard(rewriter);
-        rewriter.setInsertionPointToStart(block);
-        auto params = forwardOp.getArguments();
-
-        for (size_t i = 0; i < argc * 2; i++) {
-            bool isDup = (i % 2) != 0;
-            Value val = params[i];
-            isDup ? differentials.push_back(val) : inputs.push_back(val);
-        }
+        rewriter.setInsertionPointToStart(&block);
 
-        auto upperLimit = (argc * 2) + (resc * 2);
-        for (size_t i = argc * 2; i < upperLimit; i++) {
-            bool isDup = (i % 2) != 0;
-            Value val = params[i];
-            isDup ? cotangents.push_back(val) : outputs.push_back(val);
-        }
+        auto argc = forwardOp.getArgc();
+        auto resc = forwardOp.getResc();
 
+        // Get callee's implementation.
         auto implAttr = forwardOp.getImplementationAttr();
         auto impl = forwardOp.getImplementation();
         auto implOp = SymbolTable::lookupNearestSymbolFrom<FunctionOpInterface>(op, implAttr);
         auto implResTy = implOp.getResultTypes();
         Location loc = forwardOp.getLoc();
 
-        SmallVector<Value> tensorInputs;
+        // Create to_tensor if callee is not yet bufferized.
+        SmallVector<Value> inputs(forwardOp.getArguments());
+        SmallVector<Value> calleeInputs;
         for (auto input : inputs) {
-            Value tensorIn = (isa<TensorType>(input.getType()))
-                                 ? input
+            auto tensorIn = (isa<MemRefType>(input.getType())) ? input
                                  : rewriter.create<bufferization::ToTensorOp>(loc, input);
-            tensorInputs.push_back(tensorIn);
+            calleeInputs.push_back(tensorIn);
         }
-        auto callOp = rewriter.create<func::CallOp>(loc, impl, implResTy, tensorInputs);
-        SmallVector<Value> tensorOutputs(callOp.getResults());
 
-        for (auto [memrefOutput, tensorOutput] : llvm::zip(outputs, tensorOutputs)) {
-            Value castVal = (isa<MemRefType>(tensorOutput.getType()))
-                                ? tensorOutput
-                                : rewriter.create<bufferization::ToMemrefOp>(
-                                      loc, memrefOutput.getType(), tensorOutput);
-            rewriter.create<memref::CopyOp>(loc, castVal, memrefOutput);
-        }
-
-        auto tapeCount = forwardOp.getTape();
-        SmallVector<Value> tapeOutputs;
-        tapeOutputs.insert(tapeOutputs.begin(), tensorOutputs.end() - tapeCount,
-                           tensorOutputs.end());
-
-        SmallVector<Value> tapeMemrefOutputs;
-        for (auto [tapeTensorOutput, memrefTapeOutput] :
-             llvm::zip(tapeOutputs, forwardOp.getResultTypes())) {
-            Value castVal = (isa<MemRefType>(tapeTensorOutput.getType()))
-                                ? tapeTensorOutput
-                                : rewriter.create<bufferization::ToMemrefOp>(loc, memrefTapeOutput,
-                                                                             tapeTensorOutput);
-            tapeMemrefOutputs.push_back(castVal);
-        }
-
-        auto F = rewriter.getIntegerAttr(rewriter.getI1Type(), 0);
-        rewriter.create<catalyst::gradient::ReturnOp>(loc, tapeMemrefOutputs, F);
+        forwardOp.walk([&](func::CallOp callOp) {
+            PatternRewriter::InsertionGuard guard(rewriter);
+            rewriter.setInsertionPoint(callOp);
+            SmallVector<Value> inputs(callOp.getOperands());
+            SmallVector<Value> calleeInputs;
+            for (auto input : inputs) {
+                auto tensorIn = (isa<MemRefType>(input.getType())) ? input :
+                                        rewriter.create<bufferization::ToTensorOp>(loc, input);
+                calleeInputs.push_back(tensorIn);
+            }
+            rewriter.replaceOpWithNewOp<func::CallOp>(callOp, impl, implResTy, calleeInputs);
+        });
 
+        forwardOp.walk([&](ReturnOp returnOp) {
+            PatternRewriter::InsertionGuard guard(rewriter);
+            rewriter.setInsertionPoint(returnOp);
+            SmallVector<Value> inputs(returnOp.getOperands());
+            SmallVector<Value> returnInputs;
+            for (auto input : inputs) {
+                auto tensorIn = (isa<MemRefType>(input.getType())) ? input :
+                                        rewriter.create<bufferization::ToTensorOp>(loc, input);
+                returnInputs.push_back(tensorIn);
+            }
+            rewriter.replaceOpWithNewOp<ReturnOp>(returnOp, returnInputs, returnOp.getEmpty());
+        });
         return success();
     }
 };
@@ -376,7 +357,6 @@ struct ForwardOpInterface
 struct ReverseOpInterface
     : public bufferization::OpWithUnstructuredControlFlowBufferizableOpInterfaceExternalModel<
           ReverseOpInterface, ReverseOp> {
-
     static bool supportsUnstructuredControlFlow() { return true; }
 
     bool hasTensorSemantics(Operation *op) const
@@ -417,6 +397,7 @@ struct ReverseOpInterface
     LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
                             const bufferization::BufferizationOptions &options) const
     {
+        return failure();
         auto reverseOp = cast<ReverseOp>(op);
 
         auto argc = reverseOp.getArgc();

From accf60a92d11447b12e852cd064100b35d017489 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Tue, 17 Sep 2024 16:27:36 -0400
Subject: [PATCH 090/183] Add getBufferType method to ForwardOp and ReverseOp
 Interface

---
 .../BufferizableOpInterfaceImpl.cpp           | 52 +++++++++++++++++++
 1 file changed, 52 insertions(+)

diff --git a/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
index 1340c0bb7a..96c9752dc6 100644
--- a/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -24,6 +24,28 @@ using namespace catalyst::gradient;
 
 namespace {
 
+static BaseMemRefType
+getBufferizedFunctionArgType(FunctionOpInterface funcOp, int64_t index,
+                             const bufferization::BufferizationOptions &options) {
+    auto tensorType =
+        dyn_cast<TensorType>(funcOp.getArgument(index).getType());
+    assert(tensorType && "expected TensorType");
+
+    BaseMemRefType memrefType = options.functionArgTypeConverterFn(
+        tensorType, *options.defaultMemorySpaceFn(tensorType), funcOp, options);
+
+    auto layoutAttr = funcOp.getArgAttrOfType<AffineMapAttr>(
+        index, bufferization::BufferizationDialect::kBufferLayoutAttrName);
+    if (!layoutAttr)
+        return memrefType;
+
+    auto rankedMemrefType = dyn_cast<MemRefType>(memrefType);
+    assert(rankedMemrefType && "buffer layout not supported on unranked tensors");
+    return MemRefType::get(
+        rankedMemrefType.getShape(), rankedMemrefType.getElementType(),
+        layoutAttr.getValue(), rankedMemrefType.getMemorySpace());
+}
+
 Value generateAllocation(OpBuilder &builder, Location loc, Value reference)
 {
     auto origMemrefType = cast<MemRefType>(reference.getType());
@@ -276,6 +298,21 @@ struct ForwardOpInterface
         return {};
     }
 
+    FailureOr<BaseMemRefType>
+    getBufferType(Operation *op, Value value, const bufferization::BufferizationOptions &options,
+                  SmallVector<Value> &invocationStack) const {
+        auto funcOp = cast<ForwardOp>(op);
+        auto bbArg = cast<BlockArgument>(value);
+
+        // Function arguments are special.
+        if (bbArg.getOwner() == &funcOp.getBody().front())
+            return getBufferizedFunctionArgType(funcOp, bbArg.getArgNumber(),
+                                              options);
+
+        return OpWithUnstructuredControlFlowBufferizableOpInterfaceExternalModel::
+            getBufferType(op, value, options, invocationStack);
+    }
+
     LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
                             const bufferization::BufferizationOptions &options) const
     {
@@ -394,6 +431,21 @@ struct ReverseOpInterface
         return {};
     }
 
+    FailureOr<BaseMemRefType>
+    getBufferType(Operation *op, Value value, const bufferization::BufferizationOptions &options,
+                  SmallVector<Value> &invocationStack) const {
+        auto funcOp = cast<ForwardOp>(op);
+        auto bbArg = cast<BlockArgument>(value);
+
+        // Function arguments are special.
+        if (bbArg.getOwner() == &funcOp.getBody().front())
+            return getBufferizedFunctionArgType(funcOp, bbArg.getArgNumber(),
+                                              options);
+
+        return OpWithUnstructuredControlFlowBufferizableOpInterfaceExternalModel::
+            getBufferType(op, value, options, invocationStack);
+    }
+
     LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
                             const bufferization::BufferizationOptions &options) const
     {

From 99c517a8da8853995fa6e3c8c9395172cbb84a6c Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Tue, 17 Sep 2024 16:47:17 -0400
Subject: [PATCH 091/183] Cleanup unused read-write flag in ForwardOp and
 ReverseOp

---
 .../BufferizableOpInterfaceImpl.cpp           | 24 -------------------
 1 file changed, 24 deletions(-)

diff --git a/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
index 96c9752dc6..c9b8a7c84e 100644
--- a/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -279,18 +279,6 @@ struct ForwardOpInterface
         return false;
     }
 
-    bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
-                                const bufferization::AnalysisState &state) const
-    {
-        return true;
-    }
-
-    bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
-                                 const bufferization::AnalysisState &state) const
-    {
-        return true;
-    }
-
     bufferization::AliasingValueList
     getAliasingValues(Operation *op, OpOperand &opOperand,
                       const bufferization::AnalysisState &state) const
@@ -412,18 +400,6 @@ struct ReverseOpInterface
         return false;
     }
 
-    bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
-                                const bufferization::AnalysisState &state) const
-    {
-        return true;
-    }
-
-    bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
-                                 const bufferization::AnalysisState &state) const
-    {
-        return true;
-    }
-
     bufferization::AliasingValueList
     getAliasingValues(Operation *op, OpOperand &opOperand,
                       const bufferization::AnalysisState &state) const

From c7768beec2788a20d2477035270f0bc8afe2e610 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Tue, 17 Sep 2024 16:55:35 -0400
Subject: [PATCH 092/183] Copy methods from FuncOp to ForwardOp and ReverseOp

---
 .../BufferizableOpInterfaceImpl.cpp           | 49 +++++++++++++++----
 1 file changed, 39 insertions(+), 10 deletions(-)

diff --git a/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
index c9b8a7c84e..8234fe35e1 100644
--- a/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -11,6 +11,7 @@
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/IR/BuiltinTypes.h"
+#include "mlir/Interfaces/FunctionInterfaces.h"
 #include "mlir/Transforms/DialectConversion.h"
 
 #include "Gradient/IR/GradientOps.h"
@@ -46,6 +47,18 @@ getBufferizedFunctionArgType(FunctionOpInterface funcOp, int64_t index,
         layoutAttr.getValue(), rankedMemrefType.getMemorySpace());
 }
 
+static ReturnOp getAssumedUniqueReturnOp(FunctionOpInterface funcOp) {
+  ReturnOp returnOp;
+  for (Block &b : funcOp.getFunctionBody()) {
+    if (auto candidateOp = dyn_cast<ReturnOp>(b.getTerminator())) {
+      if (returnOp)
+        return nullptr;
+      returnOp = candidateOp;
+    }
+  }
+  return returnOp;
+}
+
 Value generateAllocation(OpBuilder &builder, Location loc, Value reference)
 {
     auto origMemrefType = cast<MemRefType>(reference.getType());
@@ -279,11 +292,10 @@ struct ForwardOpInterface
         return false;
     }
 
-    bufferization::AliasingValueList
-    getAliasingValues(Operation *op, OpOperand &opOperand,
-                      const bufferization::AnalysisState &state) const
-    {
-        return {};
+    bufferization::AliasingOpOperandList
+    getAliasingOpOperands(Operation *op, Value value,
+                          const bufferization::AnalysisState &state) const {
+        return getAliasingBranchOpOperands(op, cast<BlockArgument>(value), state);
     }
 
     FailureOr<BaseMemRefType>
@@ -301,6 +313,15 @@ struct ForwardOpInterface
             getBufferType(op, value, options, invocationStack);
     }
 
+    LogicalResult verifyAnalysis(Operation *op,
+                               const bufferization::AnalysisState &state) const {
+    auto funcOp = cast<ForwardOp>(op);
+        // TODO: func.func with multiple returns are not supported.
+        if (!getAssumedUniqueReturnOp(funcOp) && !funcOp.isExternal())
+          return op->emitOpError("op without unique func.return is not supported");
+        return success();
+    }
+
     LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
                             const bufferization::BufferizationOptions &options) const
     {
@@ -400,11 +421,10 @@ struct ReverseOpInterface
         return false;
     }
 
-    bufferization::AliasingValueList
-    getAliasingValues(Operation *op, OpOperand &opOperand,
-                      const bufferization::AnalysisState &state) const
-    {
-        return {};
+    bufferization::AliasingOpOperandList
+    getAliasingOpOperands(Operation *op, Value value,
+                          const bufferization::AnalysisState &state) const {
+        return getAliasingBranchOpOperands(op, cast<BlockArgument>(value), state);
     }
 
     FailureOr<BaseMemRefType>
@@ -422,6 +442,15 @@ struct ReverseOpInterface
             getBufferType(op, value, options, invocationStack);
     }
 
+    LogicalResult verifyAnalysis(Operation *op,
+                               const bufferization::AnalysisState &state) const {
+    auto funcOp = cast<ReverseOp>(op);
+        // TODO: func.func with multiple returns are not supported.
+        if (!getAssumedUniqueReturnOp(funcOp) && !funcOp.isExternal())
+          return op->emitOpError("op without unique func.return is not supported");
+        return success();
+    }
+
     LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
                             const bufferization::BufferizationOptions &options) const
     {

From 42d224adf5a95c72e060e51029496795a2102de9 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Wed, 18 Sep 2024 11:55:49 -0400
Subject: [PATCH 093/183] Draft new ReverseOp Interface and fix its
 getBufferType

---
 .../BufferizableOpInterfaceImpl.cpp           | 102 +++++++-----------
 1 file changed, 40 insertions(+), 62 deletions(-)

diff --git a/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
index 8234fe35e1..a0ef375cf7 100644
--- a/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -430,7 +430,7 @@ struct ReverseOpInterface
     FailureOr<BaseMemRefType>
     getBufferType(Operation *op, Value value, const bufferization::BufferizationOptions &options,
                   SmallVector<Value> &invocationStack) const {
-        auto funcOp = cast<ForwardOp>(op);
+        auto funcOp = cast<ReverseOp>(op);
         auto bbArg = cast<BlockArgument>(value);
 
         // Function arguments are special.
@@ -454,18 +454,9 @@ struct ReverseOpInterface
     LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
                             const bufferization::BufferizationOptions &options) const
     {
-        return failure();
         auto reverseOp = cast<ReverseOp>(op);
 
-        auto argc = reverseOp.getArgc();
-        auto resc = reverseOp.getResc();
-        SmallVector<Value> inputs;
-        SmallVector<Value> differentials;
-        SmallVector<Value> outputs;
-        SmallVector<Value> cotangents;
-        SmallVector<Value> tapeElements;
-
-        // Update signature
+        // Update ReverseOp's signature
         auto argTys = reverseOp.getArgumentTypes();
         auto retTys = reverseOp.getResultTypes();
         SmallVector<Type> emptyRets;
@@ -481,72 +472,59 @@ struct ReverseOpInterface
                     MemRefType::get(tensorType.getShape(), tensorType.getElementType()));
         }
         auto reverseTy = rewriter.getFunctionType(bufferArgs, emptyRets);
-
-        Block *block;
         rewriter.modifyOpInPlace(op, [&] {
             reverseOp.setFunctionType(reverseTy);
-            block = reverseOp.addEntryBlock();
         });
 
+        // Get ForwardOp's block.
+        auto &block = reverseOp.getBody().front();
         PatternRewriter::InsertionGuard guard(rewriter);
-        rewriter.setInsertionPointToStart(block);
-        auto params = reverseOp.getArguments();
-
-        for (size_t i = 0; i < argc * 2; i++) {
-            bool isDup = (i % 2) != 0;
-            Value val = params[i];
-            isDup ? differentials.push_back(val) : inputs.push_back(val);
-        }
-
-        auto upperLimit = (argc * 2) + (resc * 2);
-        for (size_t i = argc * 2; i < upperLimit; i++) {
-            bool isDup = (i % 2) != 0;
-            Value val = params[i];
-            isDup ? cotangents.push_back(val) : outputs.push_back(val);
-        }
+        rewriter.setInsertionPointToStart(&block);
 
-        auto tapeCount = reverseOp.getTape();
-        auto uppestLimit = upperLimit + tapeCount;
-        for (size_t i = upperLimit; i < uppestLimit; i++) {
-            tapeElements.push_back(params[i]);
-        }
+        auto argc = reverseOp.getArgc();
+        auto resc = reverseOp.getResc();
 
+        // Get callee's implementation.
         auto implAttr = reverseOp.getImplementationAttr();
         auto impl = reverseOp.getImplementation();
-        auto implOp =
-            SymbolTable::lookupNearestSymbolFrom<FunctionOpInterface>(reverseOp, implAttr);
+        auto implOp = SymbolTable::lookupNearestSymbolFrom<FunctionOpInterface>(op, implAttr);
         auto implResTy = implOp.getResultTypes();
         Location loc = reverseOp.getLoc();
 
-        SmallVector<Value> tensorInputs;
-        for (auto tapeElement : tapeElements) {
-            Value tensorIn = (isa<TensorType>(tapeElement.getType()))
-                                 ? tapeElement
-                                 : rewriter.create<bufferization::ToTensorOp>(loc, tapeElement);
-            tensorInputs.push_back(tensorIn);
-        }
-
-        for (auto cotangent : cotangents) {
-            Value tensorIn = (isa<TensorType>(cotangent.getType()))
-                                 ? cotangent
-                                 : rewriter.create<bufferization::ToTensorOp>(loc, cotangent);
-            tensorInputs.push_back(tensorIn);
-        }
-
-        auto callOp = rewriter.create<func::CallOp>(loc, impl, implResTy, tensorInputs);
-        SmallVector<Value> tensorOutputs(callOp.getResults());
-
-        for (auto [differential, tensorOutput] : llvm::zip(differentials, tensorOutputs)) {
-            Value castVal = (isa<MemRefType>(tensorOutput.getType()))
-                                ? tensorOutput
-                                : rewriter.create<bufferization::ToMemrefOp>(
-                                      loc, differential.getType(), tensorOutput);
-            rewriter.create<memref::CopyOp>(loc, castVal, differential);
+        // Create to_tensor if callee is not yet bufferized.
+        SmallVector<Value> inputs(reverseOp.getArguments());
+        SmallVector<Value> calleeInputs;
+        for (auto input : inputs) {
+            auto tensorIn = (isa<MemRefType>(input.getType())) ? input
+                                 : rewriter.create<bufferization::ToTensorOp>(loc, input);
+            calleeInputs.push_back(tensorIn);
         }
 
-        auto T = rewriter.getIntegerAttr(rewriter.getI1Type(), 1);
-        rewriter.create<catalyst::gradient::ReturnOp>(loc, ValueRange{}, T);
+        reverseOp.walk([&](func::CallOp callOp) {
+            PatternRewriter::InsertionGuard guard(rewriter);
+            rewriter.setInsertionPoint(callOp);
+            SmallVector<Value> inputs(callOp.getOperands());
+            SmallVector<Value> calleeInputs;
+            for (auto input : inputs) {
+                auto tensorIn = (isa<MemRefType>(input.getType())) ? input :
+                                        rewriter.create<bufferization::ToTensorOp>(loc, input);
+                calleeInputs.push_back(tensorIn);
+            }
+            rewriter.replaceOpWithNewOp<func::CallOp>(callOp, impl, implResTy, calleeInputs);
+        });
 
+        reverseOp.walk([&](ReturnOp returnOp) {
+            PatternRewriter::InsertionGuard guard(rewriter);
+            rewriter.setInsertionPoint(returnOp);
+            SmallVector<Value> inputs(returnOp.getOperands());
+            SmallVector<Value> returnInputs;
+            for (auto input : inputs) {
+                auto tensorIn = (isa<MemRefType>(input.getType())) ? input :
+                                        rewriter.create<bufferization::ToTensorOp>(loc, input);
+                returnInputs.push_back(tensorIn);
+            }
+            rewriter.replaceOpWithNewOp<ReturnOp>(returnOp, returnInputs, returnOp.getEmpty());
+        });
         return success();
     }
 };

From fedd6686d425387dbec2e31f18cd0cc55a9cac08 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Wed, 18 Sep 2024 12:35:56 -0400
Subject: [PATCH 094/183] Make ForwardOp and ReverseOp follow FuncOp
 bufferization steps

---
 .../BufferizableOpInterfaceImpl.cpp           | 240 ++++++++----------
 1 file changed, 108 insertions(+), 132 deletions(-)

diff --git a/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
index a0ef375cf7..2a7e8497d1 100644
--- a/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -301,12 +301,12 @@ struct ForwardOpInterface
     FailureOr<BaseMemRefType>
     getBufferType(Operation *op, Value value, const bufferization::BufferizationOptions &options,
                   SmallVector<Value> &invocationStack) const {
-        auto funcOp = cast<ForwardOp>(op);
+        auto forwardOp = cast<ForwardOp>(op);
         auto bbArg = cast<BlockArgument>(value);
 
         // Function arguments are special.
-        if (bbArg.getOwner() == &funcOp.getBody().front())
-            return getBufferizedFunctionArgType(funcOp, bbArg.getArgNumber(),
+        if (bbArg.getOwner() == &forwardOp.getBody().front())
+            return getBufferizedFunctionArgType(forwardOp, bbArg.getArgNumber(),
                                               options);
 
         return OpWithUnstructuredControlFlowBufferizableOpInterfaceExternalModel::
@@ -315,9 +315,9 @@ struct ForwardOpInterface
 
     LogicalResult verifyAnalysis(Operation *op,
                                const bufferization::AnalysisState &state) const {
-    auto funcOp = cast<ForwardOp>(op);
+    auto forwardOp = cast<ForwardOp>(op);
         // TODO: func.func with multiple returns are not supported.
-        if (!getAssumedUniqueReturnOp(funcOp) && !funcOp.isExternal())
+        if (!getAssumedUniqueReturnOp(forwardOp))
           return op->emitOpError("op without unique func.return is not supported");
         return success();
     }
@@ -326,76 +326,64 @@ struct ForwardOpInterface
                             const bufferization::BufferizationOptions &options) const
     {
         auto forwardOp = cast<ForwardOp>(op);
-
-        // Update ForwardOp's signature
-        auto argTys = forwardOp.getArgumentTypes();
-        auto retTys = forwardOp.getResultTypes();
-        SmallVector<Type> emptyRets;
-        SmallVector<Type> args(argTys.begin(), argTys.end());
-        args.insert(args.end(), retTys.begin(), retTys.end());
-        SmallVector<Type> bufferArgs;
-        for (Type ty : args) {
-            auto tensorType = dyn_cast<RankedTensorType>(ty);
-            if (!tensorType)
-                bufferArgs.push_back(ty);
-            else
-                bufferArgs.push_back(
-                    MemRefType::get(tensorType.getShape(), tensorType.getElementType()));
+        FunctionType funcType = forwardOp.getFunctionType();
+
+        // Construct the bufferized function type.
+        SmallVector<Type> argTypes;
+        for (const auto &it : llvm::enumerate(funcType.getInputs())) {
+            Type argType = it.value();
+            if (dyn_cast<TensorType>(argType)) {
+                argTypes.push_back(
+                    getBufferizedFunctionArgType(forwardOp, it.index(), options));
+                continue;
+            }
+            argTypes.push_back(argType);
         }
-        auto forwardTy = rewriter.getFunctionType(bufferArgs, emptyRets);
-        rewriter.modifyOpInPlace(op, [&] {
-            forwardOp.setFunctionType(forwardTy);
-        });
 
-        // Get ForwardOp's block.
-        auto &block = forwardOp.getBody().front();
-        PatternRewriter::InsertionGuard guard(rewriter);
-        rewriter.setInsertionPointToStart(&block);
-
-        auto argc = forwardOp.getArgc();
-        auto resc = forwardOp.getResc();
-
-        // Get callee's implementation.
-        auto implAttr = forwardOp.getImplementationAttr();
-        auto impl = forwardOp.getImplementation();
-        auto implOp = SymbolTable::lookupNearestSymbolFrom<FunctionOpInterface>(op, implAttr);
-        auto implResTy = implOp.getResultTypes();
-        Location loc = forwardOp.getLoc();
-
-        // Create to_tensor if callee is not yet bufferized.
-        SmallVector<Value> inputs(forwardOp.getArguments());
-        SmallVector<Value> calleeInputs;
-        for (auto input : inputs) {
-            auto tensorIn = (isa<MemRefType>(input.getType())) ? input
-                                 : rewriter.create<bufferization::ToTensorOp>(loc, input);
-            calleeInputs.push_back(tensorIn);
-        }
+        ReturnOp returnOp = getAssumedUniqueReturnOp(forwardOp);
+        assert(returnOp && "expected func with single return op");
+        Location loc = returnOp.getLoc();
 
-        forwardOp.walk([&](func::CallOp callOp) {
-            PatternRewriter::InsertionGuard guard(rewriter);
-            rewriter.setInsertionPoint(callOp);
-            SmallVector<Value> inputs(callOp.getOperands());
-            SmallVector<Value> calleeInputs;
-            for (auto input : inputs) {
-                auto tensorIn = (isa<MemRefType>(input.getType())) ? input :
-                                        rewriter.create<bufferization::ToTensorOp>(loc, input);
-                calleeInputs.push_back(tensorIn);
+        // 1. Bufferize every block.
+        for (Block &block : forwardOp.getBody())
+          if (failed(bufferization::bufferizeBlockSignature(&block, rewriter,
+                                                            options)))
+            return failure();
+
+        // 2. For each result, keep track of which inplace argument it reuses.
+        SmallVector<Value> returnValues;
+        for (OpOperand &returnOperand : returnOp->getOpOperands()) {
+            Value returnVal = returnOperand.get();
+            auto tensorType = dyn_cast<TensorType>(returnVal.getType());
+            rewriter.setInsertionPoint(returnOp);
+
+            // If not a tensor type just forward it.
+            if (!tensorType) {
+              returnValues.push_back(returnVal);
+              continue;
             }
-            rewriter.replaceOpWithNewOp<func::CallOp>(callOp, impl, implResTy, calleeInputs);
-        });
 
+            // Note: If `inferFunctionResultLayout = true`, cast are later folded
+            // away.
+            BaseMemRefType resultType = options.functionArgTypeConverterFn(
+                tensorType, *options.defaultMemorySpaceFn(tensorType), forwardOp,
+                options);
+            Value toMemrefOp = rewriter.create<bufferization::ToMemrefOp>(
+                loc, resultType, returnVal);
+            returnValues.push_back(toMemrefOp);
+        }
+
+        // 3. Rewrite the terminator.
         forwardOp.walk([&](ReturnOp returnOp) {
             PatternRewriter::InsertionGuard guard(rewriter);
             rewriter.setInsertionPoint(returnOp);
-            SmallVector<Value> inputs(returnOp.getOperands());
-            SmallVector<Value> returnInputs;
-            for (auto input : inputs) {
-                auto tensorIn = (isa<MemRefType>(input.getType())) ? input :
-                                        rewriter.create<bufferization::ToTensorOp>(loc, input);
-                returnInputs.push_back(tensorIn);
-            }
-            rewriter.replaceOpWithNewOp<ReturnOp>(returnOp, returnInputs, returnOp.getEmpty());
+            rewriter.replaceOpWithNewOp<ReturnOp>(returnOp, returnValues, returnOp.getEmpty());
         });
+
+        // 4. Rewrite the FuncOp type to buffer form.
+        forwardOp.setType(FunctionType::get(op->getContext(), argTypes,
+                                         ValueRange(returnValues).getTypes()));
+
         return success();
     }
 };
@@ -430,12 +418,12 @@ struct ReverseOpInterface
     FailureOr<BaseMemRefType>
     getBufferType(Operation *op, Value value, const bufferization::BufferizationOptions &options,
                   SmallVector<Value> &invocationStack) const {
-        auto funcOp = cast<ReverseOp>(op);
+        auto reverseOp = cast<ReverseOp>(op);
         auto bbArg = cast<BlockArgument>(value);
 
         // Function arguments are special.
-        if (bbArg.getOwner() == &funcOp.getBody().front())
-            return getBufferizedFunctionArgType(funcOp, bbArg.getArgNumber(),
+        if (bbArg.getOwner() == &reverseOp.getBody().front())
+            return getBufferizedFunctionArgType(reverseOp, bbArg.getArgNumber(),
                                               options);
 
         return OpWithUnstructuredControlFlowBufferizableOpInterfaceExternalModel::
@@ -444,9 +432,9 @@ struct ReverseOpInterface
 
     LogicalResult verifyAnalysis(Operation *op,
                                const bufferization::AnalysisState &state) const {
-    auto funcOp = cast<ReverseOp>(op);
+    auto reverseOp = cast<ReverseOp>(op);
         // TODO: func.func with multiple returns are not supported.
-        if (!getAssumedUniqueReturnOp(funcOp) && !funcOp.isExternal())
+        if (!getAssumedUniqueReturnOp(reverseOp))
           return op->emitOpError("op without unique func.return is not supported");
         return success();
     }
@@ -455,76 +443,64 @@ struct ReverseOpInterface
                             const bufferization::BufferizationOptions &options) const
     {
         auto reverseOp = cast<ReverseOp>(op);
-
-        // Update ReverseOp's signature
-        auto argTys = reverseOp.getArgumentTypes();
-        auto retTys = reverseOp.getResultTypes();
-        SmallVector<Type> emptyRets;
-        SmallVector<Type> args(argTys.begin(), argTys.end());
-        args.insert(args.end(), retTys.begin(), retTys.end());
-        SmallVector<Type> bufferArgs;
-        for (Type ty : args) {
-            auto tensorType = dyn_cast<RankedTensorType>(ty);
-            if (!tensorType)
-                bufferArgs.push_back(ty);
-            else
-                bufferArgs.push_back(
-                    MemRefType::get(tensorType.getShape(), tensorType.getElementType()));
+        FunctionType funcType = reverseOp.getFunctionType();
+
+        // Construct the bufferized function type.
+        SmallVector<Type> argTypes;
+        for (const auto &it : llvm::enumerate(funcType.getInputs())) {
+            Type argType = it.value();
+            if (dyn_cast<TensorType>(argType)) {
+                argTypes.push_back(
+                    getBufferizedFunctionArgType(reverseOp, it.index(), options));
+                continue;
+            }
+            argTypes.push_back(argType);
         }
-        auto reverseTy = rewriter.getFunctionType(bufferArgs, emptyRets);
-        rewriter.modifyOpInPlace(op, [&] {
-            reverseOp.setFunctionType(reverseTy);
-        });
 
-        // Get ForwardOp's block.
-        auto &block = reverseOp.getBody().front();
-        PatternRewriter::InsertionGuard guard(rewriter);
-        rewriter.setInsertionPointToStart(&block);
-
-        auto argc = reverseOp.getArgc();
-        auto resc = reverseOp.getResc();
-
-        // Get callee's implementation.
-        auto implAttr = reverseOp.getImplementationAttr();
-        auto impl = reverseOp.getImplementation();
-        auto implOp = SymbolTable::lookupNearestSymbolFrom<FunctionOpInterface>(op, implAttr);
-        auto implResTy = implOp.getResultTypes();
-        Location loc = reverseOp.getLoc();
-
-        // Create to_tensor if callee is not yet bufferized.
-        SmallVector<Value> inputs(reverseOp.getArguments());
-        SmallVector<Value> calleeInputs;
-        for (auto input : inputs) {
-            auto tensorIn = (isa<MemRefType>(input.getType())) ? input
-                                 : rewriter.create<bufferization::ToTensorOp>(loc, input);
-            calleeInputs.push_back(tensorIn);
-        }
+        ReturnOp returnOp = getAssumedUniqueReturnOp(reverseOp);
+        assert(returnOp && "expected func with single return op");
+        Location loc = returnOp.getLoc();
 
-        reverseOp.walk([&](func::CallOp callOp) {
-            PatternRewriter::InsertionGuard guard(rewriter);
-            rewriter.setInsertionPoint(callOp);
-            SmallVector<Value> inputs(callOp.getOperands());
-            SmallVector<Value> calleeInputs;
-            for (auto input : inputs) {
-                auto tensorIn = (isa<MemRefType>(input.getType())) ? input :
-                                        rewriter.create<bufferization::ToTensorOp>(loc, input);
-                calleeInputs.push_back(tensorIn);
+        // 1. Bufferize every block.
+        for (Block &block : reverseOp.getBody())
+          if (failed(bufferization::bufferizeBlockSignature(&block, rewriter,
+                                                            options)))
+            return failure();
+
+        // 2. For each result, keep track of which inplace argument it reuses.
+        SmallVector<Value> returnValues;
+        for (OpOperand &returnOperand : returnOp->getOpOperands()) {
+            Value returnVal = returnOperand.get();
+            auto tensorType = dyn_cast<TensorType>(returnVal.getType());
+            rewriter.setInsertionPoint(returnOp);
+
+            // If not a tensor type just forward it.
+            if (!tensorType) {
+              returnValues.push_back(returnVal);
+              continue;
             }
-            rewriter.replaceOpWithNewOp<func::CallOp>(callOp, impl, implResTy, calleeInputs);
-        });
 
+            // Note: If `inferFunctionResultLayout = true`, cast are later folded
+            // away.
+            BaseMemRefType resultType = options.functionArgTypeConverterFn(
+                tensorType, *options.defaultMemorySpaceFn(tensorType), reverseOp,
+                options);
+            Value toMemrefOp = rewriter.create<bufferization::ToMemrefOp>(
+                loc, resultType, returnVal);
+            returnValues.push_back(toMemrefOp);
+        }
+
+        // 3. Rewrite the terminator.
         reverseOp.walk([&](ReturnOp returnOp) {
             PatternRewriter::InsertionGuard guard(rewriter);
             rewriter.setInsertionPoint(returnOp);
-            SmallVector<Value> inputs(returnOp.getOperands());
-            SmallVector<Value> returnInputs;
-            for (auto input : inputs) {
-                auto tensorIn = (isa<MemRefType>(input.getType())) ? input :
-                                        rewriter.create<bufferization::ToTensorOp>(loc, input);
-                returnInputs.push_back(tensorIn);
-            }
-            rewriter.replaceOpWithNewOp<ReturnOp>(returnOp, returnInputs, returnOp.getEmpty());
+            rewriter.replaceOpWithNewOp<ReturnOp>(returnOp, returnValues, returnOp.getEmpty());
         });
+
+        // 4. Rewrite the FuncOp type to buffer form.
+        reverseOp.setType(FunctionType::get(op->getContext(), argTypes,
+                                         ValueRange(returnValues).getTypes()));
+
         return success();
     }
 };

From 0473d72fae5875589cbb521d434e5c0851ab5340 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Wed, 18 Sep 2024 12:43:19 -0400
Subject: [PATCH 095/183] Make gradient-preprocess happen before
 eliminate-empty-tensors to pass ReturnOp check

---
 frontend/catalyst/compiler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/frontend/catalyst/compiler.py b/frontend/catalyst/compiler.py
index f515699d5c..2041560145 100644
--- a/frontend/catalyst/compiler.py
+++ b/frontend/catalyst/compiler.py
@@ -206,9 +206,9 @@ def run_writing_command(command: List[str], compile_options: Optional[CompileOpt
 BUFFERIZATION_PASS = (
     "BufferizationPass",
     [
+        "gradient-preprocess",
         "eliminate-empty-tensors",
         "convert-elementwise-to-linalg",
-        "gradient-preprocess",
         "one-shot-bufferize{bufferize-function-boundaries allow-return-allocs-from-loops}",
         "canonicalize",
         "gradient-postprocess",

From f5c4da0e9fe1af82fed401316e3f2dddc1f0ef37 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Wed, 18 Sep 2024 12:48:49 -0400
Subject: [PATCH 096/183] Reformatting

---
 .../BufferizableOpInterfaceImpl.cpp           | 126 +++++++++---------
 1 file changed, 61 insertions(+), 65 deletions(-)

diff --git a/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
index 2a7e8497d1..19fbb27a1c 100644
--- a/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -27,9 +27,9 @@ namespace {
 
 static BaseMemRefType
 getBufferizedFunctionArgType(FunctionOpInterface funcOp, int64_t index,
-                             const bufferization::BufferizationOptions &options) {
-    auto tensorType =
-        dyn_cast<TensorType>(funcOp.getArgument(index).getType());
+                             const bufferization::BufferizationOptions &options)
+{
+    auto tensorType = dyn_cast<TensorType>(funcOp.getArgument(index).getType());
     assert(tensorType && "expected TensorType");
 
     BaseMemRefType memrefType = options.functionArgTypeConverterFn(
@@ -42,21 +42,21 @@ getBufferizedFunctionArgType(FunctionOpInterface funcOp, int64_t index,
 
     auto rankedMemrefType = dyn_cast<MemRefType>(memrefType);
     assert(rankedMemrefType && "buffer layout not supported on unranked tensors");
-    return MemRefType::get(
-        rankedMemrefType.getShape(), rankedMemrefType.getElementType(),
-        layoutAttr.getValue(), rankedMemrefType.getMemorySpace());
+    return MemRefType::get(rankedMemrefType.getShape(), rankedMemrefType.getElementType(),
+                           layoutAttr.getValue(), rankedMemrefType.getMemorySpace());
 }
 
-static ReturnOp getAssumedUniqueReturnOp(FunctionOpInterface funcOp) {
-  ReturnOp returnOp;
-  for (Block &b : funcOp.getFunctionBody()) {
-    if (auto candidateOp = dyn_cast<ReturnOp>(b.getTerminator())) {
-      if (returnOp)
-        return nullptr;
-      returnOp = candidateOp;
+static ReturnOp getAssumedUniqueReturnOp(FunctionOpInterface funcOp)
+{
+    ReturnOp returnOp;
+    for (Block &b : funcOp.getFunctionBody()) {
+        if (auto candidateOp = dyn_cast<ReturnOp>(b.getTerminator())) {
+            if (returnOp)
+                return nullptr;
+            returnOp = candidateOp;
+        }
     }
-  }
-  return returnOp;
+    return returnOp;
 }
 
 Value generateAllocation(OpBuilder &builder, Location loc, Value reference)
@@ -294,31 +294,32 @@ struct ForwardOpInterface
 
     bufferization::AliasingOpOperandList
     getAliasingOpOperands(Operation *op, Value value,
-                          const bufferization::AnalysisState &state) const {
+                          const bufferization::AnalysisState &state) const
+    {
         return getAliasingBranchOpOperands(op, cast<BlockArgument>(value), state);
     }
 
-    FailureOr<BaseMemRefType>
-    getBufferType(Operation *op, Value value, const bufferization::BufferizationOptions &options,
-                  SmallVector<Value> &invocationStack) const {
+    FailureOr<BaseMemRefType> getBufferType(Operation *op, Value value,
+                                            const bufferization::BufferizationOptions &options,
+                                            SmallVector<Value> &invocationStack) const
+    {
         auto forwardOp = cast<ForwardOp>(op);
         auto bbArg = cast<BlockArgument>(value);
 
         // Function arguments are special.
         if (bbArg.getOwner() == &forwardOp.getBody().front())
-            return getBufferizedFunctionArgType(forwardOp, bbArg.getArgNumber(),
-                                              options);
+            return getBufferizedFunctionArgType(forwardOp, bbArg.getArgNumber(), options);
 
-        return OpWithUnstructuredControlFlowBufferizableOpInterfaceExternalModel::
-            getBufferType(op, value, options, invocationStack);
+        return OpWithUnstructuredControlFlowBufferizableOpInterfaceExternalModel::getBufferType(
+            op, value, options, invocationStack);
     }
 
-    LogicalResult verifyAnalysis(Operation *op,
-                               const bufferization::AnalysisState &state) const {
-    auto forwardOp = cast<ForwardOp>(op);
+    LogicalResult verifyAnalysis(Operation *op, const bufferization::AnalysisState &state) const
+    {
+        auto forwardOp = cast<ForwardOp>(op);
         // TODO: func.func with multiple returns are not supported.
         if (!getAssumedUniqueReturnOp(forwardOp))
-          return op->emitOpError("op without unique func.return is not supported");
+            return op->emitOpError("op without unique func.return is not supported");
         return success();
     }
 
@@ -333,8 +334,7 @@ struct ForwardOpInterface
         for (const auto &it : llvm::enumerate(funcType.getInputs())) {
             Type argType = it.value();
             if (dyn_cast<TensorType>(argType)) {
-                argTypes.push_back(
-                    getBufferizedFunctionArgType(forwardOp, it.index(), options));
+                argTypes.push_back(getBufferizedFunctionArgType(forwardOp, it.index(), options));
                 continue;
             }
             argTypes.push_back(argType);
@@ -346,9 +346,8 @@ struct ForwardOpInterface
 
         // 1. Bufferize every block.
         for (Block &block : forwardOp.getBody())
-          if (failed(bufferization::bufferizeBlockSignature(&block, rewriter,
-                                                            options)))
-            return failure();
+            if (failed(bufferization::bufferizeBlockSignature(&block, rewriter, options)))
+                return failure();
 
         // 2. For each result, keep track of which inplace argument it reuses.
         SmallVector<Value> returnValues;
@@ -359,17 +358,16 @@ struct ForwardOpInterface
 
             // If not a tensor type just forward it.
             if (!tensorType) {
-              returnValues.push_back(returnVal);
-              continue;
+                returnValues.push_back(returnVal);
+                continue;
             }
 
             // Note: If `inferFunctionResultLayout = true`, cast are later folded
             // away.
             BaseMemRefType resultType = options.functionArgTypeConverterFn(
-                tensorType, *options.defaultMemorySpaceFn(tensorType), forwardOp,
-                options);
-            Value toMemrefOp = rewriter.create<bufferization::ToMemrefOp>(
-                loc, resultType, returnVal);
+                tensorType, *options.defaultMemorySpaceFn(tensorType), forwardOp, options);
+            Value toMemrefOp =
+                rewriter.create<bufferization::ToMemrefOp>(loc, resultType, returnVal);
             returnValues.push_back(toMemrefOp);
         }
 
@@ -381,8 +379,8 @@ struct ForwardOpInterface
         });
 
         // 4. Rewrite the FuncOp type to buffer form.
-        forwardOp.setType(FunctionType::get(op->getContext(), argTypes,
-                                         ValueRange(returnValues).getTypes()));
+        forwardOp.setType(
+            FunctionType::get(op->getContext(), argTypes, ValueRange(returnValues).getTypes()));
 
         return success();
     }
@@ -411,31 +409,32 @@ struct ReverseOpInterface
 
     bufferization::AliasingOpOperandList
     getAliasingOpOperands(Operation *op, Value value,
-                          const bufferization::AnalysisState &state) const {
+                          const bufferization::AnalysisState &state) const
+    {
         return getAliasingBranchOpOperands(op, cast<BlockArgument>(value), state);
     }
 
-    FailureOr<BaseMemRefType>
-    getBufferType(Operation *op, Value value, const bufferization::BufferizationOptions &options,
-                  SmallVector<Value> &invocationStack) const {
+    FailureOr<BaseMemRefType> getBufferType(Operation *op, Value value,
+                                            const bufferization::BufferizationOptions &options,
+                                            SmallVector<Value> &invocationStack) const
+    {
         auto reverseOp = cast<ReverseOp>(op);
         auto bbArg = cast<BlockArgument>(value);
 
         // Function arguments are special.
         if (bbArg.getOwner() == &reverseOp.getBody().front())
-            return getBufferizedFunctionArgType(reverseOp, bbArg.getArgNumber(),
-                                              options);
+            return getBufferizedFunctionArgType(reverseOp, bbArg.getArgNumber(), options);
 
-        return OpWithUnstructuredControlFlowBufferizableOpInterfaceExternalModel::
-            getBufferType(op, value, options, invocationStack);
+        return OpWithUnstructuredControlFlowBufferizableOpInterfaceExternalModel::getBufferType(
+            op, value, options, invocationStack);
     }
 
-    LogicalResult verifyAnalysis(Operation *op,
-                               const bufferization::AnalysisState &state) const {
-    auto reverseOp = cast<ReverseOp>(op);
+    LogicalResult verifyAnalysis(Operation *op, const bufferization::AnalysisState &state) const
+    {
+        auto reverseOp = cast<ReverseOp>(op);
         // TODO: func.func with multiple returns are not supported.
         if (!getAssumedUniqueReturnOp(reverseOp))
-          return op->emitOpError("op without unique func.return is not supported");
+            return op->emitOpError("op without unique func.return is not supported");
         return success();
     }
 
@@ -450,8 +449,7 @@ struct ReverseOpInterface
         for (const auto &it : llvm::enumerate(funcType.getInputs())) {
             Type argType = it.value();
             if (dyn_cast<TensorType>(argType)) {
-                argTypes.push_back(
-                    getBufferizedFunctionArgType(reverseOp, it.index(), options));
+                argTypes.push_back(getBufferizedFunctionArgType(reverseOp, it.index(), options));
                 continue;
             }
             argTypes.push_back(argType);
@@ -463,9 +461,8 @@ struct ReverseOpInterface
 
         // 1. Bufferize every block.
         for (Block &block : reverseOp.getBody())
-          if (failed(bufferization::bufferizeBlockSignature(&block, rewriter,
-                                                            options)))
-            return failure();
+            if (failed(bufferization::bufferizeBlockSignature(&block, rewriter, options)))
+                return failure();
 
         // 2. For each result, keep track of which inplace argument it reuses.
         SmallVector<Value> returnValues;
@@ -476,17 +473,16 @@ struct ReverseOpInterface
 
             // If not a tensor type just forward it.
             if (!tensorType) {
-              returnValues.push_back(returnVal);
-              continue;
+                returnValues.push_back(returnVal);
+                continue;
             }
 
             // Note: If `inferFunctionResultLayout = true`, cast are later folded
             // away.
             BaseMemRefType resultType = options.functionArgTypeConverterFn(
-                tensorType, *options.defaultMemorySpaceFn(tensorType), reverseOp,
-                options);
-            Value toMemrefOp = rewriter.create<bufferization::ToMemrefOp>(
-                loc, resultType, returnVal);
+                tensorType, *options.defaultMemorySpaceFn(tensorType), reverseOp, options);
+            Value toMemrefOp =
+                rewriter.create<bufferization::ToMemrefOp>(loc, resultType, returnVal);
             returnValues.push_back(toMemrefOp);
         }
 
@@ -498,8 +494,8 @@ struct ReverseOpInterface
         });
 
         // 4. Rewrite the FuncOp type to buffer form.
-        reverseOp.setType(FunctionType::get(op->getContext(), argTypes,
-                                         ValueRange(returnValues).getTypes()));
+        reverseOp.setType(
+            FunctionType::get(op->getContext(), argTypes, ValueRange(returnValues).getTypes()));
 
         return success();
     }

From d42b123a585e28d6e2bd34f1b3ba132ee7b3f714 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Wed, 18 Sep 2024 13:56:17 -0400
Subject: [PATCH 097/183] Update llvm patch with FunctionArgTypeConverterFn

---
 mlir/patches/moduleOp-bufferization.patch | 51 +++++++++++++++++++++++
 1 file changed, 51 insertions(+)

diff --git a/mlir/patches/moduleOp-bufferization.patch b/mlir/patches/moduleOp-bufferization.patch
index 683f1d51a3..f66ca1b529 100644
--- a/mlir/patches/moduleOp-bufferization.patch
+++ b/mlir/patches/moduleOp-bufferization.patch
@@ -1,3 +1,24 @@
+diff --git a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h
+index 2fda091e412a..ba28596d1f97 100644
+--- a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h
++++ b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h
+@@ -9,6 +9,7 @@
+ #ifndef MLIR_DIALECT_BUFFERIZATION_IR_BUFFERIZABLEOPINTERFACE_H_
+ #define MLIR_DIALECT_BUFFERIZATION_IR_BUFFERIZABLEOPINTERFACE_H_
+ 
++#include "mlir/Interfaces/FunctionInterfaces.h"
+ #include "mlir/IR/Operation.h"
+ #include "mlir/IR/PatternMatch.h"
+ #include "mlir/Support/LLVM.h"
+@@ -262,7 +263,7 @@ struct BufferizationOptions {
+   /// Parameters: Value, memory space, func op, bufferization options
+   using FunctionArgTypeConverterFn =
+       std::function<BaseMemRefType(TensorType, Attribute memorySpace,
+-                                   func::FuncOp, const BufferizationOptions &)>;
++                                   FunctionOpInterface, const BufferizationOptions &)>;
+   /// Tensor -> MemRef type converter.
+   /// Parameters: Value, memory space, bufferization options
+   using UnknownTypeConverterFn = std::function<BaseMemRefType(
 diff --git a/mlir/include/mlir/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.h b/mlir/include/mlir/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.h
 index 0b91d3d675b7..8bed0dfc5814 100644
 --- a/mlir/include/mlir/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.h
@@ -33,6 +54,36 @@ index 0b91d3d675b7..8bed0dfc5814 100644
  };
  
  void registerBufferizableOpInterfaceExternalModels(DialectRegistry &registry);
+diff --git a/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp b/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp
+index d51d63f243ea..c4201698468c 100644
+--- a/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp
++++ b/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp
+@@ -18,6 +18,7 @@
+ #include "mlir/IR/TypeUtilities.h"
+ #include "mlir/IR/Value.h"
+ #include "mlir/Interfaces/ControlFlowInterfaces.h"
++#include "mlir/Interfaces/FunctionInterfaces.h"
+ #include "llvm/ADT/ScopeExit.h"
+ #include "llvm/Support/Debug.h"
+ 
+@@ -314,7 +315,7 @@ namespace {
+ /// Default function arg type converter: Use a fully dynamic layout map.
+ BaseMemRefType
+ defaultFunctionArgTypeConverter(TensorType type, Attribute memorySpace,
+-                                func::FuncOp funcOp,
++                                FunctionOpInterface funcOp,
+                                 const BufferizationOptions &options) {
+   return getMemRefTypeWithFullyDynamicLayout(type, memorySpace);
+ }
+@@ -361,7 +362,7 @@ BufferizationOptions::dynCastBufferizableOp(Value value) const {
+ void BufferizationOptions::setFunctionBoundaryTypeConversion(
+     LayoutMapOption layoutMapOption) {
+   functionArgTypeConverterFn = [=](TensorType tensorType, Attribute memorySpace,
+-                                   func::FuncOp funcOp,
++                                   FunctionOpInterface funcOp,
+                                    const BufferizationOptions &options) {
+     if (layoutMapOption == LayoutMapOption::IdentityLayoutMap)
+       return bufferization::getMemRefTypeWithStaticIdentityLayout(tensorType,
 diff --git a/mlir/lib/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.cpp
 index 9fbe574ec392..9749a71f3514 100644
 --- a/mlir/lib/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.cpp

From 86eeb4e2af423f1646903460c2aa121d5620b01f Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Wed, 18 Sep 2024 15:42:42 -0400
Subject: [PATCH 098/183] Preserve unused return types during bufferization

---
 .../BufferizableOpInterfaceImpl.cpp           | 22 +++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
index 19fbb27a1c..4eb9e3bd52 100644
--- a/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -378,9 +378,16 @@ struct ForwardOpInterface
             rewriter.replaceOpWithNewOp<ReturnOp>(returnOp, returnValues, returnOp.getEmpty());
         });
 
-        // 4. Rewrite the FuncOp type to buffer form.
+        // 4. Rewrite the FuncOp type to buffer form. Also preserve unused return types. 
+        SmallVector<Type> returnTypes;
+        for (auto retTy : forwardOp.getResultTypes()) {
+            auto tensorType = dyn_cast<TensorType>(retTy);
+            BaseMemRefType resultType = options.functionArgTypeConverterFn(
+                tensorType, *options.defaultMemorySpaceFn(tensorType), forwardOp, options);
+            returnTypes.push_back(resultType);
+        }
         forwardOp.setType(
-            FunctionType::get(op->getContext(), argTypes, ValueRange(returnValues).getTypes()));
+            FunctionType::get(op->getContext(), argTypes, returnTypes));
 
         return success();
     }
@@ -493,9 +500,16 @@ struct ReverseOpInterface
             rewriter.replaceOpWithNewOp<ReturnOp>(returnOp, returnValues, returnOp.getEmpty());
         });
 
-        // 4. Rewrite the FuncOp type to buffer form.
+        // 4. Rewrite the FuncOp type to buffer form. Also preserve unused return types. 
+        SmallVector<Type> returnTypes;
+        for (auto retTy : reverseOp.getResultTypes()) {
+            auto tensorType = dyn_cast<TensorType>(retTy);
+            BaseMemRefType resultType = options.functionArgTypeConverterFn(
+                tensorType, *options.defaultMemorySpaceFn(tensorType), reverseOp, options);
+            returnTypes.push_back(resultType);
+        }
         reverseOp.setType(
-            FunctionType::get(op->getContext(), argTypes, ValueRange(returnValues).getTypes()));
+            FunctionType::get(op->getContext(), argTypes, returnTypes));
 
         return success();
     }

From de97c6bec5c5deeb606e0de8ba565d05d55cf1c9 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Wed, 18 Sep 2024 16:09:39 -0400
Subject: [PATCH 099/183] Use identity-layout-map to prevent unused returns
 from being removed

---
 frontend/catalyst/compiler.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/frontend/catalyst/compiler.py b/frontend/catalyst/compiler.py
index 2041560145..c20de525ea 100644
--- a/frontend/catalyst/compiler.py
+++ b/frontend/catalyst/compiler.py
@@ -209,7 +209,8 @@ def run_writing_command(command: List[str], compile_options: Optional[CompileOpt
         "gradient-preprocess",
         "eliminate-empty-tensors",
         "convert-elementwise-to-linalg",
-        "one-shot-bufferize{bufferize-function-boundaries allow-return-allocs-from-loops}",
+        "one-shot-bufferize{bufferize-function-boundaries allow-return-allocs-from-loops"
+            " function-boundary-type-conversion=identity-layout-map}",
         "canonicalize",
         "gradient-postprocess",
         "convert-arraylist-to-memref",

From 42b3870aca4b755e086532bdd80ac5f71599d244 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Wed, 18 Sep 2024 17:15:21 -0400
Subject: [PATCH 100/183] revert identiy-layout-map

---
 frontend/catalyst/compiler.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/frontend/catalyst/compiler.py b/frontend/catalyst/compiler.py
index c20de525ea..2041560145 100644
--- a/frontend/catalyst/compiler.py
+++ b/frontend/catalyst/compiler.py
@@ -209,8 +209,7 @@ def run_writing_command(command: List[str], compile_options: Optional[CompileOpt
         "gradient-preprocess",
         "eliminate-empty-tensors",
         "convert-elementwise-to-linalg",
-        "one-shot-bufferize{bufferize-function-boundaries allow-return-allocs-from-loops"
-            " function-boundary-type-conversion=identity-layout-map}",
+        "one-shot-bufferize{bufferize-function-boundaries allow-return-allocs-from-loops}",
         "canonicalize",
         "gradient-postprocess",
         "convert-arraylist-to-memref",

From ee02d7e0ba52b72a7341c78528a8f98b324a8859 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Wed, 18 Sep 2024 21:48:54 -0400
Subject: [PATCH 101/183] Restore ReserseOp's signature if it is optimized away
 by one-shot-bufferize

---
 .../Transforms/PostprocessingPatterns.cpp     | 77 ++++++++++++++++++-
 1 file changed, 73 insertions(+), 4 deletions(-)

diff --git a/mlir/lib/Gradient/Transforms/PostprocessingPatterns.cpp b/mlir/lib/Gradient/Transforms/PostprocessingPatterns.cpp
index 5c79663f0d..731ad4d5a9 100644
--- a/mlir/lib/Gradient/Transforms/PostprocessingPatterns.cpp
+++ b/mlir/lib/Gradient/Transforms/PostprocessingPatterns.cpp
@@ -14,12 +14,14 @@
 
 #include "iostream"
 #include "llvm/Support/raw_ostream.h"
+#include <cstddef>
 
 #include "mlir/Dialect/Index/IR/IndexOps.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/SymbolTable.h"
+#include "mlir/Interfaces/FunctionInterfaces.h"
 #include "mlir/Support/LogicalResult.h"
 #include "mlir/Transforms/DialectConversion.h"
 
@@ -41,12 +43,14 @@ struct PostprocessForwardOp : public OpRewritePattern<ForwardOp> {
         // Check if the numbers of args and returns match Enzyme's format.
         auto argc = op.getArgc();
         auto resc = op.getResc();
-        auto tapeCount = op.getTape();
+        auto tape = op.getTape();
 
-        if (op.getFunctionType().getNumInputs() == (argc + resc) * 2 &&
-            op.getFunctionType().getNumResults() == tapeCount)
+        // If function signature is modified, this pass cannot be processed.
+        if (op.getFunctionType().getNumInputs() != argc || 
+           op.getFunctionType().getNumResults() != (resc + tape))
             return failure();
 
+
         auto argTys = op.getArgumentTypes();
         auto retTys = op.getResultTypes();
         SmallVector<Type> bufferArgs;
@@ -127,7 +131,9 @@ struct PostprocessReverseOp : public OpRewritePattern<ReverseOp> {
         auto forwardResc = op.getResc();
         auto tape = op.getTape();
 
-        if (op.getFunctionType().getNumInputs() == (forwardArgc + forwardResc) * 2 + tape)
+        // If function signature is modified, this pass cannot be processed.
+        if (op.getFunctionType().getNumInputs() != (forwardResc + tape) || 
+           op.getFunctionType().getNumResults() != forwardArgc)
             return failure();
 
         auto argTys = op.getArgumentTypes();
@@ -200,6 +206,68 @@ struct PostprocessReverseOp : public OpRewritePattern<ReverseOp> {
     }
 };
 
+struct RestoreReverseOp : public OpRewritePattern<ReverseOp> {
+    using OpRewritePattern<ReverseOp>::OpRewritePattern;
+
+    mlir::LogicalResult matchAndRewrite(ReverseOp op,
+                                        mlir::PatternRewriter &rewriter) const override
+    {
+        // ReverseOp's output is optimized away by one-shot bufferize.
+        auto forwardArgc = op.getArgc();
+        auto forwardResc = op.getResc();
+        auto tape = op.getTape();
+
+        // Check if the Op is post-processed.
+        if (op.getFunctionType().getNumInputs() == (forwardResc + forwardArgc) * 2 + tape)
+            return failure();
+
+        // If function signature is modified, this pass cannot be processed.
+        if (op.getFunctionType().getNumResults() >= forwardArgc)
+            return failure();
+
+        // get parenet module
+        auto module = op->getParentOfType<mlir::ModuleOp>();
+
+        // Get GradOp
+        CustomGradOp gradCaller = nullptr;
+        for (auto gradOp : module.getOps<CustomGradOp>()) {
+            if (gradOp.getReverse() == op.getSymName()) {
+                gradCaller = gradOp;
+            }
+        }
+
+        if (!gradCaller)
+            return failure();
+
+        ForwardOp target = nullptr;
+        // get corresponding FowardOp
+        for (auto forwardOp : module.getOps<ForwardOp>()) {
+            if (forwardOp.getSymName() == gradCaller.getForward()) {
+                target = forwardOp;
+            }
+        }
+
+        if (!target)
+            return failure();
+
+        auto forwardArgTys = target.getArgumentTypes();
+        SmallVector<Type> noTapeTys;
+        for (size_t i = 0 ; i < forwardArgTys.size(); ++i) {
+            if (i < op.getArgc()) {
+                noTapeTys.push_back(forwardArgTys[i]);
+            }
+        }
+
+        auto reverseTy = rewriter.getFunctionType(op.getArgumentTypes(), noTapeTys);
+
+        rewriter.modifyOpInPlace(op, [&] {
+            op.setFunctionType(reverseTy);
+        });
+
+        return failure();
+    }
+};
+
 } // namespace
 
 namespace catalyst {
@@ -207,6 +275,7 @@ namespace gradient {
 
 void populatePostprocessingPatterns(RewritePatternSet &patterns)
 {
+    patterns.add<RestoreReverseOp>(patterns.getContext());
     patterns.add<PostprocessForwardOp>(patterns.getContext());
     patterns.add<PostprocessReverseOp>(patterns.getContext());
 }

From df63f1a46854653759bbd98e7d6793293e29a4d4 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Wed, 18 Sep 2024 22:24:35 -0400
Subject: [PATCH 102/183] Use replaceOpWithNewBufferizedOp for quantum.unitary

---
 .../lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp
index 08689c18a9..7f4bb2bfe9 100644
--- a/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -44,14 +44,11 @@ struct QubitUnitaryOpInterface
         auto toMemrefOp =
             rewriter.create<bufferization::ToMemrefOp>(loc, memrefType, qubitUnitaryOp.getMatrix());
         auto memref = toMemrefOp.getResult();
-        auto newQubitUnitaryOp = rewriter.create<QubitUnitaryOp>(
-            loc, qubitUnitaryOp.getOutQubits().getTypes(),
+        bufferization::replaceOpWithNewBufferizedOp<QubitUnitaryOp>(rewriter, op, 
+             qubitUnitaryOp.getOutQubits().getTypes(),
             qubitUnitaryOp.getOutCtrlQubits().getTypes(), memref, qubitUnitaryOp.getInQubits(),
             qubitUnitaryOp.getAdjointAttr(), qubitUnitaryOp.getInCtrlQubits(),
             qubitUnitaryOp.getInCtrlValues());
-        bufferization::replaceOpWithBufferizedValues(rewriter, op,
-                                                     newQubitUnitaryOp.getOutQubits());
-
         return success();
     }
 };

From 8a1c49ae25ebd1a42da17f59a7859c685789334f Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Thu, 19 Sep 2024 10:29:48 -0400
Subject: [PATCH 103/183] Use old dealloaction steps

---
 frontend/catalyst/compiler.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/frontend/catalyst/compiler.py b/frontend/catalyst/compiler.py
index 2041560145..f51f9241b6 100644
--- a/frontend/catalyst/compiler.py
+++ b/frontend/catalyst/compiler.py
@@ -212,16 +212,13 @@ def run_writing_command(command: List[str], compile_options: Optional[CompileOpt
         "one-shot-bufferize{bufferize-function-boundaries allow-return-allocs-from-loops}",
         "canonicalize",
         "gradient-postprocess",
+        "func.func(buffer-hoisting)",
+        "func.func(buffer-loop-hoisting)",
+        "func.func(buffer-deallocation)",
         "convert-arraylist-to-memref",
         "convert-bufferization-to-memref",
         "canonicalize",
         "cp-global-memref",
-        # "func.func(buffer-hoisting)",
-        # "func.func(buffer-loop-hoisting)",
-        # "buffer-results-to-out-params",
-        # "drop-equivalent-buffer-results",
-        # "func.func(promote-buffers-to-stack)",
-        # "buffer-deallocation-pipeline",
     ],
 )
 

From 160a0ed6fa59cbe46d785122bc90ba4bc993ad0b Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Thu, 19 Sep 2024 10:50:35 -0400
Subject: [PATCH 104/183] Cleanup

---
 frontend/catalyst/compiler.py                     |  7 +++++--
 .../Transforms/BufferizableOpInterfaceImpl.cpp    | 10 ++++------
 .../Transforms/PostprocessingPatterns.cpp         | 15 ++++++---------
 .../Transforms/BufferizableOpInterfaceImpl.cpp    |  4 ++--
 4 files changed, 17 insertions(+), 19 deletions(-)

diff --git a/frontend/catalyst/compiler.py b/frontend/catalyst/compiler.py
index f51f9241b6..66687e9693 100644
--- a/frontend/catalyst/compiler.py
+++ b/frontend/catalyst/compiler.py
@@ -210,14 +210,17 @@ def run_writing_command(command: List[str], compile_options: Optional[CompileOpt
         "eliminate-empty-tensors",
         "convert-elementwise-to-linalg",
         "one-shot-bufferize{bufferize-function-boundaries allow-return-allocs-from-loops}",
-        "canonicalize",
+        "canonicalize",  # Remove dead memrefToTensorOp's
         "gradient-postprocess",
+        # introduced during gradient-bufferize of callbacks
         "func.func(buffer-hoisting)",
         "func.func(buffer-loop-hoisting)",
         "func.func(buffer-deallocation)",
         "convert-arraylist-to-memref",
         "convert-bufferization-to-memref",
-        "canonicalize",
+        "canonicalize",  # Must be after convert-bufferization-to-memref
+        # otherwise there are issues in lowering of dynamic tensors.
+        # "cse",
         "cp-global-memref",
     ],
 )
diff --git a/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
index 4eb9e3bd52..3b620d75a4 100644
--- a/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -378,7 +378,7 @@ struct ForwardOpInterface
             rewriter.replaceOpWithNewOp<ReturnOp>(returnOp, returnValues, returnOp.getEmpty());
         });
 
-        // 4. Rewrite the FuncOp type to buffer form. Also preserve unused return types. 
+        // 4. Rewrite the FuncOp type to buffer form. Also preserve unused return types.
         SmallVector<Type> returnTypes;
         for (auto retTy : forwardOp.getResultTypes()) {
             auto tensorType = dyn_cast<TensorType>(retTy);
@@ -386,8 +386,7 @@ struct ForwardOpInterface
                 tensorType, *options.defaultMemorySpaceFn(tensorType), forwardOp, options);
             returnTypes.push_back(resultType);
         }
-        forwardOp.setType(
-            FunctionType::get(op->getContext(), argTypes, returnTypes));
+        forwardOp.setType(FunctionType::get(op->getContext(), argTypes, returnTypes));
 
         return success();
     }
@@ -500,7 +499,7 @@ struct ReverseOpInterface
             rewriter.replaceOpWithNewOp<ReturnOp>(returnOp, returnValues, returnOp.getEmpty());
         });
 
-        // 4. Rewrite the FuncOp type to buffer form. Also preserve unused return types. 
+        // 4. Rewrite the FuncOp type to buffer form. Also preserve unused return types.
         SmallVector<Type> returnTypes;
         for (auto retTy : reverseOp.getResultTypes()) {
             auto tensorType = dyn_cast<TensorType>(retTy);
@@ -508,8 +507,7 @@ struct ReverseOpInterface
                 tensorType, *options.defaultMemorySpaceFn(tensorType), reverseOp, options);
             returnTypes.push_back(resultType);
         }
-        reverseOp.setType(
-            FunctionType::get(op->getContext(), argTypes, returnTypes));
+        reverseOp.setType(FunctionType::get(op->getContext(), argTypes, returnTypes));
 
         return success();
     }
diff --git a/mlir/lib/Gradient/Transforms/PostprocessingPatterns.cpp b/mlir/lib/Gradient/Transforms/PostprocessingPatterns.cpp
index 731ad4d5a9..0f1b8332ba 100644
--- a/mlir/lib/Gradient/Transforms/PostprocessingPatterns.cpp
+++ b/mlir/lib/Gradient/Transforms/PostprocessingPatterns.cpp
@@ -46,11 +46,10 @@ struct PostprocessForwardOp : public OpRewritePattern<ForwardOp> {
         auto tape = op.getTape();
 
         // If function signature is modified, this pass cannot be processed.
-        if (op.getFunctionType().getNumInputs() != argc || 
-           op.getFunctionType().getNumResults() != (resc + tape))
+        if (op.getFunctionType().getNumInputs() != argc ||
+            op.getFunctionType().getNumResults() != (resc + tape))
             return failure();
 
-
         auto argTys = op.getArgumentTypes();
         auto retTys = op.getResultTypes();
         SmallVector<Type> bufferArgs;
@@ -132,8 +131,8 @@ struct PostprocessReverseOp : public OpRewritePattern<ReverseOp> {
         auto tape = op.getTape();
 
         // If function signature is modified, this pass cannot be processed.
-        if (op.getFunctionType().getNumInputs() != (forwardResc + tape) || 
-           op.getFunctionType().getNumResults() != forwardArgc)
+        if (op.getFunctionType().getNumInputs() != (forwardResc + tape) ||
+            op.getFunctionType().getNumResults() != forwardArgc)
             return failure();
 
         auto argTys = op.getArgumentTypes();
@@ -252,7 +251,7 @@ struct RestoreReverseOp : public OpRewritePattern<ReverseOp> {
 
         auto forwardArgTys = target.getArgumentTypes();
         SmallVector<Type> noTapeTys;
-        for (size_t i = 0 ; i < forwardArgTys.size(); ++i) {
+        for (size_t i = 0; i < forwardArgTys.size(); ++i) {
             if (i < op.getArgc()) {
                 noTapeTys.push_back(forwardArgTys[i]);
             }
@@ -260,9 +259,7 @@ struct RestoreReverseOp : public OpRewritePattern<ReverseOp> {
 
         auto reverseTy = rewriter.getFunctionType(op.getArgumentTypes(), noTapeTys);
 
-        rewriter.modifyOpInPlace(op, [&] {
-            op.setFunctionType(reverseTy);
-        });
+        rewriter.modifyOpInPlace(op, [&] { op.setFunctionType(reverseTy); });
 
         return failure();
     }
diff --git a/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp
index 7f4bb2bfe9..d176098b8f 100644
--- a/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -44,8 +44,8 @@ struct QubitUnitaryOpInterface
         auto toMemrefOp =
             rewriter.create<bufferization::ToMemrefOp>(loc, memrefType, qubitUnitaryOp.getMatrix());
         auto memref = toMemrefOp.getResult();
-        bufferization::replaceOpWithNewBufferizedOp<QubitUnitaryOp>(rewriter, op, 
-             qubitUnitaryOp.getOutQubits().getTypes(),
+        bufferization::replaceOpWithNewBufferizedOp<QubitUnitaryOp>(
+            rewriter, op, qubitUnitaryOp.getOutQubits().getTypes(),
             qubitUnitaryOp.getOutCtrlQubits().getTypes(), memref, qubitUnitaryOp.getInQubits(),
             qubitUnitaryOp.getAdjointAttr(), qubitUnitaryOp.getInCtrlQubits(),
             qubitUnitaryOp.getInCtrlValues());

From 3f6eaf2d3de662fccf96e98cdc790263f53bca66 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Thu, 19 Sep 2024 10:57:28 -0400
Subject: [PATCH 105/183] Fix var names in mlir debug test

---
 frontend/test/pytest/test_debug.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/frontend/test/pytest/test_debug.py b/frontend/test/pytest/test_debug.py
index 15685e828d..16c7b9ad8d 100644
--- a/frontend/test/pytest/test_debug.py
+++ b/frontend/test/pytest/test_debug.py
@@ -466,18 +466,18 @@ def f(x: float):
             ),
             (
                 "MLIRToLLVMDialect",
-                "%5 = llvm.fmul %4, %4  : f64\n",
-                "%t = llvm.fmul %4, %4  : f64\n" + "    %5 = llvm.fmul %t, %4  : f64\n",
+                "%6 = llvm.fmul %5, %5  : f64\n",
+                "%t = llvm.fmul %5, %5  : f64\n" + "    %6 = llvm.fmul %t, %5  : f64\n",
             ),
             (
                 "llvm_ir",
-                "%5 = fmul double %4, %4\n",
-                "%t = fmul double %4, %4\n" + "%5 = fmul double %t, %4\n",
+                "%6 = fmul double %5, %5\n",
+                "%t = fmul double %5, %5\n" + "%6 = fmul double %t, %5\n",
             ),
             (
                 "last",
-                "%5 = fmul double %4, %4\n",
-                "%t = fmul double %4, %4\n" + "%5 = fmul double %t, %4\n",
+                "%6 = fmul double %5, %5\n",
+                "%t = fmul double %5, %5\n" + "%6 = fmul double %t, %5\n",
             ),
         ],
     )

From f42b8611ea5f9f3e6f0f1b4226428d505b18a742 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Thu, 19 Sep 2024 12:20:47 -0400
Subject: [PATCH 106/183] Test removing stride from memreftype in customCall

---
 .../Transforms/BufferizableOpInterfaceImpl.cpp      | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
index 15f287f1d5..33bda43f2d 100644
--- a/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -3,6 +3,8 @@
 #include "mlir/Dialect/Bufferization/IR/UnstructuredControlFlow.h"
 #include "mlir/Dialect/Bufferization/Transforms/Bufferize.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/Support/LogicalResult.h"
 #include "mlir/Transforms/DialectConversion.h"
 
 #include "Catalyst/IR/CatalystOps.h"
@@ -85,7 +87,16 @@ struct CustomCallOpInterface
             FailureOr<Value> opBuffer = getBuffer(rewriter, operand, options);
             if (failed(opBuffer))
                 return failure();
-            bufferArgs.push_back(*opBuffer);
+            MemRefType memrefType = dyn_cast<MemRefType>(opBuffer->getType());
+            if (!memrefType)
+                return failure();
+            if (!memrefType.getLayout().isIdentity()) {
+                auto nonStrideType = MemRefType::get(memrefType.getShape(), memrefType.getElementType());
+                auto newMemRef = rewriter.create<memref::CastOp>(op->getLoc(), nonStrideType, *opBuffer);
+                bufferArgs.push_back(newMemRef);
+            } else {
+                bufferArgs.push_back(*opBuffer);
+            }
         }
 
         // Add bufferized return values to the arguments

From 3c4f0ca8889497f7dcdd9bbbf8bb41b7dece910b Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Thu, 19 Sep 2024 20:08:31 -0400
Subject: [PATCH 107/183] Use SubView for allocCopyMemrefDyn if CastOP is not
 supported

---
 mlir/lib/Quantum/Transforms/cp_global_buffers.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/mlir/lib/Quantum/Transforms/cp_global_buffers.cpp b/mlir/lib/Quantum/Transforms/cp_global_buffers.cpp
index 227d7977b0..b772d378cd 100644
--- a/mlir/lib/Quantum/Transforms/cp_global_buffers.cpp
+++ b/mlir/lib/Quantum/Transforms/cp_global_buffers.cpp
@@ -106,6 +106,12 @@ Value allocCopyMemrefDyn(Location loc, Value memref, PatternRewriter &rewriter)
 
     Value newMemRef = rewriter.create<memref::AllocOp>(loc, memrefType, dynDims);
     // Cast memrefType back to maintain memory layout.
+    if (!memref::CastOp::areCastCompatible(memrefType, origMemrefType)) {
+        auto subview = rewriter.create<memref::SubViewOp>(loc, origMemrefType, memref);
+        rewriter.create<memref::CopyOp>(loc, memref, subview);
+        return subview;
+    }
+
     Value castMemRef = rewriter.create<memref::CastOp>(loc, origMemrefType, newMemRef);
     rewriter.create<memref::CopyOp>(loc, memref, newMemRef);
     return castMemRef;

From 0eb6cbf34d192f4c991a12ea8cbc402b4021c100 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Fri, 20 Sep 2024 11:12:17 -0400
Subject: [PATCH 108/183] Use subview to handle memref with strides and offset

---
 .../Quantum/Transforms/cp_global_buffers.cpp  | 29 ++++++++++++++++---
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/mlir/lib/Quantum/Transforms/cp_global_buffers.cpp b/mlir/lib/Quantum/Transforms/cp_global_buffers.cpp
index b772d378cd..4f0d90a8a1 100644
--- a/mlir/lib/Quantum/Transforms/cp_global_buffers.cpp
+++ b/mlir/lib/Quantum/Transforms/cp_global_buffers.cpp
@@ -103,12 +103,33 @@ Value allocCopyMemrefDyn(Location loc, Value memref, PatternRewriter &rewriter)
             ndim++;
         }
     }
-
+    
     Value newMemRef = rewriter.create<memref::AllocOp>(loc, memrefType, dynDims);
     // Cast memrefType back to maintain memory layout.
-    if (!memref::CastOp::areCastCompatible(memrefType, origMemrefType)) {
-        auto subview = rewriter.create<memref::SubViewOp>(loc, origMemrefType, memref);
-        rewriter.create<memref::CopyOp>(loc, memref, subview);
+    //if (!memref::CastOp::areCastCompatible(memrefType, origMemrefType)) {
+    if (!origMemrefType.getLayout().isIdentity()) {
+            SmallVector<OpFoldResult> sizes =
+        memref::getMixedSizes(rewriter, loc, newMemRef);
+
+        // getlayout
+        auto layout = origMemrefType.getLayout();
+        auto stridedAttr = cast<mlir::StridedLayoutAttr>(layout);
+        auto offset = stridedAttr.getOffset();
+        auto strideArr = stridedAttr.getStrides();
+
+        // Rebuild strides and offsets info
+        SmallVector<OpFoldResult> strides;
+        for (auto stride : strideArr) {
+            strides.push_back(rewriter.getIndexAttr(stride));
+        }
+        SmallVector<OpFoldResult> offsets(origMemrefType.getRank(),
+                                      rewriter.getIndexAttr(0));
+        offsets[0] = rewriter.getIndexAttr(offset);
+        auto subview = rewriter.create<memref::SubViewOp>(
+            loc, origMemrefType, newMemRef, 
+            offsets, sizes, strides);
+        subview.print(llvm::outs());
+        rewriter.create<memref::CopyOp>(loc, memref, newMemRef);
         return subview;
     }
 

From d7a0019af47cd0ec3cedb6e294a19530b445aa37 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Fri, 20 Sep 2024 13:00:02 -0400
Subject: [PATCH 109/183] Use identity map and cleanup

---
 frontend/catalyst/compiler.py                     |  2 +-
 mlir/lib/Quantum/Transforms/cp_global_buffers.cpp | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/frontend/catalyst/compiler.py b/frontend/catalyst/compiler.py
index 66687e9693..32080e23ed 100644
--- a/frontend/catalyst/compiler.py
+++ b/frontend/catalyst/compiler.py
@@ -209,7 +209,7 @@ def run_writing_command(command: List[str], compile_options: Optional[CompileOpt
         "gradient-preprocess",
         "eliminate-empty-tensors",
         "convert-elementwise-to-linalg",
-        "one-shot-bufferize{bufferize-function-boundaries allow-return-allocs-from-loops}",
+        "one-shot-bufferize{bufferize-function-boundaries allow-return-allocs-from-loops function-boundary-type-conversion=identity-layout-map}",
         "canonicalize",  # Remove dead memrefToTensorOp's
         "gradient-postprocess",
         # introduced during gradient-bufferize of callbacks
diff --git a/mlir/lib/Quantum/Transforms/cp_global_buffers.cpp b/mlir/lib/Quantum/Transforms/cp_global_buffers.cpp
index 4f0d90a8a1..e622604e07 100644
--- a/mlir/lib/Quantum/Transforms/cp_global_buffers.cpp
+++ b/mlir/lib/Quantum/Transforms/cp_global_buffers.cpp
@@ -89,13 +89,13 @@ Value allocCopyMemrefDyn(Location loc, Value memref, PatternRewriter &rewriter)
 {
     auto origMemrefType = cast<MemRefType>(memref.getType());
     // Rebuild MemRefType without memory layout.
-    auto memrefType = MemRefType::get(origMemrefType.getShape(), origMemrefType.getElementType());
+    auto newMemrefType = MemRefType::get(origMemrefType.getShape(), origMemrefType.getElementType());
 
     llvm::SmallVector<Value> dynDims;
     {
         llvm::SmallVector<int64_t> dynIndices;
         int64_t ndim = 0;
-        for (auto dim : memrefType.getShape()) {
+        for (auto dim : newMemrefType.getShape()) {
             if (dim < 0) {
                 Value dynValue = rewriter.create<memref::DimOp>(loc, memref, ndim);
                 dynDims.push_back(dynValue);
@@ -104,10 +104,10 @@ Value allocCopyMemrefDyn(Location loc, Value memref, PatternRewriter &rewriter)
         }
     }
     
-    Value newMemRef = rewriter.create<memref::AllocOp>(loc, memrefType, dynDims);
+    Value newMemRef = rewriter.create<memref::AllocOp>(loc, newMemrefType, dynDims);
     // Cast memrefType back to maintain memory layout.
-    //if (!memref::CastOp::areCastCompatible(memrefType, origMemrefType)) {
-    if (!origMemrefType.getLayout().isIdentity()) {
+    if (!memref::CastOp::areCastCompatible(newMemrefType, origMemrefType)) {
+    //if (!origMemrefType.getLayout().isIdentity()) {
             SmallVector<OpFoldResult> sizes =
         memref::getMixedSizes(rewriter, loc, newMemRef);
 
@@ -128,7 +128,7 @@ Value allocCopyMemrefDyn(Location loc, Value memref, PatternRewriter &rewriter)
         auto subview = rewriter.create<memref::SubViewOp>(
             loc, origMemrefType, newMemRef, 
             offsets, sizes, strides);
-        subview.print(llvm::outs());
+
         rewriter.create<memref::CopyOp>(loc, memref, newMemRef);
         return subview;
     }

From 0a915b510709afb2bf84f32deb43074159c250df Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Fri, 20 Sep 2024 14:33:27 -0400
Subject: [PATCH 110/183] Add CallOp bufferization patch

---
 mlir/Makefile                           |  4 ++
 mlir/patches/callOp-bufferization.patch | 71 +++++++++++++++++++++++++
 2 files changed, 75 insertions(+)
 create mode 100644 mlir/patches/callOp-bufferization.patch

diff --git a/mlir/Makefile b/mlir/Makefile
index debedd0c3a..d966b0a96f 100644
--- a/mlir/Makefile
+++ b/mlir/Makefile
@@ -14,6 +14,7 @@ ENABLE_ASAN?=OFF
 BUILD_TYPE?=Release
 LLVM_ROOT=$(MK_DIR)/llvm-project
 LLVM_MODULE_PATCH_FILE=$(MK_DIR)/patches/moduleOp-bufferization.patch
+LLVM_FUNC_CALL_PATCH_FILE=$(MK_DIR)/patches/callOp-bufferization.patch
 TARGET_FILE=$(MK_DIR)/mlir-hlo/mhlo/transforms/CMakeLists.txt
 PATCH_FILE=$(MK_DIR)/patches/mhlo-Add-PassesIncGen-in-transforms-CMakeList.patch
 
@@ -59,6 +60,9 @@ llvm:
 	@if patch --dry-run -p1 -N --directory=$(LLVM_ROOT) < $(LLVM_MODULE_PATCH_FILE) > /dev/null 2>&1; then \
         patch -p1 --directory=$(LLVM_ROOT) < $(LLVM_MODULE_PATCH_FILE); \
 	fi
+	@if patch --dry-run -p1 -N --directory=$(LLVM_ROOT) < $(LLVM_FUNC_CALL_PATCH_FILE) > /dev/null 2>&1; then \
+        patch -p1 --directory=$(LLVM_ROOT) < $(LLVM_FUNC_CALL_PATCH_FILE); \
+	fi
 	cmake -G Ninja -S llvm-project/llvm -B $(LLVM_BUILD_DIR) \
 		-DCMAKE_BUILD_TYPE=$(BUILD_TYPE) \
 		-DLLVM_BUILD_EXAMPLES=OFF \
diff --git a/mlir/patches/callOp-bufferization.patch b/mlir/patches/callOp-bufferization.patch
new file mode 100644
index 0000000000..2b7180fdd7
--- /dev/null
+++ b/mlir/patches/callOp-bufferization.patch
@@ -0,0 +1,71 @@
+diff --git a/mlir/lib/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.cpp
+index 053ea7935260a2..9fbe574ec392dc 100644
+--- a/mlir/lib/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.cpp
++++ b/mlir/lib/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.cpp
+@@ -258,20 +258,23 @@ struct CallOpInterface
+         return failure();
+       Value buffer = *maybeBuffer;
+ 
+-      // Caller / callee type mismatch is handled with a CastOp.
++      // Caller / callee type mismatch is handled with castOrReallocMemRefValue.
+       auto memRefType = funcType.getInput(opOperand.getOperandNumber());
+       // Since we don't yet have a clear layout story, to_memref may
+       // conservatively turn tensors into more dynamic memref than necessary.
+       // If the memref type of the callee fails, introduce an extra memref.cast
+       // that will either canonicalize away or fail compilation until we can do
+-      // something better.
++      // something better. Insert a reallocation + copy if it cannot be
++      // statically guaranteed that a direct cast would be valid.
+       if (buffer.getType() != memRefType) {
+-        assert(
+-            memref::CastOp::areCastCompatible(buffer.getType(), memRefType) &&
+-            "CallOp::bufferize: cast incompatible");
+-        Value castBuffer = rewriter.create<memref::CastOp>(callOp.getLoc(),
+-                                                           memRefType, buffer);
+-        buffer = castBuffer;
++        auto memrefDstType = dyn_cast<MemRefType>(memRefType);
++        assert(memrefDstType &&
++               "buffer layout not supported on unranked tensors");
++        FailureOr<Value> replacement = bufferization::castOrReallocMemRefValue(
++            rewriter, buffer, memrefDstType, options);
++        if (failed(replacement))
++          return failure();
++        buffer = *replacement;
+       }
+       newOperands.push_back(buffer);
+     }
+diff --git a/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize.mlir
+index 0248afb11f1672..0d5224514e3a02 100644
+--- a/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize.mlir
++++ b/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize.mlir
+@@ -71,6 +71,30 @@ func.func @return_extract_slice(%idx: index, %sz: index) -> (tensor<2x?xf32>)
+ 
+ // -----
+ 
++// CHECK-NO-LAYOUT-MAP-LABEL:   func.func @foo(
++// CHECK-NO-LAYOUT-MAP-SAME:                   %[[VAL_0:.*]]: memref<3x8xf16>) -> memref<3x8xf16> {
++// CHECK-NO-LAYOUT-MAP:           return %[[VAL_0]] : memref<3x8xf16>
++// CHECK-NO-LAYOUT-MAP:         }
++func.func @foo(%arg0: tensor<3x8xf16>) -> tensor<3x8xf16> {
++  return %arg0 : tensor<3x8xf16>
++}
++
++// CHECK-NO-LAYOUT-MAP-LABEL:   func.func @call_extract_slice(
++// CHECK-NO-LAYOUT-MAP-SAME:                                  %[[VAL_0:.*]]: memref<4x8xf16>) -> memref<3x8xf16> {
++// CHECK-NO-LAYOUT-MAP:           %[[VAL_1:.*]] = memref.subview %[[VAL_0]][1, 0] [3, 8] [1, 1] : memref<4x8xf16> to memref<3x8xf16, strided<[8, 1], offset: 8>>
++// CHECK-NO-LAYOUT-MAP:           %[[VAL_2:.*]] = memref.alloc() {alignment = 64 : i64} : memref<3x8xf16>
++// CHECK-NO-LAYOUT-MAP:           memref.copy %[[VAL_1]], %[[VAL_2]] : memref<3x8xf16, strided<[8, 1], offset: 8>> to memref<3x8xf16>
++// CHECK-NO-LAYOUT-MAP:           %[[VAL_3:.*]] = call @foo(%[[VAL_2]]) : (memref<3x8xf16>) -> memref<3x8xf16>
++// CHECK-NO-LAYOUT-MAP:           return %[[VAL_3]] : memref<3x8xf16>
++// CHECK-NO-LAYOUT-MAP:         }
++func.func @call_extract_slice(%arg0: tensor<4x8xf16>) -> (tensor<3x8xf16>) {
++  %0 = tensor.extract_slice %arg0[1, 0] [3, 8] [1, 1] : tensor<4x8xf16> to tensor<3x8xf16>
++  %1 = call @foo(%0) : (tensor<3x8xf16>) -> tensor<3x8xf16>
++  return %1 : tensor<3x8xf16>
++}
++
++// -----
++
+ // CHECK-LABEL: func private @private_func
+ // CHECK-NO-LAYOUT-MAP-LABEL: func private @private_func(memref<?xf32>) -> f32
+ func.func private @private_func(tensor<?xf32>) -> (f32)

From 52fd238b65e7bb5f5a365dec45bf2d3c2d87f000 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Fri, 20 Sep 2024 14:39:09 -0400
Subject: [PATCH 111/183] Revert changes in frontend/test/pytest/test_debug.py

---
 frontend/test/pytest/test_debug.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/frontend/test/pytest/test_debug.py b/frontend/test/pytest/test_debug.py
index 16c7b9ad8d..15685e828d 100644
--- a/frontend/test/pytest/test_debug.py
+++ b/frontend/test/pytest/test_debug.py
@@ -466,18 +466,18 @@ def f(x: float):
             ),
             (
                 "MLIRToLLVMDialect",
-                "%6 = llvm.fmul %5, %5  : f64\n",
-                "%t = llvm.fmul %5, %5  : f64\n" + "    %6 = llvm.fmul %t, %5  : f64\n",
+                "%5 = llvm.fmul %4, %4  : f64\n",
+                "%t = llvm.fmul %4, %4  : f64\n" + "    %5 = llvm.fmul %t, %4  : f64\n",
             ),
             (
                 "llvm_ir",
-                "%6 = fmul double %5, %5\n",
-                "%t = fmul double %5, %5\n" + "%6 = fmul double %t, %5\n",
+                "%5 = fmul double %4, %4\n",
+                "%t = fmul double %4, %4\n" + "%5 = fmul double %t, %4\n",
             ),
             (
                 "last",
-                "%6 = fmul double %5, %5\n",
-                "%t = fmul double %5, %5\n" + "%6 = fmul double %t, %5\n",
+                "%5 = fmul double %4, %4\n",
+                "%t = fmul double %4, %4\n" + "%5 = fmul double %t, %4\n",
             ),
         ],
     )

From 18b27628bd468f4e62de72713146c2831c09037c Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Fri, 20 Sep 2024 15:50:58 -0400
Subject: [PATCH 112/183] Add inline back to bufferization

---
 frontend/catalyst/compiler.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/frontend/catalyst/compiler.py b/frontend/catalyst/compiler.py
index 32080e23ed..b92122f403 100644
--- a/frontend/catalyst/compiler.py
+++ b/frontend/catalyst/compiler.py
@@ -206,6 +206,7 @@ def run_writing_command(command: List[str], compile_options: Optional[CompileOpt
 BUFFERIZATION_PASS = (
     "BufferizationPass",
     [
+        "inline",
         "gradient-preprocess",
         "eliminate-empty-tensors",
         "convert-elementwise-to-linalg",

From 266ba2447f4b9998705c2f8dcc3b1204bde480b6 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Mon, 23 Sep 2024 10:44:59 -0400
Subject: [PATCH 113/183] Add functionOPInterface patch for mhlo

---
 mlir/Makefile                    |  5 +++++
 mlir/patches/moduleOp-mhlo.patch | 21 +++++++++++++++++++++
 2 files changed, 26 insertions(+)
 create mode 100644 mlir/patches/moduleOp-mhlo.patch

diff --git a/mlir/Makefile b/mlir/Makefile
index d966b0a96f..4d0e7de3d8 100644
--- a/mlir/Makefile
+++ b/mlir/Makefile
@@ -15,6 +15,8 @@ BUILD_TYPE?=Release
 LLVM_ROOT=$(MK_DIR)/llvm-project
 LLVM_MODULE_PATCH_FILE=$(MK_DIR)/patches/moduleOp-bufferization.patch
 LLVM_FUNC_CALL_PATCH_FILE=$(MK_DIR)/patches/callOp-bufferization.patch
+MHLO_ROOT?=$(MK_DIR)/mlir-hlo
+MHLO_MODULE_PATCH_FILE=$(MK_DIR)/patches/moduleOp-mhlo.patch
 TARGET_FILE=$(MK_DIR)/mlir-hlo/mhlo/transforms/CMakeLists.txt
 PATCH_FILE=$(MK_DIR)/patches/mhlo-Add-PassesIncGen-in-transforms-CMakeList.patch
 
@@ -94,6 +96,9 @@ mhlo:
 	@if patch --dry-run -p1 -N $(TARGET_FILE) $(PATCH_FILE) > /dev/null 2>&1; then \
 		patch -p1 $(TARGET_FILE) $(PATCH_FILE); \
 	fi
+	@if patch --dry-run -p1 -N --directory=$(MHLO_ROOT) < $(MHLO_MODULE_PATCH_FILE) > /dev/null 2>&1; then \
+        patch -p1 --directory=$(MHLO_ROOT) < $(MHLO_MODULE_PATCH_FILE); \
+	fi
 	cmake -G Ninja -S mlir-hlo -B $(MHLO_BUILD_DIR) \
 		-DCMAKE_BUILD_TYPE=$(BUILD_TYPE) \
 		-DLLVM_ENABLE_ASSERTIONS=ON \
diff --git a/mlir/patches/moduleOp-mhlo.patch b/mlir/patches/moduleOp-mhlo.patch
new file mode 100644
index 0000000000..74fe5f3dd9
--- /dev/null
+++ b/mlir/patches/moduleOp-mhlo.patch
@@ -0,0 +1,21 @@
+diff --git a/transforms/bufferize_pass.cc b/transforms/bufferize_pass.cc
+index 1e810cff2..c91c49710 100644
+--- a/transforms/bufferize_pass.cc
++++ b/transforms/bufferize_pass.cc
+@@ -66,6 +66,7 @@ limitations under the License.
+ #include "mlir/IR/Operation.h"
+ #include "mlir/IR/PatternMatch.h"
+ #include "mlir/IR/Visitors.h"
++#include "mlir/Interfaces/FunctionInterfaces.h"
+ #include "mlir/Support/LLVM.h"
+ #include "mlir/Transforms/DialectConversion.h"
+ #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+@@ -235,7 +236,7 @@ struct OneShotBufferizePass
+     opts.allowReturnAllocsFromLoops = true;
+     opts.bufferizeFunctionBoundaries = true;
+     opts.functionArgTypeConverterFn =
+-        [=](TensorType tensorType, Attribute memorySpace, func::FuncOp funcOp,
++        [=](TensorType tensorType, Attribute memorySpace, FunctionOpInterface funcOp,
+             const bufferization::BufferizationOptions& options) {
+           // Functions created by fusion outlining should have fully dynamic
+           // layout. All other functions (for now only "main") gets static

From 9d5fc4d892322d333f1829e86d7e9fed69966c22 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Mon, 23 Sep 2024 20:49:32 -0400
Subject: [PATCH 114/183] Correct bufferizeToMemoryWrite for setState and
 setBasisState

---
 mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp
index d176098b8f..fb3c0c64c7 100644
--- a/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -312,7 +312,7 @@ struct SetStateOpInterface
     bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
                                  const bufferization::AnalysisState &state) const
     {
-        return true;
+        return false;
     }
 
     bufferization::AliasingValueList
@@ -353,7 +353,7 @@ struct SetBasisStateOpInterface
     bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
                                  const bufferization::AnalysisState &state) const
     {
-        return true;
+        return false;
     }
 
     bufferization::AliasingValueList

From 0e10abb811147e257e939d40474d038fc0df7711 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Mon, 23 Sep 2024 21:38:55 -0400
Subject: [PATCH 115/183] Update linux-x86_64 wheel

---
 .github/workflows/build-wheel-linux-x86_64.yaml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/.github/workflows/build-wheel-linux-x86_64.yaml b/.github/workflows/build-wheel-linux-x86_64.yaml
index 109886598f..87bc8bd6d1 100644
--- a/.github/workflows/build-wheel-linux-x86_64.yaml
+++ b/.github/workflows/build-wheel-linux-x86_64.yaml
@@ -172,6 +172,11 @@ jobs:
       if: steps.cache-llvm-build.outputs.cache-hit != 'true'
       run: |
         export PATH=$GITHUB_WORKSPACE/llvm-build/bin:$PATH
+        export LLVM_ROOT=mlir/llvm-project
+        export LLVM_MODULE_PATCH_FILE=mlir/llvm-project/patches/moduleOp-bufferization.patch
+        export LLVM_FUNC_CALL_PATCH_FILE=mlir/llvm-project/patches/callOp-bufferization.patch
+        if patch --dry-run -p1 -N --directory=$(LLVM_ROOT) < $(LLVM_MODULE_PATCH_FILE) > /dev/null 2>&1; then patch -p1 --directory=$(LLVM_ROOT) < $(LLVM_MODULE_PATCH_FILE); fi
+        if patch --dry-run -p1 -N --directory=$(LLVM_ROOT) < $(LLVM_FUNC_CALL_PATCH_FILE) > /dev/null 2>&1; then patch -p1 --directory=$(LLVM_ROOT) < $(LLVM_FUNC_CALL_PATCH_FILE); fi
         cmake -S mlir/llvm-project/llvm -B llvm-build -G Ninja \
               -DCMAKE_BUILD_TYPE=Release \
               -DLLVM_BUILD_EXAMPLES=OFF \
@@ -208,7 +213,10 @@ jobs:
 
         export TARGET_FILE=mlir/mlir-hlo/mhlo/transforms/CMakeLists.txt
         export PATCH_FILE=mlir/patches/mhlo-Add-PassesIncGen-in-transforms-CMakeList.patch
+        export MHLO_ROOT?=mlir/mlir-hlo
+        export MHLO_MODULE_PATCH_FILE=mlir/patches/moduleOp-mhlo.patch
         if patch --dry-run -p1 -N $TARGET_FILE $PATCH_FILE > /dev/null 2>&1; then patch -p1 $TARGET_FILE $PATCH_FILE; fi
+        if patch --dry-run -p1 -N --directory=$(MHLO_ROOT) < $(MHLO_MODULE_PATCH_FILE) > /dev/null 2>&1; then patch -p1 --directory=$(MHLO_ROOT) < $(MHLO_MODULE_PATCH_FILE); fi
 
         cmake -S mlir/mlir-hlo -B mhlo-build -G Ninja \
               -DCMAKE_BUILD_TYPE=Release \

From a1dd5d5c57ff4f47130285491cb03c2b4f445e47 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Mon, 23 Sep 2024 21:40:50 -0400
Subject: [PATCH 116/183] Update macos-arm64 wheel

---
 .github/workflows/build-wheel-linux-x86_64.yaml | 2 ++
 .github/workflows/build-wheel-macos-arm64.yaml  | 8 ++++++++
 2 files changed, 10 insertions(+)

diff --git a/.github/workflows/build-wheel-linux-x86_64.yaml b/.github/workflows/build-wheel-linux-x86_64.yaml
index 87bc8bd6d1..9854315428 100644
--- a/.github/workflows/build-wheel-linux-x86_64.yaml
+++ b/.github/workflows/build-wheel-linux-x86_64.yaml
@@ -172,11 +172,13 @@ jobs:
       if: steps.cache-llvm-build.outputs.cache-hit != 'true'
       run: |
         export PATH=$GITHUB_WORKSPACE/llvm-build/bin:$PATH
+
         export LLVM_ROOT=mlir/llvm-project
         export LLVM_MODULE_PATCH_FILE=mlir/llvm-project/patches/moduleOp-bufferization.patch
         export LLVM_FUNC_CALL_PATCH_FILE=mlir/llvm-project/patches/callOp-bufferization.patch
         if patch --dry-run -p1 -N --directory=$(LLVM_ROOT) < $(LLVM_MODULE_PATCH_FILE) > /dev/null 2>&1; then patch -p1 --directory=$(LLVM_ROOT) < $(LLVM_MODULE_PATCH_FILE); fi
         if patch --dry-run -p1 -N --directory=$(LLVM_ROOT) < $(LLVM_FUNC_CALL_PATCH_FILE) > /dev/null 2>&1; then patch -p1 --directory=$(LLVM_ROOT) < $(LLVM_FUNC_CALL_PATCH_FILE); fi
+        
         cmake -S mlir/llvm-project/llvm -B llvm-build -G Ninja \
               -DCMAKE_BUILD_TYPE=Release \
               -DLLVM_BUILD_EXAMPLES=OFF \
diff --git a/.github/workflows/build-wheel-macos-arm64.yaml b/.github/workflows/build-wheel-macos-arm64.yaml
index 2fd6220c61..c8b0864fb3 100644
--- a/.github/workflows/build-wheel-macos-arm64.yaml
+++ b/.github/workflows/build-wheel-macos-arm64.yaml
@@ -137,6 +137,11 @@ jobs:
     - name: Build LLVM / MLIR
       if: steps.cache-llvm-build.outputs.cache-hit != 'true'
       run: |
+        export LLVM_ROOT=mlir/llvm-project
+        export LLVM_MODULE_PATCH_FILE=mlir/llvm-project/patches/moduleOp-bufferization.patch
+        export LLVM_FUNC_CALL_PATCH_FILE=mlir/llvm-project/patches/callOp-bufferization.patch
+        if patch --dry-run -p1 -N --directory=$(LLVM_ROOT) < $(LLVM_MODULE_PATCH_FILE) > /dev/null 2>&1; then patch -p1 --directory=$(LLVM_ROOT) < $(LLVM_MODULE_PATCH_FILE); fi
+        if patch --dry-run -p1 -N --directory=$(LLVM_ROOT) < $(LLVM_FUNC_CALL_PATCH_FILE) > /dev/null 2>&1; then patch -p1 --directory=$(LLVM_ROOT) < $(LLVM_FUNC_CALL_PATCH_FILE); fi
         cmake -S mlir/llvm-project/llvm -B llvm-build -G Ninja \
               -DCMAKE_BUILD_TYPE=Release \
               -DLLVM_BUILD_EXAMPLES=OFF \
@@ -172,7 +177,10 @@ jobs:
 
         export TARGET_FILE=mlir/mlir-hlo/mhlo/transforms/CMakeLists.txt
         export PATCH_FILE=mlir/patches/mhlo-Add-PassesIncGen-in-transforms-CMakeList.patch
+        export MHLO_ROOT?=mlir/mlir-hlo
+        export MHLO_MODULE_PATCH_FILE=mlir/patches/moduleOp-mhlo.patch
         if patch --dry-run -p1 -N $TARGET_FILE $PATCH_FILE > /dev/null 2>&1; then patch -p1 $TARGET_FILE $PATCH_FILE; fi
+        if patch --dry-run -p1 -N --directory=$(MHLO_ROOT) < $(MHLO_MODULE_PATCH_FILE) > /dev/null 2>&1; then patch -p1 --directory=$(MHLO_ROOT) < $(MHLO_MODULE_PATCH_FILE); fi
 
         cmake -S mlir/mlir-hlo -B mhlo-build -G Ninja \
               -DCMAKE_BUILD_TYPE=Release \

From 4718c149a17f7c5a4b14bd1efa8b98cc168c44e2 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Mon, 23 Sep 2024 21:42:40 -0400
Subject: [PATCH 117/183] Update macos-x86-64 wheel

---
 .github/workflows/build-wheel-macos-arm64.yaml  | 1 +
 .github/workflows/build-wheel-macos-x86_64.yaml | 9 +++++++++
 2 files changed, 10 insertions(+)

diff --git a/.github/workflows/build-wheel-macos-arm64.yaml b/.github/workflows/build-wheel-macos-arm64.yaml
index c8b0864fb3..5808bb58a7 100644
--- a/.github/workflows/build-wheel-macos-arm64.yaml
+++ b/.github/workflows/build-wheel-macos-arm64.yaml
@@ -142,6 +142,7 @@ jobs:
         export LLVM_FUNC_CALL_PATCH_FILE=mlir/llvm-project/patches/callOp-bufferization.patch
         if patch --dry-run -p1 -N --directory=$(LLVM_ROOT) < $(LLVM_MODULE_PATCH_FILE) > /dev/null 2>&1; then patch -p1 --directory=$(LLVM_ROOT) < $(LLVM_MODULE_PATCH_FILE); fi
         if patch --dry-run -p1 -N --directory=$(LLVM_ROOT) < $(LLVM_FUNC_CALL_PATCH_FILE) > /dev/null 2>&1; then patch -p1 --directory=$(LLVM_ROOT) < $(LLVM_FUNC_CALL_PATCH_FILE); fi
+        
         cmake -S mlir/llvm-project/llvm -B llvm-build -G Ninja \
               -DCMAKE_BUILD_TYPE=Release \
               -DLLVM_BUILD_EXAMPLES=OFF \
diff --git a/.github/workflows/build-wheel-macos-x86_64.yaml b/.github/workflows/build-wheel-macos-x86_64.yaml
index f03f324617..df18030527 100644
--- a/.github/workflows/build-wheel-macos-x86_64.yaml
+++ b/.github/workflows/build-wheel-macos-x86_64.yaml
@@ -133,6 +133,12 @@ jobs:
     - name: Build LLVM / MLIR
       if: steps.cache-llvm-build.outputs.cache-hit != 'true'
       run: |
+        export LLVM_ROOT=mlir/llvm-project
+        export LLVM_MODULE_PATCH_FILE=mlir/llvm-project/patches/moduleOp-bufferization.patch
+        export LLVM_FUNC_CALL_PATCH_FILE=mlir/llvm-project/patches/callOp-bufferization.patch
+        if patch --dry-run -p1 -N --directory=$(LLVM_ROOT) < $(LLVM_MODULE_PATCH_FILE) > /dev/null 2>&1; then patch -p1 --directory=$(LLVM_ROOT) < $(LLVM_MODULE_PATCH_FILE); fi
+        if patch --dry-run -p1 -N --directory=$(LLVM_ROOT) < $(LLVM_FUNC_CALL_PATCH_FILE) > /dev/null 2>&1; then patch -p1 --directory=$(LLVM_ROOT) < $(LLVM_FUNC_CALL_PATCH_FILE); fi
+
         cmake -S mlir/llvm-project/llvm -B llvm-build -G Ninja \
               -DCMAKE_BUILD_TYPE=Release \
               -DLLVM_BUILD_EXAMPLES=OFF \
@@ -168,7 +174,10 @@ jobs:
 
         export TARGET_FILE=mlir/mlir-hlo/mhlo/transforms/CMakeLists.txt
         export PATCH_FILE=mlir/patches/mhlo-Add-PassesIncGen-in-transforms-CMakeList.patch
+        export MHLO_ROOT?=mlir/mlir-hlo
+        export MHLO_MODULE_PATCH_FILE=mlir/patches/moduleOp-mhlo.patch
         if patch --dry-run -p1 -N $TARGET_FILE $PATCH_FILE > /dev/null 2>&1; then patch -p1 $TARGET_FILE $PATCH_FILE; fi
+        if patch --dry-run -p1 -N --directory=$(MHLO_ROOT) < $(MHLO_MODULE_PATCH_FILE) > /dev/null 2>&1; then patch -p1 --directory=$(MHLO_ROOT) < $(MHLO_MODULE_PATCH_FILE); fi
 
         cmake -S mlir/mlir-hlo -B mhlo-build -G Ninja \
               -DCMAKE_BUILD_TYPE=Release \

From 64aa9ffb63e4dd45dd7ff5db960d9a268d755076 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Mon, 23 Sep 2024 21:45:05 -0400
Subject: [PATCH 118/183] Update linux_arm64 scripts

---
 .github/workflows/scripts/linux_arm64/rh8/build_llvm.sh | 6 ++++++
 .github/workflows/scripts/linux_arm64/rh8/build_mhlo.sh | 3 +++
 2 files changed, 9 insertions(+)

diff --git a/.github/workflows/scripts/linux_arm64/rh8/build_llvm.sh b/.github/workflows/scripts/linux_arm64/rh8/build_llvm.sh
index 78bb6aadb8..70c3d21a7a 100644
--- a/.github/workflows/scripts/linux_arm64/rh8/build_llvm.sh
+++ b/.github/workflows/scripts/linux_arm64/rh8/build_llvm.sh
@@ -33,6 +33,12 @@ export PATH=/opt/_internal/cpython-${PYTHON_VERSION}.${PYTHON_SUBVERSION}/bin:/o
 # Install python dependencies
 /usr/bin/python3 -m pip install pennylane pybind11 PyYAML cmake ninja
 
+export LLVM_ROOT=mlir/llvm-project
+export LLVM_MODULE_PATCH_FILE=mlir/llvm-project/patches/moduleOp-bufferization.patch
+export LLVM_FUNC_CALL_PATCH_FILE=mlir/llvm-project/patches/callOp-bufferization.patch
+if patch --dry-run -p1 -N --directory=$(LLVM_ROOT) < $(LLVM_MODULE_PATCH_FILE) > /dev/null 2>&1; then patch -p1 --directory=$(LLVM_ROOT) < $(LLVM_MODULE_PATCH_FILE); fi
+if patch --dry-run -p1 -N --directory=$(LLVM_ROOT) < $(LLVM_FUNC_CALL_PATCH_FILE) > /dev/null 2>&1; then patch -p1 --directory=$(LLVM_ROOT) < $(LLVM_FUNC_CALL_PATCH_FILE); fi
+
 # Build LLVM
 cmake -S /catalyst/mlir/llvm-project/llvm -B /catalyst/llvm-build -G Ninja \
     -DCMAKE_BUILD_TYPE=Release \
diff --git a/.github/workflows/scripts/linux_arm64/rh8/build_mhlo.sh b/.github/workflows/scripts/linux_arm64/rh8/build_mhlo.sh
index 2a5b2e4fa7..b596d1f1c0 100644
--- a/.github/workflows/scripts/linux_arm64/rh8/build_mhlo.sh
+++ b/.github/workflows/scripts/linux_arm64/rh8/build_mhlo.sh
@@ -38,7 +38,10 @@ sed -i -e 's/LINK_LIBS PUBLIC/LINK_LIBS PUBLIC MLIRDeallocationUtils/g' mlir/mli
 
 export TARGET_FILE=mlir/mlir-hlo/mhlo/transforms/CMakeLists.txt
 export PATCH_FILE=mlir/patches/mhlo-Add-PassesIncGen-in-transforms-CMakeList.patch
+export MHLO_ROOT?=mlir/mlir-hlo
+export MHLO_MODULE_PATCH_FILE=mlir/patches/moduleOp-mhlo.patch
 if patch --dry-run -p1 -N $TARGET_FILE $PATCH_FILE > /dev/null 2>&1; then patch -p1 $TARGET_FILE $PATCH_FILE; fi
+if patch --dry-run -p1 -N --directory=$(MHLO_ROOT) < $(MHLO_MODULE_PATCH_FILE) > /dev/null 2>&1; then patch -p1 --directory=$(MHLO_ROOT) < $(MHLO_MODULE_PATCH_FILE); fi
 
 # Build MHLO
 cmake -S /catalyst/mlir/mlir-hlo -B /catalyst/mhlo-build -G Ninja \

From cf56867fabbe996371567c70fcdcac5befc65013 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Mon, 23 Sep 2024 22:05:57 -0400
Subject: [PATCH 119/183] add lcurses flag to compile_executable

---
 frontend/catalyst/debug/compiler_functions.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/frontend/catalyst/debug/compiler_functions.py b/frontend/catalyst/debug/compiler_functions.py
index ff3f616f1f..5df5de5c9f 100644
--- a/frontend/catalyst/debug/compiler_functions.py
+++ b/frontend/catalyst/debug/compiler_functions.py
@@ -335,6 +335,7 @@ def f(x):
         f"-Wl,-rpath,{python_lib_dir_path}",
         f"-L{python_lib_dir_path}",
         "-lpython" + version_str,
+        "-lcurses"
     ]
 
     # Linker in macOS might use @rpath/Python3.framework/Versions/3.x/Python3.

From 6862700cebcb6d1f4c9203ec78ad09f49955aefb Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Mon, 23 Sep 2024 23:27:17 -0400
Subject: [PATCH 120/183] Add copy-before-write only for async tests

---
 frontend/catalyst/compiler.py | 28 +++++++++++++++++++++++-----
 1 file changed, 23 insertions(+), 5 deletions(-)

diff --git a/frontend/catalyst/compiler.py b/frontend/catalyst/compiler.py
index 8b9202d7d5..babac7a5cc 100644
--- a/frontend/catalyst/compiler.py
+++ b/frontend/catalyst/compiler.py
@@ -235,6 +235,28 @@ def run_writing_command(command: List[str], compile_options: Optional[CompileOpt
     ],
 )
 
+BUFFERIZATION_ASYNC_PASS = (
+    "BufferizationPass",
+    [
+        "inline",
+        "gradient-preprocess",
+        "eliminate-empty-tensors",
+        "convert-elementwise-to-linalg",
+        "one-shot-bufferize{bufferize-function-boundaries allow-return-allocs-from-loops function-boundary-type-conversion=identity-layout-map copy-before-write}",
+        "canonicalize",  # Remove dead memrefToTensorOp's
+        "gradient-postprocess",
+        # introduced during gradient-bufferize of callbacks
+        "func.func(buffer-hoisting)",
+        "func.func(buffer-loop-hoisting)",
+        "func.func(buffer-deallocation)",
+        "convert-arraylist-to-memref",
+        "convert-bufferization-to-memref",
+        "canonicalize",  # Must be after convert-bufferization-to-memref
+        # otherwise there are issues in lowering of dynamic tensors.
+        # "cse",
+        "cp-global-memref",
+    ],
+)
 
 MLIR_TO_LLVM_PASS = (
     "MLIRToLLVMDialect",
@@ -289,8 +311,6 @@ def run_writing_command(command: List[str], compile_options: Optional[CompileOpt
     HLO_LOWERING_PASS,
     QUANTUM_COMPILATION_PASS,
     BUFFERIZATION_PASS,
-    # BUFFERIZATION_PASS2,
-    # BUFFERIZATION_PASS3,
     MLIR_TO_LLVM_PASS,
 ]
 
@@ -306,9 +326,7 @@ def run_writing_command(command: List[str], compile_options: Optional[CompileOpt
     TAPE_SPLITTING_PASS,
     HLO_LOWERING_PASS,
     QUANTUM_COMPILATION_PASS,
-    BUFFERIZATION_PASS,
-    # BUFFERIZATION_PASS2,
-    # BUFFERIZATION_PASS3,
+    BUFFERIZATION_ASYNC_PASS,
     MLIR_TO_LLVM_ASYNC_PASS,
 ]
 

From ee47a0c7caa0472979fcb821744b18d95d262b0a Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Tue, 24 Sep 2024 10:13:42 -0400
Subject: [PATCH 121/183] Clear subview work around in cg_global_bufferize

---
 .../Quantum/Transforms/cp_global_buffers.cpp  | 27 -------------------
 1 file changed, 27 deletions(-)

diff --git a/mlir/lib/Quantum/Transforms/cp_global_buffers.cpp b/mlir/lib/Quantum/Transforms/cp_global_buffers.cpp
index e622604e07..d1cba0993d 100644
--- a/mlir/lib/Quantum/Transforms/cp_global_buffers.cpp
+++ b/mlir/lib/Quantum/Transforms/cp_global_buffers.cpp
@@ -106,33 +106,6 @@ Value allocCopyMemrefDyn(Location loc, Value memref, PatternRewriter &rewriter)
     
     Value newMemRef = rewriter.create<memref::AllocOp>(loc, newMemrefType, dynDims);
     // Cast memrefType back to maintain memory layout.
-    if (!memref::CastOp::areCastCompatible(newMemrefType, origMemrefType)) {
-    //if (!origMemrefType.getLayout().isIdentity()) {
-            SmallVector<OpFoldResult> sizes =
-        memref::getMixedSizes(rewriter, loc, newMemRef);
-
-        // getlayout
-        auto layout = origMemrefType.getLayout();
-        auto stridedAttr = cast<mlir::StridedLayoutAttr>(layout);
-        auto offset = stridedAttr.getOffset();
-        auto strideArr = stridedAttr.getStrides();
-
-        // Rebuild strides and offsets info
-        SmallVector<OpFoldResult> strides;
-        for (auto stride : strideArr) {
-            strides.push_back(rewriter.getIndexAttr(stride));
-        }
-        SmallVector<OpFoldResult> offsets(origMemrefType.getRank(),
-                                      rewriter.getIndexAttr(0));
-        offsets[0] = rewriter.getIndexAttr(offset);
-        auto subview = rewriter.create<memref::SubViewOp>(
-            loc, origMemrefType, newMemRef, 
-            offsets, sizes, strides);
-
-        rewriter.create<memref::CopyOp>(loc, memref, newMemRef);
-        return subview;
-    }
-
     Value castMemRef = rewriter.create<memref::CastOp>(loc, origMemrefType, newMemRef);
     rewriter.create<memref::CopyOp>(loc, memref, newMemRef);
     return castMemRef;

From b858cde601795640d2895e46236d5ecee0202b03 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Tue, 24 Sep 2024 10:23:50 -0400
Subject: [PATCH 122/183] Reformatting

---
 frontend/catalyst/compiler.py                            | 6 ++++--
 frontend/catalyst/debug/compiler_functions.py            | 2 +-
 .../Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp  | 9 ++++++---
 mlir/lib/Quantum/Transforms/cp_global_buffers.cpp        | 5 +++--
 4 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/frontend/catalyst/compiler.py b/frontend/catalyst/compiler.py
index b3ba3c3ae4..ba42c376c1 100644
--- a/frontend/catalyst/compiler.py
+++ b/frontend/catalyst/compiler.py
@@ -221,7 +221,8 @@ def run_writing_command(command: List[str], compile_options: Optional[CompileOpt
         "gradient-preprocess",
         "eliminate-empty-tensors",
         "convert-elementwise-to-linalg",
-        "one-shot-bufferize{bufferize-function-boundaries allow-return-allocs-from-loops function-boundary-type-conversion=identity-layout-map}",
+        "one-shot-bufferize{bufferize-function-boundaries allow-return-allocs-from-loops "
+        "function-boundary-type-conversion=identity-layout-map}",
         "canonicalize",  # Remove dead memrefToTensorOp's
         "gradient-postprocess",
         # introduced during gradient-bufferize of callbacks
@@ -244,7 +245,8 @@ def run_writing_command(command: List[str], compile_options: Optional[CompileOpt
         "gradient-preprocess",
         "eliminate-empty-tensors",
         "convert-elementwise-to-linalg",
-        "one-shot-bufferize{bufferize-function-boundaries allow-return-allocs-from-loops function-boundary-type-conversion=identity-layout-map copy-before-write}",
+        "one-shot-bufferize{bufferize-function-boundaries allow-return-allocs-from-loops "
+        "function-boundary-type-conversion=identity-layout-map copy-before-write}",
         "canonicalize",  # Remove dead memrefToTensorOp's
         "gradient-postprocess",
         # introduced during gradient-bufferize of callbacks
diff --git a/frontend/catalyst/debug/compiler_functions.py b/frontend/catalyst/debug/compiler_functions.py
index 5df5de5c9f..1f65e90445 100644
--- a/frontend/catalyst/debug/compiler_functions.py
+++ b/frontend/catalyst/debug/compiler_functions.py
@@ -335,7 +335,7 @@ def f(x):
         f"-Wl,-rpath,{python_lib_dir_path}",
         f"-L{python_lib_dir_path}",
         "-lpython" + version_str,
-        "-lcurses"
+        "-lcurses",
     ]
 
     # Linker in macOS might use @rpath/Python3.framework/Versions/3.x/Python3.
diff --git a/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
index 33bda43f2d..c22ef85d03 100644
--- a/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -91,10 +91,13 @@ struct CustomCallOpInterface
             if (!memrefType)
                 return failure();
             if (!memrefType.getLayout().isIdentity()) {
-                auto nonStrideType = MemRefType::get(memrefType.getShape(), memrefType.getElementType());
-                auto newMemRef = rewriter.create<memref::CastOp>(op->getLoc(), nonStrideType, *opBuffer);
+                auto nonStrideType =
+                    MemRefType::get(memrefType.getShape(), memrefType.getElementType());
+                auto newMemRef =
+                    rewriter.create<memref::CastOp>(op->getLoc(), nonStrideType, *opBuffer);
                 bufferArgs.push_back(newMemRef);
-            } else {
+            }
+            else {
                 bufferArgs.push_back(*opBuffer);
             }
         }
diff --git a/mlir/lib/Quantum/Transforms/cp_global_buffers.cpp b/mlir/lib/Quantum/Transforms/cp_global_buffers.cpp
index d1cba0993d..160adb70d6 100644
--- a/mlir/lib/Quantum/Transforms/cp_global_buffers.cpp
+++ b/mlir/lib/Quantum/Transforms/cp_global_buffers.cpp
@@ -89,7 +89,8 @@ Value allocCopyMemrefDyn(Location loc, Value memref, PatternRewriter &rewriter)
 {
     auto origMemrefType = cast<MemRefType>(memref.getType());
     // Rebuild MemRefType without memory layout.
-    auto newMemrefType = MemRefType::get(origMemrefType.getShape(), origMemrefType.getElementType());
+    auto newMemrefType =
+        MemRefType::get(origMemrefType.getShape(), origMemrefType.getElementType());
 
     llvm::SmallVector<Value> dynDims;
     {
@@ -103,7 +104,7 @@ Value allocCopyMemrefDyn(Location loc, Value memref, PatternRewriter &rewriter)
             ndim++;
         }
     }
-    
+
     Value newMemRef = rewriter.create<memref::AllocOp>(loc, newMemrefType, dynDims);
     // Cast memrefType back to maintain memory layout.
     Value castMemRef = rewriter.create<memref::CastOp>(loc, origMemrefType, newMemRef);

From e8eb51c2028e6c548c0b30d554b686ef691a4720 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Tue, 24 Sep 2024 14:20:32 -0400
Subject: [PATCH 123/183] Try fixing wheel

---
 .github/workflows/build-wheel-linux-x86_64.yaml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.github/workflows/build-wheel-linux-x86_64.yaml b/.github/workflows/build-wheel-linux-x86_64.yaml
index 9854315428..783a01351a 100644
--- a/.github/workflows/build-wheel-linux-x86_64.yaml
+++ b/.github/workflows/build-wheel-linux-x86_64.yaml
@@ -172,7 +172,6 @@ jobs:
       if: steps.cache-llvm-build.outputs.cache-hit != 'true'
       run: |
         export PATH=$GITHUB_WORKSPACE/llvm-build/bin:$PATH
-
         export LLVM_ROOT=mlir/llvm-project
         export LLVM_MODULE_PATCH_FILE=mlir/llvm-project/patches/moduleOp-bufferization.patch
         export LLVM_FUNC_CALL_PATCH_FILE=mlir/llvm-project/patches/callOp-bufferization.patch
@@ -212,7 +211,6 @@ jobs:
       # building with LLD is a strong requirement for mhlo
       run: |
         export PATH=$GITHUB_WORKSPACE/llvm-build/bin:$PATH
-
         export TARGET_FILE=mlir/mlir-hlo/mhlo/transforms/CMakeLists.txt
         export PATCH_FILE=mlir/patches/mhlo-Add-PassesIncGen-in-transforms-CMakeList.patch
         export MHLO_ROOT?=mlir/mlir-hlo

From da6888314291aeb9eee467bfde0a70ae117fe46c Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Tue, 24 Sep 2024 15:11:52 -0400
Subject: [PATCH 124/183] Debug wheel

---
 .github/workflows/build-wheel-linux-x86_64.yaml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.github/workflows/build-wheel-linux-x86_64.yaml b/.github/workflows/build-wheel-linux-x86_64.yaml
index 783a01351a..1e6c231584 100644
--- a/.github/workflows/build-wheel-linux-x86_64.yaml
+++ b/.github/workflows/build-wheel-linux-x86_64.yaml
@@ -161,6 +161,11 @@ jobs:
     - name: Build LLD
       if: steps.cache-llvm-build.outputs.cache-hit != 'true'
       run: |
+        export LLVM_ROOT=mlir/llvm-project
+        export LLVM_MODULE_PATCH_FILE=mlir/llvm-project/patches/moduleOp-bufferization.patch
+        export LLVM_FUNC_CALL_PATCH_FILE=mlir/llvm-project/patches/callOp-bufferization.patch
+        if patch --dry-run -p1 -N --directory=$(LLVM_ROOT) < $(LLVM_MODULE_PATCH_FILE) > /dev/null 2>&1; then patch -p1 --directory=$(LLVM_ROOT) < $(LLVM_MODULE_PATCH_FILE); fi
+        if patch --dry-run -p1 -N --directory=$(LLVM_ROOT) < $(LLVM_FUNC_CALL_PATCH_FILE) > /dev/null 2>&1; then patch -p1 --directory=$(LLVM_ROOT) < $(LLVM_FUNC_CALL_PATCH_FILE); fi
         cmake -S mlir/llvm-project/llvm -B llvm-build -G Ninja \
               -DCMAKE_BUILD_TYPE=Release \
               -DLLVM_TARGETS_TO_BUILD="host" \

From f0364efffa885e8ee906de73066505b91e9cea5e Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Tue, 24 Sep 2024 15:17:27 -0400
Subject: [PATCH 125/183] Try getting rid of exports

---
 .github/workflows/build-wheel-linux-x86_64.yaml | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/build-wheel-linux-x86_64.yaml b/.github/workflows/build-wheel-linux-x86_64.yaml
index 1e6c231584..7ceedaff47 100644
--- a/.github/workflows/build-wheel-linux-x86_64.yaml
+++ b/.github/workflows/build-wheel-linux-x86_64.yaml
@@ -161,11 +161,8 @@ jobs:
     - name: Build LLD
       if: steps.cache-llvm-build.outputs.cache-hit != 'true'
       run: |
-        export LLVM_ROOT=mlir/llvm-project
-        export LLVM_MODULE_PATCH_FILE=mlir/llvm-project/patches/moduleOp-bufferization.patch
-        export LLVM_FUNC_CALL_PATCH_FILE=mlir/llvm-project/patches/callOp-bufferization.patch
-        if patch --dry-run -p1 -N --directory=$(LLVM_ROOT) < $(LLVM_MODULE_PATCH_FILE) > /dev/null 2>&1; then patch -p1 --directory=$(LLVM_ROOT) < $(LLVM_MODULE_PATCH_FILE); fi
-        if patch --dry-run -p1 -N --directory=$(LLVM_ROOT) < $(LLVM_FUNC_CALL_PATCH_FILE) > /dev/null 2>&1; then patch -p1 --directory=$(LLVM_ROOT) < $(LLVM_FUNC_CALL_PATCH_FILE); fi
+        if patch --dry-run -p1 -N --directory=mlir/llvm-project < mlir/llvm-project/patches/moduleOp-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/llvm-project < mlir/llvm-project/patches/moduleOp-bufferization.patch; fi
+        if patch --dry-run -p1 -N --directory=mlir/llvm-project < mlir/llvm-project/patches/callOp-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/llvm-project < mlir/llvm-project/patches/callOp-bufferization.patch; fi
         cmake -S mlir/llvm-project/llvm -B llvm-build -G Ninja \
               -DCMAKE_BUILD_TYPE=Release \
               -DLLVM_TARGETS_TO_BUILD="host" \

From 01555851a1b405d5d9d98194cff2ff10c792092f Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Tue, 24 Sep 2024 15:22:28 -0400
Subject: [PATCH 126/183] Debug llvm path

---
 .github/workflows/build-wheel-linux-x86_64.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/build-wheel-linux-x86_64.yaml b/.github/workflows/build-wheel-linux-x86_64.yaml
index 7ceedaff47..b625e0391d 100644
--- a/.github/workflows/build-wheel-linux-x86_64.yaml
+++ b/.github/workflows/build-wheel-linux-x86_64.yaml
@@ -161,6 +161,7 @@ jobs:
     - name: Build LLD
       if: steps.cache-llvm-build.outputs.cache-hit != 'true'
       run: |
+        pwd
         if patch --dry-run -p1 -N --directory=mlir/llvm-project < mlir/llvm-project/patches/moduleOp-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/llvm-project < mlir/llvm-project/patches/moduleOp-bufferization.patch; fi
         if patch --dry-run -p1 -N --directory=mlir/llvm-project < mlir/llvm-project/patches/callOp-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/llvm-project < mlir/llvm-project/patches/callOp-bufferization.patch; fi
         cmake -S mlir/llvm-project/llvm -B llvm-build -G Ninja \

From 08b44939fcd09e37883a6ab4cec872b9151d5f2c Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Tue, 24 Sep 2024 15:28:50 -0400
Subject: [PATCH 127/183] Try different paths

---
 .github/workflows/build-wheel-linux-x86_64.yaml | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/build-wheel-linux-x86_64.yaml b/.github/workflows/build-wheel-linux-x86_64.yaml
index b625e0391d..3b7657d1b3 100644
--- a/.github/workflows/build-wheel-linux-x86_64.yaml
+++ b/.github/workflows/build-wheel-linux-x86_64.yaml
@@ -161,9 +161,9 @@ jobs:
     - name: Build LLD
       if: steps.cache-llvm-build.outputs.cache-hit != 'true'
       run: |
-        pwd
-        if patch --dry-run -p1 -N --directory=mlir/llvm-project < mlir/llvm-project/patches/moduleOp-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/llvm-project < mlir/llvm-project/patches/moduleOp-bufferization.patch; fi
-        if patch --dry-run -p1 -N --directory=mlir/llvm-project < mlir/llvm-project/patches/callOp-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/llvm-project < mlir/llvm-project/patches/callOp-bufferization.patch; fi
+        if patch --dry-run -p1 -N --directory=mlir/llvm-project < mlir/patches/moduleOp-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/llvm-project < mlir/patches/moduleOp-bufferization.patch; fi
+        if patch --dry-run -p1 -N --directory=mlir/llvm-project < mlir/patches/callOp-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/llvm-project < mlir/patches/callOp-bufferization.patch; fi
+
         cmake -S mlir/llvm-project/llvm -B llvm-build -G Ninja \
               -DCMAKE_BUILD_TYPE=Release \
               -DLLVM_TARGETS_TO_BUILD="host" \
@@ -175,9 +175,10 @@ jobs:
       if: steps.cache-llvm-build.outputs.cache-hit != 'true'
       run: |
         export PATH=$GITHUB_WORKSPACE/llvm-build/bin:$PATH
+
         export LLVM_ROOT=mlir/llvm-project
-        export LLVM_MODULE_PATCH_FILE=mlir/llvm-project/patches/moduleOp-bufferization.patch
-        export LLVM_FUNC_CALL_PATCH_FILE=mlir/llvm-project/patches/callOp-bufferization.patch
+        export LLVM_MODULE_PATCH_FILE=mlir/patches/moduleOp-bufferization.patch
+        export LLVM_FUNC_CALL_PATCH_FILE=mlir/patches/callOp-bufferization.patch
         if patch --dry-run -p1 -N --directory=$(LLVM_ROOT) < $(LLVM_MODULE_PATCH_FILE) > /dev/null 2>&1; then patch -p1 --directory=$(LLVM_ROOT) < $(LLVM_MODULE_PATCH_FILE); fi
         if patch --dry-run -p1 -N --directory=$(LLVM_ROOT) < $(LLVM_FUNC_CALL_PATCH_FILE) > /dev/null 2>&1; then patch -p1 --directory=$(LLVM_ROOT) < $(LLVM_FUNC_CALL_PATCH_FILE); fi
         

From a6281e01533d6c4956d4bd9c33a4e7717ad02258 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Tue, 24 Sep 2024 15:39:28 -0400
Subject: [PATCH 128/183]  Update patch paths in wheel scripts

---
 .github/workflows/build-wheel-linux-x86_64.yaml  | 16 ++++------------
 .github/workflows/build-wheel-macos-arm64.yaml   | 13 ++++---------
 .github/workflows/build-wheel-macos-x86_64.yaml  | 11 +++--------
 .../scripts/linux_arm64/rh8/build_llvm.sh        |  7 ++-----
 .../scripts/linux_arm64/rh8/build_mhlo.sh        |  4 +---
 5 files changed, 14 insertions(+), 37 deletions(-)

diff --git a/.github/workflows/build-wheel-linux-x86_64.yaml b/.github/workflows/build-wheel-linux-x86_64.yaml
index 3b7657d1b3..6a42ab3e00 100644
--- a/.github/workflows/build-wheel-linux-x86_64.yaml
+++ b/.github/workflows/build-wheel-linux-x86_64.yaml
@@ -161,9 +161,6 @@ jobs:
     - name: Build LLD
       if: steps.cache-llvm-build.outputs.cache-hit != 'true'
       run: |
-        if patch --dry-run -p1 -N --directory=mlir/llvm-project < mlir/patches/moduleOp-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/llvm-project < mlir/patches/moduleOp-bufferization.patch; fi
-        if patch --dry-run -p1 -N --directory=mlir/llvm-project < mlir/patches/callOp-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/llvm-project < mlir/patches/callOp-bufferization.patch; fi
-
         cmake -S mlir/llvm-project/llvm -B llvm-build -G Ninja \
               -DCMAKE_BUILD_TYPE=Release \
               -DLLVM_TARGETS_TO_BUILD="host" \
@@ -176,12 +173,9 @@ jobs:
       run: |
         export PATH=$GITHUB_WORKSPACE/llvm-build/bin:$PATH
 
-        export LLVM_ROOT=mlir/llvm-project
-        export LLVM_MODULE_PATCH_FILE=mlir/patches/moduleOp-bufferization.patch
-        export LLVM_FUNC_CALL_PATCH_FILE=mlir/patches/callOp-bufferization.patch
-        if patch --dry-run -p1 -N --directory=$(LLVM_ROOT) < $(LLVM_MODULE_PATCH_FILE) > /dev/null 2>&1; then patch -p1 --directory=$(LLVM_ROOT) < $(LLVM_MODULE_PATCH_FILE); fi
-        if patch --dry-run -p1 -N --directory=$(LLVM_ROOT) < $(LLVM_FUNC_CALL_PATCH_FILE) > /dev/null 2>&1; then patch -p1 --directory=$(LLVM_ROOT) < $(LLVM_FUNC_CALL_PATCH_FILE); fi
-        
+        if patch --dry-run -p1 -N --directory=mlir/llvm-project < mlir/patches/moduleOp-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/llvm-project < mlir/patches/moduleOp-bufferization.patch; fi
+        if patch --dry-run -p1 -N --directory=mlir/llvm-project < mlir/patches/callOp-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/llvm-project < mlir/patches/callOp-bufferization.patch; fi
+
         cmake -S mlir/llvm-project/llvm -B llvm-build -G Ninja \
               -DCMAKE_BUILD_TYPE=Release \
               -DLLVM_BUILD_EXAMPLES=OFF \
@@ -217,10 +211,8 @@ jobs:
         export PATH=$GITHUB_WORKSPACE/llvm-build/bin:$PATH
         export TARGET_FILE=mlir/mlir-hlo/mhlo/transforms/CMakeLists.txt
         export PATCH_FILE=mlir/patches/mhlo-Add-PassesIncGen-in-transforms-CMakeList.patch
-        export MHLO_ROOT?=mlir/mlir-hlo
-        export MHLO_MODULE_PATCH_FILE=mlir/patches/moduleOp-mhlo.patch
         if patch --dry-run -p1 -N $TARGET_FILE $PATCH_FILE > /dev/null 2>&1; then patch -p1 $TARGET_FILE $PATCH_FILE; fi
-        if patch --dry-run -p1 -N --directory=$(MHLO_ROOT) < $(MHLO_MODULE_PATCH_FILE) > /dev/null 2>&1; then patch -p1 --directory=$(MHLO_ROOT) < $(MHLO_MODULE_PATCH_FILE); fi
+        if patch --dry-run -p1 -N --directory=mlir/mlir-hlo < mlir/patches/moduleOp-mhlo.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/mlir-hlo < mlir/patches/moduleOp-mhlo.patch; fi
 
         cmake -S mlir/mlir-hlo -B mhlo-build -G Ninja \
               -DCMAKE_BUILD_TYPE=Release \
diff --git a/.github/workflows/build-wheel-macos-arm64.yaml b/.github/workflows/build-wheel-macos-arm64.yaml
index 5808bb58a7..d7dfa80c28 100644
--- a/.github/workflows/build-wheel-macos-arm64.yaml
+++ b/.github/workflows/build-wheel-macos-arm64.yaml
@@ -137,12 +137,9 @@ jobs:
     - name: Build LLVM / MLIR
       if: steps.cache-llvm-build.outputs.cache-hit != 'true'
       run: |
-        export LLVM_ROOT=mlir/llvm-project
-        export LLVM_MODULE_PATCH_FILE=mlir/llvm-project/patches/moduleOp-bufferization.patch
-        export LLVM_FUNC_CALL_PATCH_FILE=mlir/llvm-project/patches/callOp-bufferization.patch
-        if patch --dry-run -p1 -N --directory=$(LLVM_ROOT) < $(LLVM_MODULE_PATCH_FILE) > /dev/null 2>&1; then patch -p1 --directory=$(LLVM_ROOT) < $(LLVM_MODULE_PATCH_FILE); fi
-        if patch --dry-run -p1 -N --directory=$(LLVM_ROOT) < $(LLVM_FUNC_CALL_PATCH_FILE) > /dev/null 2>&1; then patch -p1 --directory=$(LLVM_ROOT) < $(LLVM_FUNC_CALL_PATCH_FILE); fi
-        
+        if patch --dry-run -p1 -N --directory=mlir/llvm-project < mlir/patches/moduleOp-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/llvm-project < mlir/patches/moduleOp-bufferization.patch; fi
+        if patch --dry-run -p1 -N --directory=mlir/llvm-project < mlir/patches/callOp-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/llvm-project < mlir/patches/callOp-bufferization.patch; fi
+
         cmake -S mlir/llvm-project/llvm -B llvm-build -G Ninja \
               -DCMAKE_BUILD_TYPE=Release \
               -DLLVM_BUILD_EXAMPLES=OFF \
@@ -178,10 +175,8 @@ jobs:
 
         export TARGET_FILE=mlir/mlir-hlo/mhlo/transforms/CMakeLists.txt
         export PATCH_FILE=mlir/patches/mhlo-Add-PassesIncGen-in-transforms-CMakeList.patch
-        export MHLO_ROOT?=mlir/mlir-hlo
-        export MHLO_MODULE_PATCH_FILE=mlir/patches/moduleOp-mhlo.patch
         if patch --dry-run -p1 -N $TARGET_FILE $PATCH_FILE > /dev/null 2>&1; then patch -p1 $TARGET_FILE $PATCH_FILE; fi
-        if patch --dry-run -p1 -N --directory=$(MHLO_ROOT) < $(MHLO_MODULE_PATCH_FILE) > /dev/null 2>&1; then patch -p1 --directory=$(MHLO_ROOT) < $(MHLO_MODULE_PATCH_FILE); fi
+        if patch --dry-run -p1 -N --directory=mlir/mlir-hlo < mlir/patches/moduleOp-mhlo.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/mlir-hlo < mlir/patches/moduleOp-mhlo.patch; fi
 
         cmake -S mlir/mlir-hlo -B mhlo-build -G Ninja \
               -DCMAKE_BUILD_TYPE=Release \
diff --git a/.github/workflows/build-wheel-macos-x86_64.yaml b/.github/workflows/build-wheel-macos-x86_64.yaml
index df18030527..6dc80356a7 100644
--- a/.github/workflows/build-wheel-macos-x86_64.yaml
+++ b/.github/workflows/build-wheel-macos-x86_64.yaml
@@ -133,11 +133,8 @@ jobs:
     - name: Build LLVM / MLIR
       if: steps.cache-llvm-build.outputs.cache-hit != 'true'
       run: |
-        export LLVM_ROOT=mlir/llvm-project
-        export LLVM_MODULE_PATCH_FILE=mlir/llvm-project/patches/moduleOp-bufferization.patch
-        export LLVM_FUNC_CALL_PATCH_FILE=mlir/llvm-project/patches/callOp-bufferization.patch
-        if patch --dry-run -p1 -N --directory=$(LLVM_ROOT) < $(LLVM_MODULE_PATCH_FILE) > /dev/null 2>&1; then patch -p1 --directory=$(LLVM_ROOT) < $(LLVM_MODULE_PATCH_FILE); fi
-        if patch --dry-run -p1 -N --directory=$(LLVM_ROOT) < $(LLVM_FUNC_CALL_PATCH_FILE) > /dev/null 2>&1; then patch -p1 --directory=$(LLVM_ROOT) < $(LLVM_FUNC_CALL_PATCH_FILE); fi
+        if patch --dry-run -p1 -N --directory=mlir/llvm-project < mlir/patches/moduleOp-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/llvm-project < mlir/patches/moduleOp-bufferization.patch; fi
+        if patch --dry-run -p1 -N --directory=mlir/llvm-project < mlir/patches/callOp-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/llvm-project < mlir/patches/callOp-bufferization.patch; fi
 
         cmake -S mlir/llvm-project/llvm -B llvm-build -G Ninja \
               -DCMAKE_BUILD_TYPE=Release \
@@ -174,10 +171,8 @@ jobs:
 
         export TARGET_FILE=mlir/mlir-hlo/mhlo/transforms/CMakeLists.txt
         export PATCH_FILE=mlir/patches/mhlo-Add-PassesIncGen-in-transforms-CMakeList.patch
-        export MHLO_ROOT?=mlir/mlir-hlo
-        export MHLO_MODULE_PATCH_FILE=mlir/patches/moduleOp-mhlo.patch
         if patch --dry-run -p1 -N $TARGET_FILE $PATCH_FILE > /dev/null 2>&1; then patch -p1 $TARGET_FILE $PATCH_FILE; fi
-        if patch --dry-run -p1 -N --directory=$(MHLO_ROOT) < $(MHLO_MODULE_PATCH_FILE) > /dev/null 2>&1; then patch -p1 --directory=$(MHLO_ROOT) < $(MHLO_MODULE_PATCH_FILE); fi
+        if patch --dry-run -p1 -N --directory=mlir/mlir-hlo < mlir/patches/moduleOp-mhlo.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/mlir-hlo < mlir/patches/moduleOp-mhlo.patch; fi
 
         cmake -S mlir/mlir-hlo -B mhlo-build -G Ninja \
               -DCMAKE_BUILD_TYPE=Release \
diff --git a/.github/workflows/scripts/linux_arm64/rh8/build_llvm.sh b/.github/workflows/scripts/linux_arm64/rh8/build_llvm.sh
index 70c3d21a7a..6323558df1 100644
--- a/.github/workflows/scripts/linux_arm64/rh8/build_llvm.sh
+++ b/.github/workflows/scripts/linux_arm64/rh8/build_llvm.sh
@@ -33,11 +33,8 @@ export PATH=/opt/_internal/cpython-${PYTHON_VERSION}.${PYTHON_SUBVERSION}/bin:/o
 # Install python dependencies
 /usr/bin/python3 -m pip install pennylane pybind11 PyYAML cmake ninja
 
-export LLVM_ROOT=mlir/llvm-project
-export LLVM_MODULE_PATCH_FILE=mlir/llvm-project/patches/moduleOp-bufferization.patch
-export LLVM_FUNC_CALL_PATCH_FILE=mlir/llvm-project/patches/callOp-bufferization.patch
-if patch --dry-run -p1 -N --directory=$(LLVM_ROOT) < $(LLVM_MODULE_PATCH_FILE) > /dev/null 2>&1; then patch -p1 --directory=$(LLVM_ROOT) < $(LLVM_MODULE_PATCH_FILE); fi
-if patch --dry-run -p1 -N --directory=$(LLVM_ROOT) < $(LLVM_FUNC_CALL_PATCH_FILE) > /dev/null 2>&1; then patch -p1 --directory=$(LLVM_ROOT) < $(LLVM_FUNC_CALL_PATCH_FILE); fi
+if patch --dry-run -p1 -N --directory=/catalyst/mlir/llvm-project < /catalyst/mlir/patches/moduleOp-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=/catalyst/mlir/llvm-project < /catalyst/mlir/patches/moduleOp-bufferization.patch; fi
+if patch --dry-run -p1 -N --directory=/catalyst/mlir/llvm-project < /catalyst/mlir/patches/callOp-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=/catalyst/mlir/llvm-project < /catalyst/mlir/patches/callOp-bufferization.patch; fi
 
 # Build LLVM
 cmake -S /catalyst/mlir/llvm-project/llvm -B /catalyst/llvm-build -G Ninja \
diff --git a/.github/workflows/scripts/linux_arm64/rh8/build_mhlo.sh b/.github/workflows/scripts/linux_arm64/rh8/build_mhlo.sh
index b596d1f1c0..a98ca20787 100644
--- a/.github/workflows/scripts/linux_arm64/rh8/build_mhlo.sh
+++ b/.github/workflows/scripts/linux_arm64/rh8/build_mhlo.sh
@@ -38,10 +38,8 @@ sed -i -e 's/LINK_LIBS PUBLIC/LINK_LIBS PUBLIC MLIRDeallocationUtils/g' mlir/mli
 
 export TARGET_FILE=mlir/mlir-hlo/mhlo/transforms/CMakeLists.txt
 export PATCH_FILE=mlir/patches/mhlo-Add-PassesIncGen-in-transforms-CMakeList.patch
-export MHLO_ROOT?=mlir/mlir-hlo
-export MHLO_MODULE_PATCH_FILE=mlir/patches/moduleOp-mhlo.patch
 if patch --dry-run -p1 -N $TARGET_FILE $PATCH_FILE > /dev/null 2>&1; then patch -p1 $TARGET_FILE $PATCH_FILE; fi
-if patch --dry-run -p1 -N --directory=$(MHLO_ROOT) < $(MHLO_MODULE_PATCH_FILE) > /dev/null 2>&1; then patch -p1 --directory=$(MHLO_ROOT) < $(MHLO_MODULE_PATCH_FILE); fi
+if patch --dry-run -p1 -N --directory=/catalyst/mlir/mlir-hlo < /catalyst/mlir/patches/moduleOp-mhlo.patch > /dev/null 2>&1; then patch -p1 --directory=/catalyst/mlir/mlir-hlo < /catalyst/mlir/patches/moduleOp-mhlo.patch; fi
 
 # Build MHLO
 cmake -S /catalyst/mlir/mlir-hlo -B /catalyst/mhlo-build -G Ninja \

From df1c23fad87ef1b784b36c23effe312b903ddced Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Mon, 30 Sep 2024 10:52:55 -0400
Subject: [PATCH 129/183] Apply patch when getting LLVM and MHLO sources

---
 .github/workflows/build-wheel-linux-x86_64.yaml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/.github/workflows/build-wheel-linux-x86_64.yaml b/.github/workflows/build-wheel-linux-x86_64.yaml
index 6a42ab3e00..45ffdd1039 100644
--- a/.github/workflows/build-wheel-linux-x86_64.yaml
+++ b/.github/workflows/build-wheel-linux-x86_64.yaml
@@ -294,6 +294,9 @@ jobs:
         key: llvm-${{ needs.constants.outputs.llvm_version }}-container-source
         enableCrossOsArchive: True
         fail-on-cache-miss: True
+      run: |
+        if patch --dry-run -p1 -N --directory=mlir/llvm-project < mlir/patches/moduleOp-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/llvm-project < mlir/patches/moduleOp-bufferization.patch; fi
+        if patch --dry-run -p1 -N --directory=mlir/llvm-project < mlir/patches/callOp-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/llvm-project < mlir/patches/callOp-bufferization.patch; fi
 
     - name: Get Cached LLVM Build
       id: cache-llvm-build
@@ -311,6 +314,11 @@ jobs:
         key: mhlo-${{ needs.constants.outputs.mhlo_version }}-container-source
         enableCrossOsArchive: True
         fail-on-cache-miss: True
+      run: |
+        export TARGET_FILE=mlir/mlir-hlo/mhlo/transforms/CMakeLists.txt
+        export PATCH_FILE=mlir/patches/mhlo-Add-PassesIncGen-in-transforms-CMakeList.patch
+        if patch --dry-run -p1 -N $TARGET_FILE $PATCH_FILE > /dev/null 2>&1; then patch -p1 $TARGET_FILE $PATCH_FILE; fi
+        if patch --dry-run -p1 -N --directory=mlir/mlir-hlo < mlir/patches/moduleOp-mhlo.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/mlir-hlo < mlir/patches/moduleOp-mhlo.patch; fi
 
     - name: Get Cached MHLO Build
       id: cache-mhlo-build

From b5b387678ebc3933a9f22331aad86c2d4c7c0084 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Mon, 30 Sep 2024 11:02:51 -0400
Subject: [PATCH 130/183] Revert "Apply patch when getting LLVM and MHLO
 sources"

This reverts commit df1c23fad87ef1b784b36c23effe312b903ddced.
---
 .github/workflows/build-wheel-linux-x86_64.yaml | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/.github/workflows/build-wheel-linux-x86_64.yaml b/.github/workflows/build-wheel-linux-x86_64.yaml
index 45ffdd1039..6a42ab3e00 100644
--- a/.github/workflows/build-wheel-linux-x86_64.yaml
+++ b/.github/workflows/build-wheel-linux-x86_64.yaml
@@ -294,9 +294,6 @@ jobs:
         key: llvm-${{ needs.constants.outputs.llvm_version }}-container-source
         enableCrossOsArchive: True
         fail-on-cache-miss: True
-      run: |
-        if patch --dry-run -p1 -N --directory=mlir/llvm-project < mlir/patches/moduleOp-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/llvm-project < mlir/patches/moduleOp-bufferization.patch; fi
-        if patch --dry-run -p1 -N --directory=mlir/llvm-project < mlir/patches/callOp-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/llvm-project < mlir/patches/callOp-bufferization.patch; fi
 
     - name: Get Cached LLVM Build
       id: cache-llvm-build
@@ -314,11 +311,6 @@ jobs:
         key: mhlo-${{ needs.constants.outputs.mhlo_version }}-container-source
         enableCrossOsArchive: True
         fail-on-cache-miss: True
-      run: |
-        export TARGET_FILE=mlir/mlir-hlo/mhlo/transforms/CMakeLists.txt
-        export PATCH_FILE=mlir/patches/mhlo-Add-PassesIncGen-in-transforms-CMakeList.patch
-        if patch --dry-run -p1 -N $TARGET_FILE $PATCH_FILE > /dev/null 2>&1; then patch -p1 $TARGET_FILE $PATCH_FILE; fi
-        if patch --dry-run -p1 -N --directory=mlir/mlir-hlo < mlir/patches/moduleOp-mhlo.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/mlir-hlo < mlir/patches/moduleOp-mhlo.patch; fi
 
     - name: Get Cached MHLO Build
       id: cache-mhlo-build

From 0e0406bfecba1dee6e4df55c03d75596df414467 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Mon, 30 Sep 2024 11:06:18 -0400
Subject: [PATCH 131/183] Patch LLVM again before building wheel

---
 .github/workflows/build-wheel-linux-x86_64.yaml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/build-wheel-linux-x86_64.yaml b/.github/workflows/build-wheel-linux-x86_64.yaml
index 6a42ab3e00..ee7a2777ba 100644
--- a/.github/workflows/build-wheel-linux-x86_64.yaml
+++ b/.github/workflows/build-wheel-linux-x86_64.yaml
@@ -387,6 +387,9 @@ jobs:
 
     - name: Build wheel
       run: |
+        if patch --dry-run -p1 -N --directory=mlir/llvm-project < mlir/patches/moduleOp-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/llvm-project < mlir/patches/moduleOp-bufferization.patch; fi
+        if patch --dry-run -p1 -N --directory=mlir/llvm-project < mlir/patches/callOp-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/llvm-project < mlir/patches/callOp-bufferization.patch; fi
+        
         PYTHON=python${{ matrix.python_version }} \
         LLVM_BUILD_DIR=$GITHUB_WORKSPACE/llvm-build \
         MHLO_BUILD_DIR=$GITHUB_WORKSPACE/mhlo-build \

From 98fb3db5f281119aacce132a0d37202e5af5f217 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Mon, 30 Sep 2024 11:17:32 -0400
Subject: [PATCH 132/183] Apply patches right before building MLIR Dialects

---
 .github/workflows/build-wheel-linux-x86_64.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/build-wheel-linux-x86_64.yaml b/.github/workflows/build-wheel-linux-x86_64.yaml
index ee7a2777ba..df8b19586b 100644
--- a/.github/workflows/build-wheel-linux-x86_64.yaml
+++ b/.github/workflows/build-wheel-linux-x86_64.yaml
@@ -368,6 +368,9 @@ jobs:
     - name: Build MLIR Dialects
       run: |
         export PATH=$GITHUB_WORKSPACE/llvm-build/bin:$PATH
+        if patch --dry-run -p1 -N --directory=mlir/llvm-project < mlir/patches/moduleOp-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/llvm-project < mlir/patches/moduleOp-bufferization.patch; fi
+        if patch --dry-run -p1 -N --directory=mlir/llvm-project < mlir/patches/callOp-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/llvm-project < mlir/patches/callOp-bufferization.patch; fi
+        if patch --dry-run -p1 -N --directory=mlir/mlir-hlo < mlir/patches/moduleOp-mhlo.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/mlir-hlo < mlir/patches/moduleOp-mhlo.patch; fi
         cmake -S mlir -B quantum-build -G Ninja \
               -DCMAKE_BUILD_TYPE=Release \
               -DLLVM_ENABLE_ASSERTIONS=ON \
@@ -387,9 +390,6 @@ jobs:
 
     - name: Build wheel
       run: |
-        if patch --dry-run -p1 -N --directory=mlir/llvm-project < mlir/patches/moduleOp-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/llvm-project < mlir/patches/moduleOp-bufferization.patch; fi
-        if patch --dry-run -p1 -N --directory=mlir/llvm-project < mlir/patches/callOp-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/llvm-project < mlir/patches/callOp-bufferization.patch; fi
-        
         PYTHON=python${{ matrix.python_version }} \
         LLVM_BUILD_DIR=$GITHUB_WORKSPACE/llvm-build \
         MHLO_BUILD_DIR=$GITHUB_WORKSPACE/mhlo-build \

From 9401b9774c8fab1321668a71d46cc0bec3e15b26 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Mon, 30 Sep 2024 11:49:29 -0400
Subject: [PATCH 133/183] Construct BUFFERIZATION_ASYNC_PASS in a concise way

---
 frontend/catalyst/compiler.py | 20 ++------------------
 1 file changed, 2 insertions(+), 18 deletions(-)

diff --git a/frontend/catalyst/compiler.py b/frontend/catalyst/compiler.py
index ba42c376c1..be1d8f8d38 100644
--- a/frontend/catalyst/compiler.py
+++ b/frontend/catalyst/compiler.py
@@ -241,24 +241,8 @@ def run_writing_command(command: List[str], compile_options: Optional[CompileOpt
 BUFFERIZATION_ASYNC_PASS = (
     "BufferizationPass",
     [
-        "inline",
-        "gradient-preprocess",
-        "eliminate-empty-tensors",
-        "convert-elementwise-to-linalg",
-        "one-shot-bufferize{bufferize-function-boundaries allow-return-allocs-from-loops "
-        "function-boundary-type-conversion=identity-layout-map copy-before-write}",
-        "canonicalize",  # Remove dead memrefToTensorOp's
-        "gradient-postprocess",
-        # introduced during gradient-bufferize of callbacks
-        "func.func(buffer-hoisting)",
-        "func.func(buffer-loop-hoisting)",
-        "func.func(buffer-deallocation)",
-        "convert-arraylist-to-memref",
-        "convert-bufferization-to-memref",
-        "canonicalize",  # Must be after convert-bufferization-to-memref
-        # otherwise there are issues in lowering of dynamic tensors.
-        # "cse",
-        "cp-global-memref",
+        s.replace("}", " copy-before-write}") if s.startswith("one-shot-bufferize") else s
+        for s in BUFFERIZATION_PASS[1]
     ],
 )
 

From c4b7b5e91c076d164ce069e009df6f5fb97bad2f Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Mon, 30 Sep 2024 11:50:31 -0400
Subject: [PATCH 134/183] Patch LLVM when building dialects in all the other
 wheel scripts

---
 .github/workflows/build-wheel-linux-x86_64.yaml             | 1 +
 .github/workflows/build-wheel-macos-arm64.yaml              | 4 ++++
 .github/workflows/build-wheel-macos-x86_64.yaml             | 4 ++++
 .github/workflows/scripts/linux_arm64/rh8/build_catalyst.sh | 5 +++++
 4 files changed, 14 insertions(+)

diff --git a/.github/workflows/build-wheel-linux-x86_64.yaml b/.github/workflows/build-wheel-linux-x86_64.yaml
index df8b19586b..63baa19bfd 100644
--- a/.github/workflows/build-wheel-linux-x86_64.yaml
+++ b/.github/workflows/build-wheel-linux-x86_64.yaml
@@ -371,6 +371,7 @@ jobs:
         if patch --dry-run -p1 -N --directory=mlir/llvm-project < mlir/patches/moduleOp-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/llvm-project < mlir/patches/moduleOp-bufferization.patch; fi
         if patch --dry-run -p1 -N --directory=mlir/llvm-project < mlir/patches/callOp-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/llvm-project < mlir/patches/callOp-bufferization.patch; fi
         if patch --dry-run -p1 -N --directory=mlir/mlir-hlo < mlir/patches/moduleOp-mhlo.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/mlir-hlo < mlir/patches/moduleOp-mhlo.patch; fi
+        
         cmake -S mlir -B quantum-build -G Ninja \
               -DCMAKE_BUILD_TYPE=Release \
               -DLLVM_ENABLE_ASSERTIONS=ON \
diff --git a/.github/workflows/build-wheel-macos-arm64.yaml b/.github/workflows/build-wheel-macos-arm64.yaml
index d7dfa80c28..5bccc308b2 100644
--- a/.github/workflows/build-wheel-macos-arm64.yaml
+++ b/.github/workflows/build-wheel-macos-arm64.yaml
@@ -332,6 +332,10 @@ jobs:
     # Build Quantum and Gradient Dialects
     - name: Build MLIR Dialects
       run: |
+        if patch --dry-run -p1 -N --directory=mlir/llvm-project < mlir/patches/moduleOp-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/llvm-project < mlir/patches/moduleOp-bufferization.patch; fi
+        if patch --dry-run -p1 -N --directory=mlir/llvm-project < mlir/patches/callOp-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/llvm-project < mlir/patches/callOp-bufferization.patch; fi
+        if patch --dry-run -p1 -N --directory=mlir/mlir-hlo < mlir/patches/moduleOp-mhlo.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/mlir-hlo < mlir/patches/moduleOp-mhlo.patch; fi
+
         cmake -S mlir -B quantum-build -G Ninja \
               -DCMAKE_BUILD_TYPE=Release \
               -DLLVM_ENABLE_ASSERTIONS=ON \
diff --git a/.github/workflows/build-wheel-macos-x86_64.yaml b/.github/workflows/build-wheel-macos-x86_64.yaml
index 6dc80356a7..17da2a63fc 100644
--- a/.github/workflows/build-wheel-macos-x86_64.yaml
+++ b/.github/workflows/build-wheel-macos-x86_64.yaml
@@ -323,6 +323,10 @@ jobs:
     # Build Quantum and Gradient Dialects
     - name: Build MLIR Dialects
       run: |
+        if patch --dry-run -p1 -N --directory=mlir/llvm-project < mlir/patches/moduleOp-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/llvm-project < mlir/patches/moduleOp-bufferization.patch; fi
+        if patch --dry-run -p1 -N --directory=mlir/llvm-project < mlir/patches/callOp-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/llvm-project < mlir/patches/callOp-bufferization.patch; fi
+        if patch --dry-run -p1 -N --directory=mlir/mlir-hlo < mlir/patches/moduleOp-mhlo.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/mlir-hlo < mlir/patches/moduleOp-mhlo.patch; fi
+
         cmake -S mlir -B quantum-build -G Ninja \
               -DCMAKE_BUILD_TYPE=Release \
               -DLLVM_ENABLE_ASSERTIONS=ON \
diff --git a/.github/workflows/scripts/linux_arm64/rh8/build_catalyst.sh b/.github/workflows/scripts/linux_arm64/rh8/build_catalyst.sh
index aa7692215a..e6165d2f40 100644
--- a/.github/workflows/scripts/linux_arm64/rh8/build_catalyst.sh
+++ b/.github/workflows/scripts/linux_arm64/rh8/build_catalyst.sh
@@ -37,6 +37,11 @@ export PATH=/catalyst/llvm-build/bin:/opt/_internal/cpython-${PYTHON_VERSION}.${
 # Install python dependencies
 /usr/bin/python3 -m pip install pennylane pybind11 PyYAML cmake ninja delocate 'amazon-braket-pennylane-plugin>1.27.1'
 
+# Patch LLVM and MHLO
+if patch --dry-run -p1 -N --directory=mlir/llvm-project < mlir/patches/moduleOp-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/llvm-project < mlir/patches/moduleOp-bufferization.patch; fi
+if patch --dry-run -p1 -N --directory=mlir/llvm-project < mlir/patches/callOp-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/llvm-project < mlir/patches/callOp-bufferization.patch; fi
+if patch --dry-run -p1 -N --directory=mlir/mlir-hlo < mlir/patches/moduleOp-mhlo.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/mlir-hlo < mlir/patches/moduleOp-mhlo.patch; fi
+
 # Build Catalyst runtime
 cmake -S runtime -B runtime-build -G Ninja \
     -DCMAKE_BUILD_TYPE=Release \

From c36e1602f9c31ae347bae39bd73e083bbddf8675 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Mon, 30 Sep 2024 13:48:06 -0400
Subject: [PATCH 135/183] Create cache with patches' hash

---
 .github/workflows/check-catalyst.yaml | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/check-catalyst.yaml b/.github/workflows/check-catalyst.yaml
index 073824d86d..c782f38029 100644
--- a/.github/workflows/check-catalyst.yaml
+++ b/.github/workflows/check-catalyst.yaml
@@ -128,7 +128,7 @@ jobs:
       uses: actions/cache@v4
       with:
         path: llvm-build
-        key: ${{ runner.os }}-llvm-${{ needs.constants.outputs.llvm_version }}-default-build-${{ matrix.compiler }}
+        key: ${{ runner.os }}-llvm-${{ needs.constants.outputs.llvm_version }}-patch-${{ hashFiles('mlir/patches/**') }}-default-build-${{ matrix.compiler }}
 
     - name: Install Deps
       if: steps.cache-llvm-build.outputs.cache-hit != 'true'
@@ -184,7 +184,7 @@ jobs:
       uses: actions/cache@v4
       with:
         path: mhlo-build
-        key: ${{ runner.os }}-mhlo-${{ needs.constants.outputs.mhlo_version }}-default-build-${{ matrix.compiler }}
+        key: ${{ runner.os }}-mhlo-${{ needs.constants.outputs.mhlo_version }}-patch-${{ hashFiles('mlir/patches/**') }}-default-build-${{ matrix.compiler }}
 
     - name: Get Cached LLVM Source
       id: cache-llvm-source
@@ -202,7 +202,7 @@ jobs:
       uses: actions/cache@v4
       with:
         path: llvm-build
-        key: ${{ runner.os }}-llvm-${{ needs.constants.outputs.llvm_version }}-default-build-${{ matrix.compiler }}
+        key: ${{ runner.os }}-llvm-${{ needs.constants.outputs.llvm_version }}-patch-${{ hashFiles('mlir/patches/**') }}-default-build-${{ matrix.compiler }}
         fail-on-cache-miss: true
 
     - name: Install Deps
@@ -272,7 +272,7 @@ jobs:
       uses: actions/cache@v4
       with:
         path: llvm-build
-        key: ${{ runner.os }}-llvm-${{ needs.constants.outputs.llvm_version }}-default-build-${{ matrix.compiler }}
+        key: ${{ runner.os }}-llvm-${{ needs.constants.outputs.llvm_version }}-patch-${{ hashFiles('mlir/patches/**') }}-default-build-${{ matrix.compiler }}
         fail-on-cache-miss: true
 
     - name: Install Deps
@@ -324,7 +324,7 @@ jobs:
       uses: actions/cache/restore@v4
       with:
         path: llvm-build
-        key: ${{ runner.os }}-llvm-${{ needs.constants.outputs.llvm_version }}-default-build-${{ matrix.compiler }}
+        key: ${{ runner.os }}-llvm-${{ needs.constants.outputs.llvm_version }}-patch-${{ hashFiles('mlir/patches/**') }}-default-build-${{ matrix.compiler }}
         fail-on-cache-miss: true
 
     - name: Get Cached MHLO Source
@@ -341,7 +341,7 @@ jobs:
       uses: actions/cache/restore@v4
       with:
         path: mhlo-build
-        key: ${{ runner.os }}-mhlo-${{ needs.constants.outputs.mhlo_version }}-default-build-${{ matrix.compiler }}
+        key: ${{ runner.os }}-mhlo-${{ needs.constants.outputs.mhlo_version }}-patch-${{ hashFiles('mlir/patches/**') }}-default-build-${{ matrix.compiler }}
         fail-on-cache-miss: true
 
     - name: Get Cached Enzyme Source
@@ -428,7 +428,7 @@ jobs:
       uses: actions/cache@v4
       with:
         path: llvm-build
-        key: ${{ runner.os }}-llvm-${{ needs.constants.outputs.llvm_version }}-default-build-${{ matrix.compiler }}
+        key: ${{ runner.os }}-llvm-${{ needs.constants.outputs.llvm_version }}-patch-${{ hashFiles('mlir/patches/**') }}-default-build-${{ matrix.compiler }}
         fail-on-cache-miss: true
 
     - name: Download Quantum Build Artifact
@@ -500,7 +500,7 @@ jobs:
       uses: actions/cache@v4
       with:
         path: llvm-build
-        key: ${{ runner.os }}-llvm-${{ needs.constants.outputs.llvm_version }}-default-build-${{ matrix.compiler }}
+        key: ${{ runner.os }}-llvm-${{ needs.constants.outputs.llvm_version }}-patch-${{ hashFiles('mlir/patches/**') }}-default-build-${{ matrix.compiler }}
         fail-on-cache-miss: true
 
     - name: Download Quantum Build Artifact
@@ -555,7 +555,7 @@ jobs:
       uses: actions/cache@v4
       with:
         path: llvm-build
-        key: ${{ runner.os }}-llvm-${{ needs.constants.outputs.llvm_version }}-default-build-${{ matrix.compiler }}
+        key: ${{ runner.os }}-llvm-${{ needs.constants.outputs.llvm_version }}-patch-${{ hashFiles('mlir/patches/**') }}-default-build-${{ matrix.compiler }}
         fail-on-cache-miss: true
 
     - name: Download Quantum Build Artifact

From 2e6324a6b1d9922b0040dbd52f8a4ac51f559296 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Mon, 30 Sep 2024 14:44:28 -0400
Subject: [PATCH 136/183] Update patch and its name

---
 .../workflows/build-wheel-linux-x86_64.yaml   |   8 +-
 .../workflows/build-wheel-macos-arm64.yaml    |   4 +-
 .../workflows/build-wheel-macos-x86_64.yaml   |   8 +-
 .../scripts/linux_arm64/rh8/build_catalyst.sh |   4 +-
 .../scripts/linux_arm64/rh8/build_llvm.sh     |   2 +-
 .../scripts/linux_arm64/rh8/build_mhlo.sh     |   2 +-
 mlir/Makefile                                 |   8 +-
 ...> FunctionOpInterface-bufferization.patch} | 103 ++++++++++--------
 ...o.patch => FunctionOpInterface-mhlo.patch} |   0
 9 files changed, 76 insertions(+), 63 deletions(-)
 rename mlir/patches/{moduleOp-bufferization.patch => FunctionOpInterface-bufferization.patch} (93%)
 rename mlir/patches/{moduleOp-mhlo.patch => FunctionOpInterface-mhlo.patch} (100%)

diff --git a/.github/workflows/build-wheel-linux-x86_64.yaml b/.github/workflows/build-wheel-linux-x86_64.yaml
index 63baa19bfd..b6266eedc8 100644
--- a/.github/workflows/build-wheel-linux-x86_64.yaml
+++ b/.github/workflows/build-wheel-linux-x86_64.yaml
@@ -173,7 +173,7 @@ jobs:
       run: |
         export PATH=$GITHUB_WORKSPACE/llvm-build/bin:$PATH
 
-        if patch --dry-run -p1 -N --directory=mlir/llvm-project < mlir/patches/moduleOp-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/llvm-project < mlir/patches/moduleOp-bufferization.patch; fi
+        if patch --dry-run -p1 -N --directory=mlir/llvm-project < mlir/patches/FunctionOpInterface-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/llvm-project < mlir/patches/FunctionOpInterface-bufferization.patch; fi
         if patch --dry-run -p1 -N --directory=mlir/llvm-project < mlir/patches/callOp-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/llvm-project < mlir/patches/callOp-bufferization.patch; fi
 
         cmake -S mlir/llvm-project/llvm -B llvm-build -G Ninja \
@@ -212,7 +212,7 @@ jobs:
         export TARGET_FILE=mlir/mlir-hlo/mhlo/transforms/CMakeLists.txt
         export PATCH_FILE=mlir/patches/mhlo-Add-PassesIncGen-in-transforms-CMakeList.patch
         if patch --dry-run -p1 -N $TARGET_FILE $PATCH_FILE > /dev/null 2>&1; then patch -p1 $TARGET_FILE $PATCH_FILE; fi
-        if patch --dry-run -p1 -N --directory=mlir/mlir-hlo < mlir/patches/moduleOp-mhlo.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/mlir-hlo < mlir/patches/moduleOp-mhlo.patch; fi
+        if patch --dry-run -p1 -N --directory=mlir/mlir-hlo < mlir/patches/FunctionOpInterface-mhlo.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/mlir-hlo < mlir/patches/FunctionOpInterface-mhlo.patch; fi
 
         cmake -S mlir/mlir-hlo -B mhlo-build -G Ninja \
               -DCMAKE_BUILD_TYPE=Release \
@@ -368,9 +368,9 @@ jobs:
     - name: Build MLIR Dialects
       run: |
         export PATH=$GITHUB_WORKSPACE/llvm-build/bin:$PATH
-        if patch --dry-run -p1 -N --directory=mlir/llvm-project < mlir/patches/moduleOp-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/llvm-project < mlir/patches/moduleOp-bufferization.patch; fi
+        if patch --dry-run -p1 -N --directory=mlir/llvm-project < mlir/patches/FunctionOpInterface-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/llvm-project < mlir/patches/FunctionOpInterface-bufferization.patch; fi
         if patch --dry-run -p1 -N --directory=mlir/llvm-project < mlir/patches/callOp-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/llvm-project < mlir/patches/callOp-bufferization.patch; fi
-        if patch --dry-run -p1 -N --directory=mlir/mlir-hlo < mlir/patches/moduleOp-mhlo.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/mlir-hlo < mlir/patches/moduleOp-mhlo.patch; fi
+        if patch --dry-run -p1 -N --directory=mlir/mlir-hlo < mlir/patches/FunctionOpInterface-mhlo.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/mlir-hlo < mlir/patches/FunctionOpInterface-mhlo.patch; fi
         
         cmake -S mlir -B quantum-build -G Ninja \
               -DCMAKE_BUILD_TYPE=Release \
diff --git a/.github/workflows/build-wheel-macos-arm64.yaml b/.github/workflows/build-wheel-macos-arm64.yaml
index 5bccc308b2..6382f906d5 100644
--- a/.github/workflows/build-wheel-macos-arm64.yaml
+++ b/.github/workflows/build-wheel-macos-arm64.yaml
@@ -137,7 +137,7 @@ jobs:
     - name: Build LLVM / MLIR
       if: steps.cache-llvm-build.outputs.cache-hit != 'true'
       run: |
-        if patch --dry-run -p1 -N --directory=mlir/llvm-project < mlir/patches/moduleOp-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/llvm-project < mlir/patches/moduleOp-bufferization.patch; fi
+        if patch --dry-run -p1 -N --directory=mlir/llvm-project < mlir/patches/FunctionOpInterface-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/llvm-project < mlir/patches/FunctionOpInterface-bufferization.patch; fi
         if patch --dry-run -p1 -N --directory=mlir/llvm-project < mlir/patches/callOp-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/llvm-project < mlir/patches/callOp-bufferization.patch; fi
 
         cmake -S mlir/llvm-project/llvm -B llvm-build -G Ninja \
@@ -332,7 +332,7 @@ jobs:
     # Build Quantum and Gradient Dialects
     - name: Build MLIR Dialects
       run: |
-        if patch --dry-run -p1 -N --directory=mlir/llvm-project < mlir/patches/moduleOp-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/llvm-project < mlir/patches/moduleOp-bufferization.patch; fi
+        if patch --dry-run -p1 -N --directory=mlir/llvm-project < mlir/patches/FunctionOpInterface-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/llvm-project < mlir/patches/FunctionOpInterface-bufferization.patch; fi
         if patch --dry-run -p1 -N --directory=mlir/llvm-project < mlir/patches/callOp-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/llvm-project < mlir/patches/callOp-bufferization.patch; fi
         if patch --dry-run -p1 -N --directory=mlir/mlir-hlo < mlir/patches/moduleOp-mhlo.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/mlir-hlo < mlir/patches/moduleOp-mhlo.patch; fi
 
diff --git a/.github/workflows/build-wheel-macos-x86_64.yaml b/.github/workflows/build-wheel-macos-x86_64.yaml
index 17da2a63fc..91978928f0 100644
--- a/.github/workflows/build-wheel-macos-x86_64.yaml
+++ b/.github/workflows/build-wheel-macos-x86_64.yaml
@@ -133,7 +133,7 @@ jobs:
     - name: Build LLVM / MLIR
       if: steps.cache-llvm-build.outputs.cache-hit != 'true'
       run: |
-        if patch --dry-run -p1 -N --directory=mlir/llvm-project < mlir/patches/moduleOp-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/llvm-project < mlir/patches/moduleOp-bufferization.patch; fi
+        if patch --dry-run -p1 -N --directory=mlir/llvm-project < mlir/patches/FunctionOpInterface-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/llvm-project < mlir/patches/FunctionOpInterface-bufferization.patch; fi
         if patch --dry-run -p1 -N --directory=mlir/llvm-project < mlir/patches/callOp-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/llvm-project < mlir/patches/callOp-bufferization.patch; fi
 
         cmake -S mlir/llvm-project/llvm -B llvm-build -G Ninja \
@@ -172,7 +172,7 @@ jobs:
         export TARGET_FILE=mlir/mlir-hlo/mhlo/transforms/CMakeLists.txt
         export PATCH_FILE=mlir/patches/mhlo-Add-PassesIncGen-in-transforms-CMakeList.patch
         if patch --dry-run -p1 -N $TARGET_FILE $PATCH_FILE > /dev/null 2>&1; then patch -p1 $TARGET_FILE $PATCH_FILE; fi
-        if patch --dry-run -p1 -N --directory=mlir/mlir-hlo < mlir/patches/moduleOp-mhlo.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/mlir-hlo < mlir/patches/moduleOp-mhlo.patch; fi
+        if patch --dry-run -p1 -N --directory=mlir/mlir-hlo < mlir/patches/FunctionOpInterface-mhlo.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/mlir-hlo < mlir/patches/FunctionOpInterface-mhlo.patch; fi
 
         cmake -S mlir/mlir-hlo -B mhlo-build -G Ninja \
               -DCMAKE_BUILD_TYPE=Release \
@@ -323,9 +323,9 @@ jobs:
     # Build Quantum and Gradient Dialects
     - name: Build MLIR Dialects
       run: |
-        if patch --dry-run -p1 -N --directory=mlir/llvm-project < mlir/patches/moduleOp-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/llvm-project < mlir/patches/moduleOp-bufferization.patch; fi
+        if patch --dry-run -p1 -N --directory=mlir/llvm-project < mlir/patches/FunctionOpInterface-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/llvm-project < mlir/patches/FunctionOpInterface-bufferization.patch; fi
         if patch --dry-run -p1 -N --directory=mlir/llvm-project < mlir/patches/callOp-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/llvm-project < mlir/patches/callOp-bufferization.patch; fi
-        if patch --dry-run -p1 -N --directory=mlir/mlir-hlo < mlir/patches/moduleOp-mhlo.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/mlir-hlo < mlir/patches/moduleOp-mhlo.patch; fi
+        if patch --dry-run -p1 -N --directory=mlir/mlir-hlo < mlir/patches/FunctionOpInterface-mhlo.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/mlir-hlo < mlir/patches/FunctionOpInterface-mhlo.patch; fi
 
         cmake -S mlir -B quantum-build -G Ninja \
               -DCMAKE_BUILD_TYPE=Release \
diff --git a/.github/workflows/scripts/linux_arm64/rh8/build_catalyst.sh b/.github/workflows/scripts/linux_arm64/rh8/build_catalyst.sh
index e6165d2f40..bc35956fa0 100644
--- a/.github/workflows/scripts/linux_arm64/rh8/build_catalyst.sh
+++ b/.github/workflows/scripts/linux_arm64/rh8/build_catalyst.sh
@@ -38,9 +38,9 @@ export PATH=/catalyst/llvm-build/bin:/opt/_internal/cpython-${PYTHON_VERSION}.${
 /usr/bin/python3 -m pip install pennylane pybind11 PyYAML cmake ninja delocate 'amazon-braket-pennylane-plugin>1.27.1'
 
 # Patch LLVM and MHLO
-if patch --dry-run -p1 -N --directory=mlir/llvm-project < mlir/patches/moduleOp-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/llvm-project < mlir/patches/moduleOp-bufferization.patch; fi
+if patch --dry-run -p1 -N --directory=mlir/llvm-project < mlir/patches/FunctionOpInterface-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/llvm-project < mlir/patches/FunctionOpInterface-bufferization.patch; fi
 if patch --dry-run -p1 -N --directory=mlir/llvm-project < mlir/patches/callOp-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/llvm-project < mlir/patches/callOp-bufferization.patch; fi
-if patch --dry-run -p1 -N --directory=mlir/mlir-hlo < mlir/patches/moduleOp-mhlo.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/mlir-hlo < mlir/patches/moduleOp-mhlo.patch; fi
+if patch --dry-run -p1 -N --directory=mlir/mlir-hlo < mlir/patches/FunctionOpInterface-mhlo.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/mlir-hlo < mlir/patches/FunctionOpInterface-mhlo.patch; fi
 
 # Build Catalyst runtime
 cmake -S runtime -B runtime-build -G Ninja \
diff --git a/.github/workflows/scripts/linux_arm64/rh8/build_llvm.sh b/.github/workflows/scripts/linux_arm64/rh8/build_llvm.sh
index 6323558df1..a17a928e04 100644
--- a/.github/workflows/scripts/linux_arm64/rh8/build_llvm.sh
+++ b/.github/workflows/scripts/linux_arm64/rh8/build_llvm.sh
@@ -33,7 +33,7 @@ export PATH=/opt/_internal/cpython-${PYTHON_VERSION}.${PYTHON_SUBVERSION}/bin:/o
 # Install python dependencies
 /usr/bin/python3 -m pip install pennylane pybind11 PyYAML cmake ninja
 
-if patch --dry-run -p1 -N --directory=/catalyst/mlir/llvm-project < /catalyst/mlir/patches/moduleOp-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=/catalyst/mlir/llvm-project < /catalyst/mlir/patches/moduleOp-bufferization.patch; fi
+if patch --dry-run -p1 -N --directory=/catalyst/mlir/llvm-project < /catalyst/mlir/patches/FunctionOpInterface-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=/catalyst/mlir/llvm-project < /catalyst/mlir/patches/FunctionOpInterface-bufferization.patch; fi
 if patch --dry-run -p1 -N --directory=/catalyst/mlir/llvm-project < /catalyst/mlir/patches/callOp-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=/catalyst/mlir/llvm-project < /catalyst/mlir/patches/callOp-bufferization.patch; fi
 
 # Build LLVM
diff --git a/.github/workflows/scripts/linux_arm64/rh8/build_mhlo.sh b/.github/workflows/scripts/linux_arm64/rh8/build_mhlo.sh
index a98ca20787..d4fc7a7f38 100644
--- a/.github/workflows/scripts/linux_arm64/rh8/build_mhlo.sh
+++ b/.github/workflows/scripts/linux_arm64/rh8/build_mhlo.sh
@@ -39,7 +39,7 @@ sed -i -e 's/LINK_LIBS PUBLIC/LINK_LIBS PUBLIC MLIRDeallocationUtils/g' mlir/mli
 export TARGET_FILE=mlir/mlir-hlo/mhlo/transforms/CMakeLists.txt
 export PATCH_FILE=mlir/patches/mhlo-Add-PassesIncGen-in-transforms-CMakeList.patch
 if patch --dry-run -p1 -N $TARGET_FILE $PATCH_FILE > /dev/null 2>&1; then patch -p1 $TARGET_FILE $PATCH_FILE; fi
-if patch --dry-run -p1 -N --directory=/catalyst/mlir/mlir-hlo < /catalyst/mlir/patches/moduleOp-mhlo.patch > /dev/null 2>&1; then patch -p1 --directory=/catalyst/mlir/mlir-hlo < /catalyst/mlir/patches/moduleOp-mhlo.patch; fi
+if patch --dry-run -p1 -N --directory=/catalyst/mlir/mlir-hlo < /catalyst/mlir/patches/FunctionOpInterface-mhlo.patch > /dev/null 2>&1; then patch -p1 --directory=/catalyst/mlir/mlir-hlo < /catalyst/mlir/patches/FunctionOpInterface-mhlo.patch; fi
 
 # Build MHLO
 cmake -S /catalyst/mlir/mlir-hlo -B /catalyst/mhlo-build -G Ninja \
diff --git a/mlir/Makefile b/mlir/Makefile
index 4d0e7de3d8..e3b4820a3c 100644
--- a/mlir/Makefile
+++ b/mlir/Makefile
@@ -13,10 +13,10 @@ RT_BUILD_DIR?=$(MK_DIR)/../runtime/build
 ENABLE_ASAN?=OFF
 BUILD_TYPE?=Release
 LLVM_ROOT=$(MK_DIR)/llvm-project
-LLVM_MODULE_PATCH_FILE=$(MK_DIR)/patches/moduleOp-bufferization.patch
+LLVM_FUNCOP_PATCH_FILE=$(MK_DIR)/patches/FunctionOpInterface-bufferization.patch
 LLVM_FUNC_CALL_PATCH_FILE=$(MK_DIR)/patches/callOp-bufferization.patch
 MHLO_ROOT?=$(MK_DIR)/mlir-hlo
-MHLO_MODULE_PATCH_FILE=$(MK_DIR)/patches/moduleOp-mhlo.patch
+MHLO_MODULE_PATCH_FILE=$(MK_DIR)/patches/FunctionOpInterface-mhlo.patch
 TARGET_FILE=$(MK_DIR)/mlir-hlo/mhlo/transforms/CMakeLists.txt
 PATCH_FILE=$(MK_DIR)/patches/mhlo-Add-PassesIncGen-in-transforms-CMakeList.patch
 
@@ -59,8 +59,8 @@ all: llvm mhlo enzyme dialects
 .PHONY: llvm
 llvm:
 	@echo "build LLVM and MLIR enabling Python bindings"
-	@if patch --dry-run -p1 -N --directory=$(LLVM_ROOT) < $(LLVM_MODULE_PATCH_FILE) > /dev/null 2>&1; then \
-        patch -p1 --directory=$(LLVM_ROOT) < $(LLVM_MODULE_PATCH_FILE); \
+	@if patch --dry-run -p1 -N --directory=$(LLVM_ROOT) < $(LLVM_FUNCOP_PATCH_FILE) > /dev/null 2>&1; then \
+        patch -p1 --directory=$(LLVM_ROOT) < $(LLVM_FUNCOP_PATCH_FILE); \
 	fi
 	@if patch --dry-run -p1 -N --directory=$(LLVM_ROOT) < $(LLVM_FUNC_CALL_PATCH_FILE) > /dev/null 2>&1; then \
         patch -p1 --directory=$(LLVM_ROOT) < $(LLVM_FUNC_CALL_PATCH_FILE); \
diff --git a/mlir/patches/moduleOp-bufferization.patch b/mlir/patches/FunctionOpInterface-bufferization.patch
similarity index 93%
rename from mlir/patches/moduleOp-bufferization.patch
rename to mlir/patches/FunctionOpInterface-bufferization.patch
index f66ca1b529..60a2e9b93f 100644
--- a/mlir/patches/moduleOp-bufferization.patch
+++ b/mlir/patches/FunctionOpInterface-bufferization.patch
@@ -1,21 +1,25 @@
 diff --git a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h
-index 2fda091e412a..ba28596d1f97 100644
+index 2fda091e412a..eb0df1d92d6a 100644
 --- a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h
 +++ b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h
-@@ -9,6 +9,7 @@
- #ifndef MLIR_DIALECT_BUFFERIZATION_IR_BUFFERIZABLEOPINTERFACE_H_
- #define MLIR_DIALECT_BUFFERIZATION_IR_BUFFERIZABLEOPINTERFACE_H_
+@@ -11,6 +11,7 @@
  
-+#include "mlir/Interfaces/FunctionInterfaces.h"
  #include "mlir/IR/Operation.h"
  #include "mlir/IR/PatternMatch.h"
++#include "mlir/Interfaces/FunctionInterfaces.h"
  #include "mlir/Support/LLVM.h"
-@@ -262,7 +263,7 @@ struct BufferizationOptions {
+ #include "llvm/ADT/DenseMapInfoVariant.h"
+ #include "llvm/ADT/SetVector.h"
+@@ -260,9 +261,9 @@ struct BufferizationOptions {
+   using AnalysisStateInitFn = std::function<void(AnalysisState &)>;
+   /// Tensor -> MemRef type converter.
    /// Parameters: Value, memory space, func op, bufferization options
-   using FunctionArgTypeConverterFn =
-       std::function<BaseMemRefType(TensorType, Attribute memorySpace,
+-  using FunctionArgTypeConverterFn =
+-      std::function<BaseMemRefType(TensorType, Attribute memorySpace,
 -                                   func::FuncOp, const BufferizationOptions &)>;
-+                                   FunctionOpInterface, const BufferizationOptions &)>;
++  using FunctionArgTypeConverterFn = std::function<BaseMemRefType(
++      TensorType, Attribute memorySpace, FunctionOpInterface,
++      const BufferizationOptions &)>;
    /// Tensor -> MemRef type converter.
    /// Parameters: Value, memory space, bufferization options
    using UnknownTypeConverterFn = std::function<BaseMemRefType(
@@ -98,7 +102,7 @@ index 9fbe574ec392..9749a71f3514 100644
    auto createdEquiv = equivalentFuncArgs.try_emplace(funcOp, IndexMapping());
    auto createdAliasingResults =
 diff --git a/mlir/lib/Dialect/Bufferization/Transforms/OneShotModuleBufferize.cpp b/mlir/lib/Dialect/Bufferization/Transforms/OneShotModuleBufferize.cpp
-index 0a4072605c26..ce90d907b4ca 100644
+index 0a4072605c26..a0e5c7fff769 100644
 --- a/mlir/lib/Dialect/Bufferization/Transforms/OneShotModuleBufferize.cpp
 +++ b/mlir/lib/Dialect/Bufferization/Transforms/OneShotModuleBufferize.cpp
 @@ -75,7 +75,7 @@ using namespace mlir::bufferization;
@@ -118,7 +122,7 @@ index 0a4072605c26..ce90d907b4ca 100644
 -  func::ReturnOp returnOp;
 -  for (Block &b : funcOp.getBody()) {
 -    if (auto candidateOp = dyn_cast<func::ReturnOp>(b.getTerminator())) {
-+static Operation* getAssumedUniqueReturnOp(FunctionOpInterface funcOp) {
++static Operation *getAssumedUniqueReturnOp(FunctionOpInterface funcOp) {
 +  Operation *returnOp = nullptr;
 +  for (Block &b : funcOp.getFunctionBody()) {
 +    auto candidateOp = b.getTerminator();
@@ -126,12 +130,13 @@ index 0a4072605c26..ce90d907b4ca 100644
        if (returnOp)
          return nullptr;
        returnOp = candidateOp;
-@@ -126,16 +127,15 @@ static void annotateEquivalentReturnBbArg(OpOperand &returnVal,
+@@ -126,16 +127,16 @@ static void annotateEquivalentReturnBbArg(OpOperand &returnVal,
  /// Store function BlockArguments that are equivalent to/aliasing a returned
  /// value in FuncAnalysisState.
  static LogicalResult
 -aliasingFuncOpBBArgsAnalysis(FuncOp funcOp, OneShotAnalysisState &state,
-+aliasingFuncOpBBArgsAnalysis(FunctionOpInterface funcOp, OneShotAnalysisState &state,
++aliasingFuncOpBBArgsAnalysis(FunctionOpInterface funcOp,
++                             OneShotAnalysisState &state,
                               FuncAnalysisState &funcState) {
 -  if (funcOp.getBody().empty()) {
 +  if (funcOp.getFunctionBody().empty()) {
@@ -147,7 +152,7 @@ index 0a4072605c26..ce90d907b4ca 100644
          if (!isa<TensorType>(resultIt.value()))
            continue;
          int64_t returnIdx = resultIt.index();
-@@ -147,7 +147,7 @@ aliasingFuncOpBBArgsAnalysis(FuncOp funcOp, OneShotAnalysisState &state,
+@@ -147,7 +148,7 @@ aliasingFuncOpBBArgsAnalysis(FuncOp funcOp, OneShotAnalysisState &state,
    }
  
    // Support only single return-terminated block in the function.
@@ -156,32 +161,35 @@ index 0a4072605c26..ce90d907b4ca 100644
    assert(returnOp && "expected func with single return op");
  
    for (OpOperand &returnVal : returnOp->getOpOperands())
-@@ -168,7 +168,7 @@ aliasingFuncOpBBArgsAnalysis(FuncOp funcOp, OneShotAnalysisState &state,
+@@ -168,8 +169,8 @@ aliasingFuncOpBBArgsAnalysis(FuncOp funcOp, OneShotAnalysisState &state,
    return success();
  }
  
 -static void annotateFuncArgAccess(func::FuncOp funcOp, int64_t idx, bool isRead,
-+static void annotateFuncArgAccess(FunctionOpInterface funcOp, int64_t idx, bool isRead,
-                                   bool isWritten) {
+-                                  bool isWritten) {
++static void annotateFuncArgAccess(FunctionOpInterface funcOp, int64_t idx,
++                                  bool isRead, bool isWritten) {
    OpBuilder b(funcOp.getContext());
    Attribute accessType;
-@@ -189,12 +189,12 @@ static void annotateFuncArgAccess(func::FuncOp funcOp, int64_t idx, bool isRead,
+   if (isRead && isWritten) {
+@@ -189,12 +190,12 @@ static void annotateFuncArgAccess(func::FuncOp funcOp, int64_t idx, bool isRead,
  /// function with unknown ops, we conservatively assume that such ops bufferize
  /// to a read + write.
  static LogicalResult
 -funcOpBbArgReadWriteAnalysis(FuncOp funcOp, OneShotAnalysisState &state,
-+funcOpBbArgReadWriteAnalysis(FunctionOpInterface funcOp, OneShotAnalysisState &state,
++funcOpBbArgReadWriteAnalysis(FunctionOpInterface funcOp,
++                             OneShotAnalysisState &state,
                               FuncAnalysisState &funcState) {
 -  for (int64_t idx = 0, e = funcOp.getFunctionType().getNumInputs(); idx < e;
-+  for (int64_t idx = 0, e = funcOp.getNumArguments(); idx < e;
-        ++idx) {
+-       ++idx) {
++  for (int64_t idx = 0, e = funcOp.getNumArguments(); idx < e; ++idx) {
      // Skip non-tensor arguments.
 -    if (!isa<TensorType>(funcOp.getFunctionType().getInput(idx)))
 +    if (!isa<TensorType>(funcOp.getArgumentTypes()[idx]))
        continue;
      bool isRead;
      bool isWritten;
-@@ -204,7 +204,7 @@ funcOpBbArgReadWriteAnalysis(FuncOp funcOp, OneShotAnalysisState &state,
+@@ -204,7 +205,7 @@ funcOpBbArgReadWriteAnalysis(FuncOp funcOp, OneShotAnalysisState &state,
        StringRef str = accessAttr.getValue();
        isRead = str == "read" || str == "read-write";
        isWritten = str == "write" || str == "read-write";
@@ -190,7 +198,7 @@ index 0a4072605c26..ce90d907b4ca 100644
        // If the function has no body, conservatively assume that all args are
        // read + written.
        isRead = true;
-@@ -230,20 +230,19 @@ funcOpBbArgReadWriteAnalysis(FuncOp funcOp, OneShotAnalysisState &state,
+@@ -230,20 +231,19 @@ funcOpBbArgReadWriteAnalysis(FuncOp funcOp, OneShotAnalysisState &state,
  
  /// Remove bufferization attributes on FuncOp arguments.
  static void removeBufferizationAttributes(BlockArgument bbArg) {
@@ -214,7 +222,7 @@ index 0a4072605c26..ce90d907b4ca 100644
        SymbolTable::lookupNearestSymbolFrom(callOp, sym));
  }
  
-@@ -251,12 +250,12 @@ static func::FuncOp getCalledFunction(func::CallOp callOp) {
+@@ -251,12 +251,13 @@ static func::FuncOp getCalledFunction(func::CallOp callOp) {
  /// Note: This only adds new equivalence info if the called function was already
  /// analyzed.
  // TODO: This does not handle cyclic function call graphs etc.
@@ -227,11 +235,12 @@ index 0a4072605c26..ce90d907b4ca 100644
 -    assert(calledFunction && "could not retrieved called func::FuncOp");
 +  funcOp->walk([&](CallOpInterface callOp) {
 +    FunctionOpInterface calledFunction = getCalledFunction(callOp);
-+    assert(calledFunction && "could not retrieved called FunctionOpInterface");
++    if (!calledFunction)
++      return WalkResult::skip();
  
      // No equivalence info available for the called function.
      if (!funcState.equivalentFuncArgs.count(calledFunction))
-@@ -267,7 +266,7 @@ static void equivalenceAnalysis(func::FuncOp funcOp,
+@@ -267,7 +268,7 @@ static void equivalenceAnalysis(func::FuncOp funcOp,
        int64_t bbargIdx = it.second;
        if (!state.isInPlace(callOp->getOpOperand(bbargIdx)))
          continue;
@@ -240,20 +249,21 @@ index 0a4072605c26..ce90d907b4ca 100644
        Value argVal = callOp->getOperand(bbargIdx);
        state.unionEquivalenceClasses(returnVal, argVal);
      }
-@@ -277,10 +276,10 @@ static void equivalenceAnalysis(func::FuncOp funcOp,
+@@ -277,11 +278,9 @@ static void equivalenceAnalysis(func::FuncOp funcOp,
  }
  
  /// Return "true" if the given function signature has tensor semantics.
 -static bool hasTensorSignature(func::FuncOp funcOp) {
 -  return llvm::any_of(funcOp.getFunctionType().getInputs(),
-+static bool hasTensorSignature(FunctionOpInterface funcOp) {
-+  return llvm::any_of(funcOp.getArgumentTypes(),
-                       llvm::IsaPred<TensorType>) ||
+-                      llvm::IsaPred<TensorType>) ||
 -         llvm::any_of(funcOp.getFunctionType().getResults(),
-+         llvm::any_of(funcOp.getResultTypes(),
-                       llvm::IsaPred<TensorType>);
+-                      llvm::IsaPred<TensorType>);
++static bool hasTensorSignature(FunctionOpInterface funcOp) {
++  return llvm::any_of(funcOp.getArgumentTypes(), llvm::IsaPred<TensorType>) ||
++         llvm::any_of(funcOp.getResultTypes(), llvm::IsaPred<TensorType>);
  }
  
+ /// Store all functions of the `moduleOp` in `orderedFuncOps`, sorted by
 @@ -291,16 +290,16 @@ static bool hasTensorSignature(func::FuncOp funcOp) {
  /// retrieve the called FuncOp from any func::CallOp.
  static LogicalResult
@@ -277,7 +287,7 @@ index 0a4072605c26..ce90d907b4ca 100644
        if (!returnOp)
          return funcOp->emitError()
                 << "cannot bufferize a FuncOp with tensors and "
-@@ -309,9 +308,9 @@ getFuncOpsOrderedByCalls(ModuleOp moduleOp,
+@@ -309,9 +308,10 @@ getFuncOpsOrderedByCalls(ModuleOp moduleOp,
  
      // Collect function calls and populate the caller map.
      numberCallOpsContainedInFuncOp[funcOp] = 0;
@@ -286,11 +296,12 @@ index 0a4072605c26..ce90d907b4ca 100644
 -      assert(calledFunction && "could not retrieved called func::FuncOp");
 +    return funcOp.walk([&](CallOpInterface callOp) -> WalkResult {
 +      FunctionOpInterface calledFunction = getCalledFunction(callOp);
-+      assert(calledFunction && "could not retrieved called FunctionOpInterface");
++      if (!calledFunction)
++        return WalkResult::skip();
        // If the called function does not have any tensors in its signature, then
        // it is not necessary to bufferize the callee before the caller.
        if (!hasTensorSignature(calledFunction))
-@@ -349,11 +348,11 @@ getFuncOpsOrderedByCalls(ModuleOp moduleOp,
+@@ -349,11 +349,11 @@ getFuncOpsOrderedByCalls(ModuleOp moduleOp,
  /// most generic layout map as function return types. After bufferizing the
  /// entire function body, a more concise memref type can potentially be used for
  /// the return type of the function.
@@ -305,16 +316,18 @@ index 0a4072605c26..ce90d907b4ca 100644
    SmallVector<Type> resultTypes;
  
    for (OpOperand &operand : returnOp->getOpOperands()) {
-@@ -366,7 +365,7 @@ static void foldMemRefCasts(func::FuncOp funcOp) {
+@@ -365,8 +365,8 @@ static void foldMemRefCasts(func::FuncOp funcOp) {
+     }
    }
  
-   auto newFuncType = FunctionType::get(
+-  auto newFuncType = FunctionType::get(
 -      funcOp.getContext(), funcOp.getFunctionType().getInputs(), resultTypes);
-+      funcOp.getContext(), funcOp.getArgumentTypes(), resultTypes);
++  auto newFuncType = FunctionType::get(funcOp.getContext(),
++                                       funcOp.getArgumentTypes(), resultTypes);
    funcOp.setType(newFuncType);
  }
  
-@@ -379,7 +378,7 @@ mlir::bufferization::analyzeModuleOp(ModuleOp moduleOp,
+@@ -379,7 +379,7 @@ mlir::bufferization::analyzeModuleOp(ModuleOp moduleOp,
    FuncAnalysisState &funcState = getOrCreateFuncAnalysisState(state);
  
    // A list of functions in the order in which they are analyzed + bufferized.
@@ -323,7 +336,7 @@ index 0a4072605c26..ce90d907b4ca 100644
  
    // A mapping of FuncOps to their callers.
    FuncCallerMap callerMap;
-@@ -388,7 +387,7 @@ mlir::bufferization::analyzeModuleOp(ModuleOp moduleOp,
+@@ -388,7 +388,7 @@ mlir::bufferization::analyzeModuleOp(ModuleOp moduleOp,
      return failure();
  
    // Analyze ops.
@@ -332,7 +345,7 @@ index 0a4072605c26..ce90d907b4ca 100644
      if (!state.getOptions().isOpAllowed(funcOp))
        continue;
  
-@@ -416,7 +415,7 @@ mlir::bufferization::analyzeModuleOp(ModuleOp moduleOp,
+@@ -416,7 +416,7 @@ mlir::bufferization::analyzeModuleOp(ModuleOp moduleOp,
  
  void mlir::bufferization::removeBufferizationAttributesInModule(
      ModuleOp moduleOp) {
@@ -341,7 +354,7 @@ index 0a4072605c26..ce90d907b4ca 100644
      for (BlockArgument bbArg : op.getArguments())
        removeBufferizationAttributes(bbArg);
    });
-@@ -430,7 +429,7 @@ LogicalResult mlir::bufferization::bufferizeModuleOp(
+@@ -430,7 +430,7 @@ LogicalResult mlir::bufferization::bufferizeModuleOp(
    IRRewriter rewriter(moduleOp.getContext());
  
    // A list of functions in the order in which they are analyzed + bufferized.
@@ -350,7 +363,7 @@ index 0a4072605c26..ce90d907b4ca 100644
  
    // A mapping of FuncOps to their callers.
    FuncCallerMap callerMap;
-@@ -439,11 +438,11 @@ LogicalResult mlir::bufferization::bufferizeModuleOp(
+@@ -439,11 +439,11 @@ LogicalResult mlir::bufferization::bufferizeModuleOp(
      return failure();
  
    // Bufferize functions.
@@ -364,7 +377,7 @@ index 0a4072605c26..ce90d907b4ca 100644
        // This function was not analyzed and RaW conflicts were not resolved.
        // Buffer copies must be inserted before every write.
        OneShotBufferizationOptions updatedOptions = options;
-@@ -463,7 +462,7 @@ LogicalResult mlir::bufferization::bufferizeModuleOp(
+@@ -463,7 +463,7 @@ LogicalResult mlir::bufferization::bufferizeModuleOp(
    // Bufferize all other ops.
    for (Operation &op : llvm::make_early_inc_range(moduleOp.getOps())) {
      // Functions were already bufferized.
@@ -373,7 +386,7 @@ index 0a4072605c26..ce90d907b4ca 100644
        continue;
      if (failed(bufferizeOp(&op, options, statistics)))
        return failure();
-@@ -490,12 +489,12 @@ LogicalResult mlir::bufferization::runOneShotModuleBufferize(
+@@ -490,12 +490,12 @@ LogicalResult mlir::bufferization::runOneShotModuleBufferize(
        // FuncOps whose names are specified in options.noAnalysisFuncFilter will
        // not be analyzed. Ops in these FuncOps will not be analyzed as well.
        OpFilter::Entry::FilterFn analysisFilterFn = [=](Operation *op) {
diff --git a/mlir/patches/moduleOp-mhlo.patch b/mlir/patches/FunctionOpInterface-mhlo.patch
similarity index 100%
rename from mlir/patches/moduleOp-mhlo.patch
rename to mlir/patches/FunctionOpInterface-mhlo.patch

From 975b28053aedec504027b942eef18a5907ea8e87 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Mon, 30 Sep 2024 16:37:40 -0400
Subject: [PATCH 137/183] Reapply patch when building MLIR dialects

---
 .github/workflows/check-catalyst.yaml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/check-catalyst.yaml b/.github/workflows/check-catalyst.yaml
index c782f38029..65d033d342 100644
--- a/.github/workflows/check-catalyst.yaml
+++ b/.github/workflows/check-catalyst.yaml
@@ -372,6 +372,10 @@ jobs:
 
     - name: Build MLIR Dialects
       run: |
+        if patch --dry-run -p1 -N --directory=mlir/llvm-project < mlir/patches/FunctionOpInterface-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/llvm-project < mlir/patches/FunctionOpInterface-bufferization.patch; fi
+        if patch --dry-run -p1 -N --directory=mlir/llvm-project < mlir/patches/callOp-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/llvm-project < mlir/patches/callOp-bufferization.patch; fi
+        if patch --dry-run -p1 -N --directory=mlir/mlir-hlo < mlir/patches/FunctionOpInterface-mhlo.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/mlir-hlo < mlir/patches/FunctionOpInterface-mhlo.patch; fi
+
         CCACHE_DIR="$(pwd)/.ccache" \
         C_COMPILER=$(which ${{ needs.constants.outputs[format('c_compiler.{0}', matrix.compiler)] }}) \
         CXX_COMPILER=$(which ${{ needs.constants.outputs[format('cxx_compiler.{0}', matrix.compiler)] }}) \

From 3e68cd8bb27bbc786b67322b7b4ccc61836a612a Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Mon, 30 Sep 2024 17:50:57 -0400
Subject: [PATCH 138/183] Update changelog

---
 doc/releases/changelog-dev.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/doc/releases/changelog-dev.md b/doc/releases/changelog-dev.md
index 61235d045a..4c624d2e80 100644
--- a/doc/releases/changelog-dev.md
+++ b/doc/releases/changelog-dev.md
@@ -202,6 +202,8 @@
 * Cached primitive lowerings is used instead of a custom cache structure.
   [(#1159)](https://github.com/PennyLaneAI/catalyst/pull/1159)
 
+* Catalyst now supports `one-shot bufferize` from MLIR, which is required for JAX v0.4.29 or higher. [(#1027)](https://github.com/PennyLaneAI/catalyst/pull/1027)
+
 <h3>Breaking changes</h3>
 
 * Remove `static_size` field from `AbstractQreg` class.
@@ -262,6 +264,7 @@ Spencer Comin,
 Lillian M.A. Frederiksen,
 Sengthai Heng,
 David Ittah,
+Tzung-Han Juang,
 Mehrdad Malekmohammadi,
 Vincent Michaud-Rioux,
 Romain Moyard,

From 661a1d536721851499da689db824825e751d6d1f Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Mon, 30 Sep 2024 17:54:47 -0400
Subject: [PATCH 139/183] Add patch hash to macos-arm64

---
 .github/workflows/build-wheel-macos-arm64.yaml | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/build-wheel-macos-arm64.yaml b/.github/workflows/build-wheel-macos-arm64.yaml
index 6382f906d5..85fcaf7d2d 100644
--- a/.github/workflows/build-wheel-macos-arm64.yaml
+++ b/.github/workflows/build-wheel-macos-arm64.yaml
@@ -105,14 +105,14 @@ jobs:
       uses: actions/cache/restore@v4
       with:
         path:  llvm-build
-        key: ${{ runner.os }}-${{ runner.arch }}-llvm-${{ needs.constants.outputs.llvm_version }}-${{matrix.python_version}}-wheel-build
+        key: ${{ runner.os }}-${{ runner.arch }}-llvm-${{ needs.constants.outputs.llvm_version }}-patch-${{ hashFiles('mlir/patches/**') }}-${{matrix.python_version}}-wheel-build
 
     - name: Restore MHLO Build
       id: cache-mhlo-build
       uses: actions/cache/restore@v4
       with:
         path:  mhlo-build
-        key: ${{ runner.os }}-${{ runner.arch }}-mhlo-${{ needs.constants.outputs.mhlo_version }}-wheel-build
+        key: ${{ runner.os }}-${{ runner.arch }}-mhlo-${{ needs.constants.outputs.mhlo_version }}-patch-${{ hashFiles('mlir/patches/**') }}-wheel-build
         lookup-only: True
 
     - name: Restore Enzyme Build
@@ -166,7 +166,7 @@ jobs:
       uses: actions/cache/save@v4
       with:
         path:  llvm-build
-        key: ${{ runner.os }}-${{ runner.arch }}-llvm-${{ needs.constants.outputs.llvm_version }}-${{matrix.python_version}}-wheel-build
+        key: ${{ runner.os }}-${{ runner.arch }}-llvm-${{ needs.constants.outputs.llvm_version }}-patch-${{ hashFiles('mlir/patches/**') }}-${{matrix.python_version}}-wheel-build
 
     - name: Build MHLO Dialect
       if: steps.cache-mhlo-build.outputs.cache-hit != 'true'
@@ -196,7 +196,7 @@ jobs:
       uses: actions/cache/save@v4
       with:
         path:  mhlo-build
-        key: ${{ runner.os }}-${{ runner.arch }}-mhlo-${{ needs.constants.outputs.mhlo_version }}-wheel-build
+        key: ${{ runner.os }}-${{ runner.arch }}-mhlo-${{ needs.constants.outputs.mhlo_version }}-patch-${{ hashFiles('mlir/patches/**') }}-wheel-build
 
     - name: Build Enzyme
       if: steps.cache-enzyme-build.outputs.cache-hit != 'true'
@@ -260,7 +260,7 @@ jobs:
       uses: actions/cache/restore@v4
       with:
         path:  llvm-build
-        key: ${{ runner.os }}-${{ runner.arch }}-llvm-${{ needs.constants.outputs.llvm_version }}-3.10-wheel-build
+        key: ${{ runner.os }}-${{ runner.arch }}-llvm-${{ needs.constants.outputs.llvm_version }}-patch-${{ hashFiles('mlir/patches/**') }}-3.10-wheel-build
         fail-on-cache-miss: True
 
     - name: Get Cached MHLO Source
@@ -277,7 +277,7 @@ jobs:
       uses: actions/cache/restore@v4
       with:
         path: mhlo-build
-        key: ${{ runner.os }}-${{ runner.arch }}-mhlo-${{ needs.constants.outputs.mhlo_version }}-wheel-build
+        key: ${{ runner.os }}-${{ runner.arch }}-mhlo-${{ needs.constants.outputs.mhlo_version }}-patch-${{ hashFiles('mlir/patches/**') }}-wheel-build
         fail-on-cache-miss: True
 
     - name: Get Cached Enzyme Source

From 9d5d1179cfd1b55c4179b2a3ca4dab2b2bc4dbc6 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Tue, 1 Oct 2024 09:53:47 -0400
Subject: [PATCH 140/183] Add patch hash to linux-arm64 wheel

---
 .github/workflows/build-wheel-linux-arm64.yaml | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/build-wheel-linux-arm64.yaml b/.github/workflows/build-wheel-linux-arm64.yaml
index 304b23ee5d..9e6d70465a 100644
--- a/.github/workflows/build-wheel-linux-arm64.yaml
+++ b/.github/workflows/build-wheel-linux-arm64.yaml
@@ -63,7 +63,7 @@ jobs:
       uses: actions/cache@v4
       with:
         path:  mlir/llvm-project
-        key: llvm-${{ needs.constants.outputs.llvm_version }}-default-source
+        key: llvm-${{ needs.constants.outputs.llvm_version }}-patch-${{ hashFiles('mlir/patches/**') }}-default-source
         enableCrossOsArchive: True
 
     - name: Cache MHLO Source
@@ -71,7 +71,7 @@ jobs:
       uses: actions/cache@v4
       with:
         path: mlir/mlir-hlo
-        key: mhlo-${{ needs.constants.outputs.mhlo_version }}-default-source
+        key: mhlo-${{ needs.constants.outputs.mhlo_version }}-patch-${{ hashFiles('mlir/patches/**') }}-default-source
         enableCrossOsArchive: True
 
     - name: Cache Enzyme Source
@@ -112,14 +112,14 @@ jobs:
       uses: actions/cache/restore@v4
       with:
         path:  llvm-build
-        key: ${{ matrix.container_name }}-llvm-${{ needs.constants.outputs.llvm_version }}-wheel-build
+        key: ${{ matrix.container_name }}-llvm-${{ needs.constants.outputs.llvm_version }}-patch-${{ hashFiles('mlir/patches/**') }}-wheel-build
 
     - name: Restore MHLO Build
       id: cache-mhlo-build
       uses: actions/cache/restore@v4
       with:
         path:  mhlo-build
-        key: ${{ matrix.container_name }}-mhlo-${{ needs.constants.outputs.mhlo_version }}-wheel-build
+        key: ${{ matrix.container_name }}-mhlo-${{ needs.constants.outputs.mhlo_version }}-patch-${{ hashFiles('mlir/patches/**') }}-wheel-build
         lookup-only: True
 
     - name: Restore Enzyme Build
@@ -160,7 +160,7 @@ jobs:
       uses: actions/cache/save@v4
       with:
         path:  llvm-build
-        key: ${{ matrix.container_name }}-llvm-${{ needs.constants.outputs.llvm_version }}-wheel-build
+        key: ${{ matrix.container_name }}-llvm-${{ needs.constants.outputs.llvm_version }}-patch-${{ hashFiles('mlir/patches/**') }}-wheel-build
 
     - name: Build MHLO Dialect
       if: steps.cache-mhlo-build.outputs.cache-hit != 'true'
@@ -179,7 +179,7 @@ jobs:
       uses: actions/cache/save@v4
       with:
         path:  mhlo-build
-        key: ${{ matrix.container_name }}-mhlo-${{ needs.constants.outputs.mhlo_version }}-wheel-build
+        key: ${{ matrix.container_name }}-mhlo-${{ needs.constants.outputs.mhlo_version }}-patch-${{ hashFiles('mlir/patches/**') }}-wheel-build
 
     - name: Build Enzyme
       if: steps.cache-enzyme-build.outputs.cache-hit != 'true'
@@ -240,7 +240,7 @@ jobs:
       uses: actions/cache/restore@v4
       with:
         path:  llvm-build
-        key: ${{ matrix.container_name }}-llvm-${{ needs.constants.outputs.llvm_version }}-wheel-build
+        key: ${{ matrix.container_name }}-llvm-${{ needs.constants.outputs.llvm_version }}-patch-${{ hashFiles('mlir/patches/**') }}-wheel-build
         fail-on-cache-miss: True
 
     - name: Get Cached MHLO Source
@@ -257,7 +257,7 @@ jobs:
       uses: actions/cache/restore@v4
       with:
         path: mhlo-build
-        key: ${{ matrix.container_name }}-mhlo-${{ needs.constants.outputs.mhlo_version }}-wheel-build
+        key: ${{ matrix.container_name }}-mhlo-${{ needs.constants.outputs.mhlo_version }}-patch-${{ hashFiles('mlir/patches/**') }}-wheel-build
         fail-on-cache-miss: True
 
     - name: Get Cached Enzyme Source
@@ -334,7 +334,7 @@ jobs:
       uses: actions/cache@v4
       with:
         path:  llvm-build
-        key: ${{ matrix.container_name }}-llvm-${{ needs.constants.outputs.llvm_version }}-wheel-build
+        key: ${{ matrix.container_name }}-llvm-${{ needs.constants.outputs.llvm_version }}-patch-${{ hashFiles('mlir/patches/**') }}-wheel-build
         fail-on-cache-miss: True
 
     - name: Run Python Pytest Tests

From 170d2117b341225b4eb271f8ec5b8e6a3cc3a2f2 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Tue, 1 Oct 2024 12:06:33 -0400
Subject: [PATCH 141/183] Make x86_64 use patch hash

---
 .github/workflows/build-wheel-linux-x86_64.yaml | 12 ++++++------
 .github/workflows/build-wheel-macos-x86_64.yaml | 12 ++++++------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/build-wheel-linux-x86_64.yaml b/.github/workflows/build-wheel-linux-x86_64.yaml
index b6266eedc8..b9eb13da2c 100644
--- a/.github/workflows/build-wheel-linux-x86_64.yaml
+++ b/.github/workflows/build-wheel-linux-x86_64.yaml
@@ -118,14 +118,14 @@ jobs:
       uses: actions/cache/restore@v4
       with:
         path:  llvm-build
-        key: ${{ matrix.container_img }}-llvm-${{ needs.constants.outputs.llvm_version }}-${{matrix.python_version}}-wheel-build
+        key: ${{ matrix.container_img }}-llvm-${{ needs.constants.outputs.llvm_version }}-patch-${{ hashFiles('mlir/patches/**') }}-${{matrix.python_version}}-wheel-build
 
     - name: Restore MHLO Build
       id: cache-mhlo-build
       uses: actions/cache/restore@v4
       with:
         path:  mhlo-build
-        key: ${{ matrix.container_img }}-mhlo-${{ needs.constants.outputs.mhlo_version }}-wheel-build
+        key: ${{ matrix.container_img }}-mhlo-${{ needs.constants.outputs.mhlo_version }}-patch-${{ hashFiles('mlir/patches/**') }}-wheel-build
         lookup-only: True
 
     - name: Restore Enzyme Build
@@ -202,7 +202,7 @@ jobs:
       uses: actions/cache/save@v4
       with:
         path:  llvm-build
-        key: ${{ matrix.container_img }}-llvm-${{ needs.constants.outputs.llvm_version }}-${{matrix.python_version}}-wheel-build
+        key: ${{ matrix.container_img }}-llvm-${{ needs.constants.outputs.llvm_version }}-patch-${{ hashFiles('mlir/patches/**') }}-${{matrix.python_version}}-wheel-build
 
     - name: Build MHLO Dialect
       if: steps.cache-mhlo-build.outputs.cache-hit != 'true'
@@ -232,7 +232,7 @@ jobs:
       uses: actions/cache/save@v4
       with:
         path:  mhlo-build
-        key: ${{ matrix.container_img }}-mhlo-${{ needs.constants.outputs.mhlo_version }}-wheel-build
+        key: ${{ matrix.container_img }}-mhlo-${{ needs.constants.outputs.mhlo_version }}-patch-${{ hashFiles('mlir/patches/**') }}-wheel-build
 
     - name: Build Enzyme
       if: steps.cache-enzyme-build.outputs.cache-hit != 'true'
@@ -300,7 +300,7 @@ jobs:
       uses: actions/cache/restore@v4
       with:
         path:  llvm-build
-        key: ${{ matrix.container_img }}-llvm-${{ needs.constants.outputs.llvm_version }}-3.10-wheel-build
+        key: ${{ matrix.container_img }}-llvm-${{ needs.constants.outputs.llvm_version }}-patch-${{ hashFiles('mlir/patches/**') }}-3.10-wheel-build
         fail-on-cache-miss: True
 
     - name: Get Cached MHLO Source
@@ -317,7 +317,7 @@ jobs:
       uses: actions/cache/restore@v4
       with:
         path:  mhlo-build
-        key: ${{ matrix.container_img }}-mhlo-${{ needs.constants.outputs.mhlo_version }}-wheel-build
+        key: ${{ matrix.container_img }}-mhlo-${{ needs.constants.outputs.mhlo_version }}-patch-${{ hashFiles('mlir/patches/**') }}-wheel-build
         fail-on-cache-miss: True
 
     - name: Get Cached Enzyme Source
diff --git a/.github/workflows/build-wheel-macos-x86_64.yaml b/.github/workflows/build-wheel-macos-x86_64.yaml
index 91978928f0..57df711f01 100644
--- a/.github/workflows/build-wheel-macos-x86_64.yaml
+++ b/.github/workflows/build-wheel-macos-x86_64.yaml
@@ -103,14 +103,14 @@ jobs:
       uses: actions/cache/restore@v4
       with:
         path:  llvm-build
-        key: ${{ runner.os }}-${{ runner.arch }}-llvm-${{ needs.constants.outputs.llvm_version }}-${{matrix.python_version}}-wheel-build
+        key: ${{ runner.os }}-${{ runner.arch }}-llvm-${{ needs.constants.outputs.llvm_version }}-patch-${{ hashFiles('mlir/patches/**') }}-${{matrix.python_version}}-wheel-build
 
     - name: Restore MHLO Build
       id: cache-mhlo-build
       uses: actions/cache/restore@v4
       with:
         path:  mhlo-build
-        key: ${{ runner.os }}-${{ runner.arch }}-mhlo-${{ needs.constants.outputs.mhlo_version }}-wheel-build
+        key: ${{ runner.os }}-${{ runner.arch }}-mhlo-${{ needs.constants.outputs.mhlo_version }}-patch-${{ hashFiles('mlir/patches/**') }}-wheel-build
         lookup-only: True
 
     - name: Restore Enzyme Build
@@ -162,7 +162,7 @@ jobs:
       uses: actions/cache/save@v4
       with:
         path:  llvm-build
-        key: ${{ runner.os }}-${{ runner.arch }}-llvm-${{ needs.constants.outputs.llvm_version }}-${{matrix.python_version}}-wheel-build
+        key: ${{ runner.os }}-${{ runner.arch }}-llvm-${{ needs.constants.outputs.llvm_version }}-patch-${{ hashFiles('mlir/patches/**') }}-${{matrix.python_version}}-wheel-build
 
     - name: Build MHLO Dialect
       if: steps.cache-mhlo-build.outputs.cache-hit != 'true'
@@ -192,7 +192,7 @@ jobs:
       uses: actions/cache/save@v4
       with:
         path:  mhlo-build
-        key: ${{ runner.os }}-${{ runner.arch }}-mhlo-${{ needs.constants.outputs.mhlo_version }}-wheel-build
+        key: ${{ runner.os }}-${{ runner.arch }}-mhlo-${{ needs.constants.outputs.mhlo_version }}-patch-${{ hashFiles('mlir/patches/**') }}-wheel-build
 
     - name: Build Enzyme
       if: steps.cache-enzyme-build.outputs.cache-hit != 'true'
@@ -250,7 +250,7 @@ jobs:
       uses: actions/cache/restore@v4
       with:
         path:  llvm-build
-        key: ${{ runner.os }}-${{ runner.arch }}-llvm-${{ needs.constants.outputs.llvm_version }}-3.10-wheel-build
+        key: ${{ runner.os }}-${{ runner.arch }}-llvm-${{ needs.constants.outputs.llvm_version }}-patch-${{ hashFiles('mlir/patches/**') }}-3.10-wheel-build
         fail-on-cache-miss: True
 
     - name: Get Cached MHLO Source
@@ -267,7 +267,7 @@ jobs:
       uses: actions/cache/restore@v4
       with:
         path:  mhlo-build
-        key: ${{ runner.os }}-${{ runner.arch }}-mhlo-${{ needs.constants.outputs.mhlo_version }}-wheel-build
+        key: ${{ runner.os }}-${{ runner.arch }}-mhlo-${{ needs.constants.outputs.mhlo_version }}-patch-${{ hashFiles('mlir/patches/**') }}-wheel-build
         fail-on-cache-miss: True
 
     - name: Get Cached Enzyme Source

From 386f27472a3a9dffdbb084796faed36d7db07186 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@gmail.com>
Date: Thu, 3 Oct 2024 10:36:54 -0400
Subject: [PATCH 142/183] Install ncurses for linux-arm64 wheels

---
 .github/workflows/scripts/linux_arm64/rh8/test_wheels.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/scripts/linux_arm64/rh8/test_wheels.sh b/.github/workflows/scripts/linux_arm64/rh8/test_wheels.sh
index 626d3d93ce..817ea3a8a1 100644
--- a/.github/workflows/scripts/linux_arm64/rh8/test_wheels.sh
+++ b/.github/workflows/scripts/linux_arm64/rh8/test_wheels.sh
@@ -11,7 +11,7 @@ export PYTHON_PACKAGE=$4
 
 # Install system dependencies (gcc gives access to c99, which is needed by some tests)
 dnf update -y 
-dnf install -y libzstd-devel gcc-toolset-${GCC_VERSION} gcc
+dnf install -y libzstd-devel gcc-toolset-${GCC_VERSION} gcc ncurses-devel
 if [ "$PYTHON_MAJOR_MINOR" != "3.10" ]; then
     dnf install -y ${PYTHON_PACKAGE} ${PYTHON_PACKAGE}-devel
 else

From 17c94d60eeb7c20d4afdea7fb540a9fbe787dc7a Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@xanadu.ai>
Date: Fri, 4 Oct 2024 15:59:04 -0400
Subject: [PATCH 143/183] Add top-level comments for bufferization

---
 .../Transforms/BufferizableOpInterfaceImpl.cpp | 18 ++++++++++++++++++
 .../Transforms/BufferizableOpInterfaceImpl.cpp | 18 ++++++++++++++++++
 .../Transforms/BufferizableOpInterfaceImpl.cpp | 14 ++++++++++++++
 3 files changed, 50 insertions(+)

diff --git a/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
index c22ef85d03..7eac6bbaa6 100644
--- a/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -14,6 +14,24 @@ using namespace mlir;
 using namespace catalyst;
 
 namespace {
+/**
+ * The new bufferization interface requires `bufferizesToMemoryRead`, `bufferizesToMemoryWrite`, 
+ * and `getAliasingValues`.
+ * 
+ * `bufferizesToMemoryRead`: Return `true` if the buffer of the given tensor OpOperand is read.
+ *
+ * `bufferizesToMemoryWrite`: Return `true` if the buffer of the given tensor OpOperand is written 
+ * (if bufferizing in-place).
+ *
+ * `getAliasingOpOperands`: Return the OpResults that may share the same buffer as the given OpOperand. 
+ * Note that MLIR documentation does not mention `getAliasingValues` but it seems to serve the same purpose. 
+ *
+ * Bufferizing FunctionOpInterface is also not documented by MLIR. It requires 
+ * `OpWithUnstructuredControlFlowBufferizableOpInterfaceExternalModel`, which requires the implementation of
+ * `supportsUnstructuredControlFlow`, `hasTensorSemantics`, and `getAliasingOpOperands`.
+ *
+ * Link: https://mlir.llvm.org/docs/Bufferization/#extending-one-shot-bufferize
+ */
 
 /// Bufferization of catalyst.print. Get memref of printOp.val.
 struct PrintOpInterface
diff --git a/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
index 3b620d75a4..242f61043b 100644
--- a/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -24,6 +24,24 @@ using namespace mlir;
 using namespace catalyst::gradient;
 
 namespace {
+/**
+ * The new bufferization interface requires `bufferizesToMemoryRead`, `bufferizesToMemoryWrite`, 
+ * and `getAliasingValues`.
+ * 
+ * `bufferizesToMemoryRead`: Return `true` if the buffer of the given tensor OpOperand is read.
+ *
+ * `bufferizesToMemoryWrite`: Return `true` if the buffer of the given tensor OpOperand is written 
+ * (if bufferizing in-place).
+ *
+ * `getAliasingOpOperands`: Return the OpResults that may share the same buffer as the given OpOperand. 
+ * Note that MLIR documentation does not mention `getAliasingValues` but it seems to serve the same purpose. 
+ *
+ * Bufferizing FunctionOpInterface is also not documented by MLIR. It requires 
+ * `OpWithUnstructuredControlFlowBufferizableOpInterfaceExternalModel`, which requires the implementation of
+ * `supportsUnstructuredControlFlow`, `hasTensorSemantics`, and `getAliasingOpOperands`.
+ *
+ * Link: https://mlir.llvm.org/docs/Bufferization/#extending-one-shot-bufferize
+ */
 
 static BaseMemRefType
 getBufferizedFunctionArgType(FunctionOpInterface funcOp, int64_t index,
diff --git a/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp
index fb3c0c64c7..d58dffe389 100644
--- a/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -10,6 +10,20 @@ using namespace mlir;
 using namespace catalyst::quantum;
 
 namespace {
+/**
+ * The new bufferization interface requires `bufferizesToMemoryRead`, `bufferizesToMemoryWrite`, 
+ * and `getAliasingValues`.
+ * 
+ * `bufferizesToMemoryRead`: Return `true` if the buffer of the given tensor OpOperand is read.
+ *
+ * `bufferizesToMemoryWrite`: Return `true` if the buffer of the given tensor OpOperand is written 
+ * (if bufferizing in-place).
+ *
+ * `getAliasingOpOperands`: Return the OpResults that may share the same buffer as the given OpOperand. 
+ * Note that MLIR documentation does not mention `getAliasingValues` but it seems to serve the same purpose. 
+ *
+ * Link: https://mlir.llvm.org/docs/Bufferization/#extending-one-shot-bufferize
+ */
 
 /// Bufferization of catalyst.quantum.unitary. Convert Matrix into memref.
 struct QubitUnitaryOpInterface

From d58a215fb947a6b67e1e9eef0d7ee7871fb5ad4f Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@xanadu.ai>
Date: Fri, 4 Oct 2024 16:19:27 -0400
Subject: [PATCH 144/183] Add comment that explains RestoreReverseOp

---
 .../Transforms/PostprocessingPatterns.cpp     | 25 ++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/mlir/lib/Gradient/Transforms/PostprocessingPatterns.cpp b/mlir/lib/Gradient/Transforms/PostprocessingPatterns.cpp
index 0f1b8332ba..d8303e77cf 100644
--- a/mlir/lib/Gradient/Transforms/PostprocessingPatterns.cpp
+++ b/mlir/lib/Gradient/Transforms/PostprocessingPatterns.cpp
@@ -206,12 +206,35 @@ struct PostprocessReverseOp : public OpRewritePattern<ReverseOp> {
 };
 
 struct RestoreReverseOp : public OpRewritePattern<ReverseOp> {
+    /* One-shot bufferize optimizes away the return values that are not used. 
+     * This pass aims to revert the changed made by One-shot bufferize.
+     * 
+     * For example,
+     * ```
+     * gradient.reverse @bwd.rev(%arg0: memref<f64>) -> memref<f64>, memref<f64> attributes {argc = 2 : i64, implementation = @bwd, llvm.linkage = #llvm.linkage<internal>, resc = 1 : i64, tape = 0 : i64} {
+     *   %0 = func.call @bwd(%arg0) : (memref<f64>) -> memref<f64>
+     *   %alloc = memref.alloc() {alignment = 64 : i64} : memref<f64>
+     *   memref.copy %0, %alloc : memref<f64> to memref<f64>
+     *   gradient.return {empty = true} %alloc : memref<f64>
+     * }
+     * ```
+     * will be turned into 
+     * * ```
+     * gradient.reverse @bwd.rev(%arg0: memref<f64>) -> memref<f64> attributes {argc = 2 : i64, implementation = @bwd, llvm.linkage = #llvm.linkage<internal>, resc = 1 : i64, tape = 0 : i64} {
+     *   %0 = func.call @bwd(%arg0) : (memref<f64>) -> memref<f64>
+     *   %alloc = memref.alloc() {alignment = 64 : i64} : memref<f64>
+     *   memref.copy %0, %alloc : memref<f64> to memref<f64>
+     *   gradient.return {empty = true} %alloc : memref<f64>
+     * }
+     * ```
+     * 
+     * However, Enzyme expects to see the removed return. We have to add it back.
+     */
     using OpRewritePattern<ReverseOp>::OpRewritePattern;
 
     mlir::LogicalResult matchAndRewrite(ReverseOp op,
                                         mlir::PatternRewriter &rewriter) const override
     {
-        // ReverseOp's output is optimized away by one-shot bufferize.
         auto forwardArgc = op.getArgc();
         auto forwardResc = op.getResc();
         auto tape = op.getTape();

From b438e4d9c014e78b09384ceb3863675eafa76368 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@xanadu.ai>
Date: Mon, 7 Oct 2024 09:57:35 -0400
Subject: [PATCH 145/183] Update
 mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp

Co-authored-by: David Ittah <dime10@users.noreply.github.com>
---
 mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
index 7eac6bbaa6..432b135282 100644
--- a/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -299,4 +299,4 @@ void catalyst::registerBufferizableOpInterfaceExternalModels(DialectRegistry &re
         CallbackOp::attachInterface<CallbackOpInterface>(*ctx);
         CallbackCallOp::attachInterface<CallbackCallOpInterface>(*ctx);
     });
-}
\ No newline at end of file
+}

From cf4018c1bbdcb40bb21d8234c611a63f6951914a Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@xanadu.ai>
Date: Mon, 7 Oct 2024 10:09:52 -0400
Subject: [PATCH 146/183] Update
 mlir/include/Gradient/Transforms/BufferizableOpInterfaceImpl.h

Co-authored-by: David Ittah <dime10@users.noreply.github.com>
---
 mlir/include/Gradient/Transforms/BufferizableOpInterfaceImpl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/include/Gradient/Transforms/BufferizableOpInterfaceImpl.h b/mlir/include/Gradient/Transforms/BufferizableOpInterfaceImpl.h
index c8426be8b3..ae5096eb39 100644
--- a/mlir/include/Gradient/Transforms/BufferizableOpInterfaceImpl.h
+++ b/mlir/include/Gradient/Transforms/BufferizableOpInterfaceImpl.h
@@ -10,4 +10,4 @@ void registerBufferizableOpInterfaceExternalModels(mlir::DialectRegistry &regist
 
 }
 
-} // namespace catalyst
\ No newline at end of file
+} // namespace catalyst

From 71abe78fe2e5a73186aa51a19118af144eeee6ee Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@xanadu.ai>
Date: Mon, 7 Oct 2024 10:10:28 -0400
Subject: [PATCH 147/183] Update
 mlir/include/Catalyst/Transforms/BufferizableOpInterfaceImpl.h

Co-authored-by: David Ittah <dime10@users.noreply.github.com>
---
 mlir/include/Catalyst/Transforms/BufferizableOpInterfaceImpl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/include/Catalyst/Transforms/BufferizableOpInterfaceImpl.h b/mlir/include/Catalyst/Transforms/BufferizableOpInterfaceImpl.h
index e56c914ba4..ec20d6f6c9 100644
--- a/mlir/include/Catalyst/Transforms/BufferizableOpInterfaceImpl.h
+++ b/mlir/include/Catalyst/Transforms/BufferizableOpInterfaceImpl.h
@@ -6,4 +6,4 @@ namespace catalyst {
 
 void registerBufferizableOpInterfaceExternalModels(mlir::DialectRegistry &registry);
 
-} // namespace catalyst
\ No newline at end of file
+} // namespace catalyst

From c1384522567c7c59758d595e637cda78c416b2d3 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@xanadu.ai>
Date: Mon, 7 Oct 2024 10:33:29 -0400
Subject: [PATCH 148/183] Use compare instead of ==

---
 mlir/lib/Catalyst/Transforms/AsyncUtils.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/lib/Catalyst/Transforms/AsyncUtils.cpp b/mlir/lib/Catalyst/Transforms/AsyncUtils.cpp
index a295156330..e811668e89 100644
--- a/mlir/lib/Catalyst/Transforms/AsyncUtils.cpp
+++ b/mlir/lib/Catalyst/Transforms/AsyncUtils.cpp
@@ -215,7 +215,7 @@ std::optional<LLVM::LLVMFuncOp> AsyncUtils::getCalleeSafe(LLVM::CallOp callOp)
 bool AsyncUtils::isFunctionNamed(LLVM::LLVMFuncOp funcOp, llvm::StringRef expectedName)
 {
     llvm::StringRef observedName = funcOp.getSymName();
-    return observedName == expectedName;
+    return observedName.compare(expectedName) == 0;
 }
 
 bool AsyncUtils::isMlirAsyncRuntimeCreateValue(LLVM::LLVMFuncOp funcOp)

From e8322ac387512e2fa5c12d268f68d5dbdec89424 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@xanadu.ai>
Date: Mon, 7 Oct 2024 10:52:01 -0400
Subject: [PATCH 149/183] Add TODO reminder for removing patches after updating
 jax

---
 .github/workflows/build-wheel-linux-x86_64.yaml             | 3 +++
 .github/workflows/build-wheel-macos-arm64.yaml              | 3 +++
 .github/workflows/build-wheel-macos-x86_64.yaml             | 3 +++
 .github/workflows/check-catalyst.yaml                       | 1 +
 .github/workflows/scripts/linux_arm64/rh8/build_catalyst.sh | 2 +-
 .github/workflows/scripts/linux_arm64/rh8/build_llvm.sh     | 1 +
 .github/workflows/scripts/linux_arm64/rh8/build_mhlo.sh     | 1 +
 mlir/Makefile                                               | 2 ++
 8 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build-wheel-linux-x86_64.yaml b/.github/workflows/build-wheel-linux-x86_64.yaml
index b9eb13da2c..b5685d6c1d 100644
--- a/.github/workflows/build-wheel-linux-x86_64.yaml
+++ b/.github/workflows/build-wheel-linux-x86_64.yaml
@@ -173,6 +173,7 @@ jobs:
       run: |
         export PATH=$GITHUB_WORKSPACE/llvm-build/bin:$PATH
 
+        # TODO: Remove these patches after upgrading Jax (potentailly for 0.4.34 or higher).
         if patch --dry-run -p1 -N --directory=mlir/llvm-project < mlir/patches/FunctionOpInterface-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/llvm-project < mlir/patches/FunctionOpInterface-bufferization.patch; fi
         if patch --dry-run -p1 -N --directory=mlir/llvm-project < mlir/patches/callOp-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/llvm-project < mlir/patches/callOp-bufferization.patch; fi
 
@@ -208,6 +209,7 @@ jobs:
       if: steps.cache-mhlo-build.outputs.cache-hit != 'true'
       # building with LLD is a strong requirement for mhlo
       run: |
+        # TODO: Remove these patches after upgrading Jax (potentailly for 0.4.34 or higher).
         export PATH=$GITHUB_WORKSPACE/llvm-build/bin:$PATH
         export TARGET_FILE=mlir/mlir-hlo/mhlo/transforms/CMakeLists.txt
         export PATCH_FILE=mlir/patches/mhlo-Add-PassesIncGen-in-transforms-CMakeList.patch
@@ -367,6 +369,7 @@ jobs:
     # Build Quantum and Gradient Dialects
     - name: Build MLIR Dialects
       run: |
+        # TODO: Remove these patches after upgrading Jax (potentailly for 0.4.34 or higher).
         export PATH=$GITHUB_WORKSPACE/llvm-build/bin:$PATH
         if patch --dry-run -p1 -N --directory=mlir/llvm-project < mlir/patches/FunctionOpInterface-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/llvm-project < mlir/patches/FunctionOpInterface-bufferization.patch; fi
         if patch --dry-run -p1 -N --directory=mlir/llvm-project < mlir/patches/callOp-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/llvm-project < mlir/patches/callOp-bufferization.patch; fi
diff --git a/.github/workflows/build-wheel-macos-arm64.yaml b/.github/workflows/build-wheel-macos-arm64.yaml
index 85fcaf7d2d..0e7b967da9 100644
--- a/.github/workflows/build-wheel-macos-arm64.yaml
+++ b/.github/workflows/build-wheel-macos-arm64.yaml
@@ -137,6 +137,7 @@ jobs:
     - name: Build LLVM / MLIR
       if: steps.cache-llvm-build.outputs.cache-hit != 'true'
       run: |
+        # TODO: Remove these patches after upgrading Jax (potentailly for 0.4.34 or higher).
         if patch --dry-run -p1 -N --directory=mlir/llvm-project < mlir/patches/FunctionOpInterface-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/llvm-project < mlir/patches/FunctionOpInterface-bufferization.patch; fi
         if patch --dry-run -p1 -N --directory=mlir/llvm-project < mlir/patches/callOp-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/llvm-project < mlir/patches/callOp-bufferization.patch; fi
 
@@ -173,6 +174,7 @@ jobs:
       run: |
         export PATH=$GITHUB_WORKSPACE/llvm-build/bin:$PATH
 
+        # TODO: Remove these patches after upgrading Jax (potentailly for 0.4.34 or higher).
         export TARGET_FILE=mlir/mlir-hlo/mhlo/transforms/CMakeLists.txt
         export PATCH_FILE=mlir/patches/mhlo-Add-PassesIncGen-in-transforms-CMakeList.patch
         if patch --dry-run -p1 -N $TARGET_FILE $PATCH_FILE > /dev/null 2>&1; then patch -p1 $TARGET_FILE $PATCH_FILE; fi
@@ -332,6 +334,7 @@ jobs:
     # Build Quantum and Gradient Dialects
     - name: Build MLIR Dialects
       run: |
+        # TODO: Remove these patches after upgrading Jax (potentailly for 0.4.34 or higher).
         if patch --dry-run -p1 -N --directory=mlir/llvm-project < mlir/patches/FunctionOpInterface-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/llvm-project < mlir/patches/FunctionOpInterface-bufferization.patch; fi
         if patch --dry-run -p1 -N --directory=mlir/llvm-project < mlir/patches/callOp-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/llvm-project < mlir/patches/callOp-bufferization.patch; fi
         if patch --dry-run -p1 -N --directory=mlir/mlir-hlo < mlir/patches/moduleOp-mhlo.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/mlir-hlo < mlir/patches/moduleOp-mhlo.patch; fi
diff --git a/.github/workflows/build-wheel-macos-x86_64.yaml b/.github/workflows/build-wheel-macos-x86_64.yaml
index 57df711f01..3d87d07b0b 100644
--- a/.github/workflows/build-wheel-macos-x86_64.yaml
+++ b/.github/workflows/build-wheel-macos-x86_64.yaml
@@ -133,6 +133,7 @@ jobs:
     - name: Build LLVM / MLIR
       if: steps.cache-llvm-build.outputs.cache-hit != 'true'
       run: |
+        # TODO: Remove these patches after upgrading Jax (potentailly for 0.4.34 or higher).
         if patch --dry-run -p1 -N --directory=mlir/llvm-project < mlir/patches/FunctionOpInterface-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/llvm-project < mlir/patches/FunctionOpInterface-bufferization.patch; fi
         if patch --dry-run -p1 -N --directory=mlir/llvm-project < mlir/patches/callOp-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/llvm-project < mlir/patches/callOp-bufferization.patch; fi
 
@@ -169,6 +170,7 @@ jobs:
       run: |
         export PATH=$GITHUB_WORKSPACE/llvm-build/bin:$PATH
 
+        # TODO: Remove these patches after upgrading Jax (potentailly for 0.4.34 or higher).
         export TARGET_FILE=mlir/mlir-hlo/mhlo/transforms/CMakeLists.txt
         export PATCH_FILE=mlir/patches/mhlo-Add-PassesIncGen-in-transforms-CMakeList.patch
         if patch --dry-run -p1 -N $TARGET_FILE $PATCH_FILE > /dev/null 2>&1; then patch -p1 $TARGET_FILE $PATCH_FILE; fi
@@ -323,6 +325,7 @@ jobs:
     # Build Quantum and Gradient Dialects
     - name: Build MLIR Dialects
       run: |
+        # TODO: Remove these patches after upgrading Jax (potentailly for 0.4.34 or higher).
         if patch --dry-run -p1 -N --directory=mlir/llvm-project < mlir/patches/FunctionOpInterface-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/llvm-project < mlir/patches/FunctionOpInterface-bufferization.patch; fi
         if patch --dry-run -p1 -N --directory=mlir/llvm-project < mlir/patches/callOp-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/llvm-project < mlir/patches/callOp-bufferization.patch; fi
         if patch --dry-run -p1 -N --directory=mlir/mlir-hlo < mlir/patches/FunctionOpInterface-mhlo.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/mlir-hlo < mlir/patches/FunctionOpInterface-mhlo.patch; fi
diff --git a/.github/workflows/check-catalyst.yaml b/.github/workflows/check-catalyst.yaml
index 65d033d342..1b40e723c8 100644
--- a/.github/workflows/check-catalyst.yaml
+++ b/.github/workflows/check-catalyst.yaml
@@ -372,6 +372,7 @@ jobs:
 
     - name: Build MLIR Dialects
       run: |
+        # TODO: Remove these patches after upgrading Jax (potentailly for 0.4.34 or higher).
         if patch --dry-run -p1 -N --directory=mlir/llvm-project < mlir/patches/FunctionOpInterface-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/llvm-project < mlir/patches/FunctionOpInterface-bufferization.patch; fi
         if patch --dry-run -p1 -N --directory=mlir/llvm-project < mlir/patches/callOp-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/llvm-project < mlir/patches/callOp-bufferization.patch; fi
         if patch --dry-run -p1 -N --directory=mlir/mlir-hlo < mlir/patches/FunctionOpInterface-mhlo.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/mlir-hlo < mlir/patches/FunctionOpInterface-mhlo.patch; fi
diff --git a/.github/workflows/scripts/linux_arm64/rh8/build_catalyst.sh b/.github/workflows/scripts/linux_arm64/rh8/build_catalyst.sh
index bc35956fa0..74687265f9 100644
--- a/.github/workflows/scripts/linux_arm64/rh8/build_catalyst.sh
+++ b/.github/workflows/scripts/linux_arm64/rh8/build_catalyst.sh
@@ -37,7 +37,7 @@ export PATH=/catalyst/llvm-build/bin:/opt/_internal/cpython-${PYTHON_VERSION}.${
 # Install python dependencies
 /usr/bin/python3 -m pip install pennylane pybind11 PyYAML cmake ninja delocate 'amazon-braket-pennylane-plugin>1.27.1'
 
-# Patch LLVM and MHLO
+# Patch LLVM and MHLO. TODO: Remove these patches after upgrading Jax (potentailly for 0.4.34 or higher).
 if patch --dry-run -p1 -N --directory=mlir/llvm-project < mlir/patches/FunctionOpInterface-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/llvm-project < mlir/patches/FunctionOpInterface-bufferization.patch; fi
 if patch --dry-run -p1 -N --directory=mlir/llvm-project < mlir/patches/callOp-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/llvm-project < mlir/patches/callOp-bufferization.patch; fi
 if patch --dry-run -p1 -N --directory=mlir/mlir-hlo < mlir/patches/FunctionOpInterface-mhlo.patch > /dev/null 2>&1; then patch -p1 --directory=mlir/mlir-hlo < mlir/patches/FunctionOpInterface-mhlo.patch; fi
diff --git a/.github/workflows/scripts/linux_arm64/rh8/build_llvm.sh b/.github/workflows/scripts/linux_arm64/rh8/build_llvm.sh
index a17a928e04..b4ee206580 100644
--- a/.github/workflows/scripts/linux_arm64/rh8/build_llvm.sh
+++ b/.github/workflows/scripts/linux_arm64/rh8/build_llvm.sh
@@ -33,6 +33,7 @@ export PATH=/opt/_internal/cpython-${PYTHON_VERSION}.${PYTHON_SUBVERSION}/bin:/o
 # Install python dependencies
 /usr/bin/python3 -m pip install pennylane pybind11 PyYAML cmake ninja
 
+# TODO: Remove these patches after upgrading Jax (potentailly for 0.4.34 or higher).
 if patch --dry-run -p1 -N --directory=/catalyst/mlir/llvm-project < /catalyst/mlir/patches/FunctionOpInterface-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=/catalyst/mlir/llvm-project < /catalyst/mlir/patches/FunctionOpInterface-bufferization.patch; fi
 if patch --dry-run -p1 -N --directory=/catalyst/mlir/llvm-project < /catalyst/mlir/patches/callOp-bufferization.patch > /dev/null 2>&1; then patch -p1 --directory=/catalyst/mlir/llvm-project < /catalyst/mlir/patches/callOp-bufferization.patch; fi
 
diff --git a/.github/workflows/scripts/linux_arm64/rh8/build_mhlo.sh b/.github/workflows/scripts/linux_arm64/rh8/build_mhlo.sh
index d4fc7a7f38..493af40244 100644
--- a/.github/workflows/scripts/linux_arm64/rh8/build_mhlo.sh
+++ b/.github/workflows/scripts/linux_arm64/rh8/build_mhlo.sh
@@ -39,6 +39,7 @@ sed -i -e 's/LINK_LIBS PUBLIC/LINK_LIBS PUBLIC MLIRDeallocationUtils/g' mlir/mli
 export TARGET_FILE=mlir/mlir-hlo/mhlo/transforms/CMakeLists.txt
 export PATCH_FILE=mlir/patches/mhlo-Add-PassesIncGen-in-transforms-CMakeList.patch
 if patch --dry-run -p1 -N $TARGET_FILE $PATCH_FILE > /dev/null 2>&1; then patch -p1 $TARGET_FILE $PATCH_FILE; fi
+# TODO: Remove these patches after upgrading Jax (potentailly for 0.4.34 or higher).
 if patch --dry-run -p1 -N --directory=/catalyst/mlir/mlir-hlo < /catalyst/mlir/patches/FunctionOpInterface-mhlo.patch > /dev/null 2>&1; then patch -p1 --directory=/catalyst/mlir/mlir-hlo < /catalyst/mlir/patches/FunctionOpInterface-mhlo.patch; fi
 
 # Build MHLO
diff --git a/mlir/Makefile b/mlir/Makefile
index e3b4820a3c..14d48a1109 100644
--- a/mlir/Makefile
+++ b/mlir/Makefile
@@ -58,6 +58,7 @@ all: llvm mhlo enzyme dialects
 
 .PHONY: llvm
 llvm:
+	# TODO: Remove these patches after upgrading Jax (potentailly for 0.4.34 or higher).
 	@echo "build LLVM and MLIR enabling Python bindings"
 	@if patch --dry-run -p1 -N --directory=$(LLVM_ROOT) < $(LLVM_FUNCOP_PATCH_FILE) > /dev/null 2>&1; then \
         patch -p1 --directory=$(LLVM_ROOT) < $(LLVM_FUNCOP_PATCH_FILE); \
@@ -96,6 +97,7 @@ mhlo:
 	@if patch --dry-run -p1 -N $(TARGET_FILE) $(PATCH_FILE) > /dev/null 2>&1; then \
 		patch -p1 $(TARGET_FILE) $(PATCH_FILE); \
 	fi
+	# TODO: Remove this patch after upgrading Jax (potentailly for 0.4.34 or higher).
 	@if patch --dry-run -p1 -N --directory=$(MHLO_ROOT) < $(MHLO_MODULE_PATCH_FILE) > /dev/null 2>&1; then \
         patch -p1 --directory=$(MHLO_ROOT) < $(MHLO_MODULE_PATCH_FILE); \
 	fi

From 761aaf7281fd5ba3a84cd73845aff2094938658d Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@xanadu.ai>
Date: Mon, 7 Oct 2024 10:53:17 -0400
Subject: [PATCH 150/183] Update mlir/Makefile

Co-authored-by: David Ittah <dime10@users.noreply.github.com>
---
 mlir/Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mlir/Makefile b/mlir/Makefile
index 14d48a1109..d713adc969 100644
--- a/mlir/Makefile
+++ b/mlir/Makefile
@@ -12,6 +12,7 @@ ENZYME_BUILD_DIR?=$(MK_DIR)/Enzyme/build
 RT_BUILD_DIR?=$(MK_DIR)/../runtime/build
 ENABLE_ASAN?=OFF
 BUILD_TYPE?=Release
+# TODO: remove after JAX upgrade
 LLVM_ROOT=$(MK_DIR)/llvm-project
 LLVM_FUNCOP_PATCH_FILE=$(MK_DIR)/patches/FunctionOpInterface-bufferization.patch
 LLVM_FUNC_CALL_PATCH_FILE=$(MK_DIR)/patches/callOp-bufferization.patch

From 26c56d508fb5f6743f1ee156b43fecc95743a72a Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@xanadu.ai>
Date: Mon, 7 Oct 2024 11:00:18 -0400
Subject: [PATCH 151/183] Add another TODO comment

---
 .github/workflows/scripts/linux_arm64/rh8/build_mhlo.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/scripts/linux_arm64/rh8/build_mhlo.sh b/.github/workflows/scripts/linux_arm64/rh8/build_mhlo.sh
index 493af40244..e452ee22d4 100644
--- a/.github/workflows/scripts/linux_arm64/rh8/build_mhlo.sh
+++ b/.github/workflows/scripts/linux_arm64/rh8/build_mhlo.sh
@@ -38,6 +38,7 @@ sed -i -e 's/LINK_LIBS PUBLIC/LINK_LIBS PUBLIC MLIRDeallocationUtils/g' mlir/mli
 
 export TARGET_FILE=mlir/mlir-hlo/mhlo/transforms/CMakeLists.txt
 export PATCH_FILE=mlir/patches/mhlo-Add-PassesIncGen-in-transforms-CMakeList.patch
+# TODO: Jax has merged this fix. Remove after JAX upgrade.
 if patch --dry-run -p1 -N $TARGET_FILE $PATCH_FILE > /dev/null 2>&1; then patch -p1 $TARGET_FILE $PATCH_FILE; fi
 # TODO: Remove these patches after upgrading Jax (potentailly for 0.4.34 or higher).
 if patch --dry-run -p1 -N --directory=/catalyst/mlir/mlir-hlo < /catalyst/mlir/patches/FunctionOpInterface-mhlo.patch > /dev/null 2>&1; then patch -p1 --directory=/catalyst/mlir/mlir-hlo < /catalyst/mlir/patches/FunctionOpInterface-mhlo.patch; fi

From 0296eb9bc2bbc140f1f09e4cf35dfb41332b1ba2 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@xanadu.ai>
Date: Mon, 7 Oct 2024 11:19:32 -0400
Subject: [PATCH 152/183] Cleanup

---
 .../BufferizableOpInterfaceImpl.cpp           | 32 +++++++------------
 .../BufferizableOpInterfaceImpl.cpp           | 18 ++++++-----
 .../Transforms/PostprocessingPatterns.cpp     | 26 +++++++--------
 .../BufferizableOpInterfaceImpl.cpp           | 11 ++++---
 4 files changed, 40 insertions(+), 47 deletions(-)

diff --git a/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
index 432b135282..9f76b737e9 100644
--- a/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -15,20 +15,22 @@ using namespace catalyst;
 
 namespace {
 /**
- * The new bufferization interface requires `bufferizesToMemoryRead`, `bufferizesToMemoryWrite`, 
+ * The new bufferization interface requires `bufferizesToMemoryRead`, `bufferizesToMemoryWrite`,
  * and `getAliasingValues`.
- * 
+ *
  * `bufferizesToMemoryRead`: Return `true` if the buffer of the given tensor OpOperand is read.
  *
- * `bufferizesToMemoryWrite`: Return `true` if the buffer of the given tensor OpOperand is written 
+ * `bufferizesToMemoryWrite`: Return `true` if the buffer of the given tensor OpOperand is written
  * (if bufferizing in-place).
  *
- * `getAliasingOpOperands`: Return the OpResults that may share the same buffer as the given OpOperand. 
- * Note that MLIR documentation does not mention `getAliasingValues` but it seems to serve the same purpose. 
+ * `getAliasingOpOperands`: Return the OpResults that may share the same buffer as the given
+ * OpOperand. Note that MLIR documentation does not mention `getAliasingValues` but it seems to
+ * serve the same purpose.
  *
- * Bufferizing FunctionOpInterface is also not documented by MLIR. It requires 
- * `OpWithUnstructuredControlFlowBufferizableOpInterfaceExternalModel`, which requires the implementation of
- * `supportsUnstructuredControlFlow`, `hasTensorSemantics`, and `getAliasingOpOperands`.
+ * Bufferizing FunctionOpInterface is also not documented by MLIR. It requires
+ * `OpWithUnstructuredControlFlowBufferizableOpInterfaceExternalModel`, which requires the
+ * implementation of `supportsUnstructuredControlFlow`, `hasTensorSemantics`, and
+ * `getAliasingOpOperands`.
  *
  * Link: https://mlir.llvm.org/docs/Bufferization/#extending-one-shot-bufferize
  */
@@ -105,19 +107,7 @@ struct CustomCallOpInterface
             FailureOr<Value> opBuffer = getBuffer(rewriter, operand, options);
             if (failed(opBuffer))
                 return failure();
-            MemRefType memrefType = dyn_cast<MemRefType>(opBuffer->getType());
-            if (!memrefType)
-                return failure();
-            if (!memrefType.getLayout().isIdentity()) {
-                auto nonStrideType =
-                    MemRefType::get(memrefType.getShape(), memrefType.getElementType());
-                auto newMemRef =
-                    rewriter.create<memref::CastOp>(op->getLoc(), nonStrideType, *opBuffer);
-                bufferArgs.push_back(newMemRef);
-            }
-            else {
-                bufferArgs.push_back(*opBuffer);
-            }
+            bufferArgs.push_back(*opBuffer);
         }
 
         // Add bufferized return values to the arguments
diff --git a/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
index 242f61043b..ce55763343 100644
--- a/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -25,20 +25,22 @@ using namespace catalyst::gradient;
 
 namespace {
 /**
- * The new bufferization interface requires `bufferizesToMemoryRead`, `bufferizesToMemoryWrite`, 
+ * The new bufferization interface requires `bufferizesToMemoryRead`, `bufferizesToMemoryWrite`,
  * and `getAliasingValues`.
- * 
+ *
  * `bufferizesToMemoryRead`: Return `true` if the buffer of the given tensor OpOperand is read.
  *
- * `bufferizesToMemoryWrite`: Return `true` if the buffer of the given tensor OpOperand is written 
+ * `bufferizesToMemoryWrite`: Return `true` if the buffer of the given tensor OpOperand is written
  * (if bufferizing in-place).
  *
- * `getAliasingOpOperands`: Return the OpResults that may share the same buffer as the given OpOperand. 
- * Note that MLIR documentation does not mention `getAliasingValues` but it seems to serve the same purpose. 
+ * `getAliasingOpOperands`: Return the OpResults that may share the same buffer as the given
+ * OpOperand. Note that MLIR documentation does not mention `getAliasingValues` but it seems to
+ * serve the same purpose.
  *
- * Bufferizing FunctionOpInterface is also not documented by MLIR. It requires 
- * `OpWithUnstructuredControlFlowBufferizableOpInterfaceExternalModel`, which requires the implementation of
- * `supportsUnstructuredControlFlow`, `hasTensorSemantics`, and `getAliasingOpOperands`.
+ * Bufferizing FunctionOpInterface is also not documented by MLIR. It requires
+ * `OpWithUnstructuredControlFlowBufferizableOpInterfaceExternalModel`, which requires the
+ * implementation of `supportsUnstructuredControlFlow`, `hasTensorSemantics`, and
+ * `getAliasingOpOperands`.
  *
  * Link: https://mlir.llvm.org/docs/Bufferization/#extending-one-shot-bufferize
  */
diff --git a/mlir/lib/Gradient/Transforms/PostprocessingPatterns.cpp b/mlir/lib/Gradient/Transforms/PostprocessingPatterns.cpp
index d8303e77cf..cfef5c9aae 100644
--- a/mlir/lib/Gradient/Transforms/PostprocessingPatterns.cpp
+++ b/mlir/lib/Gradient/Transforms/PostprocessingPatterns.cpp
@@ -206,28 +206,28 @@ struct PostprocessReverseOp : public OpRewritePattern<ReverseOp> {
 };
 
 struct RestoreReverseOp : public OpRewritePattern<ReverseOp> {
-    /* One-shot bufferize optimizes away the return values that are not used. 
+    /* One-shot bufferize optimizes away the return values that are not used.
      * This pass aims to revert the changed made by One-shot bufferize.
-     * 
+     *
      * For example,
      * ```
-     * gradient.reverse @bwd.rev(%arg0: memref<f64>) -> memref<f64>, memref<f64> attributes {argc = 2 : i64, implementation = @bwd, llvm.linkage = #llvm.linkage<internal>, resc = 1 : i64, tape = 0 : i64} {
-     *   %0 = func.call @bwd(%arg0) : (memref<f64>) -> memref<f64>
-     *   %alloc = memref.alloc() {alignment = 64 : i64} : memref<f64>
-     *   memref.copy %0, %alloc : memref<f64> to memref<f64>
-     *   gradient.return {empty = true} %alloc : memref<f64>
+     * gradient.reverse @bwd.rev(%arg0: memref<f64>) -> memref<f64>, memref<f64> attributes {argc =
+     * 2 : i64, implementation = @bwd, llvm.linkage = #llvm.linkage<internal>, resc = 1 : i64, tape
+     * = 0 : i64} { %0 = func.call @bwd(%arg0) : (memref<f64>) -> memref<f64> %alloc =
+     * memref.alloc() {alignment = 64 : i64} : memref<f64> memref.copy %0, %alloc : memref<f64> to
+     * memref<f64> gradient.return {empty = true} %alloc : memref<f64>
      * }
      * ```
-     * will be turned into 
+     * will be turned into
      * * ```
-     * gradient.reverse @bwd.rev(%arg0: memref<f64>) -> memref<f64> attributes {argc = 2 : i64, implementation = @bwd, llvm.linkage = #llvm.linkage<internal>, resc = 1 : i64, tape = 0 : i64} {
-     *   %0 = func.call @bwd(%arg0) : (memref<f64>) -> memref<f64>
-     *   %alloc = memref.alloc() {alignment = 64 : i64} : memref<f64>
-     *   memref.copy %0, %alloc : memref<f64> to memref<f64>
+     * gradient.reverse @bwd.rev(%arg0: memref<f64>) -> memref<f64> attributes {argc = 2 : i64,
+     * implementation = @bwd, llvm.linkage = #llvm.linkage<internal>, resc = 1 : i64, tape = 0 :
+     * i64} { %0 = func.call @bwd(%arg0) : (memref<f64>) -> memref<f64> %alloc = memref.alloc()
+     * {alignment = 64 : i64} : memref<f64> memref.copy %0, %alloc : memref<f64> to memref<f64>
      *   gradient.return {empty = true} %alloc : memref<f64>
      * }
      * ```
-     * 
+     *
      * However, Enzyme expects to see the removed return. We have to add it back.
      */
     using OpRewritePattern<ReverseOp>::OpRewritePattern;
diff --git a/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp
index d58dffe389..fe11fcc64a 100644
--- a/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -11,16 +11,17 @@ using namespace catalyst::quantum;
 
 namespace {
 /**
- * The new bufferization interface requires `bufferizesToMemoryRead`, `bufferizesToMemoryWrite`, 
+ * The new bufferization interface requires `bufferizesToMemoryRead`, `bufferizesToMemoryWrite`,
  * and `getAliasingValues`.
- * 
+ *
  * `bufferizesToMemoryRead`: Return `true` if the buffer of the given tensor OpOperand is read.
  *
- * `bufferizesToMemoryWrite`: Return `true` if the buffer of the given tensor OpOperand is written 
+ * `bufferizesToMemoryWrite`: Return `true` if the buffer of the given tensor OpOperand is written
  * (if bufferizing in-place).
  *
- * `getAliasingOpOperands`: Return the OpResults that may share the same buffer as the given OpOperand. 
- * Note that MLIR documentation does not mention `getAliasingValues` but it seems to serve the same purpose. 
+ * `getAliasingOpOperands`: Return the OpResults that may share the same buffer as the given
+ * OpOperand. Note that MLIR documentation does not mention `getAliasingValues` but it seems to
+ * serve the same purpose.
  *
  * Link: https://mlir.llvm.org/docs/Bufferization/#extending-one-shot-bufferize
  */

From c27fd57c4a6be36dcaaeed6568759c168d8c9e04 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@xanadu.ai>
Date: Mon, 7 Oct 2024 11:59:07 -0400
Subject: [PATCH 153/183] Set bufferizesToMemoryWrite for CustomCallOp as true

---
 mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
index 9f76b737e9..5316b0191f 100644
--- a/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -72,7 +72,7 @@ struct PrintOpInterface
     }
 };
 
-/// Bufferization of catalyst.print. Mainly get buffers for arguments.
+/// Bufferization of catalyst.custom_call. Mainly get buffers for arguments.
 struct CustomCallOpInterface
     : public bufferization::BufferizableOpInterface::ExternalModel<CustomCallOpInterface,
                                                                    CustomCallOp> {
@@ -85,7 +85,7 @@ struct CustomCallOpInterface
     bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
                                  const bufferization::AnalysisState &state) const
     {
-        return false;
+        return true;
     }
 
     bufferization::AliasingValueList

From b2879cf1443afe3664762deaa4520f1e83566a22 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@xanadu.ai>
Date: Mon, 7 Oct 2024 13:39:19 -0400
Subject: [PATCH 154/183] Make CustomCallOp not return failure if its operands
 are not bufferizble

---
 mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
index 5316b0191f..1e4dc62ef4 100644
--- a/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -106,8 +106,9 @@ struct CustomCallOpInterface
         for (Value operand : operands) {
             FailureOr<Value> opBuffer = getBuffer(rewriter, operand, options);
             if (failed(opBuffer))
-                return failure();
-            bufferArgs.push_back(*opBuffer);
+                bufferArgs.push_back(operand);
+            else
+                bufferArgs.push_back(*opBuffer);
         }
 
         // Add bufferized return values to the arguments

From b69dd2a3852ca013f8cef7d846266d4475e90198 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@xanadu.ai>
Date: Mon, 7 Oct 2024 14:25:16 -0400
Subject: [PATCH 155/183] Make CustomCallOp not return failure if its results
 are not bufferizble

---
 mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
index 1e4dc62ef4..35adc06e94 100644
--- a/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -117,7 +117,8 @@ struct CustomCallOpInterface
             Type resultType = result.getType();
             RankedTensorType tensorType = dyn_cast<RankedTensorType>(resultType);
             if (!tensorType) {
-                return failure();
+                bufferArgs.push_back(result);
+                continue;
             }
             auto options = bufferization::BufferizationOptions();
             FailureOr<Value> tensorAlloc = bufferization::allocateTensorForShapedValue(

From 9069ca8c5610a5fdcf53228edc580b8fd6735093 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@xanadu.ai>
Date: Mon, 7 Oct 2024 14:36:51 -0400
Subject: [PATCH 156/183] Remove unused CustomCallOp members

---
 .../BufferizableOpInterfaceImpl.cpp           | 19 +++----------------
 1 file changed, 3 insertions(+), 16 deletions(-)

diff --git a/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
index 35adc06e94..dc4ed0e5e5 100644
--- a/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -164,22 +164,9 @@ struct CallbackOpInterface
         return false;
     }
 
-    bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
-                                const bufferization::AnalysisState &state) const
-    {
-        return true;
-    }
-
-    bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
-                                 const bufferization::AnalysisState &state) const
-    {
-        return false;
-    }
-
-    bufferization::AliasingValueList
-    getAliasingValues(Operation *op, OpOperand &opOperand,
-                      const bufferization::AnalysisState &state) const
-    {
+    bufferization::AliasingOpOperandList
+    getAliasingOpOperands(Operation *op, Value value,
+                        const bufferization::AnalysisState &state) const {
         return {};
     }
 

From 9c93c2a6c1d3f1e3dd2cb1cf0437c2bdafa0cf2b Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@xanadu.ai>
Date: Mon, 7 Oct 2024 15:23:16 -0400
Subject: [PATCH 157/183] Implement bufferizesToAllocation for CallbackCallOp
 and CustomCallOp

---
 .../Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp    | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
index dc4ed0e5e5..4c44c318f8 100644
--- a/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -76,6 +76,8 @@ struct PrintOpInterface
 struct CustomCallOpInterface
     : public bufferization::BufferizableOpInterface::ExternalModel<CustomCallOpInterface,
                                                                    CustomCallOp> {
+    bool bufferizesToAllocation(Operation *op, Value value) const { return true; }
+
     bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
                                 const bufferization::AnalysisState &state) const
     {
@@ -166,7 +168,8 @@ struct CallbackOpInterface
 
     bufferization::AliasingOpOperandList
     getAliasingOpOperands(Operation *op, Value value,
-                        const bufferization::AnalysisState &state) const {
+                          const bufferization::AnalysisState &state) const
+    {
         return {};
     }
 
@@ -199,6 +202,8 @@ struct CallbackOpInterface
 struct CallbackCallOpInterface
     : public bufferization::BufferizableOpInterface::ExternalModel<CallbackCallOpInterface,
                                                                    CallbackCallOp> {
+    bool bufferizesToAllocation(Operation *op, Value value) const { return true; }
+
     bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
                                 const bufferization::AnalysisState &state) const
     {

From 323b09372db8dd7f4741166335fee975cc18396a Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@xanadu.ai>
Date: Mon, 7 Oct 2024 17:50:58 -0400
Subject: [PATCH 158/183] Tentatively revert CustomCallOp
 bufferizesToMemoryWrite to false

---
 mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
index 4c44c318f8..4aadb29bfd 100644
--- a/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -87,7 +87,7 @@ struct CustomCallOpInterface
     bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
                                  const bufferization::AnalysisState &state) const
     {
-        return true;
+        return false;
     }
 
     bufferization::AliasingValueList

From 009929bc54df7121acc793ba0e1dd6b9d7f9960c Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@xanadu.ai>
Date: Tue, 8 Oct 2024 12:07:20 -0400
Subject: [PATCH 159/183] Test disable lcurses

---
 frontend/catalyst/debug/compiler_functions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/frontend/catalyst/debug/compiler_functions.py b/frontend/catalyst/debug/compiler_functions.py
index 09175346e8..30e30fae4b 100644
--- a/frontend/catalyst/debug/compiler_functions.py
+++ b/frontend/catalyst/debug/compiler_functions.py
@@ -271,7 +271,7 @@ def f(x):
         f"-Wl,-rpath,{python_lib_dir_path}",
         f"-L{python_lib_dir_path}",
         "-lpython" + version_str,
-        "-lcurses",
+        #"-lcurses",
     ]
 
     # Linker in macOS might use @rpath/Python3.framework/Versions/3.x/Python3.

From e3274797d24fcede73c4e49dbf916c338d08fd0f Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@xanadu.ai>
Date: Tue, 8 Oct 2024 13:22:23 -0400
Subject: [PATCH 160/183] Remove lcurses installation

---
 .github/workflows/scripts/linux_arm64/rh8/test_wheels.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/scripts/linux_arm64/rh8/test_wheels.sh b/.github/workflows/scripts/linux_arm64/rh8/test_wheels.sh
index 817ea3a8a1..626d3d93ce 100644
--- a/.github/workflows/scripts/linux_arm64/rh8/test_wheels.sh
+++ b/.github/workflows/scripts/linux_arm64/rh8/test_wheels.sh
@@ -11,7 +11,7 @@ export PYTHON_PACKAGE=$4
 
 # Install system dependencies (gcc gives access to c99, which is needed by some tests)
 dnf update -y 
-dnf install -y libzstd-devel gcc-toolset-${GCC_VERSION} gcc ncurses-devel
+dnf install -y libzstd-devel gcc-toolset-${GCC_VERSION} gcc
 if [ "$PYTHON_MAJOR_MINOR" != "3.10" ]; then
     dnf install -y ${PYTHON_PACKAGE} ${PYTHON_PACKAGE}-devel
 else

From aa9f69cda8c2a88a70a719473a92afdb2055955c Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@xanadu.ai>
Date: Tue, 8 Oct 2024 13:27:05 -0400
Subject: [PATCH 161/183] Cleanup

---
 frontend/catalyst/debug/compiler_functions.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/frontend/catalyst/debug/compiler_functions.py b/frontend/catalyst/debug/compiler_functions.py
index 30e30fae4b..aa8cdb9564 100644
--- a/frontend/catalyst/debug/compiler_functions.py
+++ b/frontend/catalyst/debug/compiler_functions.py
@@ -271,7 +271,6 @@ def f(x):
         f"-Wl,-rpath,{python_lib_dir_path}",
         f"-L{python_lib_dir_path}",
         "-lpython" + version_str,
-        #"-lcurses",
     ]
 
     # Linker in macOS might use @rpath/Python3.framework/Versions/3.x/Python3.

From 87be1d34cabe415b6cc88e714cc5a383b5d9ba0a Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@xanadu.ai>
Date: Tue, 8 Oct 2024 15:16:52 -0400
Subject: [PATCH 162/183] Add comment to CallbackOp's bufferizesToMemoryWrite

---
 mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
index 4aadb29bfd..7a2e81decc 100644
--- a/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -213,6 +213,9 @@ struct CallbackCallOpInterface
     bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
                                  const bufferization::AnalysisState &state) const
     {
+        // The method returns `true` if the given OpOperand bufferizes to a memory write.
+        // Since CallbackCallOp is related to print, which does not write to its operand,
+        // Maybe we can set this to `false`.
         return false;
     }
 

From 143ba55e14277f82ed3918370cb76160a509e4a0 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@xanadu.ai>
Date: Wed, 9 Oct 2024 11:48:25 -0400
Subject: [PATCH 163/183] Add table of bufferizable ops

---
 mlir/lib/Bufferization.md | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)
 create mode 100644 mlir/lib/Bufferization.md

diff --git a/mlir/lib/Bufferization.md b/mlir/lib/Bufferization.md
new file mode 100644
index 0000000000..b8addeccb5
--- /dev/null
+++ b/mlir/lib/Bufferization.md
@@ -0,0 +1,23 @@
+**Bufferization Interfaces:**
+
+| Bufferizable Operations         | PrintOp  | CustomCallOp | CallbackOp   | CallbackCallOp | AdjointOp | BackpropOp | ForwardOp | ReverseOp | QubitUnitaryOp | HermitianOp | HamiltonianOp | SampleOp | StateOp | ProbsOp | CountsOp | SetStateOp | SetBasisStateOp |
+| --------------------------------| ---------| ------------ | ------------ | -------------- | --------- | ---------- | --------- | --------- | -------------- | ----------- | ------------- | -------- | ------- | ------- | -------- | ---------- | --------------- |
+| Catagory              |         | catalyst | catalyst     | catalyst     | catalyst       | gradient  | gradient   | gradient  | gradient  | quantum        | quantum     | quantum       | quantum  | quantum | quantum | quantum  | quantum    | quantum         |
+| bufferizesToAllocation          |          | true         |              | true           |           |            |           |           |                |             |               |          |         |         |          |            |                 |
+| bufferizesToMemoryRead          | true     | true         |              | false          | true      | true       |           |           | true           | true        | true          | false    | false   | false   | false    | false      | false           |
+| bufferizesToMemoryWrite         | false    | false        |              | false          | false     | true       |           |           | false          | false       | false         | false    | false   | false   | false    | false      | false           |
+| bufferizesToElementwiseAccess   |          |              |              |                |           |            |           |           |                |             |               |          |         |         |          |            |                 |
+| resultBufferizesToMemoryWrite   |          |              |              |                |           |            |           |           |                |             |               |          |         |         |          |            |                 |
+| mustBufferizeInPlace            |          |              |              |                |           |            |           |           |                |             |               |          |         |         |          |            |                 |
+| getAliasingValues               | {}       | {}           |              | {}             | {}        | {}         |           |           | {}             | {}          | {}            | {}       | {}      | {}      | {}       | {}         | {}              |
+| getAliasingOpOperands           |          |              | {}           |                |           |            | v         | v         |                |             |               |          |         |         |          |            |                 |
+| resolveConflicts                |          |              |              |                |           |            |           |           |                |             |               |          |         |         |          |            |                 |
+| bufferize                       | v        | v            | v            | v              | v         | v          | v         | v         | v              | v           | v             | v        | v       | v       | v        | v          | v               |
+| isWritable                      |          |              |              |                |           |            |           |           |                |             |               |          |         |         |          |            |                 |
+| isNotConflicting                |          |              |              |                |           |            |           |           |                |             |               |          |         |         |          |            |                 |
+| verifyAnalysis                  |          |              |              |                |           |            | v         | v         |                |             |               |          |         |         |          |            |                 |
+| getBufferType                   |          |              |              |                |           |            | v         | v         |                |             |               |          |         |         |          |            |                 |
+| isRepetitiveRegion              |          |              |              |                |           |            |           |           |                |             |               |          |         |         |          |            |                 |
+| isParallelRegion                |          |              |              |                |           |            |           |           |                |             |               |          |         |         |          |            |                 |
+| hasTensorSemantics              |          |              | v            |                |           |            | v         | v         |                |             |               |          |         |         |          |            |                 |
+| supportsUnstructuredControlFlow |          |              | true         |                |           |            | true      | true      |                |             |               |          |         |         |          |            |                 |
\ No newline at end of file

From 8d0324db672d507d651b46fd65e9058455a9bb03 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@xanadu.ai>
Date: Wed, 9 Oct 2024 12:04:20 -0400
Subject: [PATCH 164/183] Small fix

---
 mlir/lib/Bufferization.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/lib/Bufferization.md b/mlir/lib/Bufferization.md
index b8addeccb5..16b0cfd98a 100644
--- a/mlir/lib/Bufferization.md
+++ b/mlir/lib/Bufferization.md
@@ -2,7 +2,7 @@
 
 | Bufferizable Operations         | PrintOp  | CustomCallOp | CallbackOp   | CallbackCallOp | AdjointOp | BackpropOp | ForwardOp | ReverseOp | QubitUnitaryOp | HermitianOp | HamiltonianOp | SampleOp | StateOp | ProbsOp | CountsOp | SetStateOp | SetBasisStateOp |
 | --------------------------------| ---------| ------------ | ------------ | -------------- | --------- | ---------- | --------- | --------- | -------------- | ----------- | ------------- | -------- | ------- | ------- | -------- | ---------- | --------------- |
-| Catagory              |         | catalyst | catalyst     | catalyst     | catalyst       | gradient  | gradient   | gradient  | gradient  | quantum        | quantum     | quantum       | quantum  | quantum | quantum | quantum  | quantum    | quantum         |
+| Catagory                        | catalyst | catalyst     | catalyst     | catalyst       | gradient  | gradient   | gradient  | gradient  | quantum        | quantum     | quantum       | quantum  | quantum | quantum | quantum  | quantum    | quantum         |
 | bufferizesToAllocation          |          | true         |              | true           |           |            |           |           |                |             |               |          |         |         |          |            |                 |
 | bufferizesToMemoryRead          | true     | true         |              | false          | true      | true       |           |           | true           | true        | true          | false    | false   | false   | false    | false      | false           |
 | bufferizesToMemoryWrite         | false    | false        |              | false          | false     | true       |           |           | false          | false       | false         | false    | false   | false   | false    | false      | false           |

From 6dc98b95a641f847feeb0894a18ee1e629e52298 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@xanadu.ai>
Date: Wed, 9 Oct 2024 12:07:30 -0400
Subject: [PATCH 165/183] Small correction

---
 mlir/lib/Bufferization.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/lib/Bufferization.md b/mlir/lib/Bufferization.md
index 16b0cfd98a..63a14ba49d 100644
--- a/mlir/lib/Bufferization.md
+++ b/mlir/lib/Bufferization.md
@@ -4,7 +4,7 @@
 | --------------------------------| ---------| ------------ | ------------ | -------------- | --------- | ---------- | --------- | --------- | -------------- | ----------- | ------------- | -------- | ------- | ------- | -------- | ---------- | --------------- |
 | Catagory                        | catalyst | catalyst     | catalyst     | catalyst       | gradient  | gradient   | gradient  | gradient  | quantum        | quantum     | quantum       | quantum  | quantum | quantum | quantum  | quantum    | quantum         |
 | bufferizesToAllocation          |          | true         |              | true           |           |            |           |           |                |             |               |          |         |         |          |            |                 |
-| bufferizesToMemoryRead          | true     | true         |              | false          | true      | true       |           |           | true           | true        | true          | false    | false   | false   | false    | false      | false           |
+| bufferizesToMemoryRead          | true     | true         |              | true           | true      | true       |           |           | true           | true        | true          | false    | false   | false   | false    | false      | false           |
 | bufferizesToMemoryWrite         | false    | false        |              | false          | false     | true       |           |           | false          | false       | false         | false    | false   | false   | false    | false      | false           |
 | bufferizesToElementwiseAccess   |          |              |              |                |           |            |           |           |                |             |               |          |         |         |          |            |                 |
 | resultBufferizesToMemoryWrite   |          |              |              |                |           |            |           |           |                |             |               |          |         |         |          |            |                 |

From 44d4597e17e8bed146822f9116219ccdd3c9d1db Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@xanadu.ai>
Date: Thu, 10 Oct 2024 09:56:15 -0400
Subject: [PATCH 166/183] Set supportsUnstructuredControlFlow for CallBackOp as
 False

---
 mlir/lib/Bufferization.md                                    | 2 +-
 mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Bufferization.md b/mlir/lib/Bufferization.md
index 63a14ba49d..14eb74599b 100644
--- a/mlir/lib/Bufferization.md
+++ b/mlir/lib/Bufferization.md
@@ -20,4 +20,4 @@
 | isRepetitiveRegion              |          |              |              |                |           |            |           |           |                |             |               |          |         |         |          |            |                 |
 | isParallelRegion                |          |              |              |                |           |            |           |           |                |             |               |          |         |         |          |            |                 |
 | hasTensorSemantics              |          |              | v            |                |           |            | v         | v         |                |             |               |          |         |         |          |            |                 |
-| supportsUnstructuredControlFlow |          |              | true         |                |           |            | true      | true      |                |             |               |          |         |         |          |            |                 |
\ No newline at end of file
+| supportsUnstructuredControlFlow |          |              | false        |                |           |            | true      | true      |                |             |               |          |         |         |          |            |                 |
\ No newline at end of file
diff --git a/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
index 7a2e81decc..689b2a2a15 100644
--- a/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -150,7 +150,7 @@ struct CustomCallOpInterface
 struct CallbackOpInterface
     : public bufferization::OpWithUnstructuredControlFlowBufferizableOpInterfaceExternalModel<
           CallbackOpInterface, CallbackOp> {
-    static bool supportsUnstructuredControlFlow() { return true; }
+    static bool supportsUnstructuredControlFlow() { return false; }
 
     bool hasTensorSemantics(Operation *op) const
     {

From 3cecfaff38b289c114678bd1942ad300946dd0c1 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@xanadu.ai>
Date: Thu, 10 Oct 2024 11:47:23 -0400
Subject: [PATCH 167/183] Remove uncontrolflow from CallBackOp

---
 .../Transforms/BufferizableOpInterfaceImpl.cpp     | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
index 689b2a2a15..4a71807c14 100644
--- a/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -24,13 +24,7 @@ namespace {
  * (if bufferizing in-place).
  *
  * `getAliasingOpOperands`: Return the OpResults that may share the same buffer as the given
- * OpOperand. Note that MLIR documentation does not mention `getAliasingValues` but it seems to
- * serve the same purpose.
- *
- * Bufferizing FunctionOpInterface is also not documented by MLIR. It requires
- * `OpWithUnstructuredControlFlowBufferizableOpInterfaceExternalModel`, which requires the
- * implementation of `supportsUnstructuredControlFlow`, `hasTensorSemantics`, and
- * `getAliasingOpOperands`.
+ * OpOperand.
  *
  * Link: https://mlir.llvm.org/docs/Bufferization/#extending-one-shot-bufferize
  */
@@ -148,10 +142,8 @@ struct CustomCallOpInterface
 };
 
 struct CallbackOpInterface
-    : public bufferization::OpWithUnstructuredControlFlowBufferizableOpInterfaceExternalModel<
-          CallbackOpInterface, CallbackOp> {
-    static bool supportsUnstructuredControlFlow() { return false; }
-
+    : public bufferization::BufferizableOpInterface::ExternalModel<CallbackOpInterface,
+                                                                   CallbackOp> {
     bool hasTensorSemantics(Operation *op) const
     {
         auto isaTensor = llvm::IsaPred<TensorType>;

From 2cb30e391ee809d3cc33072eceb558e974c84439 Mon Sep 17 00:00:00 2001
From: erick-xanadu <110487834+erick-xanadu@users.noreply.github.com>
Date: Thu, 10 Oct 2024 11:55:38 -0400
Subject: [PATCH 168/183] Bufferization comments (#1197)

---
 frontend/catalyst/compiler.py | 102 ++++++++++++++++++++++++++++++++--
 1 file changed, 96 insertions(+), 6 deletions(-)

diff --git a/frontend/catalyst/compiler.py b/frontend/catalyst/compiler.py
index 8b37a6205c..552134df6b 100644
--- a/frontend/catalyst/compiler.py
+++ b/frontend/catalyst/compiler.py
@@ -221,25 +221,112 @@ def run_writing_command(command: List[str], compile_options: Optional[CompileOpt
     ],
 )
 
+# From: https://mlir.llvm.org/docs/Bufferization/#overview
+#
+# Preprocessing
+#     |               rewrite_in_destination_passing_style
+#     |               -eliminate-empty-tensors
+# Bufferization
+#     |               -one-shot-bufferize
+# Buffer-Level
+# Optimizations
+#     |               -buffer-hoisting
+#     |               -buffer-loop-hoisting
+#     |               -buffer-results-to-out-params
+#     |               -drop-equivalent-buffer-results
+#     |               -promote-buffers-to-stack
+# Deallocation
+#     |               -buffer-deallocation-pipeline
+
 BUFFERIZATION_PASS = (
     "BufferizationPass",
     [
         "inline",
         "gradient-preprocess",
-        "eliminate-empty-tensors",
         "convert-elementwise-to-linalg",
-        "one-shot-bufferize{bufferize-function-boundaries allow-return-allocs-from-loops "
-        "function-boundary-type-conversion=identity-layout-map}",
-        "canonicalize",  # Remove dead memrefToTensorOp's
-        "gradient-postprocess",
+        "canonicalize",
+# Preprocessing:
+# rewrite_in_destination_passing_style
+#
+# We are not rewriting everything in DPS before -one-shot-bufferize
+# This was discussed with the main author of the -one-shot-bufferize
+# pass and he stated the following:
+#
+#     One-Shot Bufferize was designed for ops that are in DPS (destination-passing style).
+#     Ops that are not in DPS can still be bufferized,
+#     but a new buffer will be allocated for every tensor result.
+#     That’s functionally correct but inefficient.
+#
+#     I’m not sure whether it’s better to first migrate to the new bufferization,
+#     then turn the ops into DPS ops, or do it the other way around.
+#     One benefit of implementing the bufferization first is that
+#     it’s a smaller step that you can already run end-to-end.
+#     And you can think of the DPS of a performance improvement on top of it.
+#
+# https://discourse.llvm.org/t/steps-of-migrating-to-one-shot-bufferization/81062/2
+#
+# Here, please note that gradient-preprocessing is different than rewriting in DPS.
+# So, overall, we are skipping this section while we first focus on migrating to the
+# new -one-shot-bufferize
+        "eliminate-empty-tensors",
+        (
+           "one-shot-bufferize"
+           "{"
+               "bufferize-function-boundaries "
+                       # - Bufferize function boundaries (experimental).
+                       #
+                       #     By default, function boundaries are not bufferized.
+                       #     This is because there are currently limitations around function graph bufferization:
+                       #     recursive calls are not supported.
+                       #     As long as there are no recursive calls, function boundary bufferization can be enabled with bufferize-function-boundaries.
+                       #     Each tensor function argument and tensor function result is then turned into a memref.
+                       #     The layout map of the memref type can be controlled with function-boundary-type-conversion.
+                       #
+                       # https://mlir.llvm.org/docs/Bufferization/#using-one-shot-bufferize
+               "allow-return-allocs-from-loops "
+                       # - Allows returning/yielding new allocations from a loop.
+                       # https://github.com/llvm/llvm-project/pull/83964
+                       # https://github.com/llvm/llvm-project/pull/87594
+               "function-boundary-type-conversion=identity-layout-map"
+                       # - Controls layout maps when bufferizing function signatures.
+                       #     You can control the memref types at the function boundary with
+                       #     function-boundary-type-conversion. E.g., if you set it to identity-layout-map,
+                       #     you should get the same type as with --func-bufferize.
+                       #     By default, we put a fully dynamic layout map strided<[?, ?], offset: ?>
+                       #     because that works best if you don't know what layout map the buffers at
+                       #     the call site have -- you can always cast a buffer to a type with
+                       #     fully dynamic layout map. (But not the other way around. That may require a reallocation.)
+                       #
+                       #  https://discord.com/channels/636084430946959380/642426447167881246/1212338527824515102
+           "}"
+        ),
+        # Remove dead memrefToTensorOp's
         # introduced during gradient-bufferize of callbacks
+        # TODO: Figure out how to remove this.
+        "gradient-postprocess",
         "func.func(buffer-hoisting)",
         "func.func(buffer-loop-hoisting)",
+
+        # TODO: Figure out how to include the other buffer-level optimizations.
+        # -buffer-results-to-out-params,
+        # -drop-equivalent-buffer-results,
+        # -promote-buffers-to-stack
+
+        # Deallocation
+        # The buffer deallocation pass has been deprecated in favor of the
+        # ownership-based buffer deallocation pipeline.
+        # The deprecated pass has some limitations that may cause memory leaks in the resulting IR.
+        # TODO: Switch to one-shot-bufferization once it is merged.
         "func.func(buffer-deallocation)",
+        # catalyst.list_* operations are not bufferized through
+        # the bufferization interface
+        # This is because they store a memref inside of a memref
+        # which is incompatible with the bufferization pipeline.
         "convert-arraylist-to-memref",
         "convert-bufferization-to-memref",
-        "canonicalize",  # Must be after convert-bufferization-to-memref
+        # Must be after convert-bufferization-to-memref
         # otherwise there are issues in lowering of dynamic tensors.
+        "canonicalize",
         # "cse",
         "cp-global-memref",
     ],
@@ -248,6 +335,9 @@ def run_writing_command(command: List[str], compile_options: Optional[CompileOpt
 BUFFERIZATION_ASYNC_PASS = (
     "BufferizationPass",
     [
+        # TODO: Can we remove copy-before-write?
+        # copy-before-write:
+        # Skip the analysis. Make a buffer copy on every write.
         s.replace("}", " copy-before-write}") if s.startswith("one-shot-bufferize") else s
         for s in BUFFERIZATION_PASS[1]
     ],

From 812bf7f69bc576206eded493f5b55cd625d24b15 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@xanadu.ai>
Date: Thu, 10 Oct 2024 11:57:29 -0400
Subject: [PATCH 169/183] Entirely remove all unstructure control flow

---
 mlir/lib/Bufferization.md                     |  8 +--
 .../BufferizableOpInterfaceImpl.cpp           |  1 -
 .../BufferizableOpInterfaceImpl.cpp           | 50 ++-----------------
 3 files changed, 8 insertions(+), 51 deletions(-)

diff --git a/mlir/lib/Bufferization.md b/mlir/lib/Bufferization.md
index 14eb74599b..2fefd566c3 100644
--- a/mlir/lib/Bufferization.md
+++ b/mlir/lib/Bufferization.md
@@ -10,14 +10,14 @@
 | resultBufferizesToMemoryWrite   |          |              |              |                |           |            |           |           |                |             |               |          |         |         |          |            |                 |
 | mustBufferizeInPlace            |          |              |              |                |           |            |           |           |                |             |               |          |         |         |          |            |                 |
 | getAliasingValues               | {}       | {}           |              | {}             | {}        | {}         |           |           | {}             | {}          | {}            | {}       | {}      | {}      | {}       | {}         | {}              |
-| getAliasingOpOperands           |          |              | {}           |                |           |            | v         | v         |                |             |               |          |         |         |          |            |                 |
+| getAliasingOpOperands           |          |              | {}           |                |           |            | {}        | {}        |                |             |               |          |         |         |          |            |                 |
 | resolveConflicts                |          |              |              |                |           |            |           |           |                |             |               |          |         |         |          |            |                 |
 | bufferize                       | v        | v            | v            | v              | v         | v          | v         | v         | v              | v           | v             | v        | v       | v       | v        | v          | v               |
 | isWritable                      |          |              |              |                |           |            |           |           |                |             |               |          |         |         |          |            |                 |
 | isNotConflicting                |          |              |              |                |           |            |           |           |                |             |               |          |         |         |          |            |                 |
-| verifyAnalysis                  |          |              |              |                |           |            | v         | v         |                |             |               |          |         |         |          |            |                 |
-| getBufferType                   |          |              |              |                |           |            | v         | v         |                |             |               |          |         |         |          |            |                 |
+| verifyAnalysis                  |          |              |              |                |           |            |           |           |                |             |               |          |         |         |          |            |                 |
+| getBufferType                   |          |              |              |                |           |            |           |           |                |             |               |          |         |         |          |            |                 |
 | isRepetitiveRegion              |          |              |              |                |           |            |           |           |                |             |               |          |         |         |          |            |                 |
 | isParallelRegion                |          |              |              |                |           |            |           |           |                |             |               |          |         |         |          |            |                 |
 | hasTensorSemantics              |          |              | v            |                |           |            | v         | v         |                |             |               |          |         |         |          |            |                 |
-| supportsUnstructuredControlFlow |          |              | false        |                |           |            | true      | true      |                |             |               |          |         |         |          |            |                 |
\ No newline at end of file
+| supportsUnstructuredControlFlow |          |              |              |                |           |            |           |           |                |             |               |          |         |         |          |            |                 |
\ No newline at end of file
diff --git a/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
index 4a71807c14..8411b9bf78 100644
--- a/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -1,6 +1,5 @@
 #include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
 #include "mlir/Dialect/Bufferization/IR/Bufferization.h"
-#include "mlir/Dialect/Bufferization/IR/UnstructuredControlFlow.h"
 #include "mlir/Dialect/Bufferization/Transforms/Bufferize.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/IR/Builders.h"
diff --git a/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
index ce55763343..9bad418c0c 100644
--- a/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -1,14 +1,10 @@
 #include "mlir/Conversion/LLVMCommon/MemRefBuilder.h"
-#include "mlir/Conversion/LLVMCommon/Pattern.h"
-#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
 #include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
 #include "mlir/Dialect/Bufferization/IR/Bufferization.h"
-#include "mlir/Dialect/Bufferization/IR/UnstructuredControlFlow.h"
 #include "mlir/Dialect/Bufferization/Transforms/Bufferize.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Index/IR/IndexOps.h"
 #include "mlir/Dialect/LLVMIR/FunctionCallUtils.h"
-#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/Interfaces/FunctionInterfaces.h"
@@ -17,7 +13,6 @@
 #include "Gradient/IR/GradientOps.h"
 #include "Gradient/Transforms/BufferizableOpInterfaceImpl.h"
 #include "Gradient/Utils/GradientShape.h"
-#include "Quantum/IR/QuantumOps.h"
 #include "llvm/ADT/STLExtras.h"
 
 using namespace mlir;
@@ -37,11 +32,6 @@ namespace {
  * OpOperand. Note that MLIR documentation does not mention `getAliasingValues` but it seems to
  * serve the same purpose.
  *
- * Bufferizing FunctionOpInterface is also not documented by MLIR. It requires
- * `OpWithUnstructuredControlFlowBufferizableOpInterfaceExternalModel`, which requires the
- * implementation of `supportsUnstructuredControlFlow`, `hasTensorSemantics`, and
- * `getAliasingOpOperands`.
- *
  * Link: https://mlir.llvm.org/docs/Bufferization/#extending-one-shot-bufferize
  */
 
@@ -292,8 +282,7 @@ struct BackpropOpInterface
 };
 
 struct ForwardOpInterface
-    : public bufferization::OpWithUnstructuredControlFlowBufferizableOpInterfaceExternalModel<
-          ForwardOpInterface, ForwardOp> {
+    : public bufferization::BufferizableOpInterface::ExternalModel<ForwardOpInterface, ForwardOp> {
     static bool supportsUnstructuredControlFlow() { return true; }
 
     bool hasTensorSemantics(Operation *op) const
@@ -316,22 +305,7 @@ struct ForwardOpInterface
     getAliasingOpOperands(Operation *op, Value value,
                           const bufferization::AnalysisState &state) const
     {
-        return getAliasingBranchOpOperands(op, cast<BlockArgument>(value), state);
-    }
-
-    FailureOr<BaseMemRefType> getBufferType(Operation *op, Value value,
-                                            const bufferization::BufferizationOptions &options,
-                                            SmallVector<Value> &invocationStack) const
-    {
-        auto forwardOp = cast<ForwardOp>(op);
-        auto bbArg = cast<BlockArgument>(value);
-
-        // Function arguments are special.
-        if (bbArg.getOwner() == &forwardOp.getBody().front())
-            return getBufferizedFunctionArgType(forwardOp, bbArg.getArgNumber(), options);
-
-        return OpWithUnstructuredControlFlowBufferizableOpInterfaceExternalModel::getBufferType(
-            op, value, options, invocationStack);
+        return {};
     }
 
     LogicalResult verifyAnalysis(Operation *op, const bufferization::AnalysisState &state) const
@@ -413,8 +387,7 @@ struct ForwardOpInterface
 };
 
 struct ReverseOpInterface
-    : public bufferization::OpWithUnstructuredControlFlowBufferizableOpInterfaceExternalModel<
-          ReverseOpInterface, ReverseOp> {
+    : public bufferization::BufferizableOpInterface::ExternalModel<ReverseOpInterface, ReverseOp> {
     static bool supportsUnstructuredControlFlow() { return true; }
 
     bool hasTensorSemantics(Operation *op) const
@@ -437,22 +410,7 @@ struct ReverseOpInterface
     getAliasingOpOperands(Operation *op, Value value,
                           const bufferization::AnalysisState &state) const
     {
-        return getAliasingBranchOpOperands(op, cast<BlockArgument>(value), state);
-    }
-
-    FailureOr<BaseMemRefType> getBufferType(Operation *op, Value value,
-                                            const bufferization::BufferizationOptions &options,
-                                            SmallVector<Value> &invocationStack) const
-    {
-        auto reverseOp = cast<ReverseOp>(op);
-        auto bbArg = cast<BlockArgument>(value);
-
-        // Function arguments are special.
-        if (bbArg.getOwner() == &reverseOp.getBody().front())
-            return getBufferizedFunctionArgType(reverseOp, bbArg.getArgNumber(), options);
-
-        return OpWithUnstructuredControlFlowBufferizableOpInterfaceExternalModel::getBufferType(
-            op, value, options, invocationStack);
+        return {};
     }
 
     LogicalResult verifyAnalysis(Operation *op, const bufferization::AnalysisState &state) const

From c76698a8a51e0034308dc7a045cb652772c57bc1 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@xanadu.ai>
Date: Thu, 10 Oct 2024 11:58:41 -0400
Subject: [PATCH 170/183] Reformatting

---
 frontend/catalyst/compiler.py | 106 +++++++++++++++++-----------------
 1 file changed, 52 insertions(+), 54 deletions(-)

diff --git a/frontend/catalyst/compiler.py b/frontend/catalyst/compiler.py
index 552134df6b..20828f4c05 100644
--- a/frontend/catalyst/compiler.py
+++ b/frontend/catalyst/compiler.py
@@ -245,60 +245,60 @@ def run_writing_command(command: List[str], compile_options: Optional[CompileOpt
         "gradient-preprocess",
         "convert-elementwise-to-linalg",
         "canonicalize",
-# Preprocessing:
-# rewrite_in_destination_passing_style
-#
-# We are not rewriting everything in DPS before -one-shot-bufferize
-# This was discussed with the main author of the -one-shot-bufferize
-# pass and he stated the following:
-#
-#     One-Shot Bufferize was designed for ops that are in DPS (destination-passing style).
-#     Ops that are not in DPS can still be bufferized,
-#     but a new buffer will be allocated for every tensor result.
-#     That’s functionally correct but inefficient.
-#
-#     I’m not sure whether it’s better to first migrate to the new bufferization,
-#     then turn the ops into DPS ops, or do it the other way around.
-#     One benefit of implementing the bufferization first is that
-#     it’s a smaller step that you can already run end-to-end.
-#     And you can think of the DPS of a performance improvement on top of it.
-#
-# https://discourse.llvm.org/t/steps-of-migrating-to-one-shot-bufferization/81062/2
-#
-# Here, please note that gradient-preprocessing is different than rewriting in DPS.
-# So, overall, we are skipping this section while we first focus on migrating to the
-# new -one-shot-bufferize
+        # Preprocessing:
+        # rewrite_in_destination_passing_style
+        #
+        # We are not rewriting everything in DPS before -one-shot-bufferize
+        # This was discussed with the main author of the -one-shot-bufferize
+        # pass and he stated the following:
+        #
+        #     One-Shot Bufferize was designed for ops that are in DPS (destination-passing style).
+        #     Ops that are not in DPS can still be bufferized,
+        #     but a new buffer will be allocated for every tensor result.
+        #     That’s functionally correct but inefficient.
+        #
+        #     I’m not sure whether it’s better to first migrate to the new bufferization,
+        #     then turn the ops into DPS ops, or do it the other way around.
+        #     One benefit of implementing the bufferization first is that
+        #     it’s a smaller step that you can already run end-to-end.
+        #     And you can think of the DPS of a performance improvement on top of it.
+        #
+        # https://discourse.llvm.org/t/steps-of-migrating-to-one-shot-bufferization/81062/2
+        #
+        # Here, please note that gradient-preprocessing is different than rewriting in DPS.
+        # So, overall, we are skipping this section while we first focus on migrating to the
+        # new -one-shot-bufferize
         "eliminate-empty-tensors",
         (
-           "one-shot-bufferize"
-           "{"
-               "bufferize-function-boundaries "
-                       # - Bufferize function boundaries (experimental).
-                       #
-                       #     By default, function boundaries are not bufferized.
-                       #     This is because there are currently limitations around function graph bufferization:
-                       #     recursive calls are not supported.
-                       #     As long as there are no recursive calls, function boundary bufferization can be enabled with bufferize-function-boundaries.
-                       #     Each tensor function argument and tensor function result is then turned into a memref.
-                       #     The layout map of the memref type can be controlled with function-boundary-type-conversion.
-                       #
-                       # https://mlir.llvm.org/docs/Bufferization/#using-one-shot-bufferize
-               "allow-return-allocs-from-loops "
-                       # - Allows returning/yielding new allocations from a loop.
-                       # https://github.com/llvm/llvm-project/pull/83964
-                       # https://github.com/llvm/llvm-project/pull/87594
-               "function-boundary-type-conversion=identity-layout-map"
-                       # - Controls layout maps when bufferizing function signatures.
-                       #     You can control the memref types at the function boundary with
-                       #     function-boundary-type-conversion. E.g., if you set it to identity-layout-map,
-                       #     you should get the same type as with --func-bufferize.
-                       #     By default, we put a fully dynamic layout map strided<[?, ?], offset: ?>
-                       #     because that works best if you don't know what layout map the buffers at
-                       #     the call site have -- you can always cast a buffer to a type with
-                       #     fully dynamic layout map. (But not the other way around. That may require a reallocation.)
-                       #
-                       #  https://discord.com/channels/636084430946959380/642426447167881246/1212338527824515102
-           "}"
+            "one-shot-bufferize"
+            "{"
+            "bufferize-function-boundaries "
+            # - Bufferize function boundaries (experimental).
+            #
+            #     By default, function boundaries are not bufferized.
+            #     This is because there are currently limitations around function graph bufferization:
+            #     recursive calls are not supported.
+            #     As long as there are no recursive calls, function boundary bufferization can be enabled with bufferize-function-boundaries.
+            #     Each tensor function argument and tensor function result is then turned into a memref.
+            #     The layout map of the memref type can be controlled with function-boundary-type-conversion.
+            #
+            # https://mlir.llvm.org/docs/Bufferization/#using-one-shot-bufferize
+            "allow-return-allocs-from-loops "
+            # - Allows returning/yielding new allocations from a loop.
+            # https://github.com/llvm/llvm-project/pull/83964
+            # https://github.com/llvm/llvm-project/pull/87594
+            "function-boundary-type-conversion=identity-layout-map"
+            # - Controls layout maps when bufferizing function signatures.
+            #     You can control the memref types at the function boundary with
+            #     function-boundary-type-conversion. E.g., if you set it to identity-layout-map,
+            #     you should get the same type as with --func-bufferize.
+            #     By default, we put a fully dynamic layout map strided<[?, ?], offset: ?>
+            #     because that works best if you don't know what layout map the buffers at
+            #     the call site have -- you can always cast a buffer to a type with
+            #     fully dynamic layout map. (But not the other way around. That may require a reallocation.)
+            #
+            #  https://discord.com/channels/636084430946959380/642426447167881246/1212338527824515102
+            "}"
         ),
         # Remove dead memrefToTensorOp's
         # introduced during gradient-bufferize of callbacks
@@ -306,12 +306,10 @@ def run_writing_command(command: List[str], compile_options: Optional[CompileOpt
         "gradient-postprocess",
         "func.func(buffer-hoisting)",
         "func.func(buffer-loop-hoisting)",
-
         # TODO: Figure out how to include the other buffer-level optimizations.
         # -buffer-results-to-out-params,
         # -drop-equivalent-buffer-results,
         # -promote-buffers-to-stack
-
         # Deallocation
         # The buffer deallocation pass has been deprecated in favor of the
         # ownership-based buffer deallocation pipeline.

From 5b488b2eb484153d4ba3d7dc2e209b15c2278d15 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@xanadu.ai>
Date: Thu, 10 Oct 2024 13:38:49 -0400
Subject: [PATCH 171/183] Revert "Entirely remove all unstructure control flow"

This reverts commit 812bf7f69bc576206eded493f5b55cd625d24b15.
---
 mlir/lib/Bufferization.md                     |  8 +--
 .../BufferizableOpInterfaceImpl.cpp           |  1 +
 .../BufferizableOpInterfaceImpl.cpp           | 50 +++++++++++++++++--
 3 files changed, 51 insertions(+), 8 deletions(-)

diff --git a/mlir/lib/Bufferization.md b/mlir/lib/Bufferization.md
index 2fefd566c3..14eb74599b 100644
--- a/mlir/lib/Bufferization.md
+++ b/mlir/lib/Bufferization.md
@@ -10,14 +10,14 @@
 | resultBufferizesToMemoryWrite   |          |              |              |                |           |            |           |           |                |             |               |          |         |         |          |            |                 |
 | mustBufferizeInPlace            |          |              |              |                |           |            |           |           |                |             |               |          |         |         |          |            |                 |
 | getAliasingValues               | {}       | {}           |              | {}             | {}        | {}         |           |           | {}             | {}          | {}            | {}       | {}      | {}      | {}       | {}         | {}              |
-| getAliasingOpOperands           |          |              | {}           |                |           |            | {}        | {}        |                |             |               |          |         |         |          |            |                 |
+| getAliasingOpOperands           |          |              | {}           |                |           |            | v         | v         |                |             |               |          |         |         |          |            |                 |
 | resolveConflicts                |          |              |              |                |           |            |           |           |                |             |               |          |         |         |          |            |                 |
 | bufferize                       | v        | v            | v            | v              | v         | v          | v         | v         | v              | v           | v             | v        | v       | v       | v        | v          | v               |
 | isWritable                      |          |              |              |                |           |            |           |           |                |             |               |          |         |         |          |            |                 |
 | isNotConflicting                |          |              |              |                |           |            |           |           |                |             |               |          |         |         |          |            |                 |
-| verifyAnalysis                  |          |              |              |                |           |            |           |           |                |             |               |          |         |         |          |            |                 |
-| getBufferType                   |          |              |              |                |           |            |           |           |                |             |               |          |         |         |          |            |                 |
+| verifyAnalysis                  |          |              |              |                |           |            | v         | v         |                |             |               |          |         |         |          |            |                 |
+| getBufferType                   |          |              |              |                |           |            | v         | v         |                |             |               |          |         |         |          |            |                 |
 | isRepetitiveRegion              |          |              |              |                |           |            |           |           |                |             |               |          |         |         |          |            |                 |
 | isParallelRegion                |          |              |              |                |           |            |           |           |                |             |               |          |         |         |          |            |                 |
 | hasTensorSemantics              |          |              | v            |                |           |            | v         | v         |                |             |               |          |         |         |          |            |                 |
-| supportsUnstructuredControlFlow |          |              |              |                |           |            |           |           |                |             |               |          |         |         |          |            |                 |
\ No newline at end of file
+| supportsUnstructuredControlFlow |          |              | false        |                |           |            | true      | true      |                |             |               |          |         |         |          |            |                 |
\ No newline at end of file
diff --git a/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
index 8411b9bf78..4a71807c14 100644
--- a/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -1,5 +1,6 @@
 #include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
 #include "mlir/Dialect/Bufferization/IR/Bufferization.h"
+#include "mlir/Dialect/Bufferization/IR/UnstructuredControlFlow.h"
 #include "mlir/Dialect/Bufferization/Transforms/Bufferize.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/IR/Builders.h"
diff --git a/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
index 9bad418c0c..ce55763343 100644
--- a/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -1,10 +1,14 @@
 #include "mlir/Conversion/LLVMCommon/MemRefBuilder.h"
+#include "mlir/Conversion/LLVMCommon/Pattern.h"
+#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
 #include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
 #include "mlir/Dialect/Bufferization/IR/Bufferization.h"
+#include "mlir/Dialect/Bufferization/IR/UnstructuredControlFlow.h"
 #include "mlir/Dialect/Bufferization/Transforms/Bufferize.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Index/IR/IndexOps.h"
 #include "mlir/Dialect/LLVMIR/FunctionCallUtils.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/Interfaces/FunctionInterfaces.h"
@@ -13,6 +17,7 @@
 #include "Gradient/IR/GradientOps.h"
 #include "Gradient/Transforms/BufferizableOpInterfaceImpl.h"
 #include "Gradient/Utils/GradientShape.h"
+#include "Quantum/IR/QuantumOps.h"
 #include "llvm/ADT/STLExtras.h"
 
 using namespace mlir;
@@ -32,6 +37,11 @@ namespace {
  * OpOperand. Note that MLIR documentation does not mention `getAliasingValues` but it seems to
  * serve the same purpose.
  *
+ * Bufferizing FunctionOpInterface is also not documented by MLIR. It requires
+ * `OpWithUnstructuredControlFlowBufferizableOpInterfaceExternalModel`, which requires the
+ * implementation of `supportsUnstructuredControlFlow`, `hasTensorSemantics`, and
+ * `getAliasingOpOperands`.
+ *
  * Link: https://mlir.llvm.org/docs/Bufferization/#extending-one-shot-bufferize
  */
 
@@ -282,7 +292,8 @@ struct BackpropOpInterface
 };
 
 struct ForwardOpInterface
-    : public bufferization::BufferizableOpInterface::ExternalModel<ForwardOpInterface, ForwardOp> {
+    : public bufferization::OpWithUnstructuredControlFlowBufferizableOpInterfaceExternalModel<
+          ForwardOpInterface, ForwardOp> {
     static bool supportsUnstructuredControlFlow() { return true; }
 
     bool hasTensorSemantics(Operation *op) const
@@ -305,7 +316,22 @@ struct ForwardOpInterface
     getAliasingOpOperands(Operation *op, Value value,
                           const bufferization::AnalysisState &state) const
     {
-        return {};
+        return getAliasingBranchOpOperands(op, cast<BlockArgument>(value), state);
+    }
+
+    FailureOr<BaseMemRefType> getBufferType(Operation *op, Value value,
+                                            const bufferization::BufferizationOptions &options,
+                                            SmallVector<Value> &invocationStack) const
+    {
+        auto forwardOp = cast<ForwardOp>(op);
+        auto bbArg = cast<BlockArgument>(value);
+
+        // Function arguments are special.
+        if (bbArg.getOwner() == &forwardOp.getBody().front())
+            return getBufferizedFunctionArgType(forwardOp, bbArg.getArgNumber(), options);
+
+        return OpWithUnstructuredControlFlowBufferizableOpInterfaceExternalModel::getBufferType(
+            op, value, options, invocationStack);
     }
 
     LogicalResult verifyAnalysis(Operation *op, const bufferization::AnalysisState &state) const
@@ -387,7 +413,8 @@ struct ForwardOpInterface
 };
 
 struct ReverseOpInterface
-    : public bufferization::BufferizableOpInterface::ExternalModel<ReverseOpInterface, ReverseOp> {
+    : public bufferization::OpWithUnstructuredControlFlowBufferizableOpInterfaceExternalModel<
+          ReverseOpInterface, ReverseOp> {
     static bool supportsUnstructuredControlFlow() { return true; }
 
     bool hasTensorSemantics(Operation *op) const
@@ -410,7 +437,22 @@ struct ReverseOpInterface
     getAliasingOpOperands(Operation *op, Value value,
                           const bufferization::AnalysisState &state) const
     {
-        return {};
+        return getAliasingBranchOpOperands(op, cast<BlockArgument>(value), state);
+    }
+
+    FailureOr<BaseMemRefType> getBufferType(Operation *op, Value value,
+                                            const bufferization::BufferizationOptions &options,
+                                            SmallVector<Value> &invocationStack) const
+    {
+        auto reverseOp = cast<ReverseOp>(op);
+        auto bbArg = cast<BlockArgument>(value);
+
+        // Function arguments are special.
+        if (bbArg.getOwner() == &reverseOp.getBody().front())
+            return getBufferizedFunctionArgType(reverseOp, bbArg.getArgNumber(), options);
+
+        return OpWithUnstructuredControlFlowBufferizableOpInterfaceExternalModel::getBufferType(
+            op, value, options, invocationStack);
     }
 
     LogicalResult verifyAnalysis(Operation *op, const bufferization::AnalysisState &state) const

From 3646328868f4f23615aefdc09610711147e62008 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@xanadu.ai>
Date: Thu, 10 Oct 2024 14:00:54 -0400
Subject: [PATCH 172/183] set supportsUnstructuredControlFlow for forwardOp and
 reverseOp to false

---
 .../Gradient/Transforms/BufferizableOpInterfaceImpl.cpp   | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
index ce55763343..dc130a295f 100644
--- a/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -294,7 +294,7 @@ struct BackpropOpInterface
 struct ForwardOpInterface
     : public bufferization::OpWithUnstructuredControlFlowBufferizableOpInterfaceExternalModel<
           ForwardOpInterface, ForwardOp> {
-    static bool supportsUnstructuredControlFlow() { return true; }
+    static bool supportsUnstructuredControlFlow() { return false; }
 
     bool hasTensorSemantics(Operation *op) const
     {
@@ -316,7 +316,7 @@ struct ForwardOpInterface
     getAliasingOpOperands(Operation *op, Value value,
                           const bufferization::AnalysisState &state) const
     {
-        return getAliasingBranchOpOperands(op, cast<BlockArgument>(value), state);
+        return {};
     }
 
     FailureOr<BaseMemRefType> getBufferType(Operation *op, Value value,
@@ -415,7 +415,7 @@ struct ForwardOpInterface
 struct ReverseOpInterface
     : public bufferization::OpWithUnstructuredControlFlowBufferizableOpInterfaceExternalModel<
           ReverseOpInterface, ReverseOp> {
-    static bool supportsUnstructuredControlFlow() { return true; }
+    static bool supportsUnstructuredControlFlow() { return false; }
 
     bool hasTensorSemantics(Operation *op) const
     {
@@ -437,7 +437,7 @@ struct ReverseOpInterface
     getAliasingOpOperands(Operation *op, Value value,
                           const bufferization::AnalysisState &state) const
     {
-        return getAliasingBranchOpOperands(op, cast<BlockArgument>(value), state);
+        return {};
     }
 
     FailureOr<BaseMemRefType> getBufferType(Operation *op, Value value,

From 938df918e4cec86e3e71fd269f2868eb84b22e38 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@xanadu.ai>
Date: Fri, 11 Oct 2024 16:20:29 -0400
Subject: [PATCH 173/183] Update
 mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp

Co-authored-by: erick-xanadu <110487834+erick-xanadu@users.noreply.github.com>
---
 mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp
index fe11fcc64a..76349ec74d 100644
--- a/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -321,7 +321,7 @@ struct SetStateOpInterface
     bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
                                 const bufferization::AnalysisState &state) const
     {
-        return false;
+        return true;
     }
 
     bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,

From 6c7791635180baef38cde7b138bcb8f6612d6273 Mon Sep 17 00:00:00 2001
From: Tzung-Han Juang <tzunghan.juang@xanadu.ai>
Date: Fri, 11 Oct 2024 16:20:38 -0400
Subject: [PATCH 174/183] Update
 mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp

Co-authored-by: erick-xanadu <110487834+erick-xanadu@users.noreply.github.com>
---
 mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp
index 76349ec74d..2e61dfe579 100644
--- a/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Quantum/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -362,7 +362,7 @@ struct SetBasisStateOpInterface
     bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
                                 const bufferization::AnalysisState &state) const
     {
-        return false;
+        return true;
     }
 
     bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,

From d1c0bde78fbbbeb9762dd750eb183f4ccb488a8c Mon Sep 17 00:00:00 2001
From: Erick Ochoa Lopez <erick.ochoalopez@xanadu.ai>
Date: Thu, 10 Oct 2024 19:10:48 -0400
Subject: [PATCH 175/183] wip

---
 frontend/catalyst/compiler.py | 49 +++++++++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)

diff --git a/frontend/catalyst/compiler.py b/frontend/catalyst/compiler.py
index 20828f4c05..14b9e2a6eb 100644
--- a/frontend/catalyst/compiler.py
+++ b/frontend/catalyst/compiler.py
@@ -268,8 +268,57 @@ def run_writing_command(command: List[str], compile_options: Optional[CompileOpt
         # Here, please note that gradient-preprocessing is different than rewriting in DPS.
         # So, overall, we are skipping this section while we first focus on migrating to the
         # new -one-shot-bufferize
+
         "eliminate-empty-tensors",
+
         (
+            # Before we enter one-shot-bufferize, here is what we expect:
+            # * Given
+            #
+            #     One-Shot Bufferize was designed for ops that are in DPS (destination-passing style).
+            #     Ops that are not in DPS can still be bufferized,
+            #     but a new buffer will be allocated for every tensor result.
+            #     That’s functionally correct but inefficient.
+            #
+            #   from: https://discourse.llvm.org/t/steps-of-migrating-to-one-shot-bufferization/81062/2
+            #   we expect that results will be (automatically?) converted into new buffers. And it is
+            #   up to us to just define the bufferization for the operands.
+            #
+            # So what is the state of the catalyst, gradient, quantum dialects at this point?
+            #
+            # Let's start with quantum:
+            #
+            # |-------------------------|--------------------|
+            # |      operation          |  has result tensor |
+            # |-------------------------|--------------------|
+            # | quantum.set_state       |                    |
+            # | quantum.set_basis_state |                    |
+            # | quantum.unitary         |                    |
+            # | quantum.hermitian       |                    |
+            # | quantum.hamiltonian     |                    |
+            # | quantum.sample_op       |     YES            |
+            # | quantum.counts_op       |     YES            |
+            # | quantum.probs_op        |     YES            |
+            # | quantum.state_op        |     YES            |
+            # |-------------------------|--------------------|
+            # | catalyst.print_op       |                    |
+            # | catalyst.custom_call    |     YES            |
+            # | catalyst.callback       |                    |
+            # | catalyst.callback_call  |     YES            |
+            # | catalyst.launch_kernel  |     YES            |
+            # |-------------------------|--------------------|
+            # | gradient.grad           |     YES            |
+            # | gradient.value_and_grad |     YES            |
+            # | gradient.adjoint        |     YES            |
+            # | gradient.backprop       |     YES            |
+            # | gradient.jvp            |     YES            |
+            # | gradient.vjp            |     YES            |
+            # | gradient.forward        |     YES            |
+            # | gradient.reverse        |     YES            |
+            # |-------------------------|--------------------|
+            #
+            # So what this means is that for the operands, all the ones that have the YES
+            # means that no operands are written to. They are only read.
             "one-shot-bufferize"
             "{"
             "bufferize-function-boundaries "

From 12bf6186cb99a0bd5d68c74d38c524ac7d38d7b3 Mon Sep 17 00:00:00 2001
From: Erick Ochoa Lopez <erick.ochoalopez@xanadu.ai>
Date: Fri, 11 Oct 2024 14:45:11 -0400
Subject: [PATCH 176/183] Comments

---
 .../BufferizableOpInterfaceImpl.cpp           | 20 +++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
index 4a71807c14..c9033716d5 100644
--- a/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Catalyst/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -31,6 +31,7 @@ namespace {
 
 /// Bufferization of catalyst.print. Get memref of printOp.val.
 struct PrintOpInterface
+    // PrintOp will never write to the buffers
     : public bufferization::BufferizableOpInterface::ExternalModel<PrintOpInterface, PrintOp> {
     bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
                                 const bufferization::AnalysisState &state) const
@@ -68,8 +69,16 @@ struct PrintOpInterface
 
 /// Bufferization of catalyst.custom_call. Mainly get buffers for arguments.
 struct CustomCallOpInterface
+    // CustomCallOp will interface with BLAS functions.
+    // This operations is not in DPS form. This means that
+    // if we can guarantee operands are never written to, then we can set
+    // bufferizesToMemoryWrite as false.
+    // Results will be allocated a new buffer.
+    // TODO: Double check BLAS and others. Until then, it should be safe to keep
+    // bufferizesToMemoryWrite as True.
     : public bufferization::BufferizableOpInterface::ExternalModel<CustomCallOpInterface,
                                                                    CustomCallOp> {
+
     bool bufferizesToAllocation(Operation *op, Value value) const { return true; }
 
     bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
@@ -81,7 +90,7 @@ struct CustomCallOpInterface
     bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
                                  const bufferization::AnalysisState &state) const
     {
-        return false;
+        return true;
     }
 
     bufferization::AliasingValueList
@@ -205,9 +214,12 @@ struct CallbackCallOpInterface
     bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
                                  const bufferization::AnalysisState &state) const
     {
-        // The method returns `true` if the given OpOperand bufferizes to a memory write.
-        // Since CallbackCallOp is related to print, which does not write to its operand,
-        // Maybe we can set this to `false`.
+        // We can safely say false because CallbackCallOp's memrefs
+        // will be put in a JAX array and JAX arrays are immutable.
+        //
+        //    Unlike NumPy arrays, JAX arrays are always immutable.
+        //
+        // https://jax.readthedocs.io/en/latest/notebooks/thinking_in_jax.html
         return false;
     }
 

From 3c0bb0a2a4b89c38af951990b1c5b9397f8546e0 Mon Sep 17 00:00:00 2001
From: Erick Ochoa Lopez <erick.ochoalopez@xanadu.ai>
Date: Fri, 11 Oct 2024 15:08:54 -0400
Subject: [PATCH 177/183] f

---
 .../BufferizableOpInterfaceImpl.cpp           | 22 +++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
index dc130a295f..6fb383c7c9 100644
--- a/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -83,6 +83,16 @@ Value generateAllocation(OpBuilder &builder, Location loc, Value reference)
 {
     auto origMemrefType = cast<MemRefType>(reference.getType());
     // Rebuild MemRefType without memory layout.
+    // TODO: Investigate
+    //
+    //     Something looks odd here.
+    //     The result of a `memref.alloc` should be a memref without identity layout.
+    //     I know that the op supports operands for dims/symbols in the memref type,
+    //     but I never understood why.
+    //     Imo, a `memref.alloc() : memref<f64>` should have been generated.
+    //     The result value can then be casted to `memref<f64, strided<[], offset: ?>>`.
+    //
+    // https://discord.com/channels/636084430946959380/642426447167881246/1281710682160627785
     auto memrefType = MemRefType::get(origMemrefType.getShape(), origMemrefType.getElementType());
     // Get dynamic dimension sizes from the provided reference value if necessary.
     SmallVector<Value> dynamicDims;
@@ -112,6 +122,7 @@ void generateAllocations(RewriterBase &rewriter, Location loc, SmallVectorImpl<V
 }
 
 struct AdjointOpInterface
+    // This operation is not in DPS style and I believe that operands will only be read.
     : public bufferization::BufferizableOpInterface::ExternalModel<AdjointOpInterface, AdjointOp> {
     bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
                                 const bufferization::AnalysisState &state) const
@@ -169,6 +180,13 @@ struct AdjointOpInterface
 };
 
 struct BackpropOpInterface
+    // This operation is not in DPS style
+    // but it has a lot of parameters, notably:
+    // Variadic<AnyType>: $args
+    // Variadic<...RankedTensorOf<[AnyFloat]>>: $cotangents
+    // I think we don't write to the cotangents. And also not to the arguments
+    // so we can set bufferizesToMemoryWrite as false.
+    // The safe assumption is that it should be true.
     : public bufferization::BufferizableOpInterface::ExternalModel<BackpropOpInterface,
                                                                    BackpropOp> {
     bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
@@ -180,7 +198,7 @@ struct BackpropOpInterface
     bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
                                  const bufferization::AnalysisState &state) const
     {
-        return true;
+        return false;
     }
 
     bufferization::AliasingValueList
@@ -543,4 +561,4 @@ void catalyst::gradient::registerBufferizableOpInterfaceExternalModels(DialectRe
         ForwardOp::attachInterface<ForwardOpInterface>(*ctx);
         ReverseOp::attachInterface<ReverseOpInterface>(*ctx);
     });
-}
\ No newline at end of file
+}

From 619bf3d6875fa03a1a3062e1c86936f72cee0054 Mon Sep 17 00:00:00 2001
From: Erick Ochoa Lopez <erick.ochoalopez@xanadu.ai>
Date: Tue, 15 Oct 2024 12:59:48 -0400
Subject: [PATCH 178/183] More comments

---
 .../BufferizableOpInterfaceImpl.cpp           | 59 +++++++++++++++++--
 1 file changed, 55 insertions(+), 4 deletions(-)

diff --git a/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
index 6fb383c7c9..0cc45a056c 100644
--- a/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -82,8 +82,22 @@ static ReturnOp getAssumedUniqueReturnOp(FunctionOpInterface funcOp)
 Value generateAllocation(OpBuilder &builder, Location loc, Value reference)
 {
     auto origMemrefType = cast<MemRefType>(reference.getType());
-    // Rebuild MemRefType without memory layout.
-    // TODO: Investigate
+    // TODO: Investigate how to get rid of identity-layout-map
+    //
+    //     Hi all. For one-shot-bufferization, is there any automatic way to pass all memref symbols
+    //     to AllocOp? we have an example below that triggers  error: 'memref.alloc' op symbol
+    //     operand count does not equal memref symbol count: expected 1, got 0 .  We think we have
+    //     to pass the offset symbol to AllocOp.
+    //
+    //         %0 = "bufferization.to_memref"(%arg0) : (tensor<f64>) -> memref<f64, strided<[],
+    //         offset: ?>> %1 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () ->
+    //         memref<f64, strided<[], offset: ?>>
+    //
+    //     We know we can set function-signature-type-conversion=identity-layout-map to get rid of
+    //     it. But according to the document, identity-layout-map could be less efficient, we still
+    //     want to stick with the default setting.
+    //
+    // https://discord.com/channels/636084430946959380/642426447167881246/1281620504859512914
     //
     //     Something looks odd here.
     //     The result of a `memref.alloc` should be a memref without identity layout.
@@ -93,8 +107,42 @@ Value generateAllocation(OpBuilder &builder, Location loc, Value reference)
     //     The result value can then be casted to `memref<f64, strided<[], offset: ?>>`.
     //
     // https://discord.com/channels/636084430946959380/642426447167881246/1281710682160627785
-    auto memrefType = MemRefType::get(origMemrefType.getShape(), origMemrefType.getElementType());
-    // Get dynamic dimension sizes from the provided reference value if necessary.
+    //
+    // What I find interesting is that the comment says that
+    //
+    //     "we know we can set function-signature-type-conversion=identity-layout-map to get rid of
+    //     it"
+    //
+    // and that is what we are using, however we still have this rebuilding a memref without the
+    // layout. If that were true, then we could uncomment the following line and it should work.
+    auto memrefType = origMemrefType;
+    // I can confirm that having
+    // function-signature-type-conversion=identity-layout-map makes the line above succed while the
+    // line below fail:
+    //
+    //     Get dynamic dimension sizes from the provided reference value if necessary.
+    //     auto memrefType = MemRefType::get(origMemrefType.getShape(),
+    //     origMemrefType.getElementType());
+    //
+    // Looking at this a little bit deeper, I can say that the variable reference
+    // appears to come from a function parameter.
+    // and since it is not the identity layout, then we see the following generic MLIR when not
+    // using identity layout
+    //
+    // "func.func"() <{function_type = (memref<f64, strided<[], offset: ?>>) -> memref<f64,
+    // strided<[], offset: ?>>
+    //
+    // and we see this when using the identity layout:
+    //
+    // func.func public @jit_fn(%arg0: memref<f64>) -> memref<f64>
+    //
+    // When not using identity layout but also not removing the layout in the alloca, there are
+    // errors in some cases but not in others. I believe we have to do some casts in other places as
+    // well, whenever we use allocas and the types come from the arguments.
+    //
+    // My recommendation: at some point it would be good to remove the identity-layout-map from the
+    // frontend but until we have some more resources, let's keep it along with the origMemrefType.
+
     SmallVector<Value> dynamicDims;
     if (!memrefType.hasStaticShape()) {
         for (int64_t dim = 0; dim < memrefType.getRank(); dim++) {
@@ -106,6 +154,9 @@ Value generateAllocation(OpBuilder &builder, Location loc, Value reference)
     }
 
     return builder.create<memref::AllocOp>(loc, memrefType, dynamicDims);
+    // Uncomment below to follow Matthias suggestion of placing a CastOp after AllocOp
+    // some more tests will pass.
+    // return builder.create<memref::CastOp>(loc, origMemrefType, alloc_uncasted);
 }
 
 /// Helper function to generate a set of memref allocations.

From b49881e8de89cc02c4eccf8bf9163d5b26ebfba9 Mon Sep 17 00:00:00 2001
From: Erick Ochoa Lopez <erick.ochoalopez@xanadu.ai>
Date: Tue, 15 Oct 2024 14:07:20 -0400
Subject: [PATCH 179/183] Fix comments

---
 .../Transforms/PostprocessingPatterns.cpp     | 31 +++++++++++--------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/mlir/lib/Gradient/Transforms/PostprocessingPatterns.cpp b/mlir/lib/Gradient/Transforms/PostprocessingPatterns.cpp
index cfef5c9aae..21af67294d 100644
--- a/mlir/lib/Gradient/Transforms/PostprocessingPatterns.cpp
+++ b/mlir/lib/Gradient/Transforms/PostprocessingPatterns.cpp
@@ -210,21 +210,26 @@ struct RestoreReverseOp : public OpRewritePattern<ReverseOp> {
      * This pass aims to revert the changed made by One-shot bufferize.
      *
      * For example,
-     * ```
-     * gradient.reverse @bwd.rev(%arg0: memref<f64>) -> memref<f64>, memref<f64> attributes {argc =
-     * 2 : i64, implementation = @bwd, llvm.linkage = #llvm.linkage<internal>, resc = 1 : i64, tape
-     * = 0 : i64} { %0 = func.call @bwd(%arg0) : (memref<f64>) -> memref<f64> %alloc =
-     * memref.alloc() {alignment = 64 : i64} : memref<f64> memref.copy %0, %alloc : memref<f64> to
-     * memref<f64> gradient.return {empty = true} %alloc : memref<f64>
+     *
+     * ```mlir
+     * gradient.reverse @bwd.rev(%arg0: memref<f64>) -> memref<f64>, memref<f64>
+     * attributes {argc = 2 : i64, implementation = @bwd, resc = 1 : i64, tape = 0 : i64} {
+     *     %0 = func.call @bwd(%arg0) : (memref<f64>) -> memref<f64>
+     *     %alloc = memref.alloc() {alignment = 64 : i64} : memref<f64>
+     *     memref.copy %0, %alloc : memref<f64> to memref<f64>
+     *     gradient.return {empty = true} %alloc : memref<f64>
      * }
      * ```
+     *
      * will be turned into
-     * * ```
-     * gradient.reverse @bwd.rev(%arg0: memref<f64>) -> memref<f64> attributes {argc = 2 : i64,
-     * implementation = @bwd, llvm.linkage = #llvm.linkage<internal>, resc = 1 : i64, tape = 0 :
-     * i64} { %0 = func.call @bwd(%arg0) : (memref<f64>) -> memref<f64> %alloc = memref.alloc()
-     * {alignment = 64 : i64} : memref<f64> memref.copy %0, %alloc : memref<f64> to memref<f64>
-     *   gradient.return {empty = true} %alloc : memref<f64>
+     *
+     * ```mlir
+     * gradient.reverse @bwd.rev(%arg0: memref<f64>) -> memref<f64>
+     * attributes {argc = 2 : i64, implementation = @bwd, resc = 1 : i64, tape = 0 : i64} {
+     *     %0 = func.call @bwd(%arg0) : (memref<f64>) -> memref<f64>
+     *     %alloc = memref.alloc() {alignment = 64 : i64} : memref<f64>
+     *     memref.copy %0, %alloc : memref<f64> to memref<f64>
+     *     gradient.return {empty = true} %alloc : memref<f64>
      * }
      * ```
      *
@@ -301,4 +306,4 @@ void populatePostprocessingPatterns(RewritePatternSet &patterns)
 }
 
 } // namespace gradient
-} // namespace catalyst
\ No newline at end of file
+} // namespace catalyst

From 9bda879ab4568551aa666d50d873c328164274ba Mon Sep 17 00:00:00 2001
From: Erick Ochoa Lopez <erick.ochoalopez@xanadu.ai>
Date: Tue, 15 Oct 2024 15:08:16 -0400
Subject: [PATCH 180/183] Remove unnecessary code

---
 .../Transforms/PostprocessingPatterns.cpp     | 89 -------------------
 1 file changed, 89 deletions(-)

diff --git a/mlir/lib/Gradient/Transforms/PostprocessingPatterns.cpp b/mlir/lib/Gradient/Transforms/PostprocessingPatterns.cpp
index 21af67294d..16c0243352 100644
--- a/mlir/lib/Gradient/Transforms/PostprocessingPatterns.cpp
+++ b/mlir/lib/Gradient/Transforms/PostprocessingPatterns.cpp
@@ -205,94 +205,6 @@ struct PostprocessReverseOp : public OpRewritePattern<ReverseOp> {
     }
 };
 
-struct RestoreReverseOp : public OpRewritePattern<ReverseOp> {
-    /* One-shot bufferize optimizes away the return values that are not used.
-     * This pass aims to revert the changed made by One-shot bufferize.
-     *
-     * For example,
-     *
-     * ```mlir
-     * gradient.reverse @bwd.rev(%arg0: memref<f64>) -> memref<f64>, memref<f64>
-     * attributes {argc = 2 : i64, implementation = @bwd, resc = 1 : i64, tape = 0 : i64} {
-     *     %0 = func.call @bwd(%arg0) : (memref<f64>) -> memref<f64>
-     *     %alloc = memref.alloc() {alignment = 64 : i64} : memref<f64>
-     *     memref.copy %0, %alloc : memref<f64> to memref<f64>
-     *     gradient.return {empty = true} %alloc : memref<f64>
-     * }
-     * ```
-     *
-     * will be turned into
-     *
-     * ```mlir
-     * gradient.reverse @bwd.rev(%arg0: memref<f64>) -> memref<f64>
-     * attributes {argc = 2 : i64, implementation = @bwd, resc = 1 : i64, tape = 0 : i64} {
-     *     %0 = func.call @bwd(%arg0) : (memref<f64>) -> memref<f64>
-     *     %alloc = memref.alloc() {alignment = 64 : i64} : memref<f64>
-     *     memref.copy %0, %alloc : memref<f64> to memref<f64>
-     *     gradient.return {empty = true} %alloc : memref<f64>
-     * }
-     * ```
-     *
-     * However, Enzyme expects to see the removed return. We have to add it back.
-     */
-    using OpRewritePattern<ReverseOp>::OpRewritePattern;
-
-    mlir::LogicalResult matchAndRewrite(ReverseOp op,
-                                        mlir::PatternRewriter &rewriter) const override
-    {
-        auto forwardArgc = op.getArgc();
-        auto forwardResc = op.getResc();
-        auto tape = op.getTape();
-
-        // Check if the Op is post-processed.
-        if (op.getFunctionType().getNumInputs() == (forwardResc + forwardArgc) * 2 + tape)
-            return failure();
-
-        // If function signature is modified, this pass cannot be processed.
-        if (op.getFunctionType().getNumResults() >= forwardArgc)
-            return failure();
-
-        // get parenet module
-        auto module = op->getParentOfType<mlir::ModuleOp>();
-
-        // Get GradOp
-        CustomGradOp gradCaller = nullptr;
-        for (auto gradOp : module.getOps<CustomGradOp>()) {
-            if (gradOp.getReverse() == op.getSymName()) {
-                gradCaller = gradOp;
-            }
-        }
-
-        if (!gradCaller)
-            return failure();
-
-        ForwardOp target = nullptr;
-        // get corresponding FowardOp
-        for (auto forwardOp : module.getOps<ForwardOp>()) {
-            if (forwardOp.getSymName() == gradCaller.getForward()) {
-                target = forwardOp;
-            }
-        }
-
-        if (!target)
-            return failure();
-
-        auto forwardArgTys = target.getArgumentTypes();
-        SmallVector<Type> noTapeTys;
-        for (size_t i = 0; i < forwardArgTys.size(); ++i) {
-            if (i < op.getArgc()) {
-                noTapeTys.push_back(forwardArgTys[i]);
-            }
-        }
-
-        auto reverseTy = rewriter.getFunctionType(op.getArgumentTypes(), noTapeTys);
-
-        rewriter.modifyOpInPlace(op, [&] { op.setFunctionType(reverseTy); });
-
-        return failure();
-    }
-};
-
 } // namespace
 
 namespace catalyst {
@@ -300,7 +212,6 @@ namespace gradient {
 
 void populatePostprocessingPatterns(RewritePatternSet &patterns)
 {
-    patterns.add<RestoreReverseOp>(patterns.getContext());
     patterns.add<PostprocessForwardOp>(patterns.getContext());
     patterns.add<PostprocessReverseOp>(patterns.getContext());
 }

From 227c08e9b9886e6cf6e8101213cbefedd68322c1 Mon Sep 17 00:00:00 2001
From: Erick Ochoa Lopez <erick.ochoalopez@xanadu.ai>
Date: Tue, 15 Oct 2024 15:10:28 -0400
Subject: [PATCH 181/183] style

---
 frontend/catalyst/compiler.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/frontend/catalyst/compiler.py b/frontend/catalyst/compiler.py
index 14b9e2a6eb..aed9cd131f 100644
--- a/frontend/catalyst/compiler.py
+++ b/frontend/catalyst/compiler.py
@@ -268,9 +268,7 @@ def run_writing_command(command: List[str], compile_options: Optional[CompileOpt
         # Here, please note that gradient-preprocessing is different than rewriting in DPS.
         # So, overall, we are skipping this section while we first focus on migrating to the
         # new -one-shot-bufferize
-
         "eliminate-empty-tensors",
-
         (
             # Before we enter one-shot-bufferize, here is what we expect:
             # * Given

From b179e70357368fbdb1914597c720e1305c969e5d Mon Sep 17 00:00:00 2001
From: Erick Ochoa Lopez <erick.ochoalopez@xanadu.ai>
Date: Tue, 15 Oct 2024 15:30:26 -0400
Subject: [PATCH 182/183] Fix

---
 mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
index 0cc45a056c..99b1b4396b 100644
--- a/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Gradient/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -115,14 +115,13 @@ Value generateAllocation(OpBuilder &builder, Location loc, Value reference)
     //
     // and that is what we are using, however we still have this rebuilding a memref without the
     // layout. If that were true, then we could uncomment the following line and it should work.
-    auto memrefType = origMemrefType;
+    // auto memrefType = origMemrefType;
     // I can confirm that having
     // function-signature-type-conversion=identity-layout-map makes the line above succed while the
     // line below fail:
     //
     //     Get dynamic dimension sizes from the provided reference value if necessary.
-    //     auto memrefType = MemRefType::get(origMemrefType.getShape(),
-    //     origMemrefType.getElementType());
+    auto memrefType = MemRefType::get(origMemrefType.getShape(), origMemrefType.getElementType());
     //
     // Looking at this a little bit deeper, I can say that the variable reference
     // appears to come from a function parameter.

From e4235d756e337315eb83a89e039a1214b87ae2b5 Mon Sep 17 00:00:00 2001
From: Erick Ochoa Lopez <erick.ochoalopez@xanadu.ai>
Date: Tue, 15 Oct 2024 15:30:34 -0400
Subject: [PATCH 183/183] line length

---
 frontend/catalyst/compiler.py | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/frontend/catalyst/compiler.py b/frontend/catalyst/compiler.py
index aed9cd131f..bd580b3950 100644
--- a/frontend/catalyst/compiler.py
+++ b/frontend/catalyst/compiler.py
@@ -273,14 +273,16 @@ def run_writing_command(command: List[str], compile_options: Optional[CompileOpt
             # Before we enter one-shot-bufferize, here is what we expect:
             # * Given
             #
-            #     One-Shot Bufferize was designed for ops that are in DPS (destination-passing style).
+            #     One-Shot Bufferize was designed for ops that are in DPS
+            #     (destination-passing style).
             #     Ops that are not in DPS can still be bufferized,
             #     but a new buffer will be allocated for every tensor result.
             #     That’s functionally correct but inefficient.
             #
-            #   from: https://discourse.llvm.org/t/steps-of-migrating-to-one-shot-bufferization/81062/2
-            #   we expect that results will be (automatically?) converted into new buffers. And it is
-            #   up to us to just define the bufferization for the operands.
+            #   https://discourse.llvm.org/t/steps-of-migrating-to-one-shot-bufferization/81062/2
+            #
+            #   we expect that results will be (automatically?) converted into new buffers. And it
+            #   is up to us to just define the bufferization for the operands.
             #
             # So what is the state of the catalyst, gradient, quantum dialects at this point?
             #
@@ -323,9 +325,11 @@ def run_writing_command(command: List[str], compile_options: Optional[CompileOpt
             # - Bufferize function boundaries (experimental).
             #
             #     By default, function boundaries are not bufferized.
-            #     This is because there are currently limitations around function graph bufferization:
+            #     This is because there are currently limitations around function graph
+            #     bufferization:
             #     recursive calls are not supported.
-            #     As long as there are no recursive calls, function boundary bufferization can be enabled with bufferize-function-boundaries.
+            #     As long as there are no recursive calls, function boundary bufferization can be
+            #     enabled with bufferize-function-boundaries.
             #     Each tensor function argument and tensor function result is then turned into a memref.
             #     The layout map of the memref type can be controlled with function-boundary-type-conversion.
             #
@@ -342,7 +346,8 @@ def run_writing_command(command: List[str], compile_options: Optional[CompileOpt
             #     By default, we put a fully dynamic layout map strided<[?, ?], offset: ?>
             #     because that works best if you don't know what layout map the buffers at
             #     the call site have -- you can always cast a buffer to a type with
-            #     fully dynamic layout map. (But not the other way around. That may require a reallocation.)
+            #     fully dynamic layout map. (But not the other way around. That may require a
+            #     reallocation.)
             #
             #  https://discord.com/channels/636084430946959380/642426447167881246/1212338527824515102
             "}"