diff --git a/.dep-versions b/.dep-versions
index 21e20ac2a1..edc108195f 100644
--- a/.dep-versions
+++ b/.dep-versions
@@ -1,7 +1,15 @@
 # Always update the version check in catalyst.__init__ when changing the JAX version.
+
+#############
+# We track mlir submodule versions from jax 0.4.32 for now
+# These are the earliest versions with complete upstream bufferization changes
+# Versions are retrieved from
+# python3 .github/workflows/set_dep_versions.py 0.4.32
+#############
+
 jax=0.6.0
-mhlo=89a891c986650c33df76885f5620e0a92150d90f
-llvm=3a8316216807d64a586b971f51695e23883331f7
+mhlo=25b008569f413d76cfa8f481f3a84e82b89c47f4
+llvm=5f74671c85877e03622e8d308aee15ed73ccee7c
 enzyme=v0.0.149
 
 # Always remove custom PL/LQ versions before release.
diff --git a/.github/workflows/build-wheel-linux-arm64.yaml b/.github/workflows/build-wheel-linux-arm64.yaml
index d2d0c0aae6..7b36859997 100644
--- a/.github/workflows/build-wheel-linux-arm64.yaml
+++ b/.github/workflows/build-wheel-linux-arm64.yaml
@@ -222,7 +222,7 @@ jobs:
               -DCMAKE_CXX_VISIBILITY_PRESET=default \
               -DCMAKE_CXX_FLAGS="-fuse-ld=lld"
 
-        cmake --build $GITHUB_WORKSPACE/enzyme-build --target EnzymeStatic-19
+        cmake --build $GITHUB_WORKSPACE/enzyme-build --target EnzymeStatic-20
 
     - name: Save Enzyme Build
       id: save-enzyme-build
diff --git a/.github/workflows/build-wheel-linux-x86_64.yaml b/.github/workflows/build-wheel-linux-x86_64.yaml
index 794d6ff021..2b8ce6a8c4 100644
--- a/.github/workflows/build-wheel-linux-x86_64.yaml
+++ b/.github/workflows/build-wheel-linux-x86_64.yaml
@@ -245,7 +245,7 @@ jobs:
               -DCMAKE_CXX_VISIBILITY_PRESET=default \
               -DCMAKE_CXX_FLAGS="-fuse-ld=lld"
 
-        cmake --build $GITHUB_WORKSPACE/enzyme-build --target EnzymeStatic-19
+        cmake --build $GITHUB_WORKSPACE/enzyme-build --target EnzymeStatic-20
 
     - name: Save Enzyme Build
       id: save-enzyme-build
diff --git a/.github/workflows/build-wheel-macos-arm64.yaml b/.github/workflows/build-wheel-macos-arm64.yaml
index c47fbdc719..0a1279423a 100644
--- a/.github/workflows/build-wheel-macos-arm64.yaml
+++ b/.github/workflows/build-wheel-macos-arm64.yaml
@@ -218,7 +218,7 @@ jobs:
               -DENZYME_STATIC_LIB=ON \
               -DCMAKE_CXX_VISIBILITY_PRESET=default
 
-        cmake --build $GITHUB_WORKSPACE/enzyme-build --target EnzymeStatic-19
+        cmake --build $GITHUB_WORKSPACE/enzyme-build --target EnzymeStatic-20
 
     - name: Save Enzyme Build
       id: save-enzyme-build
diff --git a/doc/releases/changelog-dev.md b/doc/releases/changelog-dev.md
index 9917d27c47..c5364be715 100644
--- a/doc/releases/changelog-dev.md
+++ b/doc/releases/changelog-dev.md
@@ -213,6 +213,7 @@
   [(#1686)](https://github.com/PennyLaneAI/catalyst/pull/1686)
   [(#1708)](https://github.com/PennyLaneAI/catalyst/pull/1708)
   [(#1740)](https://github.com/PennyLaneAI/catalyst/pull/1740)
+  [(#1751)](https://github.com/PennyLaneAI/catalyst/pull/1751)
 
 * Redundant `OptionalAttr` is removed from `adjoint` argument in `QuantumOps.td` TableGen file
   [(#1746)](https://github.com/PennyLaneAI/catalyst/pull/1746)
diff --git a/frontend/catalyst/pipelines.py b/frontend/catalyst/pipelines.py
index b7ce1ddcc6..04f6c4178c 100644
--- a/frontend/catalyst/pipelines.py
+++ b/frontend/catalyst/pipelines.py
@@ -213,32 +213,33 @@ def get_quantum_compilation_stage(options: CompileOptions) -> List[str]:
     return list(filter(partial(is_not, None), quantum_compilation))
 
 
-def get_bufferization_stage(_options: CompileOptions) -> List[str]:
+def get_bufferization_stage(options: CompileOptions) -> List[str]:
     """Returns the list of passes that performs bufferization"""
+
+    bufferization_options = """bufferize-function-boundaries
+        allow-return-allocs-from-loops
+        function-boundary-type-conversion=identity-layout-map
+        unknown-type-conversion=identity-layout-map""".replace(
+        "\n", " "
+    )
+    if options.async_qnodes:
+        bufferization_options += " copy-before-write"
+
     bufferization = [
-        "one-shot-bufferize{dialect-filter=memref}",
         "inline",
-        "gradient-preprocess",
-        "one-shot-bufferize{dialect-filter=gradient unknown-type-conversion=identity-layout-map}",
-        "scf-bufferize",
         "convert-tensor-to-linalg",  # tensor.pad
-        "convert-elementwise-to-linalg",  # Must be run before --arith-bufferize
-        "arith-bufferize",
-        "empty-tensor-to-alloc-tensor",
-        "func.func(bufferization-bufferize)",
-        "func.func(tensor-bufferize)",
-        # Catalyst dialect's bufferization must be run before --func.func(linalg-bufferize)
-        "one-shot-bufferize{dialect-filter=catalyst unknown-type-conversion=identity-layout-map}",
-        "func.func(linalg-bufferize)",
-        "func.func(tensor-bufferize)",
-        "one-shot-bufferize{dialect-filter=quantum}",
-        "func-bufferize",
-        "func.func(finalizing-bufferize)",
+        "convert-elementwise-to-linalg",  # Must be run before --one-shot-bufferize
+        "gradient-preprocess",
+        "eliminate-empty-tensors",
+        ####################
+        "one-shot-bufferize{" + bufferization_options + "}",
+        ####################
         "canonicalize",  # Remove dead memrefToTensorOp's
         "gradient-postprocess",
         # introduced during gradient-bufferize of callbacks
         "func.func(buffer-hoisting)",
         "func.func(buffer-loop-hoisting)",
+        "func.func(promote-buffers-to-stack)",
         "func.func(buffer-deallocation)",
         "convert-arraylist-to-memref",
         "convert-bufferization-to-memref",
@@ -247,6 +248,7 @@ def get_bufferization_stage(_options: CompileOptions) -> List[str]:
         # "cse",
         "cp-global-memref",
     ]
+
     return bufferization
 
 
diff --git a/mlir/CMakeLists.txt b/mlir/CMakeLists.txt
index 870cc04def..1951f2c3a2 100644
--- a/mlir/CMakeLists.txt
+++ b/mlir/CMakeLists.txt
@@ -51,7 +51,6 @@ set(ALL_MHLO_PASSES
   HloToLinalgUtils
   MhloToLinalg
   MhloToStablehlo
-  MhloQuantToIntConversion
   StablehloToMhlo
 )
 
diff --git a/mlir/Makefile b/mlir/Makefile
index 9717282147..badd27d6f5 100644
--- a/mlir/Makefile
+++ b/mlir/Makefile
@@ -85,7 +85,10 @@ llvm:
 
 	# TODO: when updating LLVM, test to see if mlir/unittests/Bytecode/BytecodeTest.cpp:55 is passing
 	# and remove filter. This tests fails on CI/CD not locally.
-	LIT_FILTER_OUT="Bytecode|tosa-to-tensor" cmake --build $(LLVM_BUILD_DIR) --target $(LLVM_TARGETS)
+	# Note: the upstream lit test llvm-project/mlir/test/python/execution_engine.py requries
+	# the python package `ml_dtypes`. We don't actually use the execution engine, so we skip the
+	# test to reduce unnecessary dependencies.
+	LIT_FILTER_OUT="Bytecode|tosa-to-tensor|execution_engine" cmake --build $(LLVM_BUILD_DIR) --target $(LLVM_TARGETS)
 
 .PHONY: mhlo
 mhlo: TARGET_FILE := $(MK_DIR)/mlir-hlo/mhlo/transforms/CMakeLists.txt
@@ -130,7 +133,7 @@ enzyme:
 		-DCMAKE_CXX_VISIBILITY_PRESET=$(SYMBOL_VISIBILITY) \
 		-DCMAKE_POLICY_DEFAULT_CMP0116=NEW
 
-	cmake --build $(ENZYME_BUILD_DIR) --target EnzymeStatic-19
+	cmake --build $(ENZYME_BUILD_DIR) --target EnzymeStatic-20
 
 .PHONY: plugin
 plugin:
diff --git a/mlir/lib/Catalyst/Transforms/DetectQNodes.cpp b/mlir/lib/Catalyst/Transforms/DetectQNodes.cpp
index 6225bc84b4..9a62a283e7 100644
--- a/mlir/lib/Catalyst/Transforms/DetectQNodes.cpp
+++ b/mlir/lib/Catalyst/Transforms/DetectQNodes.cpp
@@ -925,7 +925,7 @@ struct AddExceptionHandlingPass : impl::AddExceptionHandlingPassBase<AddExceptio
 
         GreedyRewriteConfig config;
         config.strictMode = GreedyRewriteStrictness::ExistingOps;
-        config.enableRegionSimplification = false;
+        config.enableRegionSimplification = mlir::GreedySimplifyRegionLevel::Disabled;
 
         if (failed(applyPatternsAndFoldGreedily(getOperation(), std::move(patterns1), config))) {
             signalPassFailure();
diff --git a/mlir/lib/Catalyst/Transforms/InlineNestedModules.cpp b/mlir/lib/Catalyst/Transforms/InlineNestedModules.cpp
index 9771624a26..958857ae52 100644
--- a/mlir/lib/Catalyst/Transforms/InlineNestedModules.cpp
+++ b/mlir/lib/Catalyst/Transforms/InlineNestedModules.cpp
@@ -383,7 +383,7 @@ struct AnnotateWithFullyQualifiedNamePass
         // Do not fold to save in compile time.
         GreedyRewriteConfig config;
         config.strictMode = GreedyRewriteStrictness::ExistingOps;
-        config.enableRegionSimplification = false;
+        config.enableRegionSimplification = mlir::GreedySimplifyRegionLevel::Disabled;
 
         RewritePatternSet annotate(context);
         auto root = getOperation();
@@ -409,7 +409,7 @@ struct InlineNestedSymbolTablePass : PassWrapper<InlineNestedSymbolTablePass, Op
 
         GreedyRewriteConfig config;
         config.strictMode = GreedyRewriteStrictness::ExistingOps;
-        config.enableRegionSimplification = false;
+        config.enableRegionSimplification = mlir::GreedySimplifyRegionLevel::Disabled;
 
         RewritePatternSet renameFunctions(context);
 
diff --git a/mlir/lib/Driver/CompilerDriver.cpp b/mlir/lib/Driver/CompilerDriver.cpp
index 9cdfed35fa..53b7d7f353 100644
--- a/mlir/lib/Driver/CompilerDriver.cpp
+++ b/mlir/lib/Driver/CompilerDriver.cpp
@@ -738,7 +738,8 @@ LogicalResult QuantumDriverMain(const CompilerOptions &options, CompilerOutput &
         TimingScope translateTiming = timing.nest("Translate");
         llvmModule =
             timer::timer(translateModuleToLLVMIR, "translateModuleToLLVMIR",
-                         /* add_endl */ false, *mlirModule, llvmContext, "LLVMDialectModule");
+                         /* add_endl */ false, *mlirModule, llvmContext, "LLVMDialectModule",
+                         /* disableVerification */ true);
         if (!llvmModule) {
             CO_MSG(options, Verbosity::Urgent, "Failed to translate LLVM module\n");
             return failure();
diff --git a/mlir/lib/Driver/Pipelines.cpp b/mlir/lib/Driver/Pipelines.cpp
index c1ab4ad2b2..8293a98c97 100644
--- a/mlir/lib/Driver/Pipelines.cpp
+++ b/mlir/lib/Driver/Pipelines.cpp
@@ -67,47 +67,29 @@ void createQuantumCompilationPipeline(OpPassManager &pm)
 }
 void createBufferizationPipeline(OpPassManager &pm)
 {
-    mlir::bufferization::OneShotBufferizationOptions options;
-    options.opFilter.allowDialect<mlir::bufferization::BufferizationDialect>();
-    pm.addPass(mlir::bufferization::createOneShotBufferizePass(options));
     pm.addPass(mlir::createInlinerPass());
-    pm.addPass(catalyst::createGradientPreprocessingPass());
-    mlir::bufferization::OneShotBufferizationOptions gradient_buffer_options;
-    gradient_buffer_options.opFilter.allowDialect<catalyst::gradient::GradientDialect>();
-    gradient_buffer_options.unknownTypeConverterFn =
-        [=](Value value, Attribute memorySpace,
-            const mlir::bufferization::BufferizationOptions &options) {
-            auto tensorType = cast<TensorType>(value.getType());
-            return bufferization::getMemRefTypeWithStaticIdentityLayout(tensorType, memorySpace);
-        };
-    pm.addPass(mlir::bufferization::createOneShotBufferizePass(gradient_buffer_options));
-    pm.addPass(mlir::createSCFBufferizePass());
     pm.addPass(mlir::createConvertTensorToLinalgPass());
     pm.addPass(mlir::createConvertElementwiseToLinalgPass());
-    pm.addPass(mlir::arith::createArithBufferizePass());
-    pm.addPass(mlir::bufferization::createEmptyTensorToAllocTensorPass());
-    pm.addNestedPass<mlir::func::FuncOp>(mlir::bufferization::createBufferizationBufferizePass());
-    pm.addNestedPass<mlir::func::FuncOp>(mlir::tensor::createTensorBufferizePass());
-    mlir::bufferization::OneShotBufferizationOptions catalyst_buffer_options;
-    catalyst_buffer_options.opFilter.allowDialect<catalyst::CatalystDialect>();
-    catalyst_buffer_options.unknownTypeConverterFn =
-        [=](Value value, Attribute memorySpace,
-            const mlir::bufferization::BufferizationOptions &options) {
-            auto tensorType = cast<TensorType>(value.getType());
-            return bufferization::getMemRefTypeWithStaticIdentityLayout(tensorType, memorySpace);
-        };
-    pm.addPass(mlir::bufferization::createOneShotBufferizePass(catalyst_buffer_options));
-    pm.addNestedPass<mlir::func::FuncOp>(mlir::createLinalgBufferizePass());
-    pm.addNestedPass<mlir::func::FuncOp>(mlir::tensor::createTensorBufferizePass());
-    mlir::bufferization::OneShotBufferizationOptions quantum_buffer_options;
-    quantum_buffer_options.opFilter.allowDialect<catalyst::quantum::QuantumDialect>();
-    pm.addPass(mlir::bufferization::createOneShotBufferizePass(quantum_buffer_options));
-    pm.addPass(mlir::func::createFuncBufferizePass());
-    pm.addNestedPass<mlir::func::FuncOp>(mlir::bufferization::createFinalizingBufferizePass());
+    pm.addPass(catalyst::createGradientPreprocessingPass());
+    pm.addPass(mlir::bufferization::createEmptyTensorEliminationPass());
+    ///////////
+    mlir::bufferization::OneShotBufferizationOptions options;
+    options.bufferizeFunctionBoundaries = true;
+    options.allowReturnAllocsFromLoops = true;
+    options.setFunctionBoundaryTypeConversion(
+        mlir::bufferization::LayoutMapOption::IdentityLayoutMap);
+    options.unknownTypeConverterFn = [=](Value value, Attribute memorySpace,
+                                         const mlir::bufferization::BufferizationOptions &options) {
+        auto tensorType = cast<TensorType>(value.getType());
+        return bufferization::getMemRefTypeWithStaticIdentityLayout(tensorType, memorySpace);
+    };
+    pm.addPass(mlir::bufferization::createOneShotBufferizePass(options));
+    //////////////
     pm.addPass(mlir::createCanonicalizerPass());
     pm.addPass(catalyst::createGradientPostprocessingPass());
     pm.addNestedPass<mlir::func::FuncOp>(mlir::bufferization::createBufferHoistingPass());
     pm.addNestedPass<mlir::func::FuncOp>(mlir::bufferization::createBufferLoopHoistingPass());
+    pm.addNestedPass<mlir::func::FuncOp>(mlir::bufferization::createPromoteBuffersToStackPass());
     pm.addNestedPass<mlir::func::FuncOp>(mlir::bufferization::createBufferDeallocationPass());
     pm.addPass(catalyst::createArrayListToMemRefPass());
     pm.addPass(mlir::createBufferizationToMemRefPass());
diff --git a/mlir/lib/Driver/Timer.hpp b/mlir/lib/Driver/Timer.hpp
index 0974f3073b..7c61b0c53c 100644
--- a/mlir/lib/Driver/Timer.hpp
+++ b/mlir/lib/Driver/Timer.hpp
@@ -25,6 +25,7 @@
 #include <string>
 #include <string_view>
 #include <thread>
+#include <utility> // std::forward
 
 #include <ctime>
 
diff --git a/mlir/lib/Gradient/Transforms/GradMethods/ClassicalJacobian.cpp b/mlir/lib/Gradient/Transforms/GradMethods/ClassicalJacobian.cpp
index c9fb1bc8e7..464ab29089 100644
--- a/mlir/lib/Gradient/Transforms/GradMethods/ClassicalJacobian.cpp
+++ b/mlir/lib/Gradient/Transforms/GradMethods/ClassicalJacobian.cpp
@@ -148,7 +148,7 @@ func::FuncOp genSplitPreprocessed(PatternRewriter &rewriter, Location loc, func:
         PatternRewriter::InsertionGuard insertGuard(rewriter);
         rewriter.setInsertionPointToStart(&splitFn.getBody().front());
         Value paramsBuffer = rewriter.create<memref::AllocOp>(loc, paramsBufferType, paramCount);
-        Value paramsTensor = rewriter.create<bufferization::ToTensorOp>(loc, paramsBuffer);
+        Value paramsTensor = rewriter.create<bufferization::ToTensorOp>(loc, paramsBuffer, true);
 
         qnodeQuantumArgs.push_back(paramsTensor);
         MemRefType paramsProcessedType = MemRefType::get({}, rewriter.getIndexType());
diff --git a/mlir/lib/Gradient/Transforms/GradMethods/PS_QuantumGradient.cpp b/mlir/lib/Gradient/Transforms/GradMethods/PS_QuantumGradient.cpp
index 9f10b14085..ff6d172908 100644
--- a/mlir/lib/Gradient/Transforms/GradMethods/PS_QuantumGradient.cpp
+++ b/mlir/lib/Gradient/Transforms/GradMethods/PS_QuantumGradient.cpp
@@ -59,7 +59,7 @@ static std::vector<Value> computePartialDerivative(PatternRewriter &rewriter, Lo
 {
     constexpr double shift = llvm::numbers::pi / 2;
     ShapedType shiftVectorType = RankedTensorType::get({numShifts}, rewriter.getF64Type());
-    Value selectorVector = rewriter.create<bufferization::ToTensorOp>(loc, selectorBuffer);
+    Value selectorVector = rewriter.create<bufferization::ToTensorOp>(loc, selectorBuffer, true);
 
     // Define the shift vectors (pos/neg) as sparse tensor constants.
     DenseElementsAttr nonZeroIndices = rewriter.getI64TensorAttr(currentShift);
@@ -286,7 +286,7 @@ func::FuncOp ParameterShiftLowering::genQGradFunction(PatternRewriter &rewriter,
                 gradientTensors.reserve(gradResTypes.size());
                 for (Value gradientBuffer : gradientBuffers) {
                     gradientTensors.push_back(
-                        rewriter.create<bufferization::ToTensorOp>(loc, gradientBuffer));
+                        rewriter.create<bufferization::ToTensorOp>(loc, gradientBuffer, true));
                 }
                 op->setOperands(gradientTensors);
             }
diff --git a/mlir/lib/QEC/Transforms/CommuteCliffordPastPPM.cpp b/mlir/lib/QEC/Transforms/CommuteCliffordPastPPM.cpp
index 7fa027147e..5d06e7f2b3 100644
--- a/mlir/lib/QEC/Transforms/CommuteCliffordPastPPM.cpp
+++ b/mlir/lib/QEC/Transforms/CommuteCliffordPastPPM.cpp
@@ -15,10 +15,9 @@
 #define DEBUG_TYPE "merge_ppr_ppm"
 
 #include "mlir/Analysis/SliceAnalysis.h"
+#include "mlir/Analysis/TopologicalSortUtils.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
-// #include "mlir/Analysis/TopologicalSortUtils.h"  // enable when updating llvm
-#include "mlir/Transforms/TopologicalSortUtils.h"
 
 #include "QEC/IR/QECDialect.h"
 #include "QEC/IR/QECOpInterfaces.h"
diff --git a/mlir/lib/QEC/Transforms/CommuteCliffordTPPR.cpp b/mlir/lib/QEC/Transforms/CommuteCliffordTPPR.cpp
index 8cbb77d238..11bae9df0a 100644
--- a/mlir/lib/QEC/Transforms/CommuteCliffordTPPR.cpp
+++ b/mlir/lib/QEC/Transforms/CommuteCliffordTPPR.cpp
@@ -14,9 +14,8 @@
 
 #define DEBUG_TYPE "commute_ppr"
 
+#include "mlir/Analysis/TopologicalSortUtils.h"
 #include "llvm/Support/Debug.h"
-// #include "mlir/Analysis/TopologicalSortUtils.h" // enable when updating llvm
-#include "mlir/Transforms/TopologicalSortUtils.h"
 
 #include "QEC/IR/QECDialect.h"
 #include "QEC/IR/QECOpInterfaces.h"
diff --git a/mlir/lib/Quantum/Transforms/IonsDecompositionPatterns.cpp b/mlir/lib/Quantum/Transforms/IonsDecompositionPatterns.cpp
index 047e60f735..96efc974d9 100644
--- a/mlir/lib/Quantum/Transforms/IonsDecompositionPatterns.cpp
+++ b/mlir/lib/Quantum/Transforms/IonsDecompositionPatterns.cpp
@@ -14,9 +14,12 @@
 
 #define DEBUG_TYPE "ions-decomposition"
 
+#include <variant>
+
+#include "mlir/Dialect/Arith/IR/Arith.h"
+
 #include "Quantum/IR/QuantumOps.h"
 #include "Quantum/Transforms/Patterns.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
 
 using namespace mlir;
 using namespace catalyst::quantum;
diff --git a/mlir/lib/Quantum/Transforms/emit_catalyst_pyface.cpp b/mlir/lib/Quantum/Transforms/emit_catalyst_pyface.cpp
index 34d246ae7c..386023eb04 100644
--- a/mlir/lib/Quantum/Transforms/emit_catalyst_pyface.cpp
+++ b/mlir/lib/Quantum/Transforms/emit_catalyst_pyface.cpp
@@ -214,7 +214,7 @@ struct EmitCatalystPyInterfacePass
         patterns.add<EmitCatalystPyInterfaceTransform>(context);
         GreedyRewriteConfig config;
         config.strictMode = GreedyRewriteStrictness::ExistingOps;
-        config.enableRegionSimplification = false;
+        config.enableRegionSimplification = mlir::GreedySimplifyRegionLevel::Disabled;
         config.maxIterations = 1;
 
         auto op = getOperation();
diff --git a/mlir/llvm-project b/mlir/llvm-project
index 3a83162168..5f74671c85 160000
--- a/mlir/llvm-project
+++ b/mlir/llvm-project
@@ -1 +1 @@
-Subproject commit 3a8316216807d64a586b971f51695e23883331f7
+Subproject commit 5f74671c85877e03622e8d308aee15ed73ccee7c
diff --git a/mlir/mlir-hlo b/mlir/mlir-hlo
index 89a891c986..25b008569f 160000
--- a/mlir/mlir-hlo
+++ b/mlir/mlir-hlo
@@ -1 +1 @@
-Subproject commit 89a891c986650c33df76885f5620e0a92150d90f
+Subproject commit 25b008569f413d76cfa8f481f3a84e82b89c47f4
diff --git a/mlir/test/Catalyst/ConversionTest.mlir b/mlir/test/Catalyst/ConversionTest.mlir
index a7d179e7ec..29ab3dfbe9 100644
--- a/mlir/test/Catalyst/ConversionTest.mlir
+++ b/mlir/test/Catalyst/ConversionTest.mlir
@@ -147,9 +147,9 @@ module @test1 {
   // CHECK-SAME:)
   func.func private @foo(%arg0: tensor<f64>) -> tensor<f64> {
     // CHECK: [[memref0:%.+]] = bufferization.to_memref [[arg0]]
-    // CHECK: [[struct0:%.+]] = builtin.unrealized_conversion_cast [[memref0]]
     // CHECK: [[ptr0:%.+]] = llvm.alloca {{.*}}
     // CHECK: [[ptr1:%.+]] = llvm.alloca {{.*}}
+    // CHECK: [[struct0:%.+]] = builtin.unrealized_conversion_cast [[memref0]]
 
     // CHECK: [[tensor1:%.+]] = bufferization.alloc_tensor()
     // CHECK: [[memref1:%.+]] = bufferization.to_memref [[tensor1]]
diff --git a/mlir/test/Catalyst/MemrefLoadStoreLoweringTBAA.mlir b/mlir/test/Catalyst/MemrefLoadStoreLoweringTBAA.mlir
index 11dc2b0d93..39b552e4c0 100644
--- a/mlir/test/Catalyst/MemrefLoadStoreLoweringTBAA.mlir
+++ b/mlir/test/Catalyst/MemrefLoadStoreLoweringTBAA.mlir
@@ -35,8 +35,8 @@ module @my_model {
     llvm.func @my_func(...)
     llvm.func @__enzyme_autodiff0(...)
     func.func @func_i32(%arg0: memref<i32>, %arg1: memref<4xi32>) -> (memref<i32>, memref<4xi32>) {
-        // CHECK: [[castArg0:%.+]] = builtin.unrealized_conversion_cast %arg0 : memref<i32> to !llvm.struct<(ptr, ptr, i64)>
         // CHECK: [[castArg1:%.+]] = builtin.unrealized_conversion_cast %arg1 : memref<4xi32> to !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+        // CHECK: [[castArg0:%.+]] = builtin.unrealized_conversion_cast %arg0 : memref<i32> to !llvm.struct<(ptr, ptr, i64)>
         // CHECK: [[extract0:%.+]] = llvm.extractvalue [[castArg0]][1] : !llvm.struct<(ptr, ptr, i64)> 
         // CHECK: [[load:%.+]] = llvm.load [[extract0]] {tbaa = [[[tag]]]} : !llvm.ptr -> i32
         // CHECK: [[idx:%.+]] = index.constant 0
@@ -59,8 +59,8 @@ module @my_model {
 module @my_model {
     llvm.func @__enzyme_autodiff1(...)
     func.func @func_f32(%arg0: memref<f32>, %arg1: memref<4xf32>) -> (memref<f32>, memref<4xf32>) {
-        // CHECK: [[castArg0:%.+]] = builtin.unrealized_conversion_cast %arg0 : memref<f32> to !llvm.struct<(ptr, ptr, i64)>
         // CHECK: [[castArg1:%.+]] = builtin.unrealized_conversion_cast %arg1 : memref<4xf32> to !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+        // CHECK: [[castArg0:%.+]] = builtin.unrealized_conversion_cast %arg0 : memref<f32> to !llvm.struct<(ptr, ptr, i64)>
         // CHECK: [[extract0:%.+]] = llvm.extractvalue [[castArg0]][1] : !llvm.struct<(ptr, ptr, i64)> 
         // CHECK: [[load:%.+]] = llvm.load [[extract0]] {tbaa = [[[tag]]]} : !llvm.ptr -> f32
         // CHECK: [[idx:%.+]] = index.constant 0
@@ -83,8 +83,8 @@ module @my_model {
 module @my_model {
     llvm.func @__enzyme_autodiff1(...)
     func.func @func_f64(%arg0: memref<f64>, %arg1: memref<4xf64>) -> (memref<f64>, memref<4xf64>) {
-        // CHECK: [[castArg0:%.+]] = builtin.unrealized_conversion_cast %arg0 : memref<f64> to !llvm.struct<(ptr, ptr, i64)>
         // CHECK: [[castArg1:%.+]] = builtin.unrealized_conversion_cast %arg1 : memref<4xf64> to !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+        // CHECK: [[castArg0:%.+]] = builtin.unrealized_conversion_cast %arg0 : memref<f64> to !llvm.struct<(ptr, ptr, i64)>
         // CHECK: [[extract0:%.+]] = llvm.extractvalue [[castArg0]][1] : !llvm.struct<(ptr, ptr, i64)> 
         // CHECK: [[load:%.+]] = llvm.load [[extract0]] {tbaa = [[[tag]]]} : !llvm.ptr -> f64
         // CHECK: [[idx:%.+]] = index.constant 0
@@ -109,10 +109,10 @@ module @my_model {
 module @my_model {
     llvm.func @__enzyme_autodiff2(...)
     func.func @func_mix_f64_index(%arg0: memref<f64>, %arg1: memref<4xf64>, %arg2: memref<index>, %arg3: memref<3xindex>) -> (memref<4xf64>, memref<3xindex>) {
-        // CHECK: [[castArg0:%.+]] = builtin.unrealized_conversion_cast %arg0 : memref<f64> to !llvm.struct<(ptr, ptr, i64)>
-        // CHECK: [[castArg1:%.+]] = builtin.unrealized_conversion_cast %arg1 : memref<4xf64> to !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
-        // CHECK: [[castArg2:%.+]] = builtin.unrealized_conversion_cast %arg2 : memref<index> to !llvm.struct<(ptr, ptr, i64)>
         // CHECK: [[castArg3:%.+]] = builtin.unrealized_conversion_cast %arg3 : memref<3xindex> to !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+        // CHECK: [[castArg2:%.+]] = builtin.unrealized_conversion_cast %arg2 : memref<index> to !llvm.struct<(ptr, ptr, i64)>
+        // CHECK: [[castArg1:%.+]] = builtin.unrealized_conversion_cast %arg1 : memref<4xf64> to !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+        // CHECK: [[castArg0:%.+]] = builtin.unrealized_conversion_cast %arg0 : memref<f64> to !llvm.struct<(ptr, ptr, i64)>
         // CHECK: [[extract0:%.+]] = llvm.extractvalue [[castArg0]][1] : !llvm.struct<(ptr, ptr, i64)> 
         // CHECK: [[load0:%.+]] = llvm.load [[extract0]] {tbaa = [[[tagdouble]]]} : !llvm.ptr -> f64
         // CHECK: [[idx:%.+]] = index.constant 0
diff --git a/mlir/test/Gradient/ConversionTest.mlir b/mlir/test/Gradient/ConversionTest.mlir
index 88cf72108b..04fca8c5d3 100644
--- a/mlir/test/Gradient/ConversionTest.mlir
+++ b/mlir/test/Gradient/ConversionTest.mlir
@@ -62,11 +62,8 @@ module @test0 {
     // CHECK: [[in0struct:%.+]] = llvm.load [[in0ptr]] : !llvm.ptr -> !llvm.struct<(ptr, ptr, i64)>
     // CHECK: [[in0memref:%.+]] = builtin.unrealized_conversion_cast [[in0struct]] : !llvm.struct<(ptr, ptr, i64)> to memref<f64>
     // CHECK: [[diff0struct:%.+]] = llvm.load [[diff0ptr]] : !llvm.ptr -> !llvm.struct<(ptr, ptr, i64)>
-    // CHECK: [[diff0memref:%.+]] = builtin.unrealized_conversion_cast [[diff0struct]] : !llvm.struct<(ptr, ptr, i64)> to memref<f64>
     // CHECK: [[out0struct:%.+]] = llvm.load [[out0ptr]] : !llvm.ptr -> !llvm.struct<(ptr, ptr, i64)>
-    // CHECK: [[out0memref:%.+]] = builtin.unrealized_conversion_cast [[out0struct]] : !llvm.struct<(ptr, ptr, i64)> to memref<f64>
     // CHECK: [[cotan0struct:%.+]] = llvm.load [[cotan0ptr]] : !llvm.ptr -> !llvm.struct<(ptr, ptr, i64)>
-    // CHECK: [[cotan0memref:%.+]] = builtin.unrealized_conversion_cast [[cotan0struct]] : !llvm.struct<(ptr, ptr, i64)> to memref<f64>
     // CHECK: [[results:%.+]]:2 = call @fwd([[in0memref]])
 
     %1:2 = func.call @fwd(%arg0) : (memref<f64>) -> (memref<f64>, memref<f64>)
@@ -117,9 +114,7 @@ module @test1 {
   gradient.reverse @rev.rev(%arg0: memref<f64>, %arg1: memref<f64>, %arg2: memref<f64>, %arg3: memref<f64>, %arg4: memref<f64>) attributes {argc = 1 : i64, implementation = @rev, resc = 1 : i64, tape = 1 : i64} {
 
     // CHECK: [[in0struct:%.+]] = llvm.load [[in0ptr]] : !llvm.ptr -> !llvm.struct<(ptr, ptr, i64)>
-    // CHECK: [[in0memref:%.+]] = builtin.unrealized_conversion_cast [[in0struct]] : !llvm.struct<(ptr, ptr, i64)> to memref<f64>
     // CHECK: [[diff0struct:%.+]] = llvm.load [[diff0ptr]] : !llvm.ptr -> !llvm.struct<(ptr, ptr, i64)>
-    // CHECK: [[diff0memref:%.+]] = builtin.unrealized_conversion_cast [[diff0struct]] : !llvm.struct<(ptr, ptr, i64)> to memref<f64>
     // CHECK: [[out0struct:%.+]] = llvm.load [[out0ptr]] : !llvm.ptr -> !llvm.struct<(ptr, ptr, i64)>
     // CHECK: [[out0memref:%.+]] = builtin.unrealized_conversion_cast [[out0struct]] : !llvm.struct<(ptr, ptr, i64)> to memref<f64>
     // CHECK: [[cotan0struct:%.+]] = llvm.load [[cotan0ptr]] : !llvm.ptr -> !llvm.struct<(ptr, ptr, i64)>
diff --git a/mlir/test/Gradient/PS_QuantumGradientTest.mlir b/mlir/test/Gradient/PS_QuantumGradientTest.mlir
index c13ca339d4..274168b293 100644
--- a/mlir/test/Gradient/PS_QuantumGradientTest.mlir
+++ b/mlir/test/Gradient/PS_QuantumGradientTest.mlir
@@ -48,7 +48,7 @@ func.func @simple_circuit(%arg0: tensor<3xf64>) -> f64 attributes {qnode, diff_m
     // CHECK-NOT: quantum.custom
     %q_1 = quantum.custom "h"() %q_0 : !quantum.bit
 
-    // CHECK: [[sel:%[a-zA-Z0-9_]+]] = bufferization.to_tensor [[selBuff]]
+    // CHECK: [[sel:%[a-zA-Z0-9_]+]] = bufferization.to_tensor [[selBuff]] restrict
     // CHECK: [[epos:%[a-zA-Z0-9_]+]] = call @simple_circuit.shifted(%arg0, [[shift0pos]], [[sel]])
     // CHECK: [[eneg:%[a-zA-Z0-9_]+]] = call @simple_circuit.shifted(%arg0, [[shift0neg]], [[sel]])
     // CHECK: [[diff:%[a-zA-Z0-9_]+]] = arith.subf [[epos]], [[eneg]]
@@ -61,7 +61,7 @@ func.func @simple_circuit(%arg0: tensor<3xf64>) -> f64 attributes {qnode, diff_m
     // CHECK-NOT: quantum.custom
     %q_2 = quantum.custom "rz"(%f0) %q_1 : !quantum.bit
 
-    // CHECK: [[sel:%[a-zA-Z0-9_]+]] = bufferization.to_tensor [[selBuff]]
+    // CHECK: [[sel:%[a-zA-Z0-9_]+]] = bufferization.to_tensor [[selBuff]] restrict
     // CHECK: [[epos:%[a-zA-Z0-9_]+]] = call @simple_circuit.shifted(%arg0, [[shift1pos]], [[sel]])
     // CHECK: [[eneg:%[a-zA-Z0-9_]+]] = call @simple_circuit.shifted(%arg0, [[shift1neg]], [[sel]])
     // CHECK: [[diff:%[a-zA-Z0-9_]+]] = arith.subf [[epos]], [[eneg]]
@@ -71,7 +71,7 @@ func.func @simple_circuit(%arg0: tensor<3xf64>) -> f64 attributes {qnode, diff_m
     // CHECK: [[newIdx:%[a-zA-Z0-9_]+]] = index.add [[idx]], [[c1]]
     // CHECK: memref.store [[newIdx]], [[gradIdx]]
     //
-    // CHECK: [[sel:%[a-zA-Z0-9_]+]] = bufferization.to_tensor [[selBuff]]
+    // CHECK: [[sel:%[a-zA-Z0-9_]+]] = bufferization.to_tensor [[selBuff]] restrict
     // CHECK: [[epos:%[a-zA-Z0-9_]+]] = call @simple_circuit.shifted(%arg0, [[shift2pos]], [[sel]])
     // CHECK: [[eneg:%[a-zA-Z0-9_]+]] = call @simple_circuit.shifted(%arg0, [[shift2neg]], [[sel]])
     // CHECK: [[diff:%[a-zA-Z0-9_]+]] = arith.subf [[epos]], [[eneg]]
@@ -81,7 +81,7 @@ func.func @simple_circuit(%arg0: tensor<3xf64>) -> f64 attributes {qnode, diff_m
     // CHECK: [[newIdx:%[a-zA-Z0-9_]+]] = index.add [[idx]], [[c1]]
     // CHECK: memref.store [[newIdx]], [[gradIdx]]
     //
-    // CHECK: [[sel:%[a-zA-Z0-9_]+]] = bufferization.to_tensor [[selBuff]]
+    // CHECK: [[sel:%[a-zA-Z0-9_]+]] = bufferization.to_tensor [[selBuff]] restrict
     // CHECK: [[epos:%[a-zA-Z0-9_]+]] = call @simple_circuit.shifted(%arg0, [[shift3pos]], [[sel]])
     // CHECK: [[eneg:%[a-zA-Z0-9_]+]] = call @simple_circuit.shifted(%arg0, [[shift3neg]], [[sel]])
     // CHECK: [[diff:%[a-zA-Z0-9_]+]] = arith.subf [[epos]], [[eneg]]
@@ -96,7 +96,7 @@ func.func @simple_circuit(%arg0: tensor<3xf64>) -> f64 attributes {qnode, diff_m
     %obs = quantum.namedobs %q_3[PauliX] : !quantum.obs
     %expval = quantum.expval %obs : f64
 
-    // CHECK: [[ret:%[a-zA-Z0-9_]+]] = bufferization.to_tensor [[grad]]
+    // CHECK: [[ret:%[a-zA-Z0-9_]+]] = bufferization.to_tensor [[grad]] restrict
     // CHECK: return [[ret]] : tensor<?xf64>
     func.return %expval : f64
 }
@@ -136,7 +136,7 @@ func.func @structured_circuit(%arg0: f64, %arg1: i1, %arg2: i1) -> f64 attribute
     // CHECK-NOT: quantum.extract
     %q_0 = quantum.extract %r[%idx] : !quantum.reg -> !quantum.bit
 
-    // CHECK: [[sel:%[a-zA-Z0-9_]+]] = bufferization.to_tensor [[selBuff]]
+    // CHECK: [[sel:%[a-zA-Z0-9_]+]] = bufferization.to_tensor [[selBuff]] restrict
     // CHECK: [[epos:%[a-zA-Z0-9_]+]] = call @structured_circuit.shifted(%arg0, %arg1, %arg2, [[shift0pos]], [[sel]])
     // CHECK: [[eneg:%[a-zA-Z0-9_]+]] = call @structured_circuit.shifted(%arg0, %arg1, %arg2, [[shift0neg]], [[sel]])
     // CHECK: [[diff:%[a-zA-Z0-9_]+]] = arith.subf [[epos]], [[eneg]]
@@ -151,7 +151,7 @@ func.func @structured_circuit(%arg0: f64, %arg1: i1, %arg2: i1) -> f64 attribute
 
     // CHECK: scf.if %arg1
     %q_2 = scf.if %arg1 -> !quantum.bit {
-        // CHECK: [[sel:%[a-zA-Z0-9_]+]] = bufferization.to_tensor [[selBuff]]
+        // CHECK: [[sel:%[a-zA-Z0-9_]+]] = bufferization.to_tensor [[selBuff]] restrict
         // CHECK: [[epos:%[a-zA-Z0-9_]+]] = func.call @structured_circuit.shifted(%arg0, %true, %arg2, [[shift1pos]], [[sel]])
         // CHECK: [[eneg:%[a-zA-Z0-9_]+]] = func.call @structured_circuit.shifted(%arg0, %true, %arg2, [[shift1neg]], [[sel]])
         // CHECK: [[diff:%[a-zA-Z0-9_]+]] = arith.subf [[epos]], [[eneg]]
@@ -166,7 +166,7 @@ func.func @structured_circuit(%arg0: f64, %arg1: i1, %arg2: i1) -> f64 attribute
 
         // CHECK: scf.if %arg2
         %q_1_1 = scf.if %arg2 -> !quantum.bit {
-            // CHECK: [[sel:%[a-zA-Z0-9_]+]] = bufferization.to_tensor [[selBuff]]
+            // CHECK: [[sel:%[a-zA-Z0-9_]+]] = bufferization.to_tensor [[selBuff]] restrict
             // CHECK: [[epos:%[a-zA-Z0-9_]+]] = func.call @structured_circuit.shifted(%arg0, %true, %true, [[shift2pos]], [[sel]])
             // CHECK: [[eneg:%[a-zA-Z0-9_]+]] = func.call @structured_circuit.shifted(%arg0, %true, %true, [[shift2neg]], [[sel]])
             // CHECK: [[diff:%[a-zA-Z0-9_]+]] = arith.subf [[epos]], [[eneg]]
@@ -181,7 +181,7 @@ func.func @structured_circuit(%arg0: f64, %arg1: i1, %arg2: i1) -> f64 attribute
             scf.yield %q_1_0_0 : !quantum.bit
         // CHECK: else
         } else {
-            // CHECK: [[sel:%[a-zA-Z0-9_]+]] = bufferization.to_tensor [[selBuff]]
+            // CHECK: [[sel:%[a-zA-Z0-9_]+]] = bufferization.to_tensor [[selBuff]] restrict
             // CHECK: [[epos:%[a-zA-Z0-9_]+]] = func.call @structured_circuit.shifted(%arg0, %true, %false, [[shift3pos]], [[sel]])
             // CHECK: [[eneg:%[a-zA-Z0-9_]+]] = func.call @structured_circuit.shifted(%arg0, %true, %false, [[shift3neg]], [[sel]])
             // CHECK: [[diff:%[a-zA-Z0-9_]+]] = arith.subf [[epos]], [[eneg]]
@@ -193,7 +193,7 @@ func.func @structured_circuit(%arg0: f64, %arg1: i1, %arg2: i1) -> f64 attribute
             //
             // CHECK-NOT: quantum.custom
             %q_1_0_1 = quantum.custom "rz"(%arg0) %q_1_0 : !quantum.bit
-            // CHECK: [[sel:%[a-zA-Z0-9_]+]] = bufferization.to_tensor [[selBuff]]
+            // CHECK: [[sel:%[a-zA-Z0-9_]+]] = bufferization.to_tensor [[selBuff]] restrict
             // CHECK: [[epos:%[a-zA-Z0-9_]+]] = func.call @structured_circuit.shifted(%arg0, %true, %false, [[shift4pos]], [[sel]])
             // CHECK: [[eneg:%[a-zA-Z0-9_]+]] = func.call @structured_circuit.shifted(%arg0, %true, %false, [[shift4neg]], [[sel]])
             // CHECK: [[diff:%[a-zA-Z0-9_]+]] = arith.subf [[epos]], [[eneg]]
@@ -216,7 +216,7 @@ func.func @structured_circuit(%arg0: f64, %arg1: i1, %arg2: i1) -> f64 attribute
     cf.br ^exit
 
   ^exit:
-    // CHECK: [[sel:%[a-zA-Z0-9_]+]] = bufferization.to_tensor [[selBuff]]
+    // CHECK: [[sel:%[a-zA-Z0-9_]+]] = bufferization.to_tensor [[selBuff]] restrict
     // CHECK: [[epos:%[a-zA-Z0-9_]+]] = call @structured_circuit.shifted(%arg0, %arg1, %arg2, [[shift5pos]], [[sel]])
     // CHECK: [[eneg:%[a-zA-Z0-9_]+]] = call @structured_circuit.shifted(%arg0, %arg1, %arg2, [[shift5neg]], [[sel]])
     // CHECK: [[diff:%[a-zA-Z0-9_]+]] = arith.subf [[epos]], [[eneg]]
@@ -231,7 +231,7 @@ func.func @structured_circuit(%arg0: f64, %arg1: i1, %arg2: i1) -> f64 attribute
     %obs = quantum.namedobs %q_3[PauliX] : !quantum.obs
     %expval = quantum.expval %obs : f64
 
-    // CHECK: [[ret:%[a-zA-Z0-9_]+]] = bufferization.to_tensor [[grad]]
+    // CHECK: [[ret:%[a-zA-Z0-9_]+]] = bufferization.to_tensor [[grad]] restrict
     // CHECK: return [[ret]] : tensor<?xf64>
     func.return %expval : f64
 }
@@ -267,7 +267,7 @@ func.func @loop_circuit(%arg0: f64) -> f64 attributes {qnode, diff_method = "par
     // CHECK-NOT: quantum.extract
     %q_0 = quantum.extract %r[%idx] : !quantum.reg -> !quantum.bit
 
-    // CHECK: [[sel:%[a-zA-Z0-9_]+]] = bufferization.to_tensor [[selBuff]]
+    // CHECK: [[sel:%[a-zA-Z0-9_]+]] = bufferization.to_tensor [[selBuff]] restrict
     // CHECK: [[epos:%[a-zA-Z0-9_]+]] = call @loop_circuit.shifted(%arg0, [[shift0pos]], [[sel]])
     // CHECK: [[eneg:%[a-zA-Z0-9_]+]] = call @loop_circuit.shifted(%arg0, [[shift0neg]], [[sel]])
     // CHECK: [[diff:%[a-zA-Z0-9_]+]] = arith.subf [[epos]], [[eneg]]
@@ -288,7 +288,7 @@ func.func @loop_circuit(%arg0: f64) -> f64 attributes {qnode, diff_method = "par
     %q_2 = scf.for %i = %lb to %ub step %st iter_args(%q_1_0 = %q_1) -> !quantum.bit {
         // CHECK: memref.store [[i]], [[selBuff]][[[c0]]]
 
-        // CHECK: [[sel:%[a-zA-Z0-9_]+]] = bufferization.to_tensor [[selBuff]]
+        // CHECK: [[sel:%[a-zA-Z0-9_]+]] = bufferization.to_tensor [[selBuff]] restrict
         // CHECK: [[epos:%[a-zA-Z0-9_]+]] = func.call @loop_circuit.shifted(%arg0, [[shift1pos]], [[sel]])
         // CHECK: [[eneg:%[a-zA-Z0-9_]+]] = func.call @loop_circuit.shifted(%arg0, [[shift1neg]], [[sel]])
         // CHECK: [[diff:%[a-zA-Z0-9_]+]] = arith.subf [[epos]], [[eneg]]
@@ -308,7 +308,7 @@ func.func @loop_circuit(%arg0: f64) -> f64 attributes {qnode, diff_method = "par
     %q_3 = scf.for %j = %lb to %ub step %st iter_args(%q_2_0 = %q_2) -> !quantum.bit {
         // CHECK: memref.store [[j]], [[selBuff]][[[c0]]]
 
-        // CHECK: [[sel:%[a-zA-Z0-9_]+]] = bufferization.to_tensor [[selBuff]]
+        // CHECK: [[sel:%[a-zA-Z0-9_]+]] = bufferization.to_tensor [[selBuff]] restrict
         // CHECK: [[epos:%[a-zA-Z0-9_]+]] = func.call @loop_circuit.shifted(%arg0, [[shift2pos]], [[sel]])
         // CHECK: [[eneg:%[a-zA-Z0-9_]+]] = func.call @loop_circuit.shifted(%arg0, [[shift2neg]], [[sel]])
         // CHECK: [[diff:%[a-zA-Z0-9_]+]] = arith.subf [[epos]], [[eneg]]
@@ -325,7 +325,7 @@ func.func @loop_circuit(%arg0: f64) -> f64 attributes {qnode, diff_method = "par
         %q_1_1 = scf.for %k = %j to %ub step %st iter_args(%q_2_1_0 = %q_2_1) -> !quantum.bit {
             // CHECK: memref.store [[k]], [[selBuff]][[[c1]]]
 
-            // CHECK: [[sel:%[a-zA-Z0-9_]+]] = bufferization.to_tensor [[selBuff]]
+            // CHECK: [[sel:%[a-zA-Z0-9_]+]] = bufferization.to_tensor [[selBuff]] restrict
             // CHECK: [[epos:%[a-zA-Z0-9_]+]] = func.call @loop_circuit.shifted(%arg0, [[shift3pos]], [[sel]])
             // CHECK: [[eneg:%[a-zA-Z0-9_]+]] = func.call @loop_circuit.shifted(%arg0, [[shift3neg]], [[sel]])
             // CHECK: [[diff:%[a-zA-Z0-9_]+]] = arith.subf [[epos]], [[eneg]]
@@ -346,7 +346,7 @@ func.func @loop_circuit(%arg0: f64) -> f64 attributes {qnode, diff_method = "par
     %obs = quantum.namedobs %q_3[PauliX] : !quantum.obs
     %expval = quantum.expval %obs : f64
 
-    // CHECK: [[ret:%[a-zA-Z0-9_]+]] = bufferization.to_tensor [[grad]]
+    // CHECK: [[ret:%[a-zA-Z0-9_]+]] = bufferization.to_tensor [[grad]] restrict
     // CHECK: return [[ret]] : tensor<?xf64>
     func.return %expval : f64
 }
@@ -376,7 +376,7 @@ func.func @tensor_circuit(%arg0: f64) -> tensor<2x3xf64> attributes {qnode, diff
     // CHECK-NOT: quantum.extract
     %q_0 = quantum.extract %r[%idx] : !quantum.reg -> !quantum.bit
 
-    // CHECK: [[sel:%[a-zA-Z0-9_]+]] = bufferization.to_tensor [[selBuff]]
+    // CHECK: [[sel:%[a-zA-Z0-9_]+]] = bufferization.to_tensor [[selBuff]] restrict
     // CHECK: [[epos:%[a-zA-Z0-9_]+]] = call @tensor_circuit.shifted(%arg0, [[shift0pos]], [[sel]])
     // CHECK: [[eneg:%[a-zA-Z0-9_]+]] = call @tensor_circuit.shifted(%arg0, [[shift0neg]], [[sel]])
     // CHECK: [[diff:%[a-zA-Z0-9_]+]] = arith.subf [[epos]], [[eneg]]
@@ -392,7 +392,7 @@ func.func @tensor_circuit(%arg0: f64) -> tensor<2x3xf64> attributes {qnode, diff
     %obs = quantum.namedobs %q_1[PauliX] : !quantum.obs
     %expval = quantum.expval %obs : f64
 
-    // CHECK: [[ret:%[a-zA-Z0-9_]+]] = bufferization.to_tensor [[grad]]
+    // CHECK: [[ret:%[a-zA-Z0-9_]+]] = bufferization.to_tensor [[grad]] restrict
     // CHECK: return [[ret]] : tensor<?x2x3xf64>
     %res = tensor.from_elements %expval, %expval, %expval, %expval, %expval, %expval : tensor<2x3xf64>
     func.return %res : tensor<2x3xf64>
@@ -425,7 +425,7 @@ func.func @multi_res_circuit(%arg0: f64) -> (f64, tensor<2xf64>) attributes {qno
     %r = quantum.alloc(1) : !quantum.reg
     %q_0 = quantum.extract %r[%idx] : !quantum.reg -> !quantum.bit
 
-    // CHECK:         [[SEL:%.+]] = bufferization.to_tensor [[SELBUFF]] : memref<0xindex>
+    // CHECK:         [[SEL:%.+]] = bufferization.to_tensor [[SELBUFF]] restrict : memref<0xindex>
     // CHECK:         [[EVALPOS:%.+]]:2 = call @multi_res_circuit.shifted(%arg0, [[SHIFTPOS]], [[SEL]]) : {{.+}} -> (f64, tensor<2xf64>)
     // CHECK:         [[EVALNEG:%.+]]:2 = call @multi_res_circuit.shifted(%arg0, [[SHIFTNEG]], [[SEL]]) : {{.+}} -> (f64, tensor<2xf64>)
     // CHECK:         [[DIFF0:%.+]] = arith.subf [[EVALPOS]]#0, [[EVALNEG]]#0
@@ -443,8 +443,8 @@ func.func @multi_res_circuit(%arg0: f64) -> (f64, tensor<2xf64>) attributes {qno
     %obs = quantum.namedobs %q_1[PauliX] : !quantum.obs
     %expval = quantum.expval %obs : f64
 
-    // CHECK:         [[RES0:%.+]] = bufferization.to_tensor [[GRAD0]]
-    // CHECK:         [[RES1:%.+]] = bufferization.to_tensor [[GRAD1]]
+    // CHECK:         [[RES0:%.+]] = bufferization.to_tensor [[GRAD0]] restrict
+    // CHECK:         [[RES1:%.+]] = bufferization.to_tensor [[GRAD1]] restrict
     // CHECK:         return [[RES0]], [[RES1]] : tensor<?xf64>, tensor<?x2xf64>
     %res = tensor.from_elements %expval, %expval : tensor<2xf64>
     func.return %arg0, %res : f64, tensor<2xf64>
diff --git a/mlir/test/Quantum/ConversionTest.mlir b/mlir/test/Quantum/ConversionTest.mlir
index 3c69c6abac..fb5b46a372 100644
--- a/mlir/test/Quantum/ConversionTest.mlir
+++ b/mlir/test/Quantum/ConversionTest.mlir
@@ -221,9 +221,9 @@ module @custom_gate {
   // CHECK: llvm.func @__catalyst__qis__RX(f64, !llvm.ptr, !llvm.ptr)
   // CHECK-LABEL: @test
   func.func @test(%q0: !quantum.bit, %p: f64) -> () {
-    // CHECK: [[nullptr:%.+]] = llvm.mlir.zero
     // CHECK: [[c1:%.+]] = llvm.mlir.constant(1 : i64)
     // CHECK: [[alloca:%.+]] = llvm.alloca [[c1]] x !llvm.struct<(i1, i64, ptr, ptr)>
+    // CHECK: [[nullptr:%.+]] = llvm.mlir.zero
     // CHECK: [[true:%.+]] = llvm.mlir.constant(true)
     // CHECK: [[off0:%.+]] = llvm.getelementptr inbounds [[alloca]][0, 0]
     // CHECK: [[off1:%.+]] = llvm.getelementptr inbounds [[alloca]][0, 1]
@@ -382,6 +382,8 @@ func.func @hamiltonian(%obs : !quantum.obs, %p1 : memref<1xf64>, %p2 : memref<3x
     // CHECK: [[memrefvar2:%.+]] = llvm.insertvalue %arg3, [[memrefvar1]][2]
     // CHECK: [[memrefvar3:%.+]] = llvm.insertvalue %arg4, [[memrefvar2]][3, 0]
     // CHECK: [[memrefvar4:%.+]] = llvm.insertvalue %arg5, [[memrefvar3]][4, 0]
+    // CHECK: [[cast:%.+]] = builtin.unrealized_conversion_cast [[memrefvar4]]
+    // CHECK: [[memrefvar4:%.+]] = builtin.unrealized_conversion_cast [[cast]]
     // CHECK: [[c1:%.+]] = llvm.mlir.constant(1 : i64)
     // CHECK: llvm.store [[memrefvar4]], [[alloca]]
     // CHECK: llvm.call @__catalyst__qis__HamiltonianObs([[alloca]], [[c1]], %arg0)
@@ -401,8 +403,10 @@ func.func @hamiltonian(%obs : !quantum.obs, %p1 : memref<1xf64>, %p2 : memref<3x
     // CHECK: [[memrefvar2:%.+]] = llvm.insertvalue %arg8, [[memrefvar1]][2]
     // CHECK: [[memrefvar3:%.+]] = llvm.insertvalue %arg9, [[memrefvar2]][3, 0]
     // CHECK: [[memrefvar4:%.+]] = llvm.insertvalue %arg10, [[memrefvar3]][4, 0]
+    // CHECK: [[cast:%.+]] = builtin.unrealized_conversion_cast [[memrefvar4]]
     // CHECK: [[c1:%.+]] = llvm.mlir.constant(1 : i64)
     // CHECK: [[alloca:%.+]] = llvm.alloca [[c1]] x !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+    // CHECK: [[memrefvar4:%.+]] = builtin.unrealized_conversion_cast [[cast]]
     // CHECK: [[c3:%.+]] = llvm.mlir.constant(3 : i64)
     // CHECK: llvm.store [[memrefvar4]], [[alloca]]
     // CHECK: llvm.call @__catalyst__qis__HamiltonianObs([[alloca]], [[c3]], %arg0, %arg0, %arg0)
@@ -580,12 +584,11 @@ func.func @probs(%q : !quantum.bit) {
 
 // CHECK-LABEL: @state
 func.func @state(%q : !quantum.bit) {
-    // CHECK: [[qb:%.+]] = builtin.unrealized_conversion_cast %arg0
-
     %o1 = quantum.compbasis qubits %q : !quantum.obs
 
     // CHECK: [[c1:%.+]] = llvm.mlir.constant(1 : i64)
     // CHECK: [[ptr:%.+]] = llvm.alloca [[c1]] x !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+    // CHECK: [[qb:%.+]] = builtin.unrealized_conversion_cast %arg0
     // CHECK: [[c0:%.+]] = llvm.mlir.constant(0 : i64)
     // CHECK: llvm.call @__catalyst__qis__State([[ptr]], [[c0]])
     %alloc1 = memref.alloc() : memref<2xcomplex<f64>>
@@ -618,13 +621,13 @@ func.func @controlled_circuit(%1 : !quantum.bit, %2 : !quantum.bit, %3 : !quantu
     %cst_0 = llvm.mlir.constant (9.000000e-01 : f64) : f64
     %cst_1 = llvm.mlir.constant (3.000000e-01 : f64) : f64
 
-    // CHECK: [[true:%.+]] = llvm.mlir.constant(true)
     // CHECK: [[c1:%.+]] = llvm.mlir.constant(1 : i64)
     // CHECK: [[alloca0:%.+]] = llvm.alloca [[c1]] x i1
     // CHECK: [[c1:%.+]] = llvm.mlir.constant(1 : i64)
     // CHECK: [[alloca1:%.+]] = llvm.alloca [[c1]] x !llvm.ptr
     // CHECK: [[c1:%.+]] = llvm.mlir.constant(1 : i64)
     // CHECK: [[mod:%.+]] = llvm.alloca [[c1]] x !llvm.struct<(i1, i64, ptr, ptr)>
+    // CHECK: [[true:%.+]] = llvm.mlir.constant(true)
 
 
     // CHECK-DAG: [[cst6:%.+]] = llvm.mlir.constant(6.0
@@ -658,13 +661,13 @@ func.func @controlled_circuit(%1 : !quantum.bit, %2 : !quantum.bit, %3 : !quantu
     %cst = llvm.mlir.constant (6.000000e-01 : f64) : f64
     %true = llvm.mlir.constant (1 : i1) :i1
 
-    // CHECK-DAG: [[cst6:%.+]] = llvm.mlir.constant(6.0
     // CHECK: [[c1:%.+]] = llvm.mlir.constant(1 : i64)
     // CHECK: [[alloca0:%.+]] = llvm.alloca [[c1]] x i1
     // CHECK: [[c1:%.+]] = llvm.mlir.constant(1 : i64)
     // CHECK: [[alloca1:%.+]] = llvm.alloca [[c1]] x !llvm.ptr
     // CHECK: [[c1:%.+]] = llvm.mlir.constant(1 : i64)
     // CHECK: [[mod:%.+]] = llvm.alloca [[c1]] x !llvm.struct<(i1, i64, ptr, ptr)>
+    // CHECK: [[cst6:%.+]] = llvm.mlir.constant(6.0
     // CHECK: [[true:%.+]] = llvm.mlir.constant(true)
 
     // CHECK: [[offset0:%.+]] = llvm.getelementptr inbounds [[mod]][0, 0]