diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
index 58ee87cf82039..afeed370ce347 100644
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -61,7 +61,10 @@ def ConvertToSPIRVPass : Pass<"convert-to-spirv"> {
     "Run vector unrolling to convert vector types in function bodies">,
     Option<"convertGPUModules", "convert-gpu-modules", "bool",
     /*default=*/"false",
-    "Clone and convert GPU modules">
+    "Clone and convert GPU modules">,
+    Option<"nestInGPUModule", "nest-in-gpu-module", "bool",
+    /*default=*/"false",
+    "Put converted SPIR-V module inside the gpu.module instead of alongside it.">,
   ];
 }
 
diff --git a/mlir/lib/Conversion/ConvertToSPIRV/ConvertToSPIRVPass.cpp b/mlir/lib/Conversion/ConvertToSPIRV/ConvertToSPIRVPass.cpp
index 4b7f7ff114dee..ab9c048f56106 100644
--- a/mlir/lib/Conversion/ConvertToSPIRV/ConvertToSPIRVPass.cpp
+++ b/mlir/lib/Conversion/ConvertToSPIRV/ConvertToSPIRVPass.cpp
@@ -108,7 +108,10 @@ struct ConvertToSPIRVPass final
     SmallVector<Operation *, 1> gpuModules;
     OpBuilder builder(context);
     op->walk([&](gpu::GPUModuleOp gpuModule) {
-      builder.setInsertionPoint(gpuModule);
+      if (nestInGPUModule)
+        builder.setInsertionPointToStart(gpuModule.getBody());
+      else
+        builder.setInsertionPoint(gpuModule);
       gpuModules.push_back(builder.clone(*gpuModule));
     });
     // Run conversion for each module independently as they can have
diff --git a/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp b/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp
index 08b451f7d5b32..509b6343057b9 100644
--- a/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp
+++ b/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp
@@ -71,7 +71,7 @@ void GPUToSPIRVPass::runOnOperation() {
     // launch op still needs the original GPU kernel module.
     // For Vulkan Shader capabilities, we insert the newly converted SPIR-V
     // module right after the original GPU module, as that's the expectation of
-    // the in-tree Vulkan runner.
+    // the in-tree SPIR-V CPU runner (the Vulkan runner does not use this pass).
     // For OpenCL Kernel capabilities, we insert the newly converted SPIR-V
     // module inside the original GPU module, as that's the expectaion of the
     // normal GPU compilation pipeline.
diff --git a/mlir/lib/Conversion/GPUToVulkan/ConvertGPULaunchFuncToVulkanLaunchFunc.cpp b/mlir/lib/Conversion/GPUToVulkan/ConvertGPULaunchFuncToVulkanLaunchFunc.cpp
index 2d2251672230b..8488fac69e8e3 100644
--- a/mlir/lib/Conversion/GPUToVulkan/ConvertGPULaunchFuncToVulkanLaunchFunc.cpp
+++ b/mlir/lib/Conversion/GPUToVulkan/ConvertGPULaunchFuncToVulkanLaunchFunc.cpp
@@ -7,9 +7,8 @@
 //===----------------------------------------------------------------------===//
 //
 // This file implements a pass to convert gpu launch function into a vulkan
-// launch function. Creates a SPIR-V binary shader from the `spirv::ModuleOp`
-// using `spirv::serialize` function, attaches binary data and entry point name
-// as an attributes to vulkan launch call op.
+// launch function. Extracts the SPIR-V from a `gpu::BinaryOp` and attaches it
+// along with the entry point name as attributes to a Vulkan launch call op.
 //
 //===----------------------------------------------------------------------===//
 
@@ -40,10 +39,9 @@ static constexpr const char *kVulkanLaunch = "vulkanLaunch";
 
 namespace {
 
-/// A pass to convert gpu launch op to vulkan launch call op, by creating a
-/// SPIR-V binary shader from `spirv::ModuleOp` using `spirv::serialize`
-/// function and attaching binary data and entry point name as an attributes to
-/// created vulkan launch call op.
+/// A pass to convert gpu launch op to vulkan launch call op, by extracting a
+/// SPIR-V binary shader from a `gpu::BinaryOp` and attaching binary data and
+/// entry point name as an attributes to created vulkan launch call op.
 class ConvertGpuLaunchFuncToVulkanLaunchFunc
     : public impl::ConvertGpuLaunchFuncToVulkanLaunchFuncBase<
           ConvertGpuLaunchFuncToVulkanLaunchFunc> {
@@ -51,10 +49,9 @@ class ConvertGpuLaunchFuncToVulkanLaunchFunc
   void runOnOperation() override;
 
 private:
-  /// Creates a SPIR-V binary shader from the given `module` using
-  /// `spirv::serialize` function.
-  LogicalResult createBinaryShader(ModuleOp module,
-                                   std::vector<char> &binaryShader);
+  /// Extracts a SPIR-V binary shader from the given `module`, if any.
+  /// Note that this also removes the binary from the IR.
+  FailureOr<StringAttr> getBinaryShader(ModuleOp module);
 
   /// Converts the given `launchOp` to vulkan launch call.
   void convertGpuLaunchFunc(gpu::LaunchFuncOp launchOp);
@@ -135,22 +132,35 @@ LogicalResult ConvertGpuLaunchFuncToVulkanLaunchFunc::declareVulkanLaunchFunc(
   return success();
 }
 
-LogicalResult ConvertGpuLaunchFuncToVulkanLaunchFunc::createBinaryShader(
-    ModuleOp module, std::vector<char> &binaryShader) {
+FailureOr<StringAttr>
+ConvertGpuLaunchFuncToVulkanLaunchFunc::getBinaryShader(ModuleOp module) {
   bool done = false;
-  SmallVector<uint32_t, 0> binary;
-  for (auto spirvModule : module.getOps<spirv::ModuleOp>()) {
+  StringAttr binaryAttr;
+  gpu::BinaryOp binaryToErase;
+  for (auto gpuBinary : module.getOps<gpu::BinaryOp>()) {
     if (done)
-      return spirvModule.emitError("should only contain one 'spirv.module' op");
+      return gpuBinary.emitError("should only contain one 'gpu.binary' op");
     done = true;
 
-    if (failed(spirv::serialize(spirvModule, binary)))
-      return failure();
+    ArrayRef<Attribute> objects = gpuBinary.getObjectsAttr().getValue();
+    if (objects.size() != 1)
+      return gpuBinary.emitError("should only contain a single object");
+
+    auto object = cast<gpu::ObjectAttr>(objects[0]);
+
+    if (!isa<spirv::TargetEnvAttr>(object.getTarget()))
+      return gpuBinary.emitError(
+          "should contain an object with a SPIR-V target environment");
+
+    binaryAttr = object.getObject();
+    binaryToErase = gpuBinary;
   }
-  binaryShader.resize(binary.size() * sizeof(uint32_t));
-  std::memcpy(binaryShader.data(), reinterpret_cast<char *>(binary.data()),
-              binaryShader.size());
-  return success();
+  if (!done)
+    return module.emitError("should contain a 'gpu.binary' op");
+
+  // Remove the binary to avoid confusing later conversion passes.
+  binaryToErase.erase();
+  return binaryAttr;
 }
 
 void ConvertGpuLaunchFuncToVulkanLaunchFunc::convertGpuLaunchFunc(
@@ -159,9 +169,9 @@ void ConvertGpuLaunchFuncToVulkanLaunchFunc::convertGpuLaunchFunc(
   OpBuilder builder(launchOp);
   Location loc = launchOp.getLoc();
 
-  // Serialize `spirv::Module` into binary form.
-  std::vector<char> binary;
-  if (failed(createBinaryShader(module, binary)))
+  FailureOr<StringAttr> binaryAttr = getBinaryShader(module);
+  // Extract SPIR-V from `gpu.binary` op.
+  if (failed(binaryAttr))
     return signalPassFailure();
 
   // Declare vulkan launch function.
@@ -182,9 +192,7 @@ void ConvertGpuLaunchFuncToVulkanLaunchFunc::convertGpuLaunchFunc(
       vulkanLaunchOperands);
 
   // Set SPIR-V binary shader data as an attribute.
-  vulkanLaunchCallOp->setAttr(
-      kSPIRVBlobAttrName,
-      builder.getStringAttr(StringRef(binary.data(), binary.size())));
+  vulkanLaunchCallOp->setAttr(kSPIRVBlobAttrName, *binaryAttr);
 
   // Set entry point name as an attribute.
   vulkanLaunchCallOp->setAttr(kSPIRVEntryPointAttrName,
diff --git a/mlir/test/Conversion/ConvertToSPIRV/convert-gpu-modules-nested.mlir b/mlir/test/Conversion/ConvertToSPIRV/convert-gpu-modules-nested.mlir
new file mode 100644
index 0000000000000..7562de17c606e
--- /dev/null
+++ b/mlir/test/Conversion/ConvertToSPIRV/convert-gpu-modules-nested.mlir
@@ -0,0 +1,30 @@
+// RUN: mlir-opt -convert-to-spirv="convert-gpu-modules=true nest-in-gpu-module=true run-signature-conversion=false run-vector-unrolling=false" %s | FileCheck %s
+
+module attributes {
+  gpu.container_module,
+  spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Shader], []>, #spirv.resource_limits<>>
+} {
+  // CHECK-LABEL: func.func @main
+  // CHECK:       %[[C1:.*]] = arith.constant 1 : index
+  // CHECK:       gpu.launch_func  @[[$KERNELS_1:.*]]::@[[$BUILTIN_WG_ID_X:.*]] blocks in (%[[C1]], %[[C1]], %[[C1]]) threads in (%[[C1]], %[[C1]], %[[C1]])
+  func.func @main() {
+    %c1 = arith.constant 1 : index
+    gpu.launch_func @kernels_1::@builtin_workgroup_id_x
+        blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1)
+    return
+  }
+
+  // CHECK: gpu.module @[[$KERNELS_1]]
+  // CHECK:   spirv.module @{{.*}} Logical GLSL450
+  // CHECK:   spirv.func @[[$BUILTIN_WG_ID_X]]
+  // CHECK:   spirv.mlir.addressof
+  // CHECK:   spirv.Load "Input"
+  // CHECK:   spirv.CompositeExtract
+  gpu.module @kernels_1 {
+    gpu.func @builtin_workgroup_id_x() kernel
+      attributes {spirv.entry_point_abi = #spirv.entry_point_abi<workgroup_size = [16, 1, 1]>} {
+      %0 = gpu.block_id x
+      gpu.return
+    }
+  }
+}
diff --git a/mlir/test/Conversion/GPUToVulkan/lower-gpu-launch-vulkan-launch.mlir b/mlir/test/Conversion/GPUToVulkan/lower-gpu-launch-vulkan-launch.mlir
index 665d0a33abedc..96ee1866517e6 100644
--- a/mlir/test/Conversion/GPUToVulkan/lower-gpu-launch-vulkan-launch.mlir
+++ b/mlir/test/Conversion/GPUToVulkan/lower-gpu-launch-vulkan-launch.mlir
@@ -1,24 +1,24 @@
-// RUN: mlir-opt %s -convert-gpu-launch-to-vulkan-launch | FileCheck %s
+// RUN: mlir-opt %s -pass-pipeline='builtin.module(spirv-attach-target{ver=v1.0 caps=Shader exts=SPV_KHR_storage_buffer_storage_class},gpu-module-to-binary,convert-gpu-launch-to-vulkan-launch)' | FileCheck %s
 
 // CHECK: %[[resource:.*]] = memref.alloc() : memref<12xf32>
 // CHECK: %[[index:.*]] = arith.constant 1 : index
 // CHECK: call @vulkanLaunch(%[[index]], %[[index]], %[[index]], %[[resource]]) {spirv_blob = "{{.*}}", spirv_element_types = [f32], spirv_entry_point = "kernel"}
 
 module attributes {gpu.container_module} {
-  spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]> {
-    spirv.GlobalVariable @kernel_arg_0 bind(0, 0) : !spirv.ptr<!spirv.struct<(!spirv.array<12 x f32, stride=4> [0])>, StorageBuffer>
-    spirv.func @kernel() "None" attributes {workgroup_attributions = 0 : i64} {
-      %0 = spirv.mlir.addressof @kernel_arg_0 : !spirv.ptr<!spirv.struct<(!spirv.array<12 x f32, stride=4> [0])>, StorageBuffer>
-      %2 = spirv.Constant 0 : i32
-      %3 = spirv.mlir.addressof @kernel_arg_0 : !spirv.ptr<!spirv.struct<(!spirv.array<12 x f32, stride=4> [0])>, StorageBuffer>
-      %4 = spirv.AccessChain %0[%2, %2] : !spirv.ptr<!spirv.struct<(!spirv.array<12 x f32, stride=4> [0])>, StorageBuffer>, i32, i32 -> !spirv.ptr<f32, StorageBuffer>
-      %5 = spirv.Load "StorageBuffer" %4 : f32
-      spirv.Return
-    }
-    spirv.EntryPoint "GLCompute" @kernel
-    spirv.ExecutionMode @kernel "LocalSize", 1, 1, 1
-  }
   gpu.module @kernels {
+    spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]> {
+      spirv.GlobalVariable @kernel_arg_0 bind(0, 0) : !spirv.ptr<!spirv.struct<(!spirv.array<12 x f32, stride=4> [0])>, StorageBuffer>
+      spirv.func @kernel() "None" attributes {workgroup_attributions = 0 : i64} {
+        %0 = spirv.mlir.addressof @kernel_arg_0 : !spirv.ptr<!spirv.struct<(!spirv.array<12 x f32, stride=4> [0])>, StorageBuffer>
+        %2 = spirv.Constant 0 : i32
+        %3 = spirv.mlir.addressof @kernel_arg_0 : !spirv.ptr<!spirv.struct<(!spirv.array<12 x f32, stride=4> [0])>, StorageBuffer>
+        %4 = spirv.AccessChain %0[%2, %2] : !spirv.ptr<!spirv.struct<(!spirv.array<12 x f32, stride=4> [0])>, StorageBuffer>, i32, i32 -> !spirv.ptr<f32, StorageBuffer>
+        %5 = spirv.Load "StorageBuffer" %4 : f32
+        spirv.Return
+      }
+      spirv.EntryPoint "GLCompute" @kernel
+      spirv.ExecutionMode @kernel "LocalSize", 1, 1, 1
+    }
     gpu.func @kernel(%arg0: memref<12xf32>) kernel {
       gpu.return
     }
diff --git a/mlir/test/lib/Pass/TestVulkanRunnerPipeline.cpp b/mlir/test/lib/Pass/TestVulkanRunnerPipeline.cpp
index eda9aa9f9efef..789c4d76cee0d 100644
--- a/mlir/test/lib/Pass/TestVulkanRunnerPipeline.cpp
+++ b/mlir/test/lib/Pass/TestVulkanRunnerPipeline.cpp
@@ -12,33 +12,57 @@
 
 #include "mlir/Conversion/ConvertToSPIRV/ConvertToSPIRVPass.h"
 #include "mlir/Conversion/GPUToSPIRV/GPUToSPIRVPass.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/GPU/Transforms/Passes.h"
 #include "mlir/Dialect/MemRef/Transforms/Passes.h"
 #include "mlir/Dialect/SPIRV/IR/SPIRVOps.h"
 #include "mlir/Dialect/SPIRV/Transforms/Passes.h"
 #include "mlir/Pass/PassManager.h"
+#include "mlir/Pass/PassOptions.h"
 
 using namespace mlir;
 
 namespace {
 
-void buildTestVulkanRunnerPipeline(OpPassManager &passManager) {
+struct VulkanRunnerPipelineOptions
+    : PassPipelineOptions<VulkanRunnerPipelineOptions> {
+  Option<bool> spirvWebGPUPrepare{
+      *this, "spirv-webgpu-prepare",
+      llvm::cl::desc("Run MLIR transforms used when targetting WebGPU")};
+};
+
+void buildTestVulkanRunnerPipeline(OpPassManager &passManager,
+                                   const VulkanRunnerPipelineOptions &options) {
   passManager.addPass(createGpuKernelOutliningPass());
   passManager.addPass(memref::createFoldMemRefAliasOpsPass());
 
+  GpuSPIRVAttachTargetOptions attachTargetOptions{};
+  attachTargetOptions.spirvVersion = "v1.0";
+  attachTargetOptions.spirvCapabilities.push_back("Shader");
+  attachTargetOptions.spirvExtensions.push_back(
+      "SPV_KHR_storage_buffer_storage_class");
+  passManager.addPass(createGpuSPIRVAttachTarget(attachTargetOptions));
+
   ConvertToSPIRVPassOptions convertToSPIRVOptions{};
   convertToSPIRVOptions.convertGPUModules = true;
+  convertToSPIRVOptions.nestInGPUModule = true;
   passManager.addPass(createConvertToSPIRVPass(convertToSPIRVOptions));
-  OpPassManager &modulePM = passManager.nest<spirv::ModuleOp>();
-  modulePM.addPass(spirv::createSPIRVLowerABIAttributesPass());
-  modulePM.addPass(spirv::createSPIRVUpdateVCEPass());
+
+  OpPassManager &spirvModulePM =
+      passManager.nest<gpu::GPUModuleOp>().nest<spirv::ModuleOp>();
+  spirvModulePM.addPass(spirv::createSPIRVLowerABIAttributesPass());
+  spirvModulePM.addPass(spirv::createSPIRVUpdateVCEPass());
+  if (options.spirvWebGPUPrepare)
+    spirvModulePM.addPass(spirv::createSPIRVWebGPUPreparePass());
+
+  passManager.addPass(createGpuModuleToBinaryPass());
 }
 
 } // namespace
 
 namespace mlir::test {
 void registerTestVulkanRunnerPipeline() {
-  PassPipelineRegistration<>(
+  PassPipelineRegistration<VulkanRunnerPipelineOptions>(
       "test-vulkan-runner-pipeline",
       "Runs a series of passes for lowering GPU-dialect MLIR to "
       "SPIR-V-dialect MLIR intended for mlir-vulkan-runner.",
diff --git a/mlir/test/mlir-vulkan-runner/addui_extended.mlir b/mlir/test/mlir-vulkan-runner/addui_extended.mlir
index 158541f326be7..b8db451421459 100644
--- a/mlir/test/mlir-vulkan-runner/addui_extended.mlir
+++ b/mlir/test/mlir-vulkan-runner/addui_extended.mlir
@@ -6,7 +6,7 @@
 // RUN:     --shared-libs=%vulkan-runtime-wrappers,%mlir_runner_utils \
 // RUN:     --entry-point-result=void | FileCheck %s
 
-// RUN: mlir-opt %s -test-vulkan-runner-pipeline -spirv-webgpu-prepare \
+// RUN: mlir-opt %s -test-vulkan-runner-pipeline=spirv-webgpu-prepare \
 // RUN:   | mlir-vulkan-runner - \
 // RUN:     --shared-libs=%vulkan-runtime-wrappers,%mlir_runner_utils \
 // RUN:     --entry-point-result=void | FileCheck %s
diff --git a/mlir/test/mlir-vulkan-runner/smul_extended.mlir b/mlir/test/mlir-vulkan-runner/smul_extended.mlir
index 2dd31d2ebb9a0..334aec843e197 100644
--- a/mlir/test/mlir-vulkan-runner/smul_extended.mlir
+++ b/mlir/test/mlir-vulkan-runner/smul_extended.mlir
@@ -6,7 +6,7 @@
 // RUN:     --shared-libs=%vulkan-runtime-wrappers,%mlir_runner_utils \
 // RUN:     --entry-point-result=void | FileCheck %s
 
-// RUN: mlir-opt %s -test-vulkan-runner-pipeline -spirv-webgpu-prepare \
+// RUN: mlir-opt %s -test-vulkan-runner-pipeline=spirv-webgpu-prepare \
 // RUN:   | mlir-vulkan-runner - \
 // RUN:     --shared-libs=%vulkan-runtime-wrappers,%mlir_runner_utils \
 // RUN:     --entry-point-result=void | FileCheck %s
diff --git a/mlir/test/mlir-vulkan-runner/umul_extended.mlir b/mlir/test/mlir-vulkan-runner/umul_extended.mlir
index 78300d2fd81dd..803b8c3d336d3 100644
--- a/mlir/test/mlir-vulkan-runner/umul_extended.mlir
+++ b/mlir/test/mlir-vulkan-runner/umul_extended.mlir
@@ -6,7 +6,7 @@
 // RUN:     --shared-libs=%vulkan-runtime-wrappers,%mlir_runner_utils \
 // RUN:     --entry-point-result=void | FileCheck %s
 
-// RUN: mlir-opt %s -test-vulkan-runner-pipeline -spirv-webgpu-prepare \
+// RUN: mlir-opt %s -test-vulkan-runner-pipeline=spirv-webgpu-prepare \
 // RUN:   | mlir-vulkan-runner - \
 // RUN:     --shared-libs=%vulkan-runtime-wrappers,%mlir_runner_utils \
 // RUN:     --entry-point-result=void | FileCheck %s