diff --git a/.ci/docker/ci_commit_pins/pytorch.txt b/.ci/docker/ci_commit_pins/pytorch.txt
index e3a53c8bcb5..a8de771a69d 100644
--- a/.ci/docker/ci_commit_pins/pytorch.txt
+++ b/.ci/docker/ci_commit_pins/pytorch.txt
@@ -1 +1 @@
-53a2908a10f414a2f85caa06703a26a40e873869
+e6f766c7d750d40603eee3f66c5915bac606b3ea
diff --git a/.ci/scripts/utils.sh b/.ci/scripts/utils.sh
index f896d3f1d40..8f48e75e712 100644
--- a/.ci/scripts/utils.sh
+++ b/.ci/scripts/utils.sh
@@ -44,6 +44,44 @@ install_pip_dependencies() {
   popd || return
 }
 
+dedupe_macos_loader_path_rpaths() {
+  if [[ "$(uname)" != "Darwin" ]]; then
+    return
+  fi
+
+  local torch_lib_dir
+  pushd ..
+  torch_lib_dir=$(python -c "import importlib.util; print(importlib.util.find_spec('torch').submodule_search_locations[0])")/lib
+  popd
+  
+  if [[ -z "${torch_lib_dir}" || ! -d "${torch_lib_dir}" ]]; then
+    return
+  fi
+
+  local torch_libs=(
+    "libtorch_cpu.dylib"
+    "libtorch.dylib"
+    "libc10.dylib"
+  )
+
+  for lib_name in "${torch_libs[@]}"; do
+    local lib_path="${torch_lib_dir}/${lib_name}"
+    if [[ ! -f "${lib_path}" ]]; then
+      continue
+    fi
+
+    local removed=0
+    # Repeatedly remove the @loader_path rpath entries until none remain.
+    while install_name_tool -delete_rpath @loader_path "${lib_path}" 2>/dev/null; do
+      removed=1
+    done
+
+    if [[ "${removed}" == "1" ]]; then
+      install_name_tool -add_rpath @loader_path "${lib_path}" || true
+    fi
+  done
+}
+
 install_domains() {
   echo "Install torchvision and torchaudio"
   pip install --no-use-pep517 --user "git+https://github.com/pytorch/audio.git@${TORCHAUDIO_VERSION}"
@@ -101,6 +139,7 @@ install_pytorch_and_domains() {
     echo "Use cached wheel at ${cached_torch_wheel}"
   fi
 
+  dedupe_macos_loader_path_rpaths
   # Grab the pinned audio and vision commits from PyTorch
   TORCHAUDIO_VERSION=$(cat .github/ci_commit_pins/audio.txt)
   export TORCHAUDIO_VERSION
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 8f0d8f6e571..c96b85740bc 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -351,6 +351,7 @@ jobs:
 
         # reinstall executorch
         bash ./install_executorch.sh --minimal
+        pip list
 
         # run python unittest
         python -m unittest examples.models.moshi.mimi.test_mimi
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f5091a2af2e..1b96c12fbf3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -99,28 +99,6 @@ announce_configured_options(CCACHE_PROGRAM)
 
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
-# Setup RPATH. See
-# https://gitlab.kitware.com/cmake/community/-/wikis/doc/cmake/RPATH-handling
-# Use separate rpaths during build and install phases
-set(CMAKE_SKIP_BUILD_RPATH OFF)
-# Don't use the install-rpath during the build phase
-set(CMAKE_BUILD_WITH_INSTALL_RPATH ON)
-# Automatically add all linked folders that are NOT in the build directory to
-# the rpath (per library?)
-#
-# TODO: Doesn't work for us right now because we are not installing .so's into
-# the correct locations. For example we have libcustom_ops_aot_lib.so depending
-# on _portable_lib.so, which was eventually put under
-# <site-packages>/executorch/extension/pybindings/ but this rpath is not
-# automatically added because at build time it seems `portable_lib` is being
-# built under the same directory, so no extra rpath is being added. To properly
-# fix this we need to install `portable_lib` into the correct path.
-set(CMAKE_INSTALL_RPATH_USE_LINK_PATH ON)
-# ------------------------------ OPTIONS -------------------------------------
-# WARNING: Please don't add example specific options in this CMakeLists.txt.
-# Instead please use `find_package(executorch REQUIRED)` in the example
-# directory and add a new executable in the example `CMakeLists.txt`.
-
 if(NOT EXECUTORCH_ENABLE_LOGGING)
   # Avoid pulling in the logging strings, which can be large. Note that this
   # will set the compiler flag for all targets in this directory, and for all
@@ -909,12 +887,13 @@ if(EXECUTORCH_BUILD_PYBIND)
 
   # Set RPATH to find PyTorch libraries relative to the installation location
   # This goes from executorch/extension/pybindings up to site-packages, then to
-  # torch/lib
+  # torch/lib. Don't do this to APPLE, as it will error out on the following
+  # error:
+  #
   if(APPLE)
-    set_target_properties(
-      portable_lib PROPERTIES BUILD_RPATH "@loader_path/../../../torch/lib"
-                              INSTALL_RPATH "@loader_path/../../../torch/lib"
-    )
+    # Skip setting @loader_path for APPLE, since it causes error like ld:
+    # duplicate LC_RPATH '@loader_path' in '<site-packages>/torch/lib/
+    # libtorch_cpu.dylib'
   else()
     set_target_properties(
       portable_lib PROPERTIES BUILD_RPATH "$ORIGIN/../../../torch/lib"
diff --git a/backends/aoti/aoti_delegate_handle.h b/backends/aoti/aoti_delegate_handle.h
index 2e72fc39821..82ce2521750 100644
--- a/backends/aoti/aoti_delegate_handle.h
+++ b/backends/aoti/aoti_delegate_handle.h
@@ -71,6 +71,11 @@ using AOTInductorModelContainerGetNumConstantsFunc = AOTIRuntimeError (*)(
     AOTInductorModelContainerHandle container_handle,
     size_t* num_constants);
 
+// Update the model container with the constant tensors
+using AOTInductorModelUpdateConstantsFromBlobFunc = AOTIRuntimeError (*)(
+    AOTInductorModelContainerHandle container_handle,
+    const uint8_t* weight_blob_ptr);
+
 } // extern "C"
 
 // AOTI Delegate Handle structure
@@ -87,6 +92,7 @@ struct AOTIDelegateHandle {
   AOTInductorModelContainerGetNumInputsFunc get_num_inputs;
   AOTInductorModelContainerGetNumOutputsFunc get_num_outputs;
   AOTInductorModelContainerRunFunc run;
+  AOTInductorModelUpdateConstantsFromBlobFunc update_constants_from_blob;
 };
 
 } // namespace aoti
diff --git a/backends/cuda/cuda_backend.py b/backends/cuda/cuda_backend.py
index 05d01972833..ba6da92b991 100644
--- a/backends/cuda/cuda_backend.py
+++ b/backends/cuda/cuda_backend.py
@@ -146,8 +146,11 @@ def preprocess(
             "aot_inductor.embed_kernel_binary": True,
             # Do not link against the full PyTorch/libtorch library
             "aot_inductor.link_libtorch": False,
-            # Package model constants and other generated files directly in the shared object (.so) file
-            "aot_inductor.package_constants_in_so": True,
+            # Separate weight constants from the .so file
+            "aot_inductor.package": True,
+            "aot_inductor.package_constants_in_so": False,
+            # Store weight constants on disk in a binary blob
+            "aot_inductor.package_constants_on_disk_format": "binary_blob",
             # Enable maximum automatic tuning for optimal performance
             "max_autotune": True,
             # Use TRITON for GEMM (General Matrix Multiply) operations tuning only to avoid using operators in libtorch
@@ -162,7 +165,8 @@ def preprocess(
             ]
         ), torch.no_grad():
             # torch._logging.set_logs(post_grad_graphs=True)
-            so_path = torch._inductor.aot_compile(edge_program_module, tuple(user_input_placeholders), options=options)  # type: ignore[arg-type]
+            # Here we should expect 1 so file and 1 weight blob in the same directory.
+            paths = torch._inductor.aot_compile(edge_program_module, tuple(user_input_placeholders), options=options)  # type: ignore[arg-type]
             if len(missing_fallback_kernels) > 0:
                 formatted_kernels = "\n  - ".join(sorted(missing_fallback_kernels))
                 raise RuntimeError(
@@ -170,17 +174,40 @@ def preprocess(
                     "Please add them to the AOTI backend."
                 )
 
+        # Extract the .so and .blob paths from the returned list
+        so_path = None
+        blob_path = None
+        for path in paths:
+            if path.endswith(".wrapper.so"):
+                so_path = path
+            elif path.endswith(".wrapper_weights.blob"):
+                blob_path = path
+
+        if so_path is None or blob_path is None:
+            raise RuntimeError(
+                f"Could not find required files in compiled paths, got {paths}"
+            )
+
         # pyre-ignorep[6]: Incompatible parameter type
         with open(so_path, "rb") as f:
             so_data = f.read()
 
         named_data_store = NamedDataStore()
         method_name = CudaBackend.method_name_from_compile_specs(compile_specs)
+
+        # Keep the so file in the NamedDataStore, so that it can be packaged into the .pte file.
+        named_data_store.add_named_data(method_name + "_so_blob", so_data, 1, None)
+
+        # Add weights blob to named data store
+        with open(blob_path, "rb") as f:
+            blob_data = f.read()
         named_data_store.add_named_data(
-            method_name + "_so_blob", so_data, 1, "aoti_cuda_blob"
+            method_name + "_weights_blob", blob_data, 1, "aoti_cuda_blob"
         )
+        # Clean up the weights blob file
+        os.remove(blob_path)
 
-        # Clean up the generated so file; it has been packaged into the NamdeDataStore
+        # Clean up the generated so file; it has been packaged into the NamedDataStore
         # pyre-ignorep[6]: Incompatible parameter type
         os.remove(so_path)
 
diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
index e61b03ee8e6..0cef859ddfb 100644
--- a/backends/cuda/runtime/cuda_backend.cpp
+++ b/backends/cuda/runtime/cuda_backend.cpp
@@ -27,15 +27,6 @@
 
 namespace executorch::backends::cuda {
 
-#define LOAD_SYMBOL(handle, member, name, so_handle)                 \
-  do {                                                               \
-    auto symbol_res = get_function(so_handle, #name);                \
-    if (!symbol_res.ok()) {                                          \
-      return symbol_res.error();                                     \
-    }                                                                \
-    handle->member = reinterpret_cast<name##Func>(symbol_res.get()); \
-  } while (0)
-
 using namespace std;
 using namespace aoti;
 
@@ -61,29 +52,37 @@ class ET_EXPERIMENTAL CudaBackend final
   Error load_function_pointers_into_handle(
       void* so_handle,
       AOTIDelegateHandle* handle) const {
-    LOAD_SYMBOL(
-        handle,
-        create_with_device,
-        AOTInductorModelContainerCreateWithDevice,
-        so_handle);
+#define LOAD_SYMBOL(member, name)                                    \
+  do {                                                               \
+    auto symbol_res = get_function(so_handle, #name);                \
+    if (!symbol_res.ok()) {                                          \
+      return symbol_res.error();                                     \
+    }                                                                \
+    handle->member = reinterpret_cast<name##Func>(symbol_res.get()); \
+  } while (0)
+
+    LOAD_SYMBOL(create_with_device, AOTInductorModelContainerCreateWithDevice);
 
-    LOAD_SYMBOL(
-        handle, delete_container, AOTInductorModelContainerDelete, so_handle);
+    LOAD_SYMBOL(delete_container, AOTInductorModelContainerDelete);
 
-    LOAD_SYMBOL(
-        handle,
-        get_num_inputs,
-        AOTInductorModelContainerGetNumInputs,
-        so_handle);
+    LOAD_SYMBOL(get_num_inputs, AOTInductorModelContainerGetNumInputs);
 
-    LOAD_SYMBOL(
-        handle,
-        get_num_outputs,
-        AOTInductorModelContainerGetNumOutputs,
-        so_handle);
+    LOAD_SYMBOL(get_num_outputs, AOTInductorModelContainerGetNumOutputs);
 
-    LOAD_SYMBOL(handle, run, AOTInductorModelContainerRun, so_handle);
+    LOAD_SYMBOL(run, AOTInductorModelContainerRun);
+#undef LOAD_SYMBOL
 
+    auto symbol_res =
+        get_function(so_handle, "AOTInductorModelUpdateConstantsFromBlob");
+    if (symbol_res.ok()) {
+      handle->update_constants_from_blob =
+          reinterpret_cast<AOTInductorModelUpdateConstantsFromBlobFunc>(
+              symbol_res.get());
+    } else {
+      ET_LOG(
+          Info,
+          "Failed to load AOTInductorModelUpdateConstantsFromBlob. This .so is probably compiled on an old version of torch (<2.9.0)");
+    }
     return Error::Ok;
   }
 
@@ -112,13 +111,13 @@ class ET_EXPERIMENTAL CudaBackend final
         method_name.empty() ? "so_blob" : method_name + "_so_blob";
 
     const NamedDataMap* named_data_map = context.get_named_data_map();
-    auto aoti_cuda_buffer = named_data_map->get_data(so_blob_key.c_str());
+    auto aoti_dso_buffer = named_data_map->get_data(so_blob_key.c_str());
     ET_CHECK_OR_RETURN_ERROR(
-        aoti_cuda_buffer.ok(),
+        aoti_dso_buffer.ok(),
         Internal,
         "Failed to get data for key %s: 0x%x",
         so_blob_key.c_str(),
-        static_cast<uint32_t>(aoti_cuda_buffer.error()));
+        static_cast<uint32_t>(aoti_dso_buffer.error()));
 
     // Generate dynamic temporary file path
     filesystem::path temp_dir = filesystem::temp_directory_path();
@@ -132,12 +131,12 @@ class ET_EXPERIMENTAL CudaBackend final
     ET_LOG(
         Info,
         "Writing %zu bytes to %s",
-        aoti_cuda_buffer->size(),
+        aoti_dso_buffer->size(),
         so_path.c_str());
 
     outfile.write(
-        static_cast<const char*>(aoti_cuda_buffer->data()),
-        aoti_cuda_buffer->size());
+        static_cast<const char*>(aoti_dso_buffer->data()),
+        aoti_dso_buffer->size());
 
     ET_CHECK_OR_RETURN_ERROR(
         outfile, AccessFailed, "Failed to write to file %s", so_path.c_str());
@@ -145,6 +144,8 @@ class ET_EXPERIMENTAL CudaBackend final
     // Finish writing the file to disk
     outfile.close();
 
+    // Free the buffer immediately after writing to disk
+    aoti_dso_buffer->Free();
     // Load the lib
     Result<void*> lib_handle_res = load_library(so_path);
     if (!lib_handle_res.ok()) {
@@ -172,6 +173,19 @@ class ET_EXPERIMENTAL CudaBackend final
 
     handle->container_handle = container_handle;
 
+    // Look into named data map for constant data
+    std::string weights_blob_key =
+        method_name.empty() ? "weights_blob" : method_name + "_weights_blob";
+    auto buffer_res = named_data_map->get_data(weights_blob_key.c_str());
+    if (buffer_res.ok() && handle->update_constants_from_blob != nullptr) {
+      ET_LOG(Info, "Found %s in named data map", weights_blob_key.c_str());
+      const void* weights_blob = buffer_res->data();
+      // Feed the weights blob into the container. Under the hood it's copying
+      // weights, so we should free the buffer immediately.
+      ET_CHECK_OK_OR_RETURN_ERROR(handle->update_constants_from_blob(
+          handle->container_handle, static_cast<const uint8_t*>(weights_blob)));
+      buffer_res->Free();
+    }
     // Create a CUDA stream for asynchronous execution
     cudaStream_t cuda_stream;
     ET_CUDA_CHECK_OR_RETURN_ERROR(cudaStreamCreate(&cuda_stream));
diff --git a/examples/models/moshi/mimi/install_requirements.sh b/examples/models/moshi/mimi/install_requirements.sh
index 6df4caf8692..bddd960f8a7 100755
--- a/examples/models/moshi/mimi/install_requirements.sh
+++ b/examples/models/moshi/mimi/install_requirements.sh
@@ -8,9 +8,9 @@
 set -x
 
 conda install -c conda-forge "ffmpeg<8" -y
-pip install torchcodec==0.7.0.dev20250929 --extra-index-url https://download.pytorch.org/whl/nightly/cpu
-pip install moshi==0.2.4
-pip install bitsandbytes soundfile
+pip install torchcodec==0.7.0.dev20251012 --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+pip install moshi==0.2.11
+pip install bitsandbytes soundfile einops
 # Run llama2/install requirements for torchao deps
 SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
 bash "$SCRIPT_DIR"/../../llama/install_requirements.sh
diff --git a/examples/models/moshi/mimi/test_mimi.py b/examples/models/moshi/mimi/test_mimi.py
index d0c3c2ceb15..93513c54e78 100644
--- a/examples/models/moshi/mimi/test_mimi.py
+++ b/examples/models/moshi/mimi/test_mimi.py
@@ -189,8 +189,7 @@ def forward(self, x):
                 x = self.mimi_model.upsample(x)
                 (emb,) = self.mimi_model.decoder_transformer(x)
                 emb.transpose(1, 2)
-                with self.mimi_model._context_for_encoder_decoder:
-                    out = self.mimi_model.decoder(emb)
+                out = self.mimi_model.decoder(emb)
                 return out
 
         emb_input = torch.rand(1, 1, 512, device="cpu")
diff --git a/examples/models/voxtral/multimodal.cpp b/examples/models/voxtral/multimodal.cpp
index b3dd5e3ab68..29edf955751 100644
--- a/examples/models/voxtral/multimodal.cpp
+++ b/examples/models/voxtral/multimodal.cpp
@@ -319,7 +319,7 @@ int32_t main(int32_t argc, char** argv) {
   // Create multimodal runner
   std::unique_ptr<::executorch::extension::llm::MultimodalRunner> runner =
       ::executorch::extension::llm::create_multimodal_runner(
-          model_path, std::move(tokenizer), data_path);
+          model_path, std::move(tokenizer), data_path, Module::LoadMode::Mmap);
   if (runner == nullptr) {
     ET_LOG(Error, "Failed to create multimodal runner");
     return 1;
diff --git a/extension/llm/runner/llm_runner_helper.cpp b/extension/llm/runner/llm_runner_helper.cpp
index d1e4ff2ce45..674be820072 100644
--- a/extension/llm/runner/llm_runner_helper.cpp
+++ b/extension/llm/runner/llm_runner_helper.cpp
@@ -268,7 +268,8 @@ std::unique_ptr<TextLLMRunner> create_text_llm_runner(
 std::unique_ptr<MultimodalRunner> create_multimodal_runner(
     const std::string& model_path,
     std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
-    std::optional<const std::string> data_path) {
+    std::optional<const std::string> data_path,
+    Module::LoadMode load_mode) {
   // Sanity check tokenizer
   if (!tokenizer || !tokenizer->is_loaded()) {
     ET_LOG(Error, "Tokenizer is null or not loaded");
@@ -278,10 +279,9 @@ std::unique_ptr<MultimodalRunner> create_multimodal_runner(
   // Create the Module
   std::unique_ptr<Module> module;
   if (data_path.has_value()) {
-    module = std::make_unique<Module>(
-        model_path, data_path.value(), Module::LoadMode::File);
+    module = std::make_unique<Module>(model_path, data_path.value(), load_mode);
   } else {
-    module = std::make_unique<Module>(model_path, Module::LoadMode::File);
+    module = std::make_unique<Module>(model_path, load_mode);
   }
 
   // Get metadata from Module
diff --git a/extension/llm/runner/llm_runner_helper.h b/extension/llm/runner/llm_runner_helper.h
index 5c109581e19..08f0efd0353 100644
--- a/extension/llm/runner/llm_runner_helper.h
+++ b/extension/llm/runner/llm_runner_helper.h
@@ -140,6 +140,7 @@ ET_EXPERIMENTAL std::unique_ptr<TextLLMRunner> create_text_llm_runner(
 ET_EXPERIMENTAL std::unique_ptr<MultimodalRunner> create_multimodal_runner(
     const std::string& model_path,
     std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
-    std::optional<const std::string> data_path = std::nullopt);
+    std::optional<const std::string> data_path = std::nullopt,
+    Module::LoadMode load_mode = Module::LoadMode::File);
 
 } // namespace executorch::extension::llm
diff --git a/runtime/core/portable_type/c10/c10/util/llvmMathExtras.h b/runtime/core/portable_type/c10/c10/util/llvmMathExtras.h
index 556699be04b..6321297a61c 100644
--- a/runtime/core/portable_type/c10/c10/util/llvmMathExtras.h
+++ b/runtime/core/portable_type/c10/c10/util/llvmMathExtras.h
@@ -70,7 +70,7 @@ enum ZeroBehavior {
 namespace detail {
 template <typename T, std::size_t SizeOfT>
 struct TrailingZerosCounter {
-  static std::size_t count(T Val, ZeroBehavior) {
+  static std::size_t count(T Val, ZeroBehavior /*unused*/) {
     if (!Val)
       return std::numeric_limits<T>::digits;
     if (Val & 0x1)
@@ -147,7 +147,7 @@ std::size_t countTrailingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
 namespace detail {
 template <typename T, std::size_t SizeOfT>
 struct LeadingZerosCounter {
-  static std::size_t count(T Val, ZeroBehavior) {
+  static std::size_t count(T Val, ZeroBehavior /*unused*/) {
     if (!Val)
       return std::numeric_limits<T>::digits;
 
diff --git a/runtime/core/portable_type/c10/torch/headeronly/macros/Macros.h b/runtime/core/portable_type/c10/torch/headeronly/macros/Macros.h
index 558edb175ae..e340e7626a0 100644
--- a/runtime/core/portable_type/c10/torch/headeronly/macros/Macros.h
+++ b/runtime/core/portable_type/c10/torch/headeronly/macros/Macros.h
@@ -359,6 +359,7 @@ static inline int C10_WARP_SIZE_INTERNAL() {
 // Those platforms do not support assert()
 #define CUDA_KERNEL_ASSERT(cond)
 #define CUDA_KERNEL_ASSERT_MSG(cond, msg)
+#define CUDA_KERNEL_ASSERT_PRINTF(cond, msg, ...)
 #define SYCL_KERNEL_ASSERT(cond)
 #elif defined(_MSC_VER)
 #if defined(NDEBUG)
@@ -396,6 +397,26 @@ __host__ __device__
                static_cast<unsigned>(__LINE__)), \
            0);                                   \
   }
+#define CUDA_KERNEL_ASSERT_PRINTF(cond, msg, ...)                     \
+  if (C10_UNLIKELY(!(cond))) {                                        \
+    (void)(printf(                                                    \
+        "[CUDA_KERNEL_ASSERT] " __FILE__ ":" C10_STRINGIZE(           \
+            __LINE__) ": %s: block: [%d,%d,%d], thread: [%d,%d,%d]: " \
+                      "Assertion failed: `" #cond "`: " msg "\n",     \
+        __func__,                                                     \
+        blockIdx.x,                                                   \
+        blockIdx.y,                                                   \
+        blockIdx.z,                                                   \
+        threadIdx.x,                                                  \
+        threadIdx.y,                                                  \
+        threadIdx.z,                                                  \
+        ##__VA_ARGS__));                                              \
+    (void)(_wassert(                                                  \
+               _CRT_WIDE(#cond),                                      \
+               _CRT_WIDE(__FILE__),                                   \
+               static_cast<unsigned>(__LINE__)),                      \
+           0);                                                        \
+  }
 #define SYCL_KERNEL_ASSERT(cond)                 \
   if (C10_UNLIKELY(!(cond))) {                   \
     (void)(_wassert(                             \
@@ -455,6 +476,10 @@ __host__ __device__
   if C10_UNLIKELY (!(cond)) {             \
     abort();                              \
   }
+#define CUDA_KERNEL_ASSERT_PRINTF(cond, msg, ...) \
+  if C10_UNLIKELY (!(cond)) {                     \
+    abort();                                      \
+  }
 #define SYCL_KERNEL_ASSERT(cond) \
   if C10_UNLIKELY (!(cond)) {    \
     abort();                     \
@@ -470,6 +495,23 @@ __host__ __device__
     __assert_fail(                                                     \
         msg, __FILE__, static_cast<unsigned int>(__LINE__), __func__); \
   }
+#define CUDA_KERNEL_ASSERT_PRINTF(cond, msg, ...)                        \
+  if (C10_UNLIKELY(!(cond))) {                                           \
+    printf(                                                            \
+        "[CUDA_KERNEL_ASSERT] " __FILE__ ":" C10_STRINGIZE(            \
+            __LINE__) ": %s: block: [%d,%d,%d], thread: [%d,%d,%d]: "  \
+            "Assertion failed: `" #cond "`: " msg "\n",                \
+        __func__,                                                      \
+        blockIdx.x,                                                    \
+        blockIdx.y,                                                    \
+        blockIdx.z,                                                    \
+        threadIdx.x,                                                   \
+        threadIdx.y,                                                   \
+        threadIdx.z,                                                   \
+        ##__VA_ARGS__); \
+    __assert_fail(                                                       \
+        #cond, __FILE__, static_cast<unsigned int>(__LINE__), __func__); \
+  }
 #define SYCL_KERNEL_ASSERT(cond)                                         \
   if (C10_UNLIKELY(!(cond))) {                                           \
     __assert_fail(                                                       \
diff --git a/runtime/core/portable_type/c10/torch/headeronly/util/BFloat16.h b/runtime/core/portable_type/c10/torch/headeronly/util/BFloat16.h
index 2c1f805ac7b..ac47e3f844a 100644
--- a/runtime/core/portable_type/c10/torch/headeronly/util/BFloat16.h
+++ b/runtime/core/portable_type/c10/torch/headeronly/util/BFloat16.h
@@ -39,7 +39,9 @@ struct alignas(2) BFloat16 {
     return from_bits_t();
   }
 
-  constexpr C10_HOST_DEVICE BFloat16(unsigned short bits, from_bits_t)
+  constexpr C10_HOST_DEVICE BFloat16(
+      unsigned short bits,
+      from_bits_t /*unused*/)
       : x(bits) {}
   /* implicit */ inline C10_HOST_DEVICE BFloat16(float value);
   inline C10_HOST_DEVICE operator float() const;
diff --git a/runtime/core/portable_type/c10/torch/headeronly/util/Half.h b/runtime/core/portable_type/c10/torch/headeronly/util/Half.h
index 59a86f07e33..9673301e2de 100644
--- a/runtime/core/portable_type/c10/torch/headeronly/util/Half.h
+++ b/runtime/core/portable_type/c10/torch/headeronly/util/Half.h
@@ -80,7 +80,8 @@ struct alignas(2) Half {
   Half() = default;
 #endif
 
-  constexpr C10_HOST_DEVICE Half(unsigned short bits, from_bits_t) : x(bits) {}
+  constexpr C10_HOST_DEVICE Half(unsigned short bits, from_bits_t /*unused*/)
+      : x(bits) {}
 #if defined(__aarch64__) && !defined(__CUDACC__)
   inline Half(float16_t value);
   inline operator float16_t() const;
diff --git a/torch_pin.py b/torch_pin.py
index 02040c91963..5e54c848d13 100644
--- a/torch_pin.py
+++ b/torch_pin.py
@@ -1,2 +1,2 @@
 TORCH_VERSION = "2.10.0"
-NIGHTLY_VERSION = "dev20251003"
+NIGHTLY_VERSION = "dev20251015"