diff --git a/scripts/patch-pytorch.sh b/scripts/patch-pytorch.sh
index 4c329fd7d9..a7070889b3 100755
--- a/scripts/patch-pytorch.sh
+++ b/scripts/patch-pytorch.sh
@@ -16,6 +16,5 @@ fi
 echo "Applying PyTorch patches in $REPO_ROOT"
 cd "$REPO_ROOT"
 
-# curl -sSL https://github.com/pytorch/pytorch/pull/126516.diff | git apply -
-git apply "${SCRIPT_DIR}/pytorch_fp64.patch"
+# put your patch applies here
 curl -sSL https://github.com/pytorch/pytorch/pull/143553.diff | git apply -
diff --git a/scripts/pytorch_fp64.patch b/scripts/pytorch_fp64.patch
deleted file mode 100644
index 69f43e1019..0000000000
--- a/scripts/pytorch_fp64.patch
+++ /dev/null
@@ -1,67 +0,0 @@
-diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py
-index b3a17f4f669..b06c0e3bdcc 100644
---- a/benchmarks/dynamo/common.py
-+++ b/benchmarks/dynamo/common.py
-@@ -2457,6 +2457,12 @@ def cast_to_fp32(model, inputs):
-     return cast_to(torch.float32, model, inputs)
- 
- 
-+def cast_to_device(device, model, inputs):
-+    model = model.to(device=device)
-+    inputs = tree_map_only(torch.Tensor, lambda x: x.to(device=device), inputs)
-+    return model, inputs
-+
-+
- class DummyGradScaler:
-     def scale(self, loss):
-         return loss
-@@ -2948,10 +2954,24 @@ class BenchmarkRunner:
-             model_fp64 = None
-             inputs_fp64 = None
-             try:
--                model_fp64, inputs_fp64 = cast_to_fp64(
--                    self.deepcopy_and_maybe_parallelize(model),
--                    clone_inputs(example_inputs),
--                )
-+                # Currently, XPU GEMM FP64 support is WIP. Therefore, we explicitly fallback to
-+                # CPU to execute FP64 and take the result as the gold reference.
-+                if current_device == "xpu":
-+                    model_fp64, inputs_fp64 = cast_to_fp64(
-+                        *cast_to_device(
-+                            "cpu",
-+                            self.deepcopy_and_maybe_parallelize(model),
-+                            clone_inputs(example_inputs),
-+                        )
-+                    )
-+                else:
-+                    model_fp64, inputs_fp64 = cast_to_fp64(
-+                        self.deepcopy_and_maybe_parallelize(model),
-+                        clone_inputs(example_inputs),
-+                    )
-+
-+                # current_device of init_optimizer only impacts which optimizer will be applied. It does
-+                # not change any tensor internally. Hence, we leave as it is rather than passing cpu.
-                 self.init_optimizer(name, current_device, model_fp64.parameters())
-                 fp64_outputs = self.run_n_iterations(
-                     model_fp64, inputs_fp64, self.model_iter_fn
-@@ -2962,11 +2982,19 @@ class BenchmarkRunner:
-                     else x,
-                     fp64_outputs,
-                 )
--            except Exception:
-+                if current_device == "xpu":
-+                    fp64_outputs = tree_map_only(
-+                        torch.Tensor,
-+                        lambda x: x.to(device=current_device),
-+                        fp64_outputs,
-+                    )
-+            except Exception as e:
-                 log.warning(
-                     "fp64 golden ref were not generated for %s. Setting accuracy check to cosine",
-                     name,
-                 )
-+                error_msg = f"current_device={current_device}; error:{str(e)}"
-+                log.warning(error_msg)
-                 self.args.cosine = True
-                 fp64_outputs = None
-             finally: