diff --git a/scripts/patch-pytorch.sh b/scripts/patch-pytorch.sh index 4c329fd7d9..a7070889b3 100755 --- a/scripts/patch-pytorch.sh +++ b/scripts/patch-pytorch.sh @@ -16,6 +16,5 @@ fi echo "Applying PyTorch patches in $REPO_ROOT" cd "$REPO_ROOT" -# curl -sSL https://github.com/pytorch/pytorch/pull/126516.diff | git apply - -git apply "${SCRIPT_DIR}/pytorch_fp64.patch" +# put your patch applies here curl -sSL https://github.com/pytorch/pytorch/pull/143553.diff | git apply - diff --git a/scripts/pytorch_fp64.patch b/scripts/pytorch_fp64.patch deleted file mode 100644 index 69f43e1019..0000000000 --- a/scripts/pytorch_fp64.patch +++ /dev/null @@ -1,67 +0,0 @@ -diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py -index b3a17f4f669..b06c0e3bdcc 100644 ---- a/benchmarks/dynamo/common.py -+++ b/benchmarks/dynamo/common.py -@@ -2457,6 +2457,12 @@ def cast_to_fp32(model, inputs): - return cast_to(torch.float32, model, inputs) - - -+def cast_to_device(device, model, inputs): -+ model = model.to(device=device) -+ inputs = tree_map_only(torch.Tensor, lambda x: x.to(device=device), inputs) -+ return model, inputs -+ -+ - class DummyGradScaler: - def scale(self, loss): - return loss -@@ -2948,10 +2954,24 @@ class BenchmarkRunner: - model_fp64 = None - inputs_fp64 = None - try: -- model_fp64, inputs_fp64 = cast_to_fp64( -- self.deepcopy_and_maybe_parallelize(model), -- clone_inputs(example_inputs), -- ) -+ # Currently, XPU GEMM FP64 support is WIP. Therefore, we explicitly fallback to -+ # CPU to execute FP64 and take the result as the gold reference. -+ if current_device == "xpu": -+ model_fp64, inputs_fp64 = cast_to_fp64( -+ *cast_to_device( -+ "cpu", -+ self.deepcopy_and_maybe_parallelize(model), -+ clone_inputs(example_inputs), -+ ) -+ ) -+ else: -+ model_fp64, inputs_fp64 = cast_to_fp64( -+ self.deepcopy_and_maybe_parallelize(model), -+ clone_inputs(example_inputs), -+ ) -+ -+ # current_device of init_optimizer only impacts which optimizer will be applied. It does -+ # not change any tensor internally. Hence, we leave as it is rather than passing cpu. - self.init_optimizer(name, current_device, model_fp64.parameters()) - fp64_outputs = self.run_n_iterations( - model_fp64, inputs_fp64, self.model_iter_fn -@@ -2962,11 +2982,19 @@ class BenchmarkRunner: - else x, - fp64_outputs, - ) -- except Exception: -+ if current_device == "xpu": -+ fp64_outputs = tree_map_only( -+ torch.Tensor, -+ lambda x: x.to(device=current_device), -+ fp64_outputs, -+ ) -+ except Exception as e: - log.warning( - "fp64 golden ref were not generated for %s. Setting accuracy check to cosine", - name, - ) -+ error_msg = f"current_device={current_device}; error:{str(e)}" -+ log.warning(error_msg) - self.args.cosine = True - fp64_outputs = None - finally: