Revert "Disable fp64 patch (#3663)" (#4494)

anmyachev · web-flow · commit d40cfc5b7620 · 2025-06-13T17:07:47.000+02:00
Signed-off-by: Anatoly Myachev &lt;anatoly.myachev@intel.com&gt;
diff --git a/scripts/patch-pytorch.sh b/scripts/patch-pytorch.sh
@@ -25,14 +25,15 @@ apply_patch() {
     echo "Applying patch $1"
     cd "$REPO_ROOT"
     if [[ -f $SCRIPTS_DIR/$1 ]]; then
-        git apply "$SCRIPTS_DIR/$1"
+        git apply --3way "$SCRIPTS_DIR/$1"
     else
         fetch_patch "$1"
-        git apply "$SCRIPTS_DIR/$(basename "$1")"
+        git apply --3way "$SCRIPTS_DIR/$(basename "$1")"
     fi
 }
 
 echo "Applying PyTorch patches in $REPO_ROOT"
 
 # put your patch applies here
 apply_patch https://github.com/pytorch/pytorch/pull/143553.diff
+apply_patch pytorch_fp64.patch
diff --git a/scripts/pytorch_fp64.patch b/scripts/pytorch_fp64.patch
@@ -0,0 +1,67 @@
+diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py
+index b3a17f4f669..b06c0e3bdcc 100644
+--- a/benchmarks/dynamo/common.py
++++ b/benchmarks/dynamo/common.py
+@@ -2457,6 +2457,12 @@ def cast_to_fp32(model, inputs):
+     return cast_to(torch.float32, model, inputs)
+ 
+ 
++def cast_to_device(device, model, inputs):
++    model = model.to(device=device)
++    inputs = tree_map_only(torch.Tensor, lambda x: x.to(device=device), inputs)
++    return model, inputs
++
++
+ class DummyGradScaler:
+     def scale(self, loss):
+         return loss
+@@ -2948,10 +2954,24 @@ class BenchmarkRunner:
+             model_fp64 = None
+             inputs_fp64 = None
+             try:
+-                model_fp64, inputs_fp64 = cast_to_fp64(
+-                    self.deepcopy_and_maybe_parallelize(model),
+-                    clone_inputs(example_inputs),
+-                )
++                # Currently, XPU GEMM FP64 support is WIP. Therefore, we explicitly fallback to
++                # CPU to execute FP64 and take the result as the gold reference.
++                if current_device == "xpu":
++                    model_fp64, inputs_fp64 = cast_to_fp64(
++                        *cast_to_device(
++                            "cpu",
++                            self.deepcopy_and_maybe_parallelize(model),
++                            clone_inputs(example_inputs),
++                        )
++                    )
++                else:
++                    model_fp64, inputs_fp64 = cast_to_fp64(
++                        self.deepcopy_and_maybe_parallelize(model),
++                        clone_inputs(example_inputs),
++                    )
++
++                # current_device of init_optimizer only impacts which optimizer will be applied. It does
++                # not change any tensor internally. Hence, we leave as it is rather than passing cpu.
+                 self.init_optimizer(name, current_device, model_fp64.parameters())
+                 fp64_outputs = self.run_n_iterations(
+                     model_fp64, inputs_fp64, self.model_iter_fn
+@@ -2962,11 +2982,19 @@ class BenchmarkRunner:
+                     else x,
+                     fp64_outputs,
+                 )
+-            except Exception:
++                if current_device == "xpu":
++                    fp64_outputs = tree_map_only(
++                        torch.Tensor,
++                        lambda x: x.to(device=current_device),
++                        fp64_outputs,
++                    )
++            except Exception as e:
+                 log.warning(
+                     "fp64 golden ref were not generated for %s. Setting accuracy check to cosine",
+                     name,
+                 )
++                error_msg = f"current_device={current_device}; error:{str(e)}"
++                log.warning(error_msg)
+                 self.args.cosine = True
+                 fp64_outputs = None
+             finally: