|
| 1 | +diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py |
| 2 | +index b3a17f4f669..b06c0e3bdcc 100644 |
| 3 | +--- a/benchmarks/dynamo/common.py |
| 4 | ++++ b/benchmarks/dynamo/common.py |
| 5 | +@@ -2457,6 +2457,12 @@ def cast_to_fp32(model, inputs): |
| 6 | + return cast_to(torch.float32, model, inputs) |
| 7 | + |
| 8 | + |
| 9 | ++def cast_to_device(device, model, inputs): |
| 10 | ++ model = model.to(device=device) |
| 11 | ++ inputs = tree_map_only(torch.Tensor, lambda x: x.to(device=device), inputs) |
| 12 | ++ return model, inputs |
| 13 | ++ |
| 14 | ++ |
| 15 | + class DummyGradScaler: |
| 16 | + def scale(self, loss): |
| 17 | + return loss |
| 18 | +@@ -2948,10 +2954,24 @@ class BenchmarkRunner: |
| 19 | + model_fp64 = None |
| 20 | + inputs_fp64 = None |
| 21 | + try: |
| 22 | +- model_fp64, inputs_fp64 = cast_to_fp64( |
| 23 | +- self.deepcopy_and_maybe_parallelize(model), |
| 24 | +- clone_inputs(example_inputs), |
| 25 | +- ) |
| 26 | ++ # Currently, XPU GEMM FP64 support is WIP. Therefore, we explicitly fallback to |
| 27 | ++ # CPU to execute FP64 and take the result as the gold reference. |
| 28 | ++ if current_device == "xpu": |
| 29 | ++ model_fp64, inputs_fp64 = cast_to_fp64( |
| 30 | ++ *cast_to_device( |
| 31 | ++ "cpu", |
| 32 | ++ self.deepcopy_and_maybe_parallelize(model), |
| 33 | ++ clone_inputs(example_inputs), |
| 34 | ++ ) |
| 35 | ++ ) |
| 36 | ++ else: |
| 37 | ++ model_fp64, inputs_fp64 = cast_to_fp64( |
| 38 | ++ self.deepcopy_and_maybe_parallelize(model), |
| 39 | ++ clone_inputs(example_inputs), |
| 40 | ++ ) |
| 41 | ++ |
| 42 | ++ # current_device of init_optimizer only impacts which optimizer will be applied. It does |
| 43 | ++ # not change any tensor internally. Hence, we leave as it is rather than passing cpu. |
| 44 | + self.init_optimizer(name, current_device, model_fp64.parameters()) |
| 45 | + fp64_outputs = self.run_n_iterations( |
| 46 | + model_fp64, inputs_fp64, self.model_iter_fn |
| 47 | +@@ -2962,11 +2982,19 @@ class BenchmarkRunner: |
| 48 | + else x, |
| 49 | + fp64_outputs, |
| 50 | + ) |
| 51 | +- except Exception: |
| 52 | ++ if current_device == "xpu": |
| 53 | ++ fp64_outputs = tree_map_only( |
| 54 | ++ torch.Tensor, |
| 55 | ++ lambda x: x.to(device=current_device), |
| 56 | ++ fp64_outputs, |
| 57 | ++ ) |
| 58 | ++ except Exception as e: |
| 59 | + log.warning( |
| 60 | + "fp64 golden ref were not generated for %s. Setting accuracy check to cosine", |
| 61 | + name, |
| 62 | + ) |
| 63 | ++ error_msg = f"current_device={current_device}; error:{str(e)}" |
| 64 | ++ log.warning(error_msg) |
| 65 | + self.args.cosine = True |
| 66 | + fp64_outputs = None |
| 67 | + finally: |
0 commit comments