Skip to content

[distributed][tensor] RuntimeError: Comparing #2227

@zxd1997066

Description

@zxd1997066

🐛 Describe the bug

please get wheels from https://github.com/intel/torch-xpu-ops/actions/runs/18771826220 or use gh download

gh run download 18771826220 --repo intel/torch-xpu-ops --name Torch-XPU-Wheel-1826 --dir path --pattern "*.zip"
git clone -b distributed_2.10 https://github.com/daisyden/pytorch.git
cd pytorch
pip install -r requirements.txt
pip install pytest expecttest
pytest -v test/distributed/tensor/test_tensor_ops.py::DistTensorOpsTestWithLocalTensor::test_index_put_scalar
pytest -v test/distributed/tensor/test_tensor_ops.py::DistTensorOpsTestWithLocalTensor::test_index_put_tensor
Traceback (most recent call last):
  File "/home/sdp/miniforge-pypy3/envs/xccl_ww27/lib/python3.10/site-packages/torch/testing/_comparison.py", line 1298, in not_close_error_metas
    pair.compare()
  File "/home/sdp/miniforge-pypy3/envs/xccl_ww27/lib/python3.10/site-packages/torch/testing/_comparison.py", line 747, in compare
    self._compare_values(actual, expected)
  File "/home/sdp/miniforge-pypy3/envs/xccl_ww27/lib/python3.10/site-packages/torch/testing/_comparison.py", line 905, in _compare_values
    compare_fn(
  File "/home/sdp/miniforge-pypy3/envs/xccl_ww27/lib/python3.10/site-packages/torch/testing/_comparison.py", line 1087, in _compare_regular_values_close
    if torch.all(matches):
AttributeError: 'LocalIntNode' object has no attribute 'guard_int'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/home/sdp/miniforge-pypy3/envs/xccl_ww27/lib/python3.10/unittest/case.py", line 59, in testPartExecutor
    yield
  File "/home/sdp/miniforge-pypy3/envs/xccl_ww27/lib/python3.10/unittest/case.py", line 591, in run
    self._callTestMethod(testMethod)
  File "/home/sdp/miniforge-pypy3/envs/xccl_ww27/lib/python3.10/unittest/case.py", line 549, in _callTestMethod
    method()
  File "/home/sdp/miniforge-pypy3/envs/xccl_ww27/lib/python3.10/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 730, in wrapper
    fn()
  File "/home/sdp/miniforge-pypy3/envs/xccl_ww27/lib/python3.10/site-packages/torch/testing/_internal/common_utils.py", line 3302, in wrapper
    method(*args, **kwargs)
  File "/home/sdp/miniforge-pypy3/envs/xccl_ww27/lib/python3.10/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 759, in wrapped
    out = fn(self)
  File "/home/sdp/miniforge-pypy3/envs/xccl_ww27/lib/python3.10/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 522, in wrapper
    raise e
  File "/home/sdp/miniforge-pypy3/envs/xccl_ww27/lib/python3.10/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 519, in wrapper
    func(self, *args, **kwargs)  # type: ignore[misc]
  File "/home/sdp/xiangdong/pytorch/test/distributed/tensor/test_tensor_ops.py", line 673, in test_index_put_scalar
    self.assertEqual(output_dt.full_tensor(), ref)
  File "/home/sdp/miniforge-pypy3/envs/xccl_ww27/lib/python3.10/site-packages/torch/testing/_internal/common_utils.py", line 4218, in assertEqual
    error_metas = not_close_error_metas(
  File "/home/sdp/miniforge-pypy3/envs/xccl_ww27/lib/python3.10/site-packages/torch/testing/_comparison.py", line 1304, in not_close_error_metas
    raise RuntimeError(
RuntimeError: Comparing

TensorOrArrayPair(
    id=(),
    actual=LocalTensor(
  0: tensor([[[-1.6165,  0.5685, -0.5102, -0.9113, -1.1555, -0.2262, -1.2891,
           1.0654],
         [-0.7167, -0.5333,  0.2078, -0.9798,  0.7447, -0.2395,  0.2737,
           0.0920],
         [-0.8036, -1.6377,  2.2774,  0.4469,  2.0805, -0.0777, -0.2435,
          -0.2230],
         [ 0.3834, -0.2091, -0.2408,  0.2274, -1.5866, -0.2701, -1.2524,
          -0.3004]],

        [[-0.2514,  1.4439, -0.3719,  0.1226,  0.5674,  0.1105, -0.8598,
           1.6837],
         [-0.9736, -0.3028,  1.1816, -0.4079, -0.9062,  0.8258,  2.9949,
          -0.2411],
         [ 0.7362,  1.2564, -2.0559,  1.3624, -1.1564,  0.0883,  1.6727,
          -0.2501],
         [-0.2095, -1.2228, -2.6263,  0.4527,  1.5333, -1.1749,  0.6565,
          -0.9248]]], device='xpu:0'),
  1: tensor([[[-1.6165,  0.5685, -0.5102, -0.9113, -1.1555, -0.2262, -1.2891,
           1.0654],
         [-0.7167, -0.5333,  0.2078, -0.9798,  0.7447, -0.2395,  0.2737,
           0.0920],
         [-0.8036, -1.6377,  2.2774,  0.4469,  2.0805, -0.0777, -0.2435,
          -0.2230],
         [ 0.3834, -0.2091, -0.2408,  0.2274, -1.5866, -0.2701, -1.2524,
          -0.3004]],

        [[-0.2514,  1.4439, -0.3719,  0.1226,  0.5674,  1.4184, -0.8598,
           1.6837],
         [-0.9736, -0.3028,  1.1816, -0.4079, -0.9062,  0.8258,  2.9949,
          -0.2411],
         [ 0.7362,  1.2564, -2.0559,  1.3624, -1.1564,  0.0883,  1.6727,
          -0.2501],
         [-0.2095, -1.2228, -2.6263,  0.4527,  1.5333, -1.1749,  0.6565,
           0.1105]]], device='xpu:0'),
  2: tensor([[[-1.6165,  0.5685, -0.5102, -0.9113, -1.1555, -0.2262, -1.2891,
           1.0654],
         [-0.7167, -0.5333,  0.2078, -0.9798,  0.7447, -0.2395,  0.1105,
           0.0920],
         [-0.8036, -1.6377,  2.2774,  0.4469,  2.0805, -0.0777, -0.2435,
          -0.2230],
         [ 0.3834, -0.2091, -0.2408,  0.2274, -1.5866, -0.2701, -1.2524,
          -0.3004]],

        [[-0.2514,  1.4439, -0.3719,  0.1226,  0.5674,  1.4184, -0.8598,
           1.6837],
         [-0.9736, -0.3028,  1.1816, -0.4079, -0.9062,  0.8258,  2.9949,
          -0.2411],
         [ 0.7362,  1.2564, -2.0559,  1.3624, -1.1564,  0.0883,  1.6727,
          -0.2501],
         [-0.2095, -1.2228, -2.6263,  0.4527,  1.5333, -1.1749,  0.6565,
          -0.9248]]], device='xpu:0'),
  3: tensor([[[-1.6165,  0.5685, -0.5102, -0.9113, -1.1555, -0.2262, -1.2891,
           1.0654],
         [-0.7167, -0.5333,  0.2078, -0.9798,  0.7447, -0.2395,  0.2737,
           0.0920],
         [-0.8036, -1.6377,  2.2774,  0.4469,  2.0805, -0.0777, -0.2435,
          -0.2230],
         [ 0.3834, -0.2091, -0.2408,  0.2274, -1.5866, -0.2701, -1.2524,
          -0.3004]],

        [[-0.2514,  1.4439, -0.3719,  0.1226,  0.5674,  1.4184, -0.8598,
           1.6837],
         [-0.9736, -0.3028,  0.1105, -0.4079, -0.9062,  0.8258,  2.9949,
          -0.2411],
         [ 0.7362,  1.2564, -2.0559,  1.3624, -1.1564,  0.0883,  1.6727,
          -0.2501],
         [-0.2095, -1.2228, -2.6263,  0.4527,  1.5333, -1.1749,  0.6565,
          -0.9248]]], device='xpu:0')
),
    expected=LocalTensor(
  0: tensor([[[-1.6165,  0.5685, -0.5102, -0.9113, -1.1555, -0.2262, -1.2891,
           1.0654],
         [-0.7167, -0.5333,  0.2078, -0.9798,  0.7447, -0.2395,  0.2737,
           0.0920],
         [-0.8036, -1.6377,  2.2774,  0.4469,  2.0805, -0.0777, -0.2435,
          -0.2230],
         [ 0.3834, -0.2091, -0.2408,  0.2274, -1.5866, -0.2701, -1.2524,
          -0.3004]],

        [[-0.2514,  1.4439, -0.3719,  0.1226,  0.5674,  0.1105, -0.8598,
           1.6837],
         [-0.9736, -0.3028,  1.1816, -0.4079, -0.9062,  0.8258,  2.9949,
          -0.2411],
         [ 0.7362,  1.2564, -2.0559,  1.3624, -1.1564,  0.0883,  1.6727,
          -0.2501],
         [-0.2095, -1.2228, -2.6263,  0.4527,  1.5333, -1.1749,  0.6565,
          -0.9248]]], device='xpu:0'),
  1: tensor([[[ 0.6331,  1.6358, -0.3459,  1.0196, -0.4122,  1.4279,  0.9691,
          -1.6059],
         [ 0.4750,  2.0320,  0.6990, -1.5602,  0.3154,  0.7306,  0.6595,
          -2.1267],
         [ 0.8045,  0.0338, -0.0880,  0.6372,  0.7436, -0.8114, -0.3861,
           0.9831],
         [-0.6453, -0.8731, -1.7578,  0.1951, -0.3828,  0.4662, -0.2241,
           1.1444]],

        [[ 0.5584, -1.2858, -0.4909,  1.1547, -1.3643,  1.8305,  0.7982,
           1.8111],
         [-0.1695, -1.4307, -0.4022, -1.2092, -1.2156,  0.6202,  0.0528,
          -0.2820],
         [-1.0747, -0.9203, -1.2738,  0.2902,  0.3817,  1.7172,  0.7362,
           0.1958],
         [-1.8239,  1.7929, -2.0741,  0.4553, -1.8539,  0.5169,  1.2180,
           0.1105]]], device='xpu:0'),
  2: tensor([[[-0.1623,  1.8079, -1.0533,  1.3708,  0.3522, -1.6017, -1.8394,
           0.3308],
         [-0.9794,  1.1197, -0.2649,  2.7342,  1.0942, -0.6595,  0.1105,
           0.5713],
         [ 0.6352, -0.2614, -2.3907,  0.6319,  1.2192, -1.8317,  1.8331,
          -1.4025],
         [-0.8531, -0.6568,  0.9970,  1.0180,  0.6035, -0.3791,  0.2757,
          -1.1518]],

        [[-2.0452, -0.2765,  1.3527, -1.9871,  0.2513, -0.4145,  0.7712,
          -0.0469],
         [ 0.7190, -0.1964, -1.3418, -0.1360, -0.7904,  0.5635,  0.4989,
          -0.8626],
         [-0.5854,  0.1135,  0.2430,  0.4178, -1.7545, -1.8663,  2.4396,
          -0.2938],
         [ 1.2275, -0.5825, -0.7190, -0.5401, -1.1625, -0.4761,  1.4125,
           1.2533]]], device='xpu:0'),
  3: tensor([[[ 6.3410e-01, -1.2250e+00,  2.3015e-02,  4.1598e-01, -1.2895e+00,
           7.3373e-01,  9.6372e-01, -4.5297e-01],
         [ 4.3157e-01, -4.6403e-01,  1.4228e+00, -5.6680e-01,  2.5958e+00,
          -1.0071e-01, -5.5872e-01, -1.6787e-01],
         [-5.6172e-01, -7.8447e-01, -7.8194e-01,  4.8986e-01, -8.4717e-04,
           6.2117e-01,  1.4144e-01,  5.0568e-01],
         [ 1.4254e+00, -3.4932e-01,  4.3589e-01,  7.6691e-01,  1.9459e-01,
           1.5380e+00, -9.6551e-01,  1.3818e+00]],

        [[-1.3231e+00, -1.8859e+00, -9.6443e-01,  1.1178e+00, -2.4178e-01,
           1.2531e+00, -1.0125e+00,  1.0729e+00],
         [-1.0330e+00,  8.7000e-01,  1.1051e-01, -1.5859e-01, -8.1190e-01,
           8.1857e-01,  6.4280e-01,  7.5953e-01],
         [-8.3176e-01, -1.5700e-01,  1.6111e+00,  1.4884e+00,  1.4107e+00,
          -6.4074e-01, -9.6468e-01, -1.0335e+00],
         [-2.4674e+00,  1.2764e+00, -2.3045e-02, -6.7045e-02,  1.1326e-02,
          -1.4494e+00,  1.7346e+00, -8.4481e-01]]], device='xpu:0')
),
    rtol=1.3e-06,
    atol=1e-05,
    equal_nan=True,
    check_device=False,
    check_dtype=True,
    check_layout=False,
    check_stride=False,
)

resulted in the unexpected exception above. If you are a user and see this message during normal operation please file an issue at https://github.com/pytorch/pytorch/issues. If you are a developer and working on the comparison functions, please except the previous error and raise an expressive `ErrorMeta` instead.

Versions

https://github.com/daisyden/pytorch/tree/distributed_2.10

Metadata

Metadata

Assignees

Labels

bugSomething isn't workingmodule: distributedFor distributed feature issue

Type

No type

Projects

No projects

Milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions