-
Notifications
You must be signed in to change notification settings - Fork 74
Description
Describe the bug
def train_ch6(net, train_iter, test_iter, num_epochs, lr, device):
Defined in :numref:`sec_lenet`"""
def init_weights(m):
if type(m) == nn.Linear or type(m) == nn.Conv2d:
nn.init.xavier_uniform_(m.weight)
net.apply(init_weights)
print('training on', device)
optimizer = torch.optim.SGD(net.parameters(), lr=lr)
loss = nn.CrossEntropyLoss()
animator = d2l.Animator(xlabel='epoch', xlim=[1, num_epochs],
legend=['train loss', 'train acc', 'test acc'])
timer, num_batches = d2l.Timer(), len(train_iter)
net = net.to(device)
net.train()
net = torch.compile(net)
for epoch in range(num_epochs):
metric = d2l.Accumulator(3)
net.train()
for i, (X, y) in enumerate(train_iter):
timer.start()
optimizer.zero_grad()
X, y = X.to(device), y.to(device)
with torch.amp.autocast("xpu", dtype=torch.bfloat16):
y_hat = net(X)
l = loss(y_hat, y)
l.backward()
optimizer.step()
with torch.no_grad():
metric.add(l * X.shape[0], d2l.accuracy(y_hat, y), X.shape[0])
timer.stop()
train_l = metric[0] / metric[2]
train_acc = metric[1] / metric[2]
if (i + 1) % (num_batches // 5) == 0 or i == num_batches - 1:
animator.add(epoch + (i + 1) / num_batches,
(train_l, train_acc, None))
test_acc = evaluate_accuracy_gpu(net, test_iter)
animator.add(epoch + 1, (None, None, test_acc))
print(f'loss {train_l:.3f}, train acc {train_acc:.3f}, '
f'test acc {test_acc:.3f}')
print(f'{metric[2] * num_epochs / timer.sum():.1f} examples/sec '
f'on {str(device)}')
python3 ./redisual_learn.py
training on xpu:0
in kernel: 'triton_per_fused__native_batch_norm_legit_functional_copy__5'
error: backend compiler failed build.
error: Double type is not supported on this platform.
in file: cntm6tjt34k6w4pvuj3uh3ygaz6n2jhqkzsq6zshu2nm4au7anyh.py:60
in kernel: 'triton_per_fused__native_batch_norm_legit_functional_copy__5'
error: backend compiler failed build.
error: Double type is not supported on this platform.
in file: cntm6tjt34k6w4pvuj3uh3ygaz6n2jhqkzsq6zshu2nm4au7anyh.py:60
in kernel: 'triton_per_fused__native_batch_norm_legit_functional_copy__5'
error: backend compiler failed build.
error: Double type is not supported on this platform.
in file: cntm6tjt34k6w4pvuj3uh3ygaz6n2jhqkzsq6zshu2nm4au7anyh.py:61
in kernel: 'triton_per_fused__native_batch_norm_legit_functional_copy__5'
error: backend compiler failed build.
error: Double type is not supported on this platform.
in file: cntm6tjt34k6w4pvuj3uh3ygaz6n2jhqkzsq6zshu2nm4au7anyh.py:60
in kernel: 'triton_per_fused__native_batch_norm_legit_functional_copy__5'
error: backend compiler failed build.
error: Double type is not supported on this platform.
in file: cntm6tjt34k6w4pvuj3uh3ygaz6n2jhqkzsq6zshu2nm4au7anyh.py:60
in kernel: 'triton_per_fused__native_batch_norm_legit_functional_copy__5'
error: backend compiler failed build.
error: Double type is not supported on this platform.
in file: cntm6tjt34k6w4pvuj3uh3ygaz6n2jhqkzsq6zshu2nm4au7anyh.py:60
in kernel: 'triton_per_fused__native_batch_norm_legit_functional_copy__5'
error: backend compiler failed build.
error: Double type is not supported on this platform.
in file: cntm6tjt34k6w4pvuj3uh3ygaz6n2jhqkzsq6zshu2nm4au7anyh.py:61
in kernel: 'triton_per_fused__native_batch_norm_legit_functional_copy__5'
error: backend compiler failed build.
Error during Intel loadBinary: ZE_RESULT_ERROR_MODULE_BUILD_FAILURE
Traceback (most recent call last):
File "/home/kang/wkplace/sample/./redisual_learn.py", line 89, in
train_mini_fashion_mnist()
File "/home/kang/wkplace/sample/./redisual_learn.py", line 56, in train_mini_fashion_mnist
d2l.train_ch6(net, train_iter, test_iter, num_epochs, lr, d2l.try_gpu())
File "/home/kang/wkplace/sample/d2l/torch.py", line 532, in train_ch6
y_hat = net(X)
^^^^^^
File "/home/kang/wkplace/pyenv/torch/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py", line 414, in call
return super().call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/kang/wkplace/pyenv/torch/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/kang/wkplace/pyenv/torch/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/kang/wkplace/pyenv/torch/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py", line 845, in compile_wrapper
raise e.remove_dynamo_frames() from None # see TORCHDYNAMO_VERBOSE=1
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/kang/wkplace/pyenv/torch/lib/python3.12/site-packages/torch/_inductor/compile_fx.py", line 990, in _compile_fx_inner
raise InductorError(e, currentframe()).with_traceback(
File "/home/kang/wkplace/pyenv/torch/lib/python3.12/site-packages/torch/_inductor/compile_fx.py", line 974, in _compile_fx_inner
mb_compiled_graph = fx_codegen_and_compile(
^^^^^^^^^^^^^^^^^^^^^^^
File "/home/kang/wkplace/pyenv/torch/lib/python3.12/site-packages/torch/_inductor/compile_fx.py", line 1695, in fx_codegen_and_compile
return scheme.codegen_and_compile(gm, example_inputs, inputs_to_check, graph_kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/kang/wkplace/pyenv/torch/lib/python3.12/site-packages/torch/_inductor/compile_fx.py", line 1505, in codegen_and_compile
compiled_module = graph.compile_to_module()
^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/kang/wkplace/pyenv/torch/lib/python3.12/site-packages/torch/_inductor/graph.py", line 2319, in compile_to_module
return self._compile_to_module()
^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/kang/wkplace/pyenv/torch/lib/python3.12/site-packages/torch/_inductor/graph.py", line 2329, in _compile_to_module
mod = self._compile_to_module_lines(wrapper_code)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/kang/wkplace/pyenv/torch/lib/python3.12/site-packages/torch/_inductor/graph.py", line 2397, in _compile_to_module_lines
mod = PyCodeCache.load_by_key_path(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/kang/wkplace/pyenv/torch/lib/python3.12/site-packages/torch/_inductor/codecache.py", line 3548, in load_by_key_path
mod = _reload_python_module(key, path, set_sys_modules=in_toplevel)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/kang/wkplace/pyenv/torch/lib/python3.12/site-packages/torch/_inductor/runtime/compile_tasks.py", line 33, in _reload_python_module
exec(code, mod.dict, mod.dict)
File "/tmp/torchinductor_kang/o5/co57jylijj7cvz43rlumuuyb5ucetrngon22g7tofnvyrpgmlwcg.py", line 3103, in
async_compile.wait(globals())
File "/home/kang/wkplace/pyenv/torch/lib/python3.12/site-packages/torch/_inductor/async_compile.py", line 631, in wait
self._wait_futures(scope)
File "/home/kang/wkplace/pyenv/torch/lib/python3.12/site-packages/torch/_inductor/async_compile.py", line 651, in _wait_futures
kernel = result.result()
^^^^^^^^^^^^^^^
File "/home/kang/wkplace/pyenv/torch/lib/python3.12/site-packages/torch/_inductor/codecache.py", line 4289, in result
return self.result_fn()
^^^^^^^^^^^^^^^^
File "/home/kang/wkplace/pyenv/torch/lib/python3.12/site-packages/torch/_inductor/async_compile.py", line 470, in get_result
kernel.precompile(
File "/home/kang/wkplace/pyenv/torch/lib/python3.12/site-packages/torch/_inductor/runtime/triton_heuristics.py", line 451, in precompile
self._make_launchers()
File "/home/kang/wkplace/pyenv/torch/lib/python3.12/site-packages/torch/_inductor/runtime/triton_heuristics.py", line 614, in _make_launchers
launchers.append(result.make_launcher())
^^^^^^^^^^^^^^^^^^^^^^
File "/home/kang/wkplace/pyenv/torch/lib/python3.12/site-packages/torch/_inductor/runtime/triton_heuristics.py", line 1713, in make_launcher
binary._init_handles()
File "/home/kang/wkplace/pyenv/torch/lib/python3.12/site-packages/triton/compiler/compiler.py", line 462, in _init_handles
self.module, self.function, self.n_regs, self.n_spills, self.n_max_threads = driver.active.utils.load_binary(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/kang/wkplace/pyenv/torch/lib/python3.12/site-packages/triton/backends/intel/driver.py", line 213, in load_binary
return self.shared_library.load_binary(args)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
torch._inductor.exc.InductorError: RuntimeError: ZE_RESULT_ERROR_MODULE_BUILD_FAILURE
Set TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer context, set TORCH_LOGS="+dynamo"
Environment details
python3 -m torch.utils.collect_env
:128: RuntimeWarning: 'torch.utils.collect_env' found in sys.modules after import of package 'torch.utils', but prior to execution of 'torch.utils.collect_env'; this may result in unpredictable behaviour
Collecting environment information...
PyTorch version: 2.9.0+xpu
Is debug build: False
CUDA used to build PyTorch: Could not collect
ROCM used to build PyTorch: N/A
OS: Ubuntu 24.04.3 LTS (x86_64)
GCC version: (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0
Clang version: 18.1.3 (1ubuntu1)
CMake version: version 3.28.3
Libc version: glibc-2.39
Python version: 3.12.3 (main, Aug 14 2025, 17:47:21) [GCC 13.3.0] (64-bit runtime)
Python platform: Linux-6.6.87.2-microsoft-standard-WSL2-x86_64-with-glibc2.39
Is CUDA available: False
CUDA runtime version: Could not collect
CUDA_MODULE_LOADING set to: N/A
GPU models and configuration: GPU 0: NVIDIA GeForce GTX 1060 6GB
Nvidia driver version: 576.80
cuDNN version: Could not collect
Is XPU available: True
XPU used to build PyTorch: 20250201
Intel GPU driver version:
- intel-opencl-icd: 25.35.35096.9-1
24.04ppa3 - libze1: 1.24.1-1
24.04ppa1
Intel GPU models onboard:
N/A
Intel GPU models detected: - [0] _XpuDeviceProperties(name='Intel(R) Graphics [0x56a0]', platform_name='Intel(R) oneAPI Unified Runtime over Level-Zero', type='gpu', device_id=0x56A0, uuid=8680a056-0800-0000-1100-000000000000, driver_version='1.6.35096+9', total_memory=15930MB, max_compute_units=512, gpu_eu_count=512, gpu_subslice_count=64, max_work_group_size=1024, max_num_sub_groups=128, sub_group_sizes=[8 16 32], has_fp16=1, has_fp64=0, has_atomic64=1)
HIP runtime version: N/A
MIOpen runtime version: N/A
Is XNNPACK available: True
CPU:
Architecture: x86_64
CPU op-mode(s): 32-bit, 64-bit
Address sizes: 48 bits physical, 48 bits virtual
Byte Order: Little Endian
CPU(s): 22
On-line CPU(s) list: 0-21
Vendor ID: AuthenticAMD
Model name: AMD Ryzen 9 5900X 12-Core Processor
CPU family: 25
Model: 33
Thread(s) per core: 2
Core(s) per socket: 11
Socket(s): 1
Stepping: 0
BogoMIPS: 7386.21
Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl tsc_reliable nonstop_tsc cpuid extd_apicid tsc_known_freq pni pclmulqdq ssse3 fma cx16 sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm cmp_legacy svm cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw topoext perfctr_core ssbd ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves clzero xsaveerptr arat npt nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold v_vmsave_vmload umip vaes vpclmulqdq rdpid fsrm
Virtualization: AMD-V
Hypervisor vendor: Microsoft
Virtualization type: full
L1d cache: 352 KiB (11 instances)
L1i cache: 352 KiB (11 instances)
L2 cache: 5.5 MiB (11 instances)
L3 cache: 32 MiB (1 instance)
NUMA node(s): 1
NUMA node0 CPU(s): 0-21
Vulnerability Gather data sampling: Not affected
Vulnerability Itlb multihit: Not affected
Vulnerability L1tf: Not affected
Vulnerability Mds: Not affected
Vulnerability Meltdown: Not affected
Vulnerability Mmio stale data: Not affected
Vulnerability Reg file data sampling: Not affected
Vulnerability Retbleed: Not affected
Vulnerability Spec rstack overflow: Vulnerable: Safe RET, no microcode
Vulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl
Vulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization
Vulnerability Spectre v2: Mitigation; Retpolines; IBPB conditional; IBRS_FW; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected
Vulnerability Srbds: Not affected
Vulnerability Tsx async abort: Not affected
Versions of relevant libraries:
[pip3] dpcpp-cpp-rt==2025.2.1
[pip3] impi-rt==2021.16.1
[pip3] intel-cmplr-lib-rt==2025.2.1
[pip3] intel-cmplr-lib-ur==2025.2.1
[pip3] intel-cmplr-lic-rt==2025.2.1
[pip3] intel-opencl-rt==2025.2.1
[pip3] intel-openmp==2025.2.1
[pip3] intel-pti==0.13.1
[pip3] intel-sycl-rt==2025.2.1
[pip3] mkl==2025.2.0
[pip3] numpy==2.3.3
[pip3] oneccl==2021.16.1
[pip3] oneccl-devel==2021.16.1
[pip3] onemkl-sycl-blas==2025.2.0
[pip3] onemkl-sycl-dft==2025.2.0
[pip3] onemkl-sycl-lapack==2025.2.0
[pip3] onemkl-sycl-rng==2025.2.0
[pip3] onemkl-sycl-sparse==2025.2.0
[pip3] pytorch-triton-xpu==3.5.0
[pip3] tbb==2022.2.0
[pip3] tcmlib==1.4.0
[pip3] torch==2.9.0+xpu
[pip3] torchaudio==2.9.0+xpu
[pip3] torchvision==0.24.0+xpu
[pip3] umf==0.11.0
[conda] No relevant packages