Skip to content

Commit 5388fe4

Browse files
authored
Merge branch 'main' into RC-TEST-2.8
2 parents 658033d + c518c51 commit 5388fe4

40 files changed

+472
-211
lines changed

.ci/docker/requirements.txt

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@ pypandoc==1.15
1515
pandocfilters==1.5.1
1616
markdown==3.8.2
1717

18-
1918
# PyTorch Theme
2019
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@pytorch_sphinx_theme2#egg=pytorch_sphinx_theme2
2120

@@ -51,12 +50,8 @@ onnxruntime
5150
evaluate
5251
accelerate>=0.20.1
5352

54-
5553
importlib-metadata==6.8.0
5654

57-
# PyTorch Theme
58-
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
59-
6055
ipython
6156

6257
sphinxcontrib.katex

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@ advanced
55
pytorch_basics
66
/recipes
77
prototype
8+
/unstable
9+
sg_execution_times.rst
810

911
#data things
1012
_data/

.jenkins/download_data.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
BEGINNER_DATA_DIR = REPO_BASE_DIR / "beginner_source" / "data"
1313
INTERMEDIATE_DATA_DIR = REPO_BASE_DIR / "intermediate_source" / "data"
1414
ADVANCED_DATA_DIR = REPO_BASE_DIR / "advanced_source" / "data"
15-
PROTOTYPE_DATA_DIR = REPO_BASE_DIR / "prototype_source" / "data"
15+
PROTOTYPE_DATA_DIR = REPO_BASE_DIR / "unstable_source" / "data"
1616
FILES_TO_RUN = os.getenv("FILES_TO_RUN")
1717

1818

@@ -106,7 +106,7 @@ def download_lenet_mnist() -> None:
106106
)
107107

108108
def download_gpu_quantization_torchao() -> None:
109-
# Download SAM model checkpoint for prototype_source/gpu_quantization_torchao_tutorial.py
109+
# Download SAM model checkpoint unstable_source/gpu_quantization_torchao_tutorial.py
110110
download_url_to_file("https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth",
111111
prefix=PROTOTYPE_DATA_DIR,
112112
dst="sam_vit_h_4b8939.pth",

.jenkins/validate_tutorials_built.py

Lines changed: 1 addition & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -18,36 +18,24 @@
1818
"beginner_source/examples_nn/polynomial_module",
1919
"beginner_source/examples_nn/dynamic_net",
2020
"beginner_source/examples_nn/polynomial_optim",
21-
"beginner_source/former_torchies/autograd_tutorial_old",
22-
"beginner_source/former_torchies/tensor_tutorial_old",
2321
"beginner_source/examples_autograd/polynomial_autograd",
2422
"beginner_source/examples_autograd/polynomial_custom_function",
2523
"intermediate_source/mnist_train_nas", # used by ax_multiobjective_nas_tutorial.py
2624
"intermediate_source/torch_compile_conv_bn_fuser",
2725
"intermediate_source/_torch_export_nightly_tutorial", # does not work on release
2826
"advanced_source/usb_semisup_learn", # fails with CUDA OOM error, should try on a different worker
29-
"prototype_source/fx_graph_mode_ptq_dynamic",
30-
"prototype_source/vmap_recipe",
31-
"prototype_source/torchscript_freezing",
32-
"prototype_source/nestedtensor",
33-
"prototype_source/gpu_direct_storage", # requires specific filesystem + GPUDirect Storage to be set up
34-
"recipes_source/recipes/saving_and_loading_models_for_inference",
35-
"recipes_source/recipes/saving_multiple_models_in_one_file",
27+
"unstable_source/gpu_direct_storage", # requires specific filesystem + GPUDirect Storage to be set up
3628
"recipes_source/recipes/tensorboard_with_pytorch",
3729
"recipes_source/recipes/what_is_state_dict",
3830
"recipes_source/recipes/profiler_recipe",
39-
"recipes_source/recipes/save_load_across_devices",
4031
"recipes_source/recipes/warmstarting_model_using_parameters_from_a_different_model",
41-
"recipes_source/recipes/dynamic_quantization",
42-
"recipes_source/recipes/saving_and_loading_a_general_checkpoint",
4332
"recipes_source/recipes/benchmark",
4433
"recipes_source/recipes/tuning_guide",
4534
"recipes_source/recipes/zeroing_out_gradients",
4635
"recipes_source/recipes/defining_a_neural_network",
4736
"recipes_source/recipes/timer_quick_start",
4837
"recipes_source/recipes/amp_recipe",
4938
"recipes_source/recipes/Captum_Recipe",
50-
"intermediate_source/text_to_speech_with_torchaudio",
5139
"intermediate_source/tensorboard_profiler_tutorial", # reenable after 2.0 release.
5240
"advanced_source/semi_structured_sparse", # reenable after 3303 is fixed.
5341
"intermediate_source/torchrec_intro_tutorial.py", #failing with 2.8 reenable after 3498

.lintrunner.toml

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -66,15 +66,15 @@ exclude_patterns = [
6666
"intermediate_source/tiatoolbox_tutorial.rst",
6767
"intermediate_source/torch_compile_tutorial.py",
6868
"intermediate_source/transformer_building_blocks.py",
69-
"prototype_source/README.md",
70-
"prototype_source/README.txt",
71-
"prototype_source/backend_config_tutorial.rst",
72-
"prototype_source/gpu_direct_storage.py",
73-
"prototype_source/inductor_cpp_wrapper_tutorial.rst",
74-
"prototype_source/inductor_windows.rst",
75-
"prototype_source/maskedtensor_advanced_semantics.py",
76-
"prototype_source/max_autotune_on_CPU_tutorial.rst",
77-
"prototype_source/vmap_recipe.py",
69+
"unstable_source/README.md",
70+
"unstable_source/README.txt",
71+
"unstable_source/backend_config_tutorial.rst",
72+
"unstable_source/gpu_direct_storage.py",
73+
"unstable_source/inductor_cpp_wrapper_tutorial.rst",
74+
"unstable_source/inductor_windows.rst",
75+
"unstable_source/maskedtensor_advanced_semantics.py",
76+
"unstable_source/max_autotune_on_CPU_tutorial.rst",
77+
"unstable_source/vmap_recipe.py",
7878
"recipes_source/README.txt",
7979
"recipes_source/amx.rst",
8080
"recipes_source/compiling_optimizer.rst",
@@ -150,7 +150,7 @@ exclude_patterns = [
150150
"intermediate_source/README.txt",
151151
"intermediate_source/TP_tutorial.rst",
152152
"intermediate_source/inductor_debug_cpu.py",
153-
"prototype_source/README.txt",
153+
"unstable_source/README.txt",
154154
"recipes_source/README.txt",
155155
"recipes_source/recipes/README.txt",
156156
"recipes_source/xeon_run_cpu.rst",
Lines changed: 274 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,274 @@
1+
.. _cpp-custom-ops-tutorial-sycl:
2+
3+
Custom SYCL Operators
4+
=====================
5+
6+
.. grid:: 2
7+
8+
.. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn
9+
:class-card: card-prerequisites
10+
11+
* How to integrate custom operators written in SYCL with PyTorch
12+
13+
.. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites
14+
:class-card: card-prerequisites
15+
16+
* PyTorch 2.8 or later
17+
* Basic understanding of SYCL programming
18+
19+
.. note::
20+
21+
``SYCL`` serves as the backend programming language for Intel GPUs (device label ``xpu``). For configuration details, see:
22+
`Getting Started on Intel GPUs <https://docs.pytorch.org/docs/main/notes/get_start_xpu.html>`_. The Intel Compiler, which comes bundled with Intel Deep Learning Essentials, handles ``SYCL`` compilation. Ensure you install and activate the compiler environment prior to executing the code examples in this tutorial.
23+
24+
PyTorch offers a large library of operators that work on Tensors (e.g. torch.add, torch.sum, etc).
25+
However, you may wish to bring a new custom operator to PyTorch. This tutorial demonstrates the
26+
best path to authoring a custom operator written in SYCL. Tutorials for C++ and CUDA operators are available in the :ref:`cpp-custom-ops-tutorial`.
27+
28+
Follow the structure to create a custom SYCL operator:
29+
30+
.. code-block:: text
31+
32+
sycl_example/
33+
├── setup.py
34+
├── sycl_extension
35+
│ ├── __init__.py
36+
│ ├── muladd.sycl
37+
│ └── ops.py
38+
└── test_sycl_extension.py
39+
40+
Setting up the Build System
41+
---------------------------
42+
43+
If you need to compile **SYCL** code (for example, ``.sycl`` files), use `torch.utils.cpp_extension.SyclExtension <https://docs.pytorch.org/docs/stable/cpp_extension.html#torch.utils.cpp_extension.SyclExtension>`_.
44+
The setup process is very similar to C++/CUDA, except the compilation arguments need to be adjusted for SYCL.
45+
46+
Using ``sycl_extension`` is as straightforward as writing the following ``setup.py``:
47+
48+
.. code-block:: python
49+
50+
import os
51+
import torch
52+
import glob
53+
from setuptools import find_packages, setup
54+
from torch.utils.cpp_extension import SyclExtension, BuildExtension
55+
56+
library_name = "sycl_extension"
57+
py_limited_api = True
58+
extra_compile_args = {
59+
"cxx": ["-O3",
60+
"-fdiagnostics-color=always",
61+
"-DPy_LIMITED_API=0x03090000"],
62+
"sycl": ["-O3" ]
63+
}
64+
65+
assert(torch.xpu.is_available()), "XPU is not available, please check your environment"
66+
# Source files collection
67+
this_dir = os.path.dirname(os.path.curdir)
68+
extensions_dir = os.path.join(this_dir, library_name)
69+
sources = list(glob.glob(os.path.join(extensions_dir, "*.sycl")))
70+
# Construct extension
71+
ext_modules = [
72+
SyclExtension(
73+
f"{library_name}._C",
74+
sources,
75+
extra_compile_args=extra_compile_args,
76+
py_limited_api=py_limited_api,
77+
)
78+
]
79+
setup(
80+
name=library_name,
81+
packages=find_packages(),
82+
ext_modules=ext_modules,
83+
install_requires=["torch"],
84+
description="Simple Example of PyTorch Sycl extensions",
85+
cmdclass={"build_ext": BuildExtension},
86+
options={"bdist_wheel": {"py_limited_api": "cp39"}} if py_limited_api else {},
87+
)
88+
89+
90+
Defining the custom op and adding backend implementations
91+
---------------------------------------------------------
92+
First, let's write a SYCL function that computes ``mymuladd``:
93+
94+
In order to use this from PyTorch’s Python frontend, we need to register it
95+
as a PyTorch operator using the ``TORCH_LIBRARY`` API. This will automatically
96+
bind the operator to Python.
97+
98+
99+
If you also have a SYCL implementation of ``myaddmul``, you can also register it
100+
in a separate ``TORCH_LIBRARY_IMPL`` block:
101+
102+
.. code-block:: cpp
103+
104+
#include <c10/xpu/XPUStream.h>
105+
#include <sycl/sycl.hpp>
106+
#include <ATen/Operators.h>
107+
#include <torch/all.h>
108+
#include <torch/library.h>
109+
110+
namespace sycl_extension {
111+
// MulAdd Kernel: result = a * b + c
112+
static void muladd_kernel(
113+
int numel, const float* a, const float* b, float c, float* result,
114+
const sycl::nd_item<1>& item) {
115+
int idx = item.get_global_id(0);
116+
if (idx < numel) {
117+
result[idx] = a[idx] * b[idx] + c;
118+
}
119+
}
120+
121+
class MulAddKernelFunctor {
122+
public:
123+
MulAddKernelFunctor(int _numel, const float* _a, const float* _b, float _c, float* _result)
124+
: numel(_numel), a(_a), b(_b), c(_c), result(_result) {}
125+
void operator()(const sycl::nd_item<1>& item) const {
126+
muladd_kernel(numel, a, b, c, result, item);
127+
}
128+
129+
private:
130+
int numel;
131+
const float* a;
132+
const float* b;
133+
float c;
134+
float* result;
135+
};
136+
137+
at::Tensor mymuladd_xpu(const at::Tensor& a, const at::Tensor& b, double c) {
138+
TORCH_CHECK(a.sizes() == b.sizes(), "a and b must have the same shape");
139+
TORCH_CHECK(a.dtype() == at::kFloat, "a must be a float tensor");
140+
TORCH_CHECK(b.dtype() == at::kFloat, "b must be a float tensor");
141+
TORCH_CHECK(a.device().is_xpu(), "a must be an XPU tensor");
142+
TORCH_CHECK(b.device().is_xpu(), "b must be an XPU tensor");
143+
144+
at::Tensor a_contig = a.contiguous();
145+
at::Tensor b_contig = b.contiguous();
146+
at::Tensor result = at::empty_like(a_contig);
147+
148+
const float* a_ptr = a_contig.data_ptr<float>();
149+
const float* b_ptr = b_contig.data_ptr<float>();
150+
float* res_ptr = result.data_ptr<float>();
151+
int numel = a_contig.numel();
152+
153+
sycl::queue& queue = c10::xpu::getCurrentXPUStream().queue();
154+
constexpr int threads = 256;
155+
int blocks = (numel + threads - 1) / threads;
156+
157+
queue.submit([&](sycl::handler& cgh) {
158+
cgh.parallel_for<MulAddKernelFunctor>(
159+
sycl::nd_range<1>(blocks * threads, threads),
160+
MulAddKernelFunctor(numel, a_ptr, b_ptr, static_cast<float>(c), res_ptr)
161+
);
162+
});
163+
164+
return result;
165+
}
166+
// Defines the operators
167+
TORCH_LIBRARY(sycl_extension, m) {
168+
m.def("mymuladd(Tensor a, Tensor b, float c) -> Tensor");
169+
}
170+
171+
// ==================================================
172+
// Register SYCL Implementations to Torch Library
173+
// ==================================================
174+
TORCH_LIBRARY_IMPL(sycl_extension, XPU, m) {
175+
m.impl("mymuladd", &mymuladd_xpu);
176+
}
177+
178+
} // namespace sycl_extension
179+
180+
181+
182+
Create a Python Interface
183+
-------------------------
184+
185+
Create a Python interface for our operator in the ``sycl_extension/ops.py`` file:
186+
187+
.. code-block:: python
188+
189+
import torch
190+
from torch import Tensor
191+
__all__ = ["mymuladd"]
192+
193+
def mymuladd(a: Tensor, b: Tensor, c: float) -> Tensor:
194+
"""Performs a * b + c in an efficient fused kernel"""
195+
return torch.ops.sycl_extension.mymuladd.default(a, b, c)
196+
197+
Initialize Package
198+
------------------
199+
200+
Create ``sycl_extension/__init__.py`` file to make the package importable:
201+
202+
.. code-block:: python
203+
204+
import ctypes
205+
from pathlib import Path
206+
207+
import torch
208+
209+
current_dir = Path(__file__).parent.parent
210+
build_dir = current_dir / "build"
211+
so_files = list(build_dir.glob("**/*.so"))
212+
213+
assert len(so_files) == 1, f"Expected one _C*.so file, found {len(so_files)}"
214+
215+
with torch._ops.dl_open_guard():
216+
loaded_lib = ctypes.CDLL(so_files[0])
217+
218+
from . import ops
219+
220+
__all__ = [
221+
"loaded_lib",
222+
"ops",
223+
]
224+
225+
Testing SYCL extension operator
226+
-------------------
227+
228+
Use simple test to verify that the operator works correctly.
229+
230+
.. code-block:: python
231+
232+
import torch
233+
from torch.testing._internal.common_utils import TestCase
234+
import unittest
235+
import sycl_extension
236+
237+
def reference_muladd(a, b, c):
238+
return a * b + c
239+
240+
class TestMyMulAdd(TestCase):
241+
def sample_inputs(self, device, *, requires_grad=False):
242+
def make_tensor(*size):
243+
return torch.randn(size, device=device, requires_grad=requires_grad)
244+
245+
def make_nondiff_tensor(*size):
246+
return torch.randn(size, device=device, requires_grad=False)
247+
248+
return [
249+
[make_tensor(3), make_tensor(3), 1],
250+
[make_tensor(20), make_tensor(20), 3.14],
251+
[make_tensor(20), make_nondiff_tensor(20), -123],
252+
[make_nondiff_tensor(2, 3), make_tensor(2, 3), -0.3],
253+
]
254+
255+
def _test_correctness(self, device):
256+
samples = self.sample_inputs(device)
257+
for args in samples:
258+
result = sycl_extension.ops.mymuladd(*args)
259+
expected = reference_muladd(*args)
260+
torch.testing.assert_close(result, expected)
261+
262+
@unittest.skipIf(not torch.xpu.is_available(), "requires Intel GPU")
263+
def test_correctness_xpu(self):
264+
self._test_correctness("xpu")
265+
266+
if __name__ == "__main__":
267+
unittest.main()
268+
269+
This test checks the correctness of the custom operator by comparing its output against a reference implementation.
270+
271+
Conclusion
272+
----------
273+
274+
In this tutorial, we demonstrated how to implement and compile custom SYCL operators for PyTorch. We specifically showcased an inference operation ``muladd``. For adding backward support or enabling torch.compile compatibility, please refer to :ref:`cpp-custom-ops-tutorial`.

0 commit comments

Comments
 (0)