intel
diff --git a/‎.gitmodules‎
Lines changed: 3 additions & 0 deletions b/‎.gitmodules‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 6 additions & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 114 additions & 6 deletions b/‎README.md‎
Lines changed: 114 additions & 6 deletions
diff --git a/‎cmake/CPU.cmake‎
Lines changed: 11 additions & 7 deletions b/‎cmake/CPU.cmake‎
Lines changed: 11 additions & 7 deletions
diff --git a/‎cmake/Modules/FindTorchCCL.cmake‎
Lines changed: 28 additions & 0 deletions b/‎cmake/Modules/FindTorchCCL.cmake‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎docker/Dockerfile‎
Lines changed: 2 additions & 2 deletions b/‎docker/Dockerfile‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎intel_pytorch_extension_py/__init__.py‎
Lines changed: 2 additions & 1 deletion b/‎intel_pytorch_extension_py/__init__.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎intel_pytorch_extension_py/ops/gru.py‎
Lines changed: 1 addition & 1 deletion b/‎intel_pytorch_extension_py/ops/gru.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎intel_pytorch_extension_py/ops/lstm.py‎
Lines changed: 2 additions & 2 deletions b/‎intel_pytorch_extension_py/ops/lstm.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎intel_pytorch_extension_py/ops/rnn.py‎
Lines changed: 2 additions & 2 deletions b/‎intel_pytorch_extension_py/ops/rnn.py‎
Lines changed: 2 additions & 2 deletions
@@ -7,3 +7,6 @@
 [submodule "third_party/xsmm"]
 	path = third_party/xsmm
 	url = https://github.com/hfp/libxsmm.git
+[submodule "third_party/torch_ccl"]
+	path = third_party/torch_ccl
+	url = https://github.com/intel/torch-ccl.git
@@ -7,6 +7,12 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
 set(PLUGIN_NAME _torch_ipex)
 
+set(RPATH_VALUE $ORIGIN)
+set(CMAKE_SKIP_BUILD_RPATH  FALSE)
+set(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE)
+set(CMAKE_INSTALL_RPATH "${RPATH_VALUE}/lib/")
+set(CMAKE_INSTALL_RPATH_USE_LINK_PATH FALSE)
+
 set(DPCPP_ROOT "${PROJECT_SOURCE_DIR}/torch_ipex/csrc")
 set(DPCPP_THIRD_PARTY_ROOT "${PROJECT_SOURCE_DIR}/third_party")
 
 
@@ -3,31 +3,33 @@
 Intel Extension for PyTorch is a Python package to extend official PyTorch. It is designed to make the Out-of-Box user experience of PyTorch CPU better while achieving good performance. The extension also will be the PR(Pull-Request) buffer for the Intel PyTorch framework dev team. The PR buffer will not only contain functions, but also optimization (for example, take advantage of Intel's new hardware features).
 
  - [Installation](#installation)
-	 - [Install PyTorch from Source](#install-pytorch-from-source)
-	 - [Install Intel Extension for PyTorch from Source](#install-intel-extension-for-pytorch-from-source)
+     - [Install PyTorch from Source](#install-pytorch-from-source)
+     - [Install Intel Extension for PyTorch from Source](#install-intel-extension-for-pytorch-from-source)
  - [Getting Started](#getting-started)
      - [Automatically Mix Precison](#automatically-mix-precision)
+        - [BFloat16](#BFloat16)
+        - [INT8](#int8-quantization)
  - [Contribution](#contribution)
  - [License](#license)
 
 ## Installation
 
 ### Install PyTorch from Source
 
- 1. Get PyTorch v1.5.0-rc3 source(Refer to [PyTorch guide](https://github.com/pytorch/pytorch#get-the-pytorch-source) for more details)
+ 1. Get PyTorch v1.7.0 source(Refer to [PyTorch guide](https://github.com/pytorch/pytorch#get-the-pytorch-source) for more details)
     ```bash
     git clone --recursive https://github.com/pytorch/pytorch
     cd pytorch
 
     # checkout source code to the specified version
-    git checkout v1.5.0-rc3
+    git checkout v1.7.0
 
     # update submodules for the specified PyTorch version
     git submodule sync
     git submodule update --init --recursive
     ```
 
- 2. Get Intel PyTorch Extension source
+ 2. Get the source code of Intel Extension for PyTorch
     ```bash
     git clone --recursive https://github.com/intel/intel-extension-for-pytorch
     cd intel-extension-for-pytorch
@@ -41,7 +43,7 @@ Intel Extension for PyTorch is a Python package to extend official PyTorch. It i
     ```bash
     # Apply git patch to pytorch code
     cd ${pytorch_directory}
-    git apply ${intel_extension_for_pytorch_directory}/torch_patches/dpcpp-v1.5-rc3.patch
+    git apply ${intel_extension_for_pytorch_directory}/torch_patches/xpu-1.7.patch
     ```
 
  4. Build and install PyTorch (Refer to [PyTorch guide](https://github.com/pytorch/pytorch#install-pytorch) for more details)
@@ -109,6 +111,8 @@ res = model(input)
 In addition, Intel Extension for PyTorch supports the mixed precision. It means that some operators of a model may run with Float32 and some other operators may run with BFloat16 or INT8.
 In traditional, if you want to run a model with a low precision type, you need to convert the parameters and the input tensors to the low precision type manually. And if the model contains some operators that do not support the low precision type, then you have to convert back to Float32. Round after round until the model can run normally.
 The extension can simply the case, you just need to enable the auto-mix-precision as follows, then you can benefit from the low precision. Currently, the extension only supports BFloat16.
+
+#### BFloat16
 ```python
 import torch
 import torch.nn as nn
@@ -130,6 +134,110 @@ model = Model().to(ipex.DEVICE)
 
 res = model(input)
 ```
+#### INT8 Quantization
+Currently, Intel Extension for PyTorch has supported static and symmetric quantization. Development of dynamic quantization is undergoing. And asymmetric quantization will be enabled once oneDNN is upgraded to v2.0 or higher versions.
+
+How to quantize the following model:
+```python
+import torch
+import torch.nn as nn
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+        self.conv = nn.Conv2d(3, 64, 7, stride=2)
+
+    def forward(self, input):
+        return self.conv(input).relu()
+```
+Firstly we need to do calibration step against a representative dataset (set ```running_mode``` to ```calibration```):
+```python
+# Convert the model to the Extension device
+model = Model().to(ipex.DEVICE)
+
+# Create a configuration file to save quantization parameters.
+conf = ipex.AmpConf(torch.int8)
+with torch.no_grad():
+    for x in cali_dataset:
+        # Run the model under calibration mode to collect quantization parameters
+        with ipex.AutoMixPrecision(conf, running_mode='calibration'):
+            y = model(x.to(ipex.DEVICE))
+# Save the configuration file
+conf.save('configure.json')
+```
+The content of the configuration file is as follows.
+
+```json
+[
+    {
+        "id": 0,
+        "name": "Convolution",
+        "algorithm": "min_max",
+        "weight_granularity": "per_channel",
+        "inputs_scale": [
+            25.05583953857422
+        ],
+        "outputs_scale": [
+            43.98969650268555
+        ],
+        "inputs_uint8_used": [
+            false
+        ],
+        "outputs_uint8_used": [
+            false
+        ],
+        "quantized": true
+    },
+    {
+        "id": 1,
+        "name": "Relu",
+        "algorithm": "min_max",
+        "weight_granularity": "per_channel",
+        "inputs_scale": [
+            43.98969650268555
+        ],
+        "outputs_scale": [
+            43.98969650268555
+        ],
+        "inputs_uint8_used": [
+            false
+        ],
+        "outputs_uint8_used": [
+            false
+        ],
+        "quantized": true
+    }
+]
+```
+- ```id``` is a sequence number of operators which were quantized statically in the calibration step.
+**Manually changing this value will cause unexpected behaviors**.
+- ```name``` is the name of the operator to be quantized.
+- ```algorithm``` indicates how to calculate the scales of the observed tensors. Currently only ```min_max``` is supported.
+- ```weight_granularity``` controls how to quantize the operator weights. The ```Convolution``` and ```Linear``` both supports  ```per_channel``` and ```per_tensor```. And the other operators only supports ```per_tensor```.
+- ```inputs_scale``` and ```outputs_scale``` are the scales to quantize the input tensors and output tensors respectively.
+- ```inputs_uint8_used``` and ```outputs_uint8_used``` indicate whether to use ```int8``` or ```uint8```. Default value is ```false```, indicating that ```int8``` is used.
+- ```quantized``` determines whether this operator should be quantized or not during inference.
+
+After doing calibration step, we can use the saved configuration json file to do evalution (set ```running_mode``` to ```inference```):
+```python
+conf = ipex.AmpConf(torch.int8, 'configure.json')
+with torch.no_grad():
+    for x in cali_dataset:
+        with ipex.AutoMixPrecision(conf, running_mode='inference'):
+            y = model(x.to(ipex.DEVICE))
+```
+
+Supported Quantization Operators:
+- ```Convoluton```
+- ```BatchNorm```
+- ```MaxPooling```
+- ```AvgPooling```
+- ```AdaptivePooling```
+- ```Linear```
+- ```convolution + relu```
+- ```convolution + sum```
+- ```convolution + sum + relu```
+- ```convolution + BatchNorm```
 
 
 ## Contribution
 
@@ -12,7 +12,7 @@ SET(DNNL_LIBRARY_TYPE STATIC CACHE STRING "" FORCE)
 
 set(DPCPP_CPU_ROOT "${PROJECT_SOURCE_DIR}/torch_ipex/csrc/cpu")
 add_subdirectory(${DPCPP_THIRD_PARTY_ROOT}/mkl-dnn)
-
+find_package(TorchCCL REQUIRED)
 list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules)
 
 FIND_PACKAGE(AVX)
@@ -125,9 +125,15 @@ set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-trapping-math")
 
 # includes
 
+# include mkl-dnn before PyTorch
+# Otherwise, path_to_pytorch/torch/include/dnnl.hpp will be used as the header
+include_directories(${PROJECT_SOURCE_DIR}/build/third_party/mkl-dnn/include)
+include_directories(${DPCPP_THIRD_PARTY_ROOT}/mkl-dnn/include)
+
 # Set installed PyTorch dir
 if(DEFINED PYTORCH_INSTALL_DIR)
   include_directories(${PYTORCH_INSTALL_DIR}/include)
+  include_directories(${PYTORCH_INSTALL_DIR}/include/torch/csrc/api/include/)
 else()
   message(FATAL_ERROR, "Cannot find installed PyTorch directory")
 endif()
@@ -136,9 +142,8 @@ include_directories(${PROJECT_SOURCE_DIR})
 include_directories(${PROJECT_SOURCE_DIR}/torch_ipex)
 include_directories(${PROJECT_SOURCE_DIR}/torch_ipex/csrc/)
 include_directories(${DPCPP_THIRD_PARTY_ROOT}/pybind11/include)
-include_directories(${PROJECT_SOURCE_DIR}/build/third_party/mkl-dnn/include)
-include_directories(${DPCPP_THIRD_PARTY_ROOT}/mkl-dnn/include)
 include_directories(${DPCPP_THIRD_PARTY_ROOT}/xsmm/include)
+include_directories(${TORCHCCL_INCLUDE_DIR})
 
 # sources
 set(DPCPP_SRCS)
@@ -167,7 +172,7 @@ set(DPCPP_SRCS ${DPCPP_ATEN_SRCS} ${DPCPP_COMMON_SRCS} ${DPCPP_CPU_SRCS} ${DPCPP
 pybind11_add_module(${PLUGIN_NAME} SHARED ${DPCPP_SRCS})
 target_link_libraries(${PLUGIN_NAME} PRIVATE ${DPCPP_THIRD_PARTY_ROOT}/xsmm/lib/libxsmm.a)
 
-link_directories(${PYTORCH_INSTALL_DIR}/lib)
+#link_directories(${PYTORCH_INSTALL_DIR}/lib)
 target_link_libraries(${PLUGIN_NAME} PUBLIC ${PYTORCH_INSTALL_DIR}/lib/libtorch_cpu.so)
 target_link_libraries(${PLUGIN_NAME} PUBLIC ${PYTORCH_INSTALL_DIR}/lib/libc10.so)
 
@@ -184,12 +189,11 @@ else()
 endif()
 
 add_dependencies(${PLUGIN_NAME} pybind11)
-
+add_dependencies(${PLUGIN_NAME} torch_ccl)
 add_dependencies(${PLUGIN_NAME} dnnl)
 target_link_libraries(${PLUGIN_NAME} PUBLIC dnnl)
-
 add_dependencies(${PLUGIN_NAME} xsmm)
-
+target_link_libraries(${PLUGIN_NAME} PUBLIC torch_ccl)
 link_directories(${PYTORCH_INSTALL_DIR}/lib)
 target_link_libraries(${PLUGIN_NAME} PUBLIC ${PYTORCH_INSTALL_DIR}/lib/libtorch_python.so)
 target_link_libraries(${PLUGIN_NAME} PUBLIC ${PYTORCH_INSTALL_DIR}/lib/libtorch_cpu.so)
 
@@ -0,0 +1,28 @@
+# - Try to find torch-ccl
+#
+# The following are set after configuration is done:
+#  TORCHCCL_FOUND          : set to true if oneCCL is found.
+#  TORCHCCL_INCLUDE_DIR    : path to oneCCL include dir.
+#  TORCHCCL_LIBRARIES      : list of libraries for oneCCL
+#
+# The following variables are used:
+#  TORCHCCL_USE_NATIVE_ARCH : Whether native CPU instructions should be used in TORCHCCL. This should be turned off for
+#  general packaging to avoid incompatible CPU instructions. Default: OFF.
+
+IF (NOT TORCHCCL_FOUND)
+SET(TORCHCCL_FOUND OFF)
+
+SET(TORCHCCL_LIBRARIES)
+SET(TORCHCCL_INCLUDE_DIR)
+
+SET(TORCHCCL_ROOT "${PROJECT_SOURCE_DIR}/third_party/torch_ccl")
+
+ADD_SUBDIRECTORY(${TORCHCCL_ROOT})
+IF(NOT TARGET torch_ccl)
+    MESSAGE(FATAL_ERROR "Failed to include torch_ccl target")
+ENDIF()
+GET_TARGET_PROPERTY(INCLUDE_DIRS torch_ccl INCLUDE_DIRECTORIES)
+SET(TORCHCCL_INCLUDE_DIR ${INCLUDE_DIRS})
+SET(TORCHCCL_LIBRARIES torch_ccl)
+
+ENDIF(NOT TORCHCCL_FOUND)
@@ -50,9 +50,9 @@ RUN --mount=type=cache,target=/opt/ccache \
     cd intel-extension-for-pytorch && git submodule sync && \
     git submodule update --init --recursive && \
     git clone https://github.com/pytorch/pytorch && \
-    cd pytorch && git checkout v1.5.1 && git submodule sync && \
+    cd pytorch && git checkout v1.7.0 && git submodule sync && \
     git submodule update --init --recursive && \
-    git apply ../torch_patches/dpcpp-v1.5.1.patch && \
+    git apply ../torch_patches/xpu-1.7.patch && \
     USE_MKLDNN=1 USE_CUDA=0 USE_NNPACK=0 USE_CUDNN=0 \
     CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" pip install -v . && \
     cd .. && pip install -v . && rm -rf *
 
@@ -7,8 +7,9 @@
 from .optim import *
 from .ops import *
 import _torch_ipex as core
+core.enable_torch_ccl()
 
-DEVICE = 'dpcpp'
+DEVICE = 'xpu:0'
 
 class AmpConf(object):
     def __init__(self, mixed_dtype = torch.bfloat16, configure_file = None):
 
@@ -7,7 +7,7 @@
 VF_gru = _VF.gru
 
 def ipex_gru(input, hx, _flat_weights, bias, num_layers, dropout, training, bidirectional, batch_first):
-    if input.device == torch.device('dpcpp') and (dropout == 0 or training == False):
+    if input.device.type == 'xpu' and (dropout == 0 or training == False):
         return torch.ops.torch_ipex.gru(input, hx, _flat_weights, bias, num_layers, dropout, training, bidirectional, batch_first)
     else:
         return VF_gru(input, hx, _flat_weights, bias, num_layers, dropout, training, bidirectional, batch_first)
 
@@ -34,9 +34,9 @@ def fallback_lstm(*args, device):
         else:
             item_cpu = item
         args_cpu.append(item_cpu)
-    
+
     output = VF_lstm(*args_cpu)
-    
+
     # move output to the original device
     output_device = []
     # output is a tuple which does not support item assignment
 
@@ -10,13 +10,13 @@
 from torch import _VF
 
 def rnn_tanh(input, hx, _flat_weights, bias, num_layers, dropout, training, bidirectional, batch_first):
-    if input.device == torch.device('dpcpp') and (dropout == 0 or training == False):
+    if input.device.type == 'xpu' and (dropout == 0 or training == False):
         return torch.ops.torch_ipex.rnn_tanh(input, hx, _flat_weights, bias, num_layers, dropout, training, bidirectional, batch_first)
     else:
         return _VF.rnn_tanh(input, hx, _flat_weights, bias, num_layers, dropout, training, bidirectional, batch_first)
 
 def rnn_relu(input, hx, _flat_weights, bias, num_layers, dropout, training, bidirectional, batch_first):
-    if input.device == torch.device('dpcpp') and (dropout == 0 or training == False):
+    if input.device.type == 'xpu' and (dropout == 0 or training == False):
         return torch.ops.torch_ipex.rnn_relu(input, hx, _flat_weights, bias, num_layers, dropout, training, bidirectional, batch_first)
     else:
         return _VF.rnn_relu(input, hx, _flat_weights, bias, num_layers, dropout, training, bidirectional, batch_first)