diff --git a/third-party-programs/ktransformers/custom_gguf/README.md b/third-party-programs/ktransformers/custom_gguf/README.md new file mode 100644 index 000000000..b38dfc603 --- /dev/null +++ b/third-party-programs/ktransformers/custom_gguf/README.md @@ -0,0 +1,135 @@ +# Migrate ktransformers to SYCL version +[SYCLomatic](https://github.com/oneapi-src/SYCLomatic) is a project to assist developers in migrating their existing code written in different programming languages to the SYCL* C++ heterogeneous programming model. It is an open source version of the Intel® DPC++ Compatibility Tool. + +This file lists the detailed steps to migrate CUDA version of [ktransformers](https://github.com/kvcache-ai/ktransformers.git) to SYCL version with SYCLomatic. As follow table summarizes the migration environment, the software required, and so on. + + | Optimized for | Description + |:--- |:--- + | OS | Linux* Ubuntu* 22.04 + | Software | Intel® oneAPI Base Toolkit, SYCLomatic + | What you will learn | Migration of CUDA code, Run SYCL code on oneAPI and Intel device + | Time to complete | TBD + + +## Migrating ktransformers to SYCL + +### 1 Prepare the migration +#### 1.1 Get the source code of ktransformers and install the dependencies +```sh + $ git clone https://github.com/kvcache-ai/ktransformers.git + $ pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121 + $ export PATH=/usr/local/cuda:$PATH + $ export PATH=/usr/local/cuda-12.4/bin:$PATH +``` + +#### 1.2 Prepare migration tool and environment + + * Install SYCL run environment [Intel® oneAPI Base Toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html). After installation, the Intel® DPC++ Compatibility tool is also available, set up the SYCL run environment as follows: + +``` + $ source /opt/intel/oneapi/setvars.sh + $ dpct --version # Intel® DPC++ Compatibility tool version +``` + * If want to try the latest version of the compatibility tool, try to install SYCLomatic by downloading prebuild of [SYCLomatic release](https://github.com/oneapi-src/SYCLomatic/blob/SYCLomatic/README.md#Releases) or [build from source](https://github.com/oneapi-src/SYCLomatic/blob/SYCLomatic/README.md), as follow give the steps to install prebuild version: + ``` + $ export SYCLomatic_HOME=/path/to/install/SYCLomatic + $ mkdir $SYCLomatic_HOME + $ cd $SYCLomatic_HOME + $ wget https://github.com/oneapi-src/SYCLomatic/releases/download/20240203/linux_release.tgz #Change the timestamp 20240203 to latest one + $ tar xzvf linux_release.tgz + $ source setvars.sh + $ dpct --version #SYCLomatic version + ``` + +For more information on configuring environment variables, see [Use the setvars Script with Linux*](https://www.intel.com/content/www/us/en/develop/documentation/oneapi-programming-guide/top/oneapi-development-environment-setup/use-the-setvars-script-with-linux-or-macos.html). + +### 2 Migrate the source code +Here, we use [custom_gguf](https://github.com/kvcache-ai/ktransformers/tree/main/ktransformers/ktransformers_ext/cuda/custom_gguf) as an example to explain the migrate process. + +```sh +# custom_gguf_HOME=ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/ +$ export PATH_TO_C2S_INSTALL_FOLDER=~/workspace/c2s_install +$ source $PATH_TO_C2S_INSTALL_FOLDER/setvars.sh +$ cd ${custom_gguf_HOME} +$ c2s dequant.cu \ + --extra-arg="-I/~/.local/lib/python3.10/site-packages/torch/include" \ + --extra-arg="-I/~/.local/lib/python3.10/site-packages/torch/include/torch/csrc/api/include" \ + --extra-arg="-I/usr/include/python3.10" \ + --rule-file=~/workspace/c2s_install/extensions/pytorch_api_rules/pytorch_api.yaml +``` + +Now you can see the migrated files in ${custom_gguf_HOME}/dpct_output. + +### 3 Prepare the running environment +#### 3.1 Create virtual environment and source oneapi +``` +$ python3 -m venv xputorch +$ source ~/workspace/xputorch/bin/activate +$ source /opt/intel/oneapi/setvars.sh +$ export LD_LIBRARY_PATH=~/workspace/xputorch/lib/python3.10/site-packages/torch/lib:$LD_LIBRARY_PATH +``` +#### 3.2 Install xpu torch +Install xpu torch through + +``` +pip install torch==2.7.0.dev20250305+xpu --extra-index-url https://download.pytorch.org/whl/nightly/xpu +``` + +### 4 Build the migrated ktransformers +There serveral tests available in the current stage: +* 3 sycl tests to test single kernel (passed) in ./migrated/single_kernel_test +* 4 sycl tests to test single kernel (results mismatch) in ./migrated/single_kernel_test_need_debug +* 1 torch test to test dequantize_q8_0 in ./migrated/torch_test +* 9 pytorch test to test in ./migrated/python_test, passed with random generated input, compared with cpu calculation + * dequantize_f32 + * dequantize_f16 + * dequantize_q8_0 + * dequantize_q2_k + * dequantize_q3_k + * dequantize_q4_k + * dequantize_q5_k + * dequantize_q6_k + * dequantize_iq4_xs + +For the c++ test, you can select one - ${test_directory}/${test_name}, and compile it through +``` +$ cd ${test_directory} +$ source /opt/intel/oneapi/setvars.sh +$ icpx -fsycl -I/opt/intel/oneapi/compiler/latest/include/sycl -I/~/workspace/xputorch/lib/python3.10/site-packages/torch/include -I/usr/include/python3.10 -I/~/workspace/xputorch/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -L/~/workspace/xputorch/lib/python3.10/site-packages/torch/lib -ltorch_xpu -ltorch_cpu -lc10_xpu -lc10 ${test_name} -o ${out_name} +``` + +For the python test, you need to bulid extension and run python test +``` +$ source ~/workspace/xputorch/bin/activate +$ source /opt/intel/oneapi/setvars.sh +$ unset CPATH # avoid duplicated headers in pytorch sycl +$ cd third-party-programs/ktransformers/custom_gguf/migrated +$ export CC=icpx +$ export CXX=icpx +$ source $SYCLomatic_HOME/setvars.sh +$ python3 setup.py build_ext --inplace + +# Run the pytest +$ pip install pytest +$ cd python_test +$ ptest test_dequant.py +``` + +### 5 Run migrated SYCL version ktransformers +``` +$ ./${out_name} +``` + + +## ktransformers License +[LICENSE](https://github.com/kvcache-ai/ktransformers/blob/main/LICENSE) + +## Reference +* Command Line Options of [SYCLomatic](https://oneapi-src.github.io/SYCLomatic/dev_guide/command-line-options-reference.html) or [Intel® DPC++ Compatibility Tool](https://software.intel.com/content/www/us/en/develop/documentation/intel-dpcpp-compatibility-tool-user-guide/top/command-line-options-reference.html) +* [oneAPI GPU Optimization Guide](https://www.intel.com/content/www/us/en/docs/oneapi/optimization-guide-gpu/) +* [SYCLomatic project](https://github.com/oneapi-src/SYCLomatic/) + + +## Trademarks information +Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. +\*Other names and brands may be claimed as the property of others. SYCL is a trademark of the Khronos Group Inc. diff --git a/third-party-programs/ktransformers/custom_gguf/migrated/MainSourceFiles.yaml b/third-party-programs/ktransformers/custom_gguf/migrated/MainSourceFiles.yaml new file mode 100644 index 000000000..63d825773 --- /dev/null +++ b/third-party-programs/ktransformers/custom_gguf/migrated/MainSourceFiles.yaml @@ -0,0 +1,3006 @@ +--- +MainSourceFile: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/MainSrcFiles_placehold' +Replacements: + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 352 + Length: 26 + ReplacementText: "#include \n#include \n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 378 + Length: 23 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 401 + Length: 23 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 524 + Length: 31 + ReplacementText: "#include \n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 557 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 699 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 730 + Length: 10 + ReplacementText: 'item_ct1.get_group(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 743 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 756 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 846 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 859 + Length: 9 + ReplacementText: 'item_ct1.get_group_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 1039 + Length: 33 + ReplacementText: 'sycl::vec(*((sycl::half*)cur_block)).convert()[0]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 1214 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 1278 + Length: 6 + ReplacementText: 'sycl::half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 1357 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 1388 + Length: 10 + ReplacementText: 'item_ct1.get_group(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 1401 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 1414 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 1504 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 1517 + Length: 9 + ReplacementText: 'item_ct1.get_group_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 1538 + Length: 6 + ReplacementText: 'sycl::half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 1573 + Length: 6 + ReplacementText: 'sycl::half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 1700 + Length: 33 + ReplacementText: 'sycl::vec(*((sycl::half*)cur_block)).convert()[0]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 1835 + Length: 34 + ReplacementText: 'sycl::vec(scale * cur_block[i]).convert()[0]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 1890 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 1954 + Length: 11 + ReplacementText: 'sycl::ext::oneapi::bfloat16' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 2038 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 2069 + Length: 10 + ReplacementText: 'item_ct1.get_group(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 2082 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 2095 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 2185 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 2198 + Length: 9 + ReplacementText: 'item_ct1.get_group_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 2219 + Length: 11 + ReplacementText: 'sycl::ext::oneapi::bfloat16' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 2259 + Length: 11 + ReplacementText: 'sycl::ext::oneapi::bfloat16' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 2391 + Length: 33 + ReplacementText: 'sycl::vec(*((sycl::half*)cur_block)).convert()[0]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 2526 + Length: 38 + ReplacementText: 'sycl::ext::oneapi::bfloat16(scale * cur_block[i])' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 2714 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 3014 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 3156 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 3187 + Length: 10 + ReplacementText: 'item_ct1.get_group(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 3200 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 3213 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 3298 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 3311 + Length: 9 + ReplacementText: 'item_ct1.get_group_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 3434 + Length: 79 + ReplacementText: 'sycl::vec(*(reinterpret_cast(data + block_id * blk_size + 80))).convert()[0]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 3541 + Length: 79 + ReplacementText: 'sycl::vec(*(reinterpret_cast(data + block_id * blk_size + 82))).convert()[0]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 4506 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 4570 + Length: 6 + ReplacementText: 'sycl::half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 4649 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 4680 + Length: 10 + ReplacementText: 'item_ct1.get_group(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 4693 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 4706 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 4791 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 4804 + Length: 9 + ReplacementText: 'item_ct1.get_group_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 4824 + Length: 6 + ReplacementText: 'sycl::half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 4859 + Length: 6 + ReplacementText: 'sycl::half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 4929 + Length: 79 + ReplacementText: 'sycl::vec(*(reinterpret_cast(data + block_id * blk_size + 80))).convert()[0]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 5036 + Length: 79 + ReplacementText: 'sycl::vec(*(reinterpret_cast(data + block_id * blk_size + 82))).convert()[0]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 5603 + Length: 55 + ReplacementText: 'sycl::vec(dl * ((int8_t)((q[l] >> shift) & 3)) - ml).convert()[0]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 5886 + Length: 58 + ReplacementText: 'sycl::vec(dl * ((int8_t)((q[l+16] >> shift) & 3)) - ml).convert()[0]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 6029 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 6093 + Length: 11 + ReplacementText: 'sycl::ext::oneapi::bfloat16' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 6177 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 6208 + Length: 10 + ReplacementText: 'item_ct1.get_group(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 6221 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 6234 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 6319 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 6332 + Length: 9 + ReplacementText: 'item_ct1.get_group_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 6352 + Length: 11 + ReplacementText: 'sycl::ext::oneapi::bfloat16' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 6392 + Length: 11 + ReplacementText: 'sycl::ext::oneapi::bfloat16' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 6467 + Length: 79 + ReplacementText: 'sycl::vec(*(reinterpret_cast(data + block_id * blk_size + 80))).convert()[0]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 6574 + Length: 79 + ReplacementText: 'sycl::vec(*(reinterpret_cast(data + block_id * blk_size + 82))).convert()[0]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 7141 + Length: 59 + ReplacementText: 'sycl::ext::oneapi::bfloat16(dl * ((int8_t)((q[l] >> shift) & 3)) - ml)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 7428 + Length: 62 + ReplacementText: 'sycl::ext::oneapi::bfloat16(dl * ((int8_t)((q[l+16] >> shift) & 3)) - ml)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 7575 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 7717 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 7753 + Length: 10 + ReplacementText: 'item_ct1.get_group(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 7766 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 7779 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 7948 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 7961 + Length: 9 + ReplacementText: 'item_ct1.get_group_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 8163 + Length: 80 + ReplacementText: 'sycl::vec(*(reinterpret_cast(data + block_id * blk_size + 108))).convert()[0]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 9745 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 9809 + Length: 6 + ReplacementText: 'sycl::half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 9888 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 9924 + Length: 10 + ReplacementText: 'item_ct1.get_group(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 9937 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 9950 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 10119 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 10132 + Length: 9 + ReplacementText: 'item_ct1.get_group_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 10152 + Length: 6 + ReplacementText: 'sycl::half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 10187 + Length: 6 + ReplacementText: 'sycl::half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 10336 + Length: 80 + ReplacementText: 'sycl::vec(*(reinterpret_cast(data + block_id * blk_size + 108))).convert()[0]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 11504 + Length: 80 + ReplacementText: 'sycl::vec(dl * ((int8_t)((q[l+ 0] >> shift) & 3) - ((hm[l+ 0] & m) ? 0 : 4))).convert()[0]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 11738 + Length: 80 + ReplacementText: 'sycl::vec(dl * ((int8_t)((q[l+16] >> shift) & 3) - ((hm[l+16] & m) ? 0 : 4))).convert()[0]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 11946 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 12010 + Length: 11 + ReplacementText: 'sycl::ext::oneapi::bfloat16' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 12094 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 12130 + Length: 10 + ReplacementText: 'item_ct1.get_group(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 12143 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 12156 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 12325 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 12338 + Length: 9 + ReplacementText: 'item_ct1.get_group_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 12358 + Length: 11 + ReplacementText: 'sycl::ext::oneapi::bfloat16' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 12398 + Length: 11 + ReplacementText: 'sycl::ext::oneapi::bfloat16' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 12552 + Length: 80 + ReplacementText: 'sycl::vec(*(reinterpret_cast(data + block_id * blk_size + 108))).convert()[0]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 13720 + Length: 84 + ReplacementText: 'sycl::ext::oneapi::bfloat16(dl * ((int8_t)((q[l+ 0] >> shift) & 3) - ((hm[l+ 0] & m) ? 0 : 4)))' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 13958 + Length: 84 + ReplacementText: 'sycl::ext::oneapi::bfloat16(dl * ((int8_t)((q[l+16] >> shift) & 3) - ((hm[l+16] & m) ? 0 : 4)))' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 14171 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 14313 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 14344 + Length: 10 + ReplacementText: 'item_ct1.get_group(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 14357 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 14370 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 14454 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 14467 + Length: 9 + ReplacementText: 'item_ct1.get_group_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 14701 + Length: 73 + ReplacementText: 'sycl::vec(*(reinterpret_cast(data + block_id * 144 + 0))).convert()[0]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 14802 + Length: 73 + ReplacementText: 'sycl::vec(*(reinterpret_cast(data + block_id * 144 + 2))).convert()[0]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 15488 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 15552 + Length: 6 + ReplacementText: 'sycl::half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 15631 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 15662 + Length: 10 + ReplacementText: 'item_ct1.get_group(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 15675 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 15688 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 15772 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 15785 + Length: 9 + ReplacementText: 'item_ct1.get_group_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 15805 + Length: 6 + ReplacementText: 'sycl::half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 15840 + Length: 6 + ReplacementText: 'sycl::half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 16021 + Length: 73 + ReplacementText: 'sycl::vec(*(reinterpret_cast(data + block_id * 144 + 0))).convert()[0]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 16122 + Length: 73 + ReplacementText: 'sycl::vec(*(reinterpret_cast(data + block_id * 144 + 2))).convert()[0]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 16654 + Length: 36 + ReplacementText: 'sycl::vec(d1 * (q[l] & 0xF) - m1).convert()[0]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 16749 + Length: 36 + ReplacementText: 'sycl::vec(d2 * (q[l] >> 4) - m2).convert()[0]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 16836 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 16900 + Length: 11 + ReplacementText: 'sycl::ext::oneapi::bfloat16' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 16984 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 17015 + Length: 10 + ReplacementText: 'item_ct1.get_group(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 17028 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 17041 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 17125 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 17138 + Length: 9 + ReplacementText: 'item_ct1.get_group_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 17158 + Length: 11 + ReplacementText: 'sycl::ext::oneapi::bfloat16' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 17198 + Length: 11 + ReplacementText: 'sycl::ext::oneapi::bfloat16' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 17384 + Length: 73 + ReplacementText: 'sycl::vec(*(reinterpret_cast(data + block_id * 144 + 0))).convert()[0]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 17485 + Length: 73 + ReplacementText: 'sycl::vec(*(reinterpret_cast(data + block_id * 144 + 2))).convert()[0]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 18017 + Length: 40 + ReplacementText: 'sycl::ext::oneapi::bfloat16(d1 * (q[l] & 0xF) - m1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 18116 + Length: 40 + ReplacementText: 'sycl::ext::oneapi::bfloat16(d2 * (q[l] >> 4) - m2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 18207 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 18349 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 18380 + Length: 10 + ReplacementText: 'item_ct1.get_group(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 18393 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 18406 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 18496 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 18509 + Length: 9 + ReplacementText: 'item_ct1.get_group_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 18632 + Length: 78 + ReplacementText: 'sycl::vec(*(reinterpret_cast(data + block_id * blk_size + 0))).convert()[0]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 18738 + Length: 78 + ReplacementText: 'sycl::vec(*(reinterpret_cast(data + block_id * blk_size + 2))).convert()[0]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 19718 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 19782 + Length: 6 + ReplacementText: 'sycl::half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 19861 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 19892 + Length: 10 + ReplacementText: 'item_ct1.get_group(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 19905 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 19918 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 20008 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 20021 + Length: 9 + ReplacementText: 'item_ct1.get_group_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 20041 + Length: 6 + ReplacementText: 'sycl::half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 20076 + Length: 6 + ReplacementText: 'sycl::half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 20146 + Length: 78 + ReplacementText: 'sycl::vec(*(reinterpret_cast(data + block_id * blk_size + 0))).convert()[0]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 20252 + Length: 78 + ReplacementText: 'sycl::vec(*(reinterpret_cast(data + block_id * blk_size + 2))).convert()[0]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 20991 + Length: 63 + ReplacementText: 'sycl::vec(d1 * ((ql[l] & 0xF) + (qh[l] & u1 ? 16 : 0)) - m1).convert()[0]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 21113 + Length: 63 + ReplacementText: 'sycl::vec(d2 * ((ql[l] >> 4) + (qh[l] & u2 ? 16 : 0)) - m2).convert()[0]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 21260 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 21324 + Length: 11 + ReplacementText: 'sycl::ext::oneapi::bfloat16' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 21408 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 21439 + Length: 10 + ReplacementText: 'item_ct1.get_group(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 21452 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 21465 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 21555 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 21568 + Length: 9 + ReplacementText: 'item_ct1.get_group_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 21588 + Length: 11 + ReplacementText: 'sycl::ext::oneapi::bfloat16' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 21628 + Length: 11 + ReplacementText: 'sycl::ext::oneapi::bfloat16' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 21703 + Length: 78 + ReplacementText: 'sycl::vec(*(reinterpret_cast(data + block_id * blk_size + 0))).convert()[0]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 21809 + Length: 78 + ReplacementText: 'sycl::vec(*(reinterpret_cast(data + block_id * blk_size + 2))).convert()[0]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 22548 + Length: 67 + ReplacementText: 'sycl::ext::oneapi::bfloat16(d1 * ((ql[l] & 0xF) + (qh[l] & u1 ? 16 : 0)) - m1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 22674 + Length: 67 + ReplacementText: 'sycl::ext::oneapi::bfloat16(d2 * ((ql[l] >> 4) + (qh[l] & u2 ? 16 : 0)) - m2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 22825 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 22967 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 22998 + Length: 10 + ReplacementText: 'item_ct1.get_group(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 23011 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 23024 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 23108 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 23121 + Length: 9 + ReplacementText: 'item_ct1.get_group_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 23241 + Length: 80 + ReplacementText: 'sycl::vec(*(reinterpret_cast(data + block_id * blk_size + 208))).convert()[0]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 24459 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 24523 + Length: 6 + ReplacementText: 'sycl::half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 24602 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 24633 + Length: 10 + ReplacementText: 'item_ct1.get_group(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 24646 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 24659 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 24743 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 24756 + Length: 9 + ReplacementText: 'item_ct1.get_group_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 24776 + Length: 6 + ReplacementText: 'sycl::half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 24811 + Length: 6 + ReplacementText: 'sycl::half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 24878 + Length: 80 + ReplacementText: 'sycl::vec(*(reinterpret_cast(data + block_id * blk_size + 208))).convert()[0]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 25772 + Length: 33 + ReplacementText: 'sycl::vec(d * sc[is + 0] * q1).convert()[0]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 25844 + Length: 33 + ReplacementText: 'sycl::vec(d * sc[is + 2] * q2).convert()[0]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 25916 + Length: 33 + ReplacementText: 'sycl::vec(d * sc[is + 4] * q3).convert()[0]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 25988 + Length: 33 + ReplacementText: 'sycl::vec(d * sc[is + 6] * q4).convert()[0]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 26152 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 26216 + Length: 11 + ReplacementText: 'sycl::ext::oneapi::bfloat16' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 26300 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 26331 + Length: 10 + ReplacementText: 'item_ct1.get_group(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 26344 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 26357 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 26441 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 26454 + Length: 9 + ReplacementText: 'item_ct1.get_group_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 26474 + Length: 11 + ReplacementText: 'sycl::ext::oneapi::bfloat16' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 26514 + Length: 11 + ReplacementText: 'sycl::ext::oneapi::bfloat16' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 26586 + Length: 80 + ReplacementText: 'sycl::vec(*(reinterpret_cast(data + block_id * blk_size + 208))).convert()[0]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 27480 + Length: 37 + ReplacementText: 'sycl::ext::oneapi::bfloat16(d * sc[is + 0] * q1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 27556 + Length: 37 + ReplacementText: 'sycl::ext::oneapi::bfloat16(d * sc[is + 2] * q2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 27632 + Length: 37 + ReplacementText: 'sycl::ext::oneapi::bfloat16(d * sc[is + 4] * q3)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 27708 + Length: 37 + ReplacementText: 'sycl::ext::oneapi::bfloat16(d * sc[is + 6] * q4)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 27876 + Length: 130 + ReplacementText: 'static dpct::global_memory kvalues_iq4nl(sycl::range<1>(16), {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113});' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 28008 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 28152 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1,\n const int8_t *kvalues_iq4nl" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 28183 + Length: 10 + ReplacementText: 'item_ct1.get_group(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 28196 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 28209 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 28293 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 28306 + Length: 9 + ReplacementText: 'item_ct1.get_group_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 28427 + Length: 74 + ReplacementText: 'sycl::vec(*(reinterpret_cast(data + block_id * blk_size))).convert()[0]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 29228 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 29294 + Length: 6 + ReplacementText: 'sycl::half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 29373 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1,\n const int8_t *kvalues_iq4nl" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 29404 + Length: 10 + ReplacementText: 'item_ct1.get_group(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 29417 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 29430 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 29514 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 29527 + Length: 9 + ReplacementText: 'item_ct1.get_group_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 29548 + Length: 6 + ReplacementText: 'sycl::half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 29583 + Length: 6 + ReplacementText: 'sycl::half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 29650 + Length: 74 + ReplacementText: 'sycl::vec(*(reinterpret_cast(data + block_id * blk_size))).convert()[0]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 30264 + Length: 45 + ReplacementText: 'sycl::vec(dl * kvalues_iq4nl[qs[j] & 0xf]).convert()[0]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 30348 + Length: 44 + ReplacementText: 'sycl::vec(dl * kvalues_iq4nl[qs[j] >> 4]).convert()[0]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 30479 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 30545 + Length: 11 + ReplacementText: 'sycl::ext::oneapi::bfloat16' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 30629 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1,\n const int8_t *kvalues_iq4nl" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 30660 + Length: 10 + ReplacementText: 'item_ct1.get_group(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 30673 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 30686 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 30770 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 30783 + Length: 9 + ReplacementText: 'item_ct1.get_group_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 30804 + Length: 11 + ReplacementText: 'sycl::ext::oneapi::bfloat16' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 30844 + Length: 11 + ReplacementText: 'sycl::ext::oneapi::bfloat16' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 30916 + Length: 74 + ReplacementText: 'sycl::vec(*(reinterpret_cast(data + block_id * blk_size))).convert()[0]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 31530 + Length: 49 + ReplacementText: 'sycl::ext::oneapi::bfloat16(dl * kvalues_iq4nl[qs[j] & 0xf])' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 31618 + Length: 48 + ReplacementText: 'sycl::ext::oneapi::bfloat16(dl * kvalues_iq4nl[qs[j] >> 4])' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 31929 + Length: 0 + ReplacementText: "\n dpct::device_ext &dev_ct1 = dpct::get_current_device();\n sycl::queue &q_ct1 = dev_ct1.in_order_queue();" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 31983 + Length: 27 + ReplacementText: 'c10::OptionalDeviceGuard' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 32222 + Length: 80 + ReplacementText: 'q_ct1.memcpy(data_gpu.data_ptr(), data, num_bytes).wait()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 32534 + Length: 133 + ReplacementText: "{\n dpct::has_capability_or_fail(q_ct1.get_device(), {sycl::aspect::fp16});\n\n q_ct1.submit(\n [&](sycl::handler &cgh) {\n const int8_t * data_gpu_data_ptr_int8_t_ct0 = data_gpu.data_ptr();\n auto output_data_ptr_ct1 = (sycl::half*)output.data_ptr();\n\n cgh.parallel_for(\n sycl::nd_range<3>(sycl::range<3>(1, 1, 512) * sycl::range<3>(1, 1, 256), sycl::range<3>(1, 1, 256)), \n [=](sycl::nd_item<3> item_ct1) {\n dequantize_q8_0_fp16_kernel(data_gpu_data_ptr_int8_t_ct0, output_data_ptr_ct1, blk_size, ele_per_blk, num_blocks, item_ct1);\n });\n });\n }" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 32667 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 32731 + Length: 138 + ReplacementText: "{\n dpct::has_capability_or_fail(q_ct1.get_device(), {sycl::aspect::fp16});\n\n q_ct1.submit(\n [&](sycl::handler &cgh) {\n const int8_t * data_gpu_data_ptr_int8_t_ct0 = data_gpu.data_ptr();\n auto output_data_ptr_ct1 = (sycl::ext::oneapi::bfloat16*)output.data_ptr();\n\n cgh.parallel_for(\n sycl::nd_range<3>(sycl::range<3>(1, 1, 512) * sycl::range<3>(1, 1, 256), sycl::range<3>(1, 1, 256)), \n [=](sycl::nd_item<3> item_ct1) {\n dequantize_q8_0_bf16_kernel(data_gpu_data_ptr_int8_t_ct0, output_data_ptr_ct1, blk_size, ele_per_blk, num_blocks, item_ct1);\n });\n });\n }" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 32869 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 32932 + Length: 131 + ReplacementText: "{\n dpct::has_capability_or_fail(q_ct1.get_device(), {sycl::aspect::fp16});\n\n q_ct1.submit(\n [&](sycl::handler &cgh) {\n const int8_t * data_gpu_data_ptr_int8_t_ct0 = data_gpu.data_ptr();\n auto output_data_ptr_float_ct1 = output.data_ptr();\n\n cgh.parallel_for(\n sycl::nd_range<3>(sycl::range<3>(1, 1, 512) * sycl::range<3>(1, 1, 256), sycl::range<3>(1, 1, 256)), \n [=](sycl::nd_item<3> item_ct1) {\n dequantize_q8_0_fp32_kernel(data_gpu_data_ptr_int8_t_ct0, output_data_ptr_float_ct1, blk_size, ele_per_blk, num_blocks, item_ct1);\n });\n });\n }" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 33063 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 33182 + Length: 23 + ReplacementText: 'dev_ct1.queues_wait_and_throw()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 33406 + Length: 0 + ReplacementText: "\n dpct::device_ext &dev_ct1 = dpct::get_current_device();\n sycl::queue &q_ct1 = dev_ct1.in_order_queue();" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 33516 + Length: 27 + ReplacementText: 'c10::OptionalDeviceGuard' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 33752 + Length: 80 + ReplacementText: 'q_ct1.memcpy(data_gpu.data_ptr(), data, num_bytes).wait()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 34063 + Length: 133 + ReplacementText: "{\n dpct::has_capability_or_fail(q_ct1.get_device(), {sycl::aspect::fp16});\n\n q_ct1.submit(\n [&](sycl::handler &cgh) {\n const int8_t * data_gpu_data_ptr_int8_t_ct0 = data_gpu.data_ptr();\n auto output_data_ptr_ct1 = (sycl::half*)output.data_ptr();\n\n cgh.parallel_for(\n sycl::nd_range<3>(sycl::range<3>(1, 1, 512) * sycl::range<3>(1, 1, 256), sycl::range<3>(1, 1, 256)), \n [=](sycl::nd_item<3> item_ct1) {\n dequantize_q6_k_fp16_kernel(data_gpu_data_ptr_int8_t_ct0, output_data_ptr_ct1, blk_size, ele_per_blk, num_blocks, item_ct1);\n });\n });\n }" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 34196 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 34260 + Length: 138 + ReplacementText: "{\n dpct::has_capability_or_fail(q_ct1.get_device(), {sycl::aspect::fp16});\n\n q_ct1.submit(\n [&](sycl::handler &cgh) {\n const int8_t * data_gpu_data_ptr_int8_t_ct0 = data_gpu.data_ptr();\n auto output_data_ptr_ct1 = (sycl::ext::oneapi::bfloat16*)output.data_ptr();\n\n cgh.parallel_for(\n sycl::nd_range<3>(sycl::range<3>(1, 1, 512) * sycl::range<3>(1, 1, 256), sycl::range<3>(1, 1, 256)), \n [=](sycl::nd_item<3> item_ct1) {\n dequantize_q6_k_bf16_kernel(data_gpu_data_ptr_int8_t_ct0, output_data_ptr_ct1, blk_size, ele_per_blk, num_blocks, item_ct1);\n });\n });\n }" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 34398 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 34461 + Length: 131 + ReplacementText: "{\n dpct::has_capability_or_fail(q_ct1.get_device(), {sycl::aspect::fp16});\n\n q_ct1.submit(\n [&](sycl::handler &cgh) {\n const int8_t * data_gpu_data_ptr_int8_t_ct0 = data_gpu.data_ptr();\n auto output_data_ptr_float_ct1 = output.data_ptr();\n\n cgh.parallel_for(\n sycl::nd_range<3>(sycl::range<3>(1, 1, 512) * sycl::range<3>(1, 1, 256), sycl::range<3>(1, 1, 256)), \n [=](sycl::nd_item<3> item_ct1) {\n dequantize_q6_k_fp32_kernel(data_gpu_data_ptr_int8_t_ct0, output_data_ptr_float_ct1, blk_size, ele_per_blk, num_blocks, item_ct1);\n });\n });\n }" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 34592 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 34710 + Length: 23 + ReplacementText: 'dev_ct1.queues_wait_and_throw()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 34933 + Length: 0 + ReplacementText: "\n dpct::device_ext &dev_ct1 = dpct::get_current_device();\n sycl::queue &q_ct1 = dev_ct1.in_order_queue();" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 34987 + Length: 27 + ReplacementText: 'c10::OptionalDeviceGuard' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 35224 + Length: 80 + ReplacementText: 'q_ct1.memcpy(data_gpu.data_ptr(), data, num_bytes).wait()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 35535 + Length: 133 + ReplacementText: "{\n dpct::has_capability_or_fail(q_ct1.get_device(), {sycl::aspect::fp16});\n\n q_ct1.submit(\n [&](sycl::handler &cgh) {\n const int8_t * data_gpu_data_ptr_int8_t_ct0 = data_gpu.data_ptr();\n auto output_data_ptr_ct1 = (sycl::half*)output.data_ptr();\n\n cgh.parallel_for(\n sycl::nd_range<3>(sycl::range<3>(1, 1, 512) * sycl::range<3>(1, 1, 256), sycl::range<3>(1, 1, 256)), \n [=](sycl::nd_item<3> item_ct1) {\n dequantize_q5_k_fp16_kernel(data_gpu_data_ptr_int8_t_ct0, output_data_ptr_ct1, blk_size, ele_per_blk, num_blocks, item_ct1);\n });\n });\n }" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 35668 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 35732 + Length: 138 + ReplacementText: "{\n dpct::has_capability_or_fail(q_ct1.get_device(), {sycl::aspect::fp16});\n\n q_ct1.submit(\n [&](sycl::handler &cgh) {\n const int8_t * data_gpu_data_ptr_int8_t_ct0 = data_gpu.data_ptr();\n auto output_data_ptr_ct1 = (sycl::ext::oneapi::bfloat16*)output.data_ptr();\n\n cgh.parallel_for(\n sycl::nd_range<3>(sycl::range<3>(1, 1, 512) * sycl::range<3>(1, 1, 256), sycl::range<3>(1, 1, 256)), \n [=](sycl::nd_item<3> item_ct1) {\n dequantize_q5_k_bf16_kernel(data_gpu_data_ptr_int8_t_ct0, output_data_ptr_ct1, blk_size, ele_per_blk, num_blocks, item_ct1);\n });\n });\n }" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 35870 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 35933 + Length: 131 + ReplacementText: "{\n dpct::has_capability_or_fail(q_ct1.get_device(), {sycl::aspect::fp16});\n\n q_ct1.submit(\n [&](sycl::handler &cgh) {\n const int8_t * data_gpu_data_ptr_int8_t_ct0 = data_gpu.data_ptr();\n auto output_data_ptr_float_ct1 = output.data_ptr();\n\n cgh.parallel_for(\n sycl::nd_range<3>(sycl::range<3>(1, 1, 512) * sycl::range<3>(1, 1, 256), sycl::range<3>(1, 1, 256)), \n [=](sycl::nd_item<3> item_ct1) {\n dequantize_q5_k_fp32_kernel(data_gpu_data_ptr_int8_t_ct0, output_data_ptr_float_ct1, blk_size, ele_per_blk, num_blocks, item_ct1);\n });\n });\n }" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 36064 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 36182 + Length: 23 + ReplacementText: 'dev_ct1.queues_wait_and_throw()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 36405 + Length: 0 + ReplacementText: "\n dpct::device_ext &dev_ct1 = dpct::get_current_device();\n sycl::queue &q_ct1 = dev_ct1.in_order_queue();" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 36514 + Length: 27 + ReplacementText: 'c10::OptionalDeviceGuard' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 36751 + Length: 80 + ReplacementText: 'q_ct1.memcpy(data_gpu.data_ptr(), data, num_bytes).wait()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 37062 + Length: 133 + ReplacementText: "{\n dpct::has_capability_or_fail(q_ct1.get_device(), {sycl::aspect::fp16});\n\n q_ct1.submit(\n [&](sycl::handler &cgh) {\n const int8_t * data_gpu_data_ptr_int8_t_ct0 = data_gpu.data_ptr();\n auto output_data_ptr_ct1 = (sycl::half*)output.data_ptr();\n\n cgh.parallel_for(\n sycl::nd_range<3>(sycl::range<3>(1, 1, 512) * sycl::range<3>(1, 1, 256), sycl::range<3>(1, 1, 256)), \n [=](sycl::nd_item<3> item_ct1) {\n dequantize_q4_k_fp16_kernel(data_gpu_data_ptr_int8_t_ct0, output_data_ptr_ct1, blk_size, ele_per_blk, num_blocks, item_ct1);\n });\n });\n }" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 37195 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 37259 + Length: 138 + ReplacementText: "{\n dpct::has_capability_or_fail(q_ct1.get_device(), {sycl::aspect::fp16});\n\n q_ct1.submit(\n [&](sycl::handler &cgh) {\n const int8_t * data_gpu_data_ptr_int8_t_ct0 = data_gpu.data_ptr();\n auto output_data_ptr_ct1 = (sycl::ext::oneapi::bfloat16*)output.data_ptr();\n\n cgh.parallel_for(\n sycl::nd_range<3>(sycl::range<3>(1, 1, 512) * sycl::range<3>(1, 1, 256), sycl::range<3>(1, 1, 256)), \n [=](sycl::nd_item<3> item_ct1) {\n dequantize_q4_k_bf16_kernel(data_gpu_data_ptr_int8_t_ct0, output_data_ptr_ct1, blk_size, ele_per_blk, num_blocks, item_ct1);\n });\n });\n }" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 37397 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 37460 + Length: 131 + ReplacementText: "{\n dpct::has_capability_or_fail(q_ct1.get_device(), {sycl::aspect::fp16});\n\n q_ct1.submit(\n [&](sycl::handler &cgh) {\n const int8_t * data_gpu_data_ptr_int8_t_ct0 = data_gpu.data_ptr();\n auto output_data_ptr_float_ct1 = output.data_ptr();\n\n cgh.parallel_for(\n sycl::nd_range<3>(sycl::range<3>(1, 1, 512) * sycl::range<3>(1, 1, 256), sycl::range<3>(1, 1, 256)), \n [=](sycl::nd_item<3> item_ct1) {\n dequantize_q4_k_fp32_kernel(data_gpu_data_ptr_int8_t_ct0, output_data_ptr_float_ct1, blk_size, ele_per_blk, num_blocks, item_ct1);\n });\n });\n }" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 37591 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 37709 + Length: 23 + ReplacementText: 'dev_ct1.queues_wait_and_throw()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 37932 + Length: 0 + ReplacementText: "\n dpct::device_ext &dev_ct1 = dpct::get_current_device();\n sycl::queue &q_ct1 = dev_ct1.in_order_queue();" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 37986 + Length: 27 + ReplacementText: 'c10::OptionalDeviceGuard' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 38223 + Length: 80 + ReplacementText: 'q_ct1.memcpy(data_gpu.data_ptr(), data, num_bytes).wait()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 38534 + Length: 133 + ReplacementText: "{\n dpct::has_capability_or_fail(q_ct1.get_device(), {sycl::aspect::fp16});\n\n q_ct1.submit(\n [&](sycl::handler &cgh) {\n const int8_t * data_gpu_data_ptr_int8_t_ct0 = data_gpu.data_ptr();\n auto output_data_ptr_ct1 = (sycl::half*)output.data_ptr();\n\n cgh.parallel_for(\n sycl::nd_range<3>(sycl::range<3>(1, 1, 512) * sycl::range<3>(1, 1, 256), sycl::range<3>(1, 1, 256)), \n [=](sycl::nd_item<3> item_ct1) {\n dequantize_q3_k_fp16_kernel(data_gpu_data_ptr_int8_t_ct0, output_data_ptr_ct1, blk_size, ele_per_blk, num_blocks, item_ct1);\n });\n });\n }" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 38667 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 38731 + Length: 138 + ReplacementText: "{\n dpct::has_capability_or_fail(q_ct1.get_device(), {sycl::aspect::fp16});\n\n q_ct1.submit(\n [&](sycl::handler &cgh) {\n const int8_t * data_gpu_data_ptr_int8_t_ct0 = data_gpu.data_ptr();\n auto output_data_ptr_ct1 = (sycl::ext::oneapi::bfloat16*)output.data_ptr();\n\n cgh.parallel_for(\n sycl::nd_range<3>(sycl::range<3>(1, 1, 512) * sycl::range<3>(1, 1, 256), sycl::range<3>(1, 1, 256)), \n [=](sycl::nd_item<3> item_ct1) {\n dequantize_q3_k_bf16_kernel(data_gpu_data_ptr_int8_t_ct0, output_data_ptr_ct1, blk_size, ele_per_blk, num_blocks, item_ct1);\n });\n });\n }" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 38869 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 38932 + Length: 131 + ReplacementText: "{\n dpct::has_capability_or_fail(q_ct1.get_device(), {sycl::aspect::fp16});\n\n q_ct1.submit(\n [&](sycl::handler &cgh) {\n const int8_t * data_gpu_data_ptr_int8_t_ct0 = data_gpu.data_ptr();\n auto output_data_ptr_float_ct1 = output.data_ptr();\n\n cgh.parallel_for(\n sycl::nd_range<3>(sycl::range<3>(1, 1, 512) * sycl::range<3>(1, 1, 256), sycl::range<3>(1, 1, 256)), \n [=](sycl::nd_item<3> item_ct1) {\n dequantize_q3_k_fp32_kernel(data_gpu_data_ptr_int8_t_ct0, output_data_ptr_float_ct1, blk_size, ele_per_blk, num_blocks, item_ct1);\n });\n });\n }" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 39063 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 39181 + Length: 23 + ReplacementText: 'dev_ct1.queues_wait_and_throw()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 39404 + Length: 0 + ReplacementText: "\n dpct::device_ext &dev_ct1 = dpct::get_current_device();\n sycl::queue &q_ct1 = dev_ct1.in_order_queue();" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 39458 + Length: 27 + ReplacementText: 'c10::OptionalDeviceGuard' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 39695 + Length: 80 + ReplacementText: 'q_ct1.memcpy(data_gpu.data_ptr(), data, num_bytes).wait()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 40006 + Length: 133 + ReplacementText: "{\n dpct::has_capability_or_fail(q_ct1.get_device(), {sycl::aspect::fp16});\n\n q_ct1.submit(\n [&](sycl::handler &cgh) {\n const int8_t * data_gpu_data_ptr_int8_t_ct0 = data_gpu.data_ptr();\n auto output_data_ptr_ct1 = (sycl::half*)output.data_ptr();\n\n cgh.parallel_for(\n sycl::nd_range<3>(sycl::range<3>(1, 1, 512) * sycl::range<3>(1, 1, 256), sycl::range<3>(1, 1, 256)), \n [=](sycl::nd_item<3> item_ct1) {\n dequantize_q2_k_fp16_kernel(data_gpu_data_ptr_int8_t_ct0, output_data_ptr_ct1, blk_size, ele_per_blk, num_blocks, item_ct1);\n });\n });\n }" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 40139 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 40203 + Length: 138 + ReplacementText: "{\n dpct::has_capability_or_fail(q_ct1.get_device(), {sycl::aspect::fp16});\n\n q_ct1.submit(\n [&](sycl::handler &cgh) {\n const int8_t * data_gpu_data_ptr_int8_t_ct0 = data_gpu.data_ptr();\n auto output_data_ptr_ct1 = (sycl::ext::oneapi::bfloat16*)output.data_ptr();\n\n cgh.parallel_for(\n sycl::nd_range<3>(sycl::range<3>(1, 1, 512) * sycl::range<3>(1, 1, 256), sycl::range<3>(1, 1, 256)), \n [=](sycl::nd_item<3> item_ct1) {\n dequantize_q2_k_bf16_kernel(data_gpu_data_ptr_int8_t_ct0, output_data_ptr_ct1, blk_size, ele_per_blk, num_blocks, item_ct1);\n });\n });\n }" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 40341 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 40404 + Length: 131 + ReplacementText: "{\n dpct::has_capability_or_fail(q_ct1.get_device(), {sycl::aspect::fp16});\n\n q_ct1.submit(\n [&](sycl::handler &cgh) {\n const int8_t * data_gpu_data_ptr_int8_t_ct0 = data_gpu.data_ptr();\n auto output_data_ptr_float_ct1 = output.data_ptr();\n\n cgh.parallel_for(\n sycl::nd_range<3>(sycl::range<3>(1, 1, 512) * sycl::range<3>(1, 1, 256), sycl::range<3>(1, 1, 256)), \n [=](sycl::nd_item<3> item_ct1) {\n dequantize_q2_k_fp32_kernel(data_gpu_data_ptr_int8_t_ct0, output_data_ptr_float_ct1, blk_size, ele_per_blk, num_blocks, item_ct1);\n });\n });\n }" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 40535 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 40653 + Length: 23 + ReplacementText: 'dev_ct1.queues_wait_and_throw()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 40878 + Length: 0 + ReplacementText: "\n dpct::device_ext &dev_ct1 = dpct::get_current_device();\n sycl::queue &q_ct1 = dev_ct1.in_order_queue();" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 40932 + Length: 27 + ReplacementText: 'c10::OptionalDeviceGuard' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 41169 + Length: 80 + ReplacementText: 'q_ct1.memcpy(data_gpu.data_ptr(), data, num_bytes).wait()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 41480 + Length: 135 + ReplacementText: "{\n kvalues_iq4nl.init();\n\n dpct::has_capability_or_fail(q_ct1.get_device(), {sycl::aspect::fp16});\n\n q_ct1.submit(\n [&](sycl::handler &cgh) {\n auto kvalues_iq4nl_ptr_ct1 = kvalues_iq4nl.get_ptr();\n\n const int8_t * data_gpu_data_ptr_int8_t_ct0 = data_gpu.data_ptr();\n auto output_data_ptr_ct1 = (sycl::half*)output.data_ptr();\n\n cgh.parallel_for(\n sycl::nd_range<3>(sycl::range<3>(1, 1, 512) * sycl::range<3>(1, 1, 256), sycl::range<3>(1, 1, 256)), \n [=](sycl::nd_item<3> item_ct1) {\n dequantize_iq4_xs_fp16_kernel(data_gpu_data_ptr_int8_t_ct0, output_data_ptr_ct1, blk_size, ele_per_blk, num_blocks, item_ct1, kvalues_iq4nl_ptr_ct1);\n });\n });\n }" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 41615 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 41679 + Length: 140 + ReplacementText: "{\n kvalues_iq4nl.init();\n\n dpct::has_capability_or_fail(q_ct1.get_device(), {sycl::aspect::fp16});\n\n q_ct1.submit(\n [&](sycl::handler &cgh) {\n auto kvalues_iq4nl_ptr_ct1 = kvalues_iq4nl.get_ptr();\n\n const int8_t * data_gpu_data_ptr_int8_t_ct0 = data_gpu.data_ptr();\n auto output_data_ptr_ct1 = (sycl::ext::oneapi::bfloat16*)output.data_ptr();\n\n cgh.parallel_for(\n sycl::nd_range<3>(sycl::range<3>(1, 1, 512) * sycl::range<3>(1, 1, 256), sycl::range<3>(1, 1, 256)), \n [=](sycl::nd_item<3> item_ct1) {\n dequantize_iq4_xs_bf16_kernel(data_gpu_data_ptr_int8_t_ct0, output_data_ptr_ct1, blk_size, ele_per_blk, num_blocks, item_ct1, kvalues_iq4nl_ptr_ct1);\n });\n });\n }" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 41819 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 41882 + Length: 133 + ReplacementText: "{\n kvalues_iq4nl.init();\n\n dpct::has_capability_or_fail(q_ct1.get_device(), {sycl::aspect::fp16});\n\n q_ct1.submit(\n [&](sycl::handler &cgh) {\n auto kvalues_iq4nl_ptr_ct1 = kvalues_iq4nl.get_ptr();\n\n const int8_t * data_gpu_data_ptr_int8_t_ct0 = data_gpu.data_ptr();\n auto output_data_ptr_float_ct1 = output.data_ptr();\n\n cgh.parallel_for(\n sycl::nd_range<3>(sycl::range<3>(1, 1, 512) * sycl::range<3>(1, 1, 256), sycl::range<3>(1, 1, 256)), \n [=](sycl::nd_item<3> item_ct1) {\n dequantize_iq4_xs_fp32_kernel(data_gpu_data_ptr_int8_t_ct0, output_data_ptr_float_ct1, blk_size, ele_per_blk, num_blocks, item_ct1, kvalues_iq4nl_ptr_ct1);\n });\n });\n }" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 42015 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Offset: 42133 + Length: 23 + ReplacementText: 'dev_ct1.queues_wait_and_throw()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false +MainSourceFilesDigest: + - MainSourceFile: '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu' + Digest: 406e64c0c53ab47f1bdf9d7fa9d99727 + HasCUDASyntax: true +DpctVersion: 20.0.0 +MainHelperFileName: '' +USMLevel: '' +FeatureMap: {} +CompileTargets: {} +OptionMap: + AnalysisScopePath: + Value: '' + ValueVec: + - '/home/sijialou/workspace/workloads/ktransformers/ktransformers/ktransformers_ext/cuda/custom_gguf' + Specified: false + AsyncHandler: + Value: 'false' + Specified: false + BuildScript: + Value: '0' + Specified: false + CodePinEnabled: + Value: 'false' + Specified: false + CommentsEnabled: + Value: 'false' + Specified: false + CompilationsDir: + Value: '' + Specified: false + CtadEnabled: + Value: 'false' + Specified: false + EnablepProfiling: + Value: 'false' + Specified: false + ExperimentalFlag: + Value: '0' + Specified: false + ExplicitNamespace: + Value: '20' + Specified: false + ExtensionDDFlag: + Value: '0' + Specified: false + ExtensionDEFlag: + Value: '4294967295' + Specified: false + HelperFuncPreferenceFlag: + Value: '0' + Specified: false + NDRangeDim: + Value: '3' + Specified: false + NoDRYPattern: + Value: 'false' + Specified: false + OptimizeMigration: + Value: 'false' + Specified: false + ProcessAll: + Value: 'false' + Specified: false + RuleFile: + Value: '' + ValueVec: + - '/home/sijialou/workspace/c2s_install/extensions/pytorch_api_rules/pytorch_api.yaml' + Specified: true + SyclNamedLambda: + Value: 'false' + Specified: false + UseSYCLCompat: + Value: 'false' + Specified: false + UsmLevel: + Value: '1' + Specified: false +... diff --git a/third-party-programs/ktransformers/custom_gguf/migrated/python_test/custom_gguf.py b/third-party-programs/ktransformers/custom_gguf/migrated/python_test/custom_gguf.py new file mode 100644 index 000000000..d101469e2 --- /dev/null +++ b/third-party-programs/ktransformers/custom_gguf/migrated/python_test/custom_gguf.py @@ -0,0 +1,540 @@ +import numpy as np + +# add path +import sys +sys.path.append("../") + +# from ktransformers.util.custom_gguf import dequantize_q4_k_gpu +import torch +import dequantize_extension +torch.set_default_dtype(torch.float32) +import time +from transformers import ( + AutoConfig, +) +from enum import IntEnum +from typing import Sequence +import ctypes + +class GGMLQuantizationType(IntEnum): + F32 = 0 + F16 = 1 + Q4_0 = 2 + Q4_1 = 3 + Q5_0 = 6 + Q5_1 = 7 + Q8_0 = 8 + Q8_1 = 9 + Q2_K = 10 + Q3_K = 11 + Q4_K = 12 + Q5_K = 13 + Q6_K = 14 + Q8_K = 15 + IQ2_XXS = 16 + IQ2_XS = 17 + IQ3_XXS = 18 + IQ1_S = 19 + IQ4_NL = 20 + IQ3_S = 21 + IQ2_S = 22 + IQ4_XS = 23 + I8 = 24 + I16 = 25 + I32 = 26 + I64 = 27 + F64 = 28 + IQ1_M = 29 + BF16 = 30 + +QK_K = 256 +GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = { + GGMLQuantizationType.F32: (1, 4), + GGMLQuantizationType.F16: (1, 2), + GGMLQuantizationType.Q4_0: (32, 2 + 16), + GGMLQuantizationType.Q4_1: (32, 2 + 2 + 16), + GGMLQuantizationType.Q5_0: (32, 2 + 4 + 16), + GGMLQuantizationType.Q5_1: (32, 2 + 2 + 4 + 16), + GGMLQuantizationType.Q8_0: (32, 2 + 32), + GGMLQuantizationType.Q8_1: (32, 4 + 4 + 32), + GGMLQuantizationType.Q2_K: (256, 2 + 2 + QK_K // 16 + QK_K // 4), + GGMLQuantizationType.Q3_K: (256, 2 + QK_K // 4 + QK_K // 8 + 12), + GGMLQuantizationType.Q4_K: (256, 2 + 2 + QK_K // 2 + 12), + GGMLQuantizationType.Q5_K: (256, 2 + 2 + QK_K // 2 + QK_K // 8 + 12), + GGMLQuantizationType.Q6_K: (256, 2 + QK_K // 2 + QK_K // 4 + QK_K // 16), + GGMLQuantizationType.Q8_K: (256, 4 + QK_K + QK_K // 8), + GGMLQuantizationType.IQ2_XXS: (256, 2 + QK_K // 4), + GGMLQuantizationType.IQ2_XS: (256, 2 + QK_K // 4 + QK_K // 32), + GGMLQuantizationType.IQ3_XXS: (256, 2 + QK_K // 4 + QK_K // 8), + GGMLQuantizationType.IQ1_S: (256, 2 + QK_K // 8 + QK_K // 16), + GGMLQuantizationType.IQ4_NL: (32, 2 + 16), + GGMLQuantizationType.IQ3_S: (256, 2 + QK_K // 4 + QK_K // 8 + QK_K // 32 + 4), + GGMLQuantizationType.IQ2_S: (256, 2 + QK_K // 4 + QK_K // 16), + GGMLQuantizationType.IQ4_XS: (256, 2 + 2 + QK_K // 2 + QK_K // 64), + GGMLQuantizationType.I8: (1, 1), + GGMLQuantizationType.I16: (1, 2), + GGMLQuantizationType.I32: (1, 4), + GGMLQuantizationType.I64: (1, 8), + GGMLQuantizationType.F64: (1, 8), + GGMLQuantizationType.IQ1_M: (256, QK_K // 8 + QK_K // 16 + QK_K // 32), + GGMLQuantizationType.BF16: (1, 2), +} + +# copied from llama.cpp/gguf-py/gguf/quants.py to avoid dependence of gguf +def quant_shape_to_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType): + block_size, type_size = GGML_QUANT_SIZES[quant_type] + if shape[-1] % block_size != 0: + raise ValueError(f"Quantized tensor row size ({shape[-1]}) is not a multiple of {quant_type.name} block size ({block_size})") + return (*shape[:-1], shape[-1] // block_size * type_size) + +GGML_TYPES = { + "F32": 0, + "F16": 1, + "Q4_0": 2, + "Q5_0": 6, + "Q8_0": 8, + "Q2_K": 10, + "Q3_K": 11, + "Q4_K": 12, + "Q5_K": 13, + "Q6_K": 14, + "IQ4_XS": 23, + "BF16": 30, +} + +GGML_NAMES = {ggml_type: name for name, ggml_type in GGML_TYPES.items()} + +GGML_BLOCK_SIZES = { + "F32": 4, + "F16": 2, + "BF16": 2, + "Q4_0": 2 + 16, + "Q5_0": 2 + 4 + 16, + "Q8_0": 2 + 32, + "Q2_K": 256 // 16 + 256 // 4 + 2 + 2, + "Q3_K": 256 // 8 + 256 // 4 + 12 + 2, + "Q4_K": 2 + 2 + 12 + 256 // 2, + "Q5_K": 2 + 2 + 12 + 256 // 8 + 256 // 2, + "Q6_K": 256 // 2 + 256 // 4 + 256 // 16 + 2, + "IQ4_XS": 2 + 2 + 256 // 2 + 256 // 64, + "FP8": 1, +} + +GGML_ELEMENTS_PER_BLOCK = { + "F32": 1, + "F16": 1, + "BF16": 1, + "Q4_0": 32, + "Q5_0": 32, + "Q8_0": 32, + "Q2_K": 256, + "Q3_K": 256, + "Q4_K": 256, + "Q5_K": 256, + "Q6_K": 256, + "IQ4_XS": 256, + "FP8": 1, +} + +DATA_TYPES = { + "uint8": 0, + "int8": 1, + "uint16": 2, + "int16": 3, + "uint32": 4, + "int32": 5, + "float32": 6, + "bool": 7, + "string": 8, + "array": 9, + "uint64": 10, + "int64": 11, + "float64": 12, + "FP8": 13, +} + +def dequantize_q2_k(data): + # C implementation + # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.c#L1547 + # C struct definition + # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L74 + block_size = GGML_BLOCK_SIZES["Q2_K"] + num_blocks = len(data) // block_size + + data_f16 = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, block_size // 2) + data_u8 = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, block_size) + + dmin = data_f16[:, -1].reshape(num_blocks, 1, 1).astype(np.float32) + d = data_f16[:, -2].reshape(num_blocks, 1, 1).astype(np.float32) + scales = data_u8[:, :16].reshape(num_blocks, 16, 1) + qs = data_u8[:, 16:80].reshape(num_blocks, 64) + + tmp = np.stack([ + qs[:, 00:16] >> 0, + qs[:, 16:32] >> 0, + qs[:, 00:16] >> 2, + qs[:, 16:32] >> 2, + qs[:, 00:16] >> 4, + qs[:, 16:32] >> 4, + qs[:, 00:16] >> 6, + qs[:, 16:32] >> 6, + qs[:, 32:48] >> 0, + qs[:, 48:64] >> 0, + qs[:, 32:48] >> 2, + qs[:, 48:64] >> 2, + qs[:, 32:48] >> 4, + qs[:, 48:64] >> 4, + qs[:, 32:48] >> 6, + qs[:, 48:64] >> 6, + ], axis=1) + + return d * (scales & 15) * (tmp & 3) - dmin * (scales >> 4) + +def dequantize_q2_k_gpu(data, device:str ="cuda", target_dtype = torch.get_default_dtype()): + block_size = GGML_BLOCK_SIZES["Q2_K"] + ele_per_blk = GGML_ELEMENTS_PER_BLOCK["Q2_K"] + data = np.frombuffer(data, dtype=data.dtype) + device = torch.device(device) + # TODO: this and from_numpy in other functions will cause a warning saying that numpy is not writable, + # the best way to fix this is transfer ptr to dequantize_extension instead of Tensor. + c_pointer = ctypes.addressof(ctypes.cast(data.ctypes.data, ctypes.POINTER(ctypes.c_int8)).contents) + return dequantize_extension.dequantize_q2_k(c_pointer, data.size, block_size, ele_per_blk, device, target_dtype) + +def dequantize_q3_k(data): + # C implementation + # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.c#L1723C32-L1723C42 + # C struct definition + # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L95 + block_size = GGML_BLOCK_SIZES["Q3_K"] + num_blocks = len(data) // block_size + + data_f16 = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, block_size // 2) + data_u8 = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, block_size) + + d = data_f16[:, -1].reshape(num_blocks, 1, 1).astype(np.float32) + bits = np.unpackbits(data_u8[:, :32].reshape(num_blocks, 32, 1), axis=-1, bitorder="little") + bits = 4 ^ (bits << 2) + qs = data_u8[:, 32:32 + 64].astype(np.int16) + a, b, c = data_u8[:, 96: 96 + 12].reshape(num_blocks, 3, 4).transpose(1, 0, 2) + scales = np.zeros((num_blocks, 4, 4), dtype=np.uint8) + scales[:, 0] = (a & 15) | ((c & 3) << 4) + scales[:, 1] = (b & 15) | (((c >> 2) & 3) << 4) + scales[:, 2] = (a >> 4) | (((c >> 4) & 3) << 4) + scales[:, 3] = (b >> 4) | ((c >> 6) << 4) + scales = scales.reshape(num_blocks, 16, 1).astype(np.int16) + + return d * (scales - 32) * np.stack([ + (((qs[:, 00:16] >> 0) & 3) - bits[:, :16, 0]), + (((qs[:, 16:32] >> 0) & 3) - bits[:, 16:, 0]), + (((qs[:, 00:16] >> 2) & 3) - bits[:, :16, 1]), + (((qs[:, 16:32] >> 2) & 3) - bits[:, 16:, 1]), + (((qs[:, 00:16] >> 4) & 3) - bits[:, :16, 2]), + (((qs[:, 16:32] >> 4) & 3) - bits[:, 16:, 2]), + (((qs[:, 00:16] >> 6) & 3) - bits[:, :16, 3]), + (((qs[:, 16:32] >> 6) & 3) - bits[:, 16:, 3]), + (((qs[:, 32:48] >> 0) & 3) - bits[:, :16, 4]), + (((qs[:, 48:64] >> 0) & 3) - bits[:, 16:, 4]), + (((qs[:, 32:48] >> 2) & 3) - bits[:, :16, 5]), + (((qs[:, 48:64] >> 2) & 3) - bits[:, 16:, 5]), + (((qs[:, 32:48] >> 4) & 3) - bits[:, :16, 6]), + (((qs[:, 48:64] >> 4) & 3) - bits[:, 16:, 6]), + (((qs[:, 32:48] >> 6) & 3) - bits[:, :16, 7]), + (((qs[:, 48:64] >> 6) & 3) - bits[:, 16:, 7]) + ], axis=1) + +def dequantize_q3_k_gpu(data, device:str ="cuda", target_dtype = torch.get_default_dtype()): + block_size = GGML_BLOCK_SIZES["Q3_K"] + ele_per_blk = GGML_ELEMENTS_PER_BLOCK["Q3_K"] + data = np.frombuffer(data, dtype=data.dtype) + device = torch.device(device) + # TODO: this and from_numpy in other functions will cause a warning saying that numpy is not writable, + # the best way to fix this is transfer ptr to dequantize_extension instead of Tensor. + c_pointer = ctypes.addressof(ctypes.cast(data.ctypes.data, ctypes.POINTER(ctypes.c_int8)).contents) + return dequantize_extension.dequantize_q3_k(c_pointer, data.size, block_size, ele_per_blk, device, target_dtype) + +def dequantize_q4_k(data): + # C implementation + # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.c#L1929 + # C struct definition + # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L116 + block_size = GGML_BLOCK_SIZES["Q4_K"] + num_blocks = len(data) // block_size + data_f16 = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, block_size // 2) + data_u8 = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, block_size) + # Casting to float32 because float16 is very slow on CPU + scale_factors = data_f16[:, 0].reshape(num_blocks, 1, 1).astype(np.float32) + scale_offsets = data_f16[:, 1].reshape(num_blocks, 1, 1).astype(np.float32) + qs1 = data_u8[:, 4:16].reshape(num_blocks, 12, 1) + qs2 = data_u8[:, 16:].reshape(num_blocks, 4, 32) + # Dequantize scales and offsets (6 bits and 4 + 2 bits) + factors = scale_factors * np.concatenate([qs1[:, 0:4] & 0b111111, (qs1[:, 8:] & 15) | ((qs1[:, 0:4] >> 6) << 4)], axis=1) + offsets = scale_offsets * np.concatenate([qs1[:, 4:8] & 0b111111, (qs1[:, 8:] >> 4) | ((qs1[:, 4:8] >> 6) << 4)], axis=1) + # Interleave low and high quantized bits + qs2 = np.stack([qs2 & 0xf, qs2 >> 4], axis=2).reshape(num_blocks, 8, 32) + # Dequantize final weights using scales and offsets + return factors * qs2 - offsets + +def dequantize_q4_k_gpu(data, device:str ="cuda", target_dtype = torch.get_default_dtype()): + block_size = GGML_BLOCK_SIZES["Q4_K"] + ele_per_blk = GGML_ELEMENTS_PER_BLOCK["Q4_K"] + data = np.frombuffer(data, dtype=data.dtype) + device = torch.device(device) + # TODO: this and from_numpy in other functions will cause a warning saying that numpy is not writable, + # the best way to fix this is transfer ptr to dequantize_extension instead of Tensor. + c_pointer = ctypes.addressof(ctypes.cast(data.ctypes.data, ctypes.POINTER(ctypes.c_int8)).contents) + return dequantize_extension.dequantize_q4_k(c_pointer, data.size, block_size, ele_per_blk, device, target_dtype) + +def dequantize_q5_k(data): + # C implementation + # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.c#L2129 + # C struct definition + # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L138 + block_size = GGML_BLOCK_SIZES["Q5_K"] + num_blocks = len(data) // block_size + + data_f16 = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, block_size // 2) + data_u8 = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, block_size) + + d = data_f16[:, 0].reshape(num_blocks, 1).astype(np.float32) + dmin = data_f16[:, 1].reshape(num_blocks, 1).astype(np.float32) + scales = data_u8[:, 4:16].reshape(num_blocks, 12, 1) + qh = data_u8[:, 16: 16 + 32].reshape(num_blocks, 32, 1) + qs = data_u8[:, 48: 48 + 128].reshape(num_blocks, 4, 32) + + bits = np.unpackbits(qh, axis=-1, bitorder="little") + + qs_hi_4 = qs >> 4 + qs_lo_4 = qs & 15 + + scales_lo_6 = scales[:, :8] & 63 + scales_hi_6 = scales[:, :8] >> 6 + scales_lo_4 = scales[:, 8:] & 15 + scales_hi_4 = scales[:, 8:] >> 4 + + m1 = dmin * scales_lo_6[:, 4] + m2 = dmin * scales_lo_6[:, 5] + m3 = dmin * scales_lo_6[:, 6] + m4 = dmin * scales_lo_6[:, 7] + m5 = dmin * (scales_hi_4[:, 0] | (scales_hi_6[:, 4] << 4)) + m6 = dmin * (scales_hi_4[:, 1] | (scales_hi_6[:, 5] << 4)) + m7 = dmin * (scales_hi_4[:, 2] | (scales_hi_6[:, 6] << 4)) + m8 = dmin * (scales_hi_4[:, 3] | (scales_hi_6[:, 7] << 4)) + + d1 = d * scales_lo_6[:, 0] + d2 = d * scales_lo_6[:, 1] + d3 = d * scales_lo_6[:, 2] + d4 = d * scales_lo_6[:, 3] + d5 = d * (scales_lo_4[:, 0] | (scales_hi_6[:, 0] << 4)) + d6 = d * (scales_lo_4[:, 1] | (scales_hi_6[:, 1] << 4)) + d7 = d * (scales_lo_4[:, 2] | (scales_hi_6[:, 2] << 4)) + d8 = d * (scales_lo_4[:, 3] | (scales_hi_6[:, 3] << 4)) + + return np.concatenate([ + d1 * (qs_lo_4[:, 0] + (bits[:, :, 0] << 4)) - m1, + d2 * (qs_hi_4[:, 0] + (bits[:, :, 1] << 4)) - m2, + d3 * (qs_lo_4[:, 1] + (bits[:, :, 2] << 4)) - m3, + d4 * (qs_hi_4[:, 1] + (bits[:, :, 3] << 4)) - m4, + d5 * (qs_lo_4[:, 2] + (bits[:, :, 4] << 4)) - m5, + d6 * (qs_hi_4[:, 2] + (bits[:, :, 5] << 4)) - m6, + d7 * (qs_lo_4[:, 3] + (bits[:, :, 6] << 4)) - m7, + d8 * (qs_hi_4[:, 3] + (bits[:, :, 7] << 4)) - m8, + ], axis=1) + +def dequantize_q5_k_gpu(data, device:str ="cuda", target_dtype = torch.get_default_dtype()): + block_size = GGML_BLOCK_SIZES["Q5_K"] + ele_per_blk = GGML_ELEMENTS_PER_BLOCK["Q5_K"] + data = np.frombuffer(data, dtype=data.dtype) + device = torch.device(device) + # TODO: this and from_numpy in other functions will cause a warning saying that numpy is not writable, + # the best way to fix this is transfer ptr to dequantize_extension instead of Tensor. + c_pointer = ctypes.addressof(ctypes.cast(data.ctypes.data, ctypes.POINTER(ctypes.c_int8)).contents) + return dequantize_extension.dequantize_q5_k(c_pointer, data.size, block_size, ele_per_blk, device, target_dtype) + +def dequantize_q6_k(data): + # C implementation + # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.c#L2275 + # C struct definition + # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L152 + block_size = GGML_BLOCK_SIZES["Q6_K"] + num_blocks = len(data) // block_size + + data_f16 = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, block_size // 2) + data_u8 = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, block_size) + data_i8 = np.frombuffer(data, dtype=np.int8).reshape(num_blocks, block_size) + + scales = data_f16[:, -1].reshape(num_blocks, 1).astype(np.float32) + # TODO use uint8 and cast later? + ql = data_u8[:, :128].astype(np.int16) + qh = data_u8[:, 128:192].astype(np.int16) + sc = data_i8[:, 192:208, np.newaxis].astype(np.float32) + + # Unpack bits, subtraction requires signed data type + q1 = (ql[:, :32 ] & 0xF) | (((qh[:, :32] >> 0) & 3) << 4) - 32 + q2 = (ql[:, 32:64 ] & 0xF) | (((qh[:, :32] >> 2) & 3) << 4) - 32 + q3 = (ql[:, :32 ] >> 4) | (((qh[:, :32] >> 4) & 3) << 4) - 32 + q4 = (ql[:, 32:64 ] >> 4) | (((qh[:, :32] >> 6) & 3) << 4) - 32 + q5 = (ql[:, 64:96 ] & 0xF) | (((qh[:, 32:] >> 0) & 3) << 4) - 32 + q6 = (ql[:, 96:128] & 0xF) | (((qh[:, 32:] >> 2) & 3) << 4) - 32 + q7 = (ql[:, 64:96 ] >> 4) | (((qh[:, 32:] >> 4) & 3) << 4) - 32 + q8 = (ql[:, 96:128] >> 4) | (((qh[:, 32:] >> 6) & 3) << 4) - 32 + + # Dequantize + return scales * np.concatenate([ + sc[:, 0] * q1[:, :16], + sc[:, 1] * q1[:, 16:], + sc[:, 2] * q2[:, :16], + sc[:, 3] * q2[:, 16:], + sc[:, 4] * q3[:, :16], + sc[:, 5] * q3[:, 16:], + sc[:, 6] * q4[:, :16], + sc[:, 7] * q4[:, 16:], + sc[:, 8] * q5[:, :16], + sc[:, 9] * q5[:, 16:], + sc[:, 10] * q6[:, :16], + sc[:, 11] * q6[:, 16:], + sc[:, 12] * q7[:, :16], + sc[:, 13] * q7[:, 16:], + sc[:, 14] * q8[:, :16], + sc[:, 15] * q8[:, 16:], + ], axis=1) + +# @torch.jit.script +def dequantize_q6_k_gpu(data: np.ndarray, device:str = "cuda", target_dtype = torch.get_default_dtype()): + block_size = GGML_BLOCK_SIZES["Q6_K"] + ele_per_blk = GGML_ELEMENTS_PER_BLOCK["Q6_K"] + device = torch.device(device) + num_blocks = len(data) // block_size + data = np.frombuffer(data, dtype=data.dtype) + c_pointer = ctypes.addressof(ctypes.cast(data.ctypes.data, ctypes.POINTER(ctypes.c_int8)).contents) + return dequantize_extension.dequantize_q6_k(c_pointer, data.size, block_size, ele_per_blk, device, target_dtype) + +kvalues_iq4nl = np.array([-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113], dtype=np.int8) + +def dequantize_iq4_xs(data): + # C implementation + # https://github.com/ggerganov/ggml/blob/21d3a308fcb7f31cb9beceaeebad4fb622f3c337/src/ggml-quants.c#L3568 + # C struct definition + # https://github.com/ggerganov/ggml/blob/21d3a308fcb7f31cb9beceaeebad4fb622f3c337/src/ggml-common.h#L393 + block_size = GGML_BLOCK_SIZES["IQ4_XS"] + num_blocks = len(data) // block_size + + d = np.frombuffer(data, dtype=np.float16)[0::block_size//2].astype(np.float32).reshape(num_blocks, 1) + scales_h = np.frombuffer(data, dtype=np.uint16)[1::block_size//2].reshape(num_blocks, 1) + data_u8 = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, block_size)[:, 4:] + scales_l = data_u8[:, :4].reshape(num_blocks, 4) + qs = data_u8[:, 4:].reshape(num_blocks, block_size - 8) + + ls = np.zeros((num_blocks, QK_K // 32), dtype=np.int8) + for ib in range(QK_K // 32): + ls[:, ib] = ((scales_l[:, ib // 2] >> 4 * (ib % 2)) & 0xf) | (((scales_h[:, 0] >> 2 * ib) & 3) << 4) + + dl = (d * (ls - 32)).reshape(num_blocks, -1, 1) + + qs_lo_4 = qs[:, :QK_K // 2].reshape(num_blocks, -1, 16) & 0xf + qs_hi_4 = qs[:, :QK_K // 2].reshape(num_blocks, -1, 16) >> 4 + + y = np.zeros((num_blocks, QK_K), dtype=np.float32) + for ib in range(QK_K // 32): + y[:, ib*32:(ib*32)+16] = dl[:, ib] * kvalues_iq4nl[qs_lo_4[:, ib]] + y[:, (ib*32)+16:(ib*32)+32] = dl[:, ib] * kvalues_iq4nl[qs_hi_4[:, ib]] + + return y.flatten() + +def dequantize_iq4_xs_gpu(data: np.ndarray, device:str = "cuda", target_dtype = torch.get_default_dtype()): + block_size = GGML_BLOCK_SIZES["IQ4_XS"] + ele_per_blk = GGML_ELEMENTS_PER_BLOCK["IQ4_XS"] + device = torch.device(device) + num_blocks = len(data) // block_size + data = np.frombuffer(data, dtype=data.dtype) + c_pointer = ctypes.addressof(ctypes.cast(data.ctypes.data, ctypes.POINTER(ctypes.c_int8)).contents) + return dequantize_extension.dequantize_iq4_xs(c_pointer, data.size, block_size, ele_per_blk, device, target_dtype) + +def dequantize_q4_0(data): + # C implementation + # https://github.com/ggerganov/ggml/blob/a3c0188a4b5d3dec052ff87c9f773baa53631d70/src/ggml-quants.c#L1515 + # C struct definition + # https://github.com/ggerganov/ggml/blob/a3c0188a4b5d3dec052ff87c9f773baa53631d70/src/ggml-common.h#L141 + num_blocks = len(data) // GGML_BLOCK_SIZES["Q4_0"] + + scales = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, 1 + 8)[:, :1].astype(np.float32) + qs = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, 2 + 16)[:, 2:] + + return np.concatenate([ + scales * ((qs & 0xf).astype(np.int8) - 8), + scales * ((qs >> 4).astype(np.int8) - 8), + ], axis=1) + +def dequantize_q4_0_gpu(data, device:str = "cuda", target_dtype = torch.get_default_dtype()): + raise NotImplementedError() + +def dequantize_q5_0(data): + # C implementation + # https://github.com/ggerganov/ggml/blob/a3c0188a4b5d3dec052ff87c9f773baa53631d70/src/ggml-quants.c#L1556 + # C struct definition + # https://github.com/ggerganov/ggml/blob/a3c0188a4b5d3dec052ff87c9f773baa53631d70/src/ggml-common.h#L161 + num_blocks = len(data) // GGML_BLOCK_SIZES["Q5_0"] + + scales = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, 1 + 2 + 8)[:, :1].astype(np.float32) + qh = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, 2 + 4 + 16)[:, 2:2 + 4] + qs = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, 2 + 4 + 16)[:, 2 + 4:] + + bits = np.unpackbits(qh, axis=-1, bitorder="little") + + x0 = ((qs & 0xf).astype(np.int8) | (bits[:, :16] << 4)) - 16 + x1 = ((qs >> 4).astype(np.int8) | (bits[:, 16:] << 4)) - 16 + + return np.concatenate([ + scales * x0, + scales * x1, + ], axis=1) + +def dequantize_q5_0_gpu(data, device:str = "cuda", target_dtype = torch.get_default_dtype()): + raise NotImplementedError() + +def dequantize_q8_0(data): + # C struct definition + # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L43 + num_blocks = len(data) // GGML_BLOCK_SIZES["Q8_0"] + + scales = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, 1 + 16)[:, :1].astype(np.float32) + qs = np.frombuffer(data, dtype=np.int8).reshape(num_blocks, 2 + 32)[:, 2:] + return scales * qs + +def dequantize_q8_0_gpu(data, device:str = "cuda", target_dtype = torch.get_default_dtype()): + # C struct definition + # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L43 + + block_size = GGML_BLOCK_SIZES["Q8_0"] + ele_per_blk = GGML_ELEMENTS_PER_BLOCK["Q8_0"] + device = torch.device(device) + data = np.frombuffer(data, dtype=data.dtype) + c_pointer = ctypes.addressof(ctypes.cast(data.ctypes.data, ctypes.POINTER(ctypes.c_int8)).contents) + return dequantize_extension.dequantize_q8_0(c_pointer, data.size, block_size, ele_per_blk, device, target_dtype) + + +def dequantize_f32(data): + return np.frombuffer(data, dtype=np.float32) + +def dequantize_f32_gpu(data, device, target_dtype = torch.get_default_dtype()): + data = np.frombuffer(data, dtype=np.float32) + res = torch.from_numpy(data.copy()) + res_gpu = torch.empty_like(res, device=device, dtype=target_dtype) + res_gpu.copy_(res) + return res_gpu + +def dequantize_f16(data): + return np.frombuffer(data, dtype=np.float16) + +def dequantize_f16_gpu(data, device, target_dtype = torch.get_default_dtype()): + data = np.frombuffer(data, dtype=np.float16) + res = torch.from_numpy(data.copy()) + res_gpu = torch.empty_like(res, device=device, dtype=target_dtype) + res_gpu.copy_(res) + return res_gpu + +def dequantize_bf16_gpu(data, device, target_dtype = torch.get_default_dtype()): + data = np.frombuffer(data, dtype=np.float16) + res = torch.from_numpy(data.copy()) + res_gpu = torch.empty_like(res, device=device) + res_gpu.copy_(res) + return res_gpu \ No newline at end of file diff --git a/third-party-programs/ktransformers/custom_gguf/migrated/python_test/test_dequant.py b/third-party-programs/ktransformers/custom_gguf/migrated/python_test/test_dequant.py new file mode 100644 index 000000000..c45e60fe4 --- /dev/null +++ b/third-party-programs/ktransformers/custom_gguf/migrated/python_test/test_dequant.py @@ -0,0 +1,62 @@ +import numpy as np +import torch +import pytest +from custom_gguf import * # Make sure to import your custom module + +# Type table mapping GGML quantization types to their corresponding data types +GGML_DATA_TYPES = { + "F32": torch.float32, + "F16": torch.float16, + "Q8_0": torch.float32, # Adjust as needed + "Q2_K": torch.float32, # Adjust as needed + "Q3_K": torch.float32, # Adjust as needed + "Q4_K": torch.float32, # Adjust as needed + "Q5_K": torch.float32, # Adjust as needed + "Q6_K": torch.float32, # Adjust as needed + "IQ4_XS": torch.float32, # Adjust as needed + # Add other mappings as needed +} + +@pytest.mark.parametrize("ggml_name", GGML_DATA_TYPES.keys()) +def test_dequant_function(ggml_name): + num_blocks = 4 + device = "xpu" # or "cpu" if you're not using a GPU + elements_per_block = GGML_ELEMENTS_PER_BLOCK[ggml_name] + block_size = GGML_BLOCK_SIZES[ggml_name] + size = block_size * elements_per_block * num_blocks + target_dtype = GGML_DATA_TYPES[ggml_name] + + # Initialize the 1D np.ndarray with random data + data = np.random.randint(1, 256, size, dtype=np.uint8) + + # Get the CPU and GPU dequantization functions + dequant_cpu_func = globals()[f"dequantize_{ggml_name.lower()}"] + dequant_gpu_func = globals()[f"dequantize_{ggml_name.lower()}_gpu"] + + # Perform dequantization on CPU + res_cpu = dequant_cpu_func(data) + res_cpu = torch.from_numpy(res_cpu) + + # Perform dequantization on GPU + res_gpu = dequant_gpu_func(data, device=device, target_dtype=target_dtype) + res_gpu = res_gpu.cpu().view(res_cpu.shape) + + # Check if all elements are either close or NaN in both tensors + close_or_nan_both = torch.isclose(res_cpu, res_gpu) | (torch.isnan(res_cpu) & torch.isnan(res_gpu)) + all_elements_close = close_or_nan_both.all() + + # Print "Pass" or "Fail" based on the comparison result + if all_elements_close: + print(f"Pass for {ggml_name}") + else: + print(f"Fail for {ggml_name}") + # Print the indices and the values from both tensors + not_close_and_not_nan_both = ~close_or_nan_both + differing_indices = not_close_and_not_nan_both.nonzero(as_tuple=False) + cpu_values = res_cpu[not_close_and_not_nan_both] + gpu_values = res_gpu[not_close_and_not_nan_both] + for idx, cpu_val, gpu_val in zip(differing_indices, cpu_values, gpu_values): + print(f"Index: {idx}, CPU value: {cpu_val}, GPU value: {gpu_val}") + + assert all_elements_close, f"Dequantization failed for {ggml_name}" + diff --git a/third-party-programs/ktransformers/custom_gguf/migrated/setup.py b/third-party-programs/ktransformers/custom_gguf/migrated/setup.py new file mode 100644 index 000000000..c8dc56203 --- /dev/null +++ b/third-party-programs/ktransformers/custom_gguf/migrated/setup.py @@ -0,0 +1,26 @@ +from setuptools import setup +from torch.utils.cpp_extension import SyclExtension, BuildExtension + +setup( + name='dequantize_extension', + ext_modules=[ + SyclExtension( + name='dequantize_extension', + sources=[ + 'src/bindings.cpp', + 'src/dequant.dp.cpp' + ], + # library_dirs=[ + # '/home/chengxiw/hackathon/workspace/xputorch/lib/python3.10/site-packages/torch/lib' + # ], + libraries=[ + 'torch_xpu', 'torch_cpu', 'c10_xpu', 'c10' + ], + extra_compile_args=['-fsycl'], + extra_link_args=['-fsycl'] + ), + ], + cmdclass={ + 'build_ext': BuildExtension.with_options(use_ninja=False) + } +) \ No newline at end of file diff --git a/third-party-programs/ktransformers/custom_gguf/migrated/single_kernel_test/test_dequantize_q8_0_bf16_kernel.cpp b/third-party-programs/ktransformers/custom_gguf/migrated/single_kernel_test/test_dequantize_q8_0_bf16_kernel.cpp new file mode 100644 index 000000000..474418a86 --- /dev/null +++ b/third-party-programs/ktransformers/custom_gguf/migrated/single_kernel_test/test_dequantize_q8_0_bf16_kernel.cpp @@ -0,0 +1,99 @@ +#include +#include +#include +#include +#include + +using namespace sycl; + +void dequantize_q8_0_bf16_kernel(const int8_t *data, + sycl::ext::oneapi::bfloat16 *output, + const int blk_size, const int ele_per_blk, + const int num_blocks, + const sycl::nd_item<3> &item_ct1) { + long long global_idx = item_ct1.get_group(2) * item_ct1.get_local_range(2) + + item_ct1.get_local_id(2); + for (long long block_id = global_idx; block_id < num_blocks; + block_id += + item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) { + sycl::ext::oneapi::bfloat16 *__restrict__ output_blk = + (sycl::ext::oneapi::bfloat16 *)(output + block_id * ele_per_blk); + const int8_t* cur_block = data + block_id * blk_size; + float scale = sycl::vec(*((sycl::half *)cur_block)) + .convert()[0]; + cur_block += 2; + for (int i = 0; i < ele_per_blk; i++) { + output_blk[i] = sycl::ext::oneapi::bfloat16(scale * cur_block[i]); + } + } +} + +int main() { + // Define the parameters + const int blk_size = 10; + const int ele_per_blk = 8; + const int num_blocks = 2; + + // Initialize input data + std::vector data(blk_size * num_blocks); + std::vector output(ele_per_blk * num_blocks, 0.0f); + + // Fill the data with some values + for (int i = 0; i < num_blocks; ++i) { + sycl::half scale = 0.5f; + std::memcpy(data.data() + i * blk_size, &scale, sizeof(sycl::half)); + for (int j = 2; j < blk_size; ++j) { + data[i * blk_size + j] = j - 2; + } + } + + // Create a SYCL queue + queue q; + + // Allocate device memory + int8_t* d_data = malloc_device(data.size(), q); + sycl::ext::oneapi::bfloat16* d_output = malloc_device(output.size(), q); + + // Copy data to device + q.memcpy(d_data, data.data(), data.size() * sizeof(int8_t)).wait(); + q.memcpy(d_output, output.data(), output.size() * sizeof(sycl::ext::oneapi::bfloat16)).wait(); + + // Define the kernel execution configuration + range<3> global_work_size(1, 1, num_blocks); + range<3> local_work_size(1, 1, 1); + + // Launch the kernel + q.submit([&](handler& h) { + h.parallel_for(nd_range<3>(global_work_size, local_work_size), [=](nd_item<3> item_ct1) { + dequantize_q8_0_bf16_kernel(d_data, d_output, blk_size, ele_per_blk, num_blocks, item_ct1); + }); + }).wait(); + + // Copy the result back to host + q.memcpy(output.data(), d_output, output.size() * sizeof(sycl::ext::oneapi::bfloat16)).wait(); + + // Free device memory + free(d_data, q); + free(d_output, q); + + // Check the results + bool success = true; + for (int i = 0; i < num_blocks; ++i) { + sycl::half scale = 0.5f; + for (int j = 0; j < ele_per_blk; ++j) { + float expected = scale * (j); + if (std::fabs(static_cast(output[i * ele_per_blk + j]) - expected) > 1e-3) { + success = false; + std::cout << "Mismatch at block " << i << ", element " << j << ": expected " << expected << ", got " << static_cast(output[i * ele_per_blk + j]) << std::endl; + } + } + } + + if (success) { + std::cout << "Test passed!" << std::endl; + } else { + std::cout << "Test failed!" << std::endl; + } + + return 0; +} diff --git a/third-party-programs/ktransformers/custom_gguf/migrated/single_kernel_test/test_dequantize_q8_0_fp16_kernel.cpp b/third-party-programs/ktransformers/custom_gguf/migrated/single_kernel_test/test_dequantize_q8_0_fp16_kernel.cpp new file mode 100644 index 000000000..2da9577dd --- /dev/null +++ b/third-party-programs/ktransformers/custom_gguf/migrated/single_kernel_test/test_dequantize_q8_0_fp16_kernel.cpp @@ -0,0 +1,99 @@ +#include +#include +#include +#include + +using namespace sycl; + +void dequantize_q8_0_fp16_kernel(const int8_t *data, sycl::half *output, + const int blk_size, const int ele_per_blk, + const int num_blocks, + const sycl::nd_item<3> &item_ct1) { + long long global_idx = item_ct1.get_group(2) * item_ct1.get_local_range(2) + + item_ct1.get_local_id(2); + for (long long block_id = global_idx; block_id < num_blocks; + block_id += + item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) { + sycl::half *__restrict__ output_blk = + (sycl::half *)(output + block_id * ele_per_blk); + const int8_t* cur_block = data + block_id * blk_size; + float scale = sycl::vec(*((sycl::half *)cur_block)) + .convert()[0]; + cur_block += 2; + for (int i = 0; i < ele_per_blk; i++) { + output_blk[i] = + sycl::vec(scale * cur_block[i]) + .convert()[0]; + } + } +} + +int main() { + // Define the parameters + const int blk_size = 10; + const int ele_per_blk = 8; + const int num_blocks = 2; + + // Initialize input data + std::vector data(blk_size * num_blocks); + std::vector output(ele_per_blk * num_blocks, 0.0f); + + // Fill the data with some values + for (int i = 0; i < num_blocks; ++i) { + sycl::half scale = 0.5f; + std::memcpy(data.data() + i * blk_size, &scale, sizeof(sycl::half)); + for (int j = 2; j < blk_size; ++j) { + data[i * blk_size + j] = j - 2; + } + } + + // Create a SYCL queue + queue q; + + // Allocate device memory + int8_t* d_data = malloc_device(data.size(), q); + sycl::half* d_output = malloc_device(output.size(), q); + + // Copy data to device + q.memcpy(d_data, data.data(), data.size() * sizeof(int8_t)).wait(); + q.memcpy(d_output, output.data(), output.size() * sizeof(sycl::half)).wait(); + + // Define the kernel execution configuration + range<3> global_work_size(1, 1, num_blocks); + range<3> local_work_size(1, 1, 1); + + // Launch the kernel + q.submit([&](handler& h) { + h.parallel_for(nd_range<3>(global_work_size, local_work_size), [=](nd_item<3> item_ct1) { + dequantize_q8_0_fp16_kernel(d_data, d_output, blk_size, ele_per_blk, num_blocks, item_ct1); + }); + }).wait(); + + // Copy the result back to host + q.memcpy(output.data(), d_output, output.size() * sizeof(sycl::half)).wait(); + + // Free device memory + free(d_data, q); + free(d_output, q); + + // Check the results + bool success = true; + for (int i = 0; i < num_blocks; ++i) { + sycl::half scale = 0.5f; + for (int j = 0; j < ele_per_blk; ++j) { + sycl::half expected = sycl::vec(scale * (j)).convert()[0]; + if (std::fabs(static_cast(output[i * ele_per_blk + j]) - static_cast(expected)) > 1e-3) { + success = false; + std::cout << "Mismatch at block " << i << ", element " << j << ": expected " << static_cast(expected) << ", got " << static_cast(output[i * ele_per_blk + j]) << std::endl; + } + } + } + + if (success) { + std::cout << "Test passed!" << std::endl; + } else { + std::cout << "Test failed!" << std::endl; + } + + return 0; +} diff --git a/third-party-programs/ktransformers/custom_gguf/migrated/single_kernel_test/test_dequantize_q8_0_fp32_kernel.cpp b/third-party-programs/ktransformers/custom_gguf/migrated/single_kernel_test/test_dequantize_q8_0_fp32_kernel.cpp new file mode 100644 index 000000000..dee8c978a --- /dev/null +++ b/third-party-programs/ktransformers/custom_gguf/migrated/single_kernel_test/test_dequantize_q8_0_fp32_kernel.cpp @@ -0,0 +1,118 @@ +#include +#include +#include +#include + +using namespace sycl; + +void dequantize_q8_0_fp32_kernel(const int8_t* data, float* output, const int blk_size, const int ele_per_blk, const int num_blocks, + const sycl::nd_item<3> &item_ct1) { + long long global_idx = item_ct1.get_group(2) * item_ct1.get_local_range(2) + + item_ct1.get_local_id(2); + for (long long block_id = global_idx; block_id < num_blocks; + block_id += + item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) { + float* __restrict__ output_blk = (float*)(output + block_id * ele_per_blk); + const int8_t* cur_block = data + block_id * blk_size; + float scale = sycl::vec(*((sycl::half *)cur_block)) + .convert()[0]; + cur_block += 2; + for (int i = 0; i < ele_per_blk; i++){ + output_blk[i] = scale * cur_block[i]; + } + } +} + +int main() { + // Define the parameters + const int blk_size = 10; + const int ele_per_blk = 8; + const int num_blocks = 2; + + // Initialize input data + std::vector data(blk_size * num_blocks); + std::vector output(ele_per_blk * num_blocks, 0.0f); + + // Fill the data with some values + for (int i = 0; i < num_blocks; ++i) { + sycl::half scale = 0.5f; + std::memcpy(data.data() + i * blk_size, &scale, sizeof(sycl::half)); + for (int j = 2; j < blk_size; ++j) { + data[i * blk_size + j] = j - 2; + } + } + + // Print the input data + std::cout << "Input data:" << std::endl; + for (int i = 0; i < num_blocks; ++i) { + std::cout << "Block " << i << ":" << std::endl; + for (int j = 0; j < blk_size; ++j) { + std::cout << static_cast(data[i * blk_size + j]) << " "; + } + std::cout << std::endl; + } + + + // Create a SYCL queue + queue q; + auto dev = q.get_device(); + std::cout << "Running on " << dev.get_info() << "\n"; + + // Allocate device memory + int8_t* d_data = malloc_device(data.size(), q); + float* d_output = malloc_device(output.size(), q); + + // Copy data to device + q.memcpy(d_data, data.data(), data.size() * sizeof(int8_t)).wait(); + q.memcpy(d_output, output.data(), output.size() * sizeof(float)).wait(); + + // Define the kernel execution configuration + range<3> global_work_size(1, 1, num_blocks); + range<3> local_work_size(1, 1, 1); + + // Launch the kernel + q.submit([&](handler& h) { + h.parallel_for(nd_range<3>(global_work_size, local_work_size), [=](nd_item<3> item_ct1) { + dequantize_q8_0_fp32_kernel(d_data, d_output, blk_size, ele_per_blk, num_blocks, item_ct1); + }); + }).wait(); + + // Copy the result back to host + q.memcpy(output.data(), d_output, output.size() * sizeof(float)).wait(); + + // Free device memory + free(d_data, q); + free(d_output, q); + + // Print the output data + std::cout << "Output data:" << std::endl; + for (int i = 0; i < num_blocks; ++i) { + std::cout << "Block " << i << ":" << std::endl; + for (int j = 0; j < ele_per_blk; ++j) { + std::cout << output[i * ele_per_blk + j] << " "; + } + std::cout << std::endl; + } + + + // Check the results + bool success = true; + for (int i = 0; i < num_blocks; ++i) { + sycl::half scale = 0.5f; + for (int j = 0; j < ele_per_blk; ++j) { + float expected = scale * (j); + if (std::fabs(output[i * ele_per_blk + j] - expected) > 1e-5) { + success = false; + std::cout << "Mismatch at block " << i << ", element " << j << ": expected " << expected << ", got " << output[i * ele_per_blk + j] << std::endl; + } + } + } + + if (success) { + std::cout << "Test passed!" << std::endl; + } else { + std::cout << "Test failed!" << std::endl; + } + + return 0; +} \ No newline at end of file diff --git a/third-party-programs/ktransformers/custom_gguf/migrated/single_kernel_test_need_debug/test_dequantize_q2_k_bf16_kernel.cpp b/third-party-programs/ktransformers/custom_gguf/migrated/single_kernel_test_need_debug/test_dequantize_q2_k_bf16_kernel.cpp new file mode 100644 index 000000000..22e414f97 --- /dev/null +++ b/third-party-programs/ktransformers/custom_gguf/migrated/single_kernel_test_need_debug/test_dequantize_q2_k_bf16_kernel.cpp @@ -0,0 +1,146 @@ +#include +#include +#include +#include + +using namespace sycl; + +void dequantize_q2_k_bf16_kernel(const int8_t *data, + sycl::ext::oneapi::bfloat16 *output, + const int blk_size, const int ele_per_blk, + const int num_blocks, + const sycl::nd_item<3> &item_ct1) { + long long global_idx = item_ct1.get_group(2) * item_ct1.get_local_range(2) + + item_ct1.get_local_id(2); + for (long long block_id = global_idx; block_id < num_blocks; + block_id += + item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) { + sycl::ext::oneapi::bfloat16 *__restrict__ output_blk = + (sycl::ext::oneapi::bfloat16 *)(output + block_id * ele_per_blk); + + const float d = + sycl::vec(*(reinterpret_cast( + data + block_id * blk_size + 80))) + .convert()[0]; + const float min = + sycl::vec(*(reinterpret_cast( + data + block_id * blk_size + 82))) + .convert()[0]; + + const uint8_t * __restrict__ q = (uint8_t*)(data + block_id * blk_size + 16); + + int is = 0; + float dl, ml; + + for (int n = 0; n < 256; n += 128) { + int shift = 0; + for (int j = 0; j < 4; ++j) { + uint8_t* scales = (uint8_t*)(data + block_id * blk_size + (is++)); + uint8_t sc = *scales; + dl = d * (sc & 0xF); ml = min * (sc >> 4); + for (int l = 0; l < 16; ++l) *output_blk++ = + sycl::ext::oneapi::bfloat16( + dl * ((int8_t)((q[l] >> shift) & 3)) - ml); + + scales = (uint8_t*)(data + block_id * blk_size + (is++)); + sc = *scales; + + dl = d * (sc & 0xF); ml = min * (sc >> 4); + for (int l = 0; l < 16; ++l) *output_blk++ = + sycl::ext::oneapi::bfloat16( + dl * ((int8_t)((q[l + 16] >> shift) & 3)) - ml); + + shift += 2; + } + q += 32; + } + } +} + +int main() { + // Define the parameters + const int blk_size = 128 + 16 + 2 * sizeof(sycl::half); // Adjusted to match the kernel's data layout + const int ele_per_blk = 256; + const int num_blocks = 2; + + // Initialize input data + std::vector data(blk_size * num_blocks); + std::vector output(ele_per_blk * num_blocks, 0.0f); + + // Fill the data with some values + for (int i = 0; i < num_blocks; ++i) { + sycl::half d = 0.5f; + sycl::half min = 0.1f; + std::memcpy(data.data() + i * blk_size + 80, &d, sizeof(sycl::half)); + std::memcpy(data.data() + i * blk_size + 82, &min, sizeof(sycl::half)); + for (int j = 0; j < 16; ++j) { + data[i * blk_size + j] = j; + } + for (int j = 16; j < 128 + 16; ++j) { + data[i * blk_size + j] = (j - 16) % 256; + } + } + + // Create a SYCL queue + queue q; + + // Allocate device memory + int8_t* d_data = malloc_device(data.size(), q); + sycl::ext::oneapi::bfloat16* d_output = malloc_device(output.size(), q); + + // Copy data to device + q.memcpy(d_data, data.data(), data.size() * sizeof(int8_t)).wait(); + q.memcpy(d_output, output.data(), output.size() * sizeof(sycl::ext::oneapi::bfloat16)).wait(); + + // Define the kernel execution configuration + range<3> global_work_size(1, 1, num_blocks); + range<3> local_work_size(1, 1, 1); + + // Launch the kernel + q.submit([&](handler& h) { + h.parallel_for(nd_range<3>(global_work_size, local_work_size), [=](nd_item<3> item_ct1) { + dequantize_q2_k_bf16_kernel(d_data, d_output, blk_size, ele_per_blk, num_blocks, item_ct1); + }); + }).wait(); + + // Copy the result back to host + q.memcpy(output.data(), d_output, output.size() * sizeof(sycl::ext::oneapi::bfloat16)).wait(); + + // Free device memory + free(d_data, q); + free(d_output, q); + + // Check the results + bool success = true; + for (int i = 0; i < num_blocks; ++i) { + sycl::half d = 0.5f; + sycl::half min = 0.1f; + for (int j = 0; j < ele_per_blk; ++j) { + // Calculate expected value + int block_offset = i * blk_size; + int q_offset = block_offset + 16 + (j / 128) * 32; + int scale_offset = block_offset + (j / 64) * 2; + uint8_t sc = data[scale_offset]; + float dl = d * (sc & 0xF); + float ml = min * (sc >> 4); + int q_idx = (j % 64) / 16; + int shift = (j % 16) * 2; + int8_t q_val = (data[q_offset + q_idx] >> shift) & 3; + float expected = dl * q_val - ml; + sycl::ext::oneapi::bfloat16 expected_bf16 = sycl::ext::oneapi::bfloat16(expected); + + if (std::fabs(static_cast(output[i * ele_per_blk + j]) - static_cast(expected_bf16)) > 1e-3) { + success = false; + std::cout << "Mismatch at block " << i << ", element " << j << ": expected " << static_cast(expected_bf16) << ", got " << static_cast(output[i * ele_per_blk + j]) << std::endl; + } + } + } + + if (success) { + std::cout << "Test passed!" << std::endl; + } else { + std::cout << "Test failed!" << std::endl; + } + + return 0; +} diff --git a/third-party-programs/ktransformers/custom_gguf/migrated/single_kernel_test_need_debug/test_dequantize_q2_k_fp16_kernel.cpp b/third-party-programs/ktransformers/custom_gguf/migrated/single_kernel_test_need_debug/test_dequantize_q2_k_fp16_kernel.cpp new file mode 100644 index 000000000..aa9fa1d6d --- /dev/null +++ b/third-party-programs/ktransformers/custom_gguf/migrated/single_kernel_test_need_debug/test_dequantize_q2_k_fp16_kernel.cpp @@ -0,0 +1,149 @@ +#include +#include +#include +#include + +using namespace sycl; + +void dequantize_q2_k_fp16_kernel(const int8_t *data, sycl::half *output, + const int blk_size, const int ele_per_blk, + const int num_blocks, + const sycl::nd_item<3> &item_ct1) { + long long global_idx = item_ct1.get_group(2) * item_ct1.get_local_range(2) + + item_ct1.get_local_id(2); + for (long long block_id = global_idx; block_id < num_blocks; + block_id += + item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) { + sycl::half *__restrict__ output_blk = + (sycl::half *)(output + block_id * ele_per_blk); + + const float d = + sycl::vec(*(reinterpret_cast( + data + block_id * blk_size + 80))) + .convert()[0]; + const float min = + sycl::vec(*(reinterpret_cast( + data + block_id * blk_size + 82))) + .convert()[0]; + + const uint8_t * __restrict__ q = (uint8_t*)(data + block_id * blk_size + 16); + + int is = 0; + float dl, ml; + + for (int n = 0; n < 256; n += 128) { + int shift = 0; + for (int j = 0; j < 4; ++j) { + uint8_t* scales = (uint8_t*)(data + block_id * blk_size + (is++)); + uint8_t sc = *scales; + dl = d * (sc & 0xF); ml = min * (sc >> 4); + for (int l = 0; l < 16; ++l) *output_blk++ = + sycl::vec(dl * ((int8_t)((q[l] >> shift) & 3)) - + ml) + .convert()[0]; + + scales = (uint8_t*)(data + block_id * blk_size + (is++)); + sc = *scales; + + dl = d * (sc & 0xF); ml = min * (sc >> 4); + for (int l = 0; l < 16; ++l) *output_blk++ = + sycl::vec( + dl * ((int8_t)((q[l + 16] >> shift) & 3)) - ml) + .convert()[0]; + + shift += 2; + } + q += 32; + } + } +} + +int main() { + // Define the parameters + const int blk_size = 128 + 16 + 2 * sizeof(sycl::half); // Adjusted to match the kernel's data layout + const int ele_per_blk = 256; + const int num_blocks = 2; + + // Initialize input data + std::vector data(blk_size * num_blocks); + std::vector output(ele_per_blk * num_blocks, 0.0f); + + // Fill the data with some values + for (int i = 0; i < num_blocks; ++i) { + sycl::half d = 0.5f; + sycl::half min = 0.1f; + std::memcpy(data.data() + i * blk_size + 80, &d, sizeof(sycl::half)); + std::memcpy(data.data() + i * blk_size + 82, &min, sizeof(sycl::half)); + for (int j = 0; j < 16; ++j) { + data[i * blk_size + j] = j; + } + for (int j = 16; j < 128 + 16; ++j) { + data[i * blk_size + j] = (j - 16) % 256; + } + } + + // Create a SYCL queue + queue q; + + // Allocate device memory + int8_t* d_data = malloc_device(data.size(), q); + sycl::half* d_output = malloc_device(output.size(), q); + + // Copy data to device + q.memcpy(d_data, data.data(), data.size() * sizeof(int8_t)).wait(); + q.memcpy(d_output, output.data(), output.size() * sizeof(sycl::half)).wait(); + + // Define the kernel execution configuration + range<3> global_work_size(1, 1, num_blocks); + range<3> local_work_size(1, 1, 1); + + // Launch the kernel + q.submit([&](handler& h) { + h.parallel_for(nd_range<3>(global_work_size, local_work_size), [=](nd_item<3> item_ct1) { + dequantize_q2_k_fp16_kernel(d_data, d_output, blk_size, ele_per_blk, num_blocks, item_ct1); + }); + }).wait(); + + // Copy the result back to host + q.memcpy(output.data(), d_output, output.size() * sizeof(sycl::half)).wait(); + + // Free device memory + free(d_data, q); + free(d_output, q); + + // Check the results + bool success = true; + for (int i = 0; i < num_blocks; ++i) { + sycl::half d = 0.5f; + sycl::half min = 0.1f; + for (int j = 0; j < ele_per_blk; ++j) { + // Calculate expected value + int block_offset = i * blk_size; + int q_offset = block_offset + 16 + (j / 128) * 32; + int scale_offset = block_offset + (j / 64) * 2; + uint8_t sc = data[scale_offset]; + float dl = d * (sc & 0xF); + float ml = min * (sc >> 4); + int q_idx = (j % 64) / 16; + int shift = (j % 16) * 2; + int8_t q_val = (data[q_offset + q_idx] >> shift) & 3; + float expected = dl * q_val - ml; + sycl::half expected_half = sycl::vec(expected).convert()[0]; + + if (std::fabs(static_cast(output[i * ele_per_blk + j]) - static_cast(expected_half)) > 1e-3) { + success = false; + std::cout << "Mismatch at block " << i << ", element " << j << ": expected " << static_cast(expected_half) << ", got " << static_cast(output[i * ele_per_blk + j]) << std::endl; + } + } + } + + if (success) { + std::cout << "Test passed!" << std::endl; + } else { + std::cout << "Test failed!" << std::endl; + } + + return 0; +} diff --git a/third-party-programs/ktransformers/custom_gguf/migrated/single_kernel_test_need_debug/test_dequantize_q2_k_fp32_kernel.cpp b/third-party-programs/ktransformers/custom_gguf/migrated/single_kernel_test_need_debug/test_dequantize_q2_k_fp32_kernel.cpp new file mode 100644 index 000000000..bcb021386 --- /dev/null +++ b/third-party-programs/ktransformers/custom_gguf/migrated/single_kernel_test_need_debug/test_dequantize_q2_k_fp32_kernel.cpp @@ -0,0 +1,158 @@ +#include +#include +#include +#include + +using namespace sycl; + +void dequantize_q2_k_fp32_kernel(const int8_t* data, float* output, const int blk_size, const int ele_per_blk, const int num_blocks, + const sycl::nd_item<3> &item_ct1) { + long long global_idx = item_ct1.get_group(2) * item_ct1.get_local_range(2) + + item_ct1.get_local_id(2); + for (long long block_id = global_idx; block_id < num_blocks; + block_id += + item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) { + float* __restrict__ output_blk = (float*)(output + block_id * ele_per_blk); + + const float d = + sycl::vec(*(reinterpret_cast( + data + block_id * blk_size + 80))) + .convert()[0]; + const float min = + sycl::vec(*(reinterpret_cast( + data + block_id * blk_size + 82))) + .convert()[0]; + + const uint8_t * __restrict__ q = (uint8_t*)(data + block_id * blk_size + 16); + + int is = 0; + float dl, ml; + + for (int n = 0; n < 256; n += 128) { + int shift = 0; + for (int j = 0; j < 4; ++j) { + uint8_t* scales = (uint8_t*)(data + block_id * blk_size + (is++)); + uint8_t sc = *scales; + dl = d * (sc & 0xF); ml = min * (sc >> 4); + for (int l = 0; l < 16; ++l) *output_blk++ = dl * ((int8_t)((q[l] >> shift) & 3)) - ml; + + scales = (uint8_t*)(data + block_id * blk_size + (is++)); + sc = *scales; + + dl = d * (sc & 0xF); ml = min * (sc >> 4); + for (int l = 0; l < 16; ++l) *output_blk++ = dl * ((int8_t)((q[l+16] >> shift) & 3)) - ml; + + shift += 2; + } + q += 32; + } + } +} + +int main() { + // Define the parameters + const int blk_size = 128 + 16 + 2 * sizeof(sycl::half); // Adjusted to match the kernel's data layout + const int ele_per_blk = 256; + const int num_blocks = 2; + + // Initialize input data + std::vector data(blk_size * num_blocks); + std::vector output(ele_per_blk * num_blocks, 0.0f); + + // Fill the data with some values + for (int i = 0; i < num_blocks; ++i) { + sycl::half d = 0.5f; + sycl::half min = 0.1f; + std::memcpy(data.data() + i * blk_size + 80, &d, sizeof(sycl::half)); + std::memcpy(data.data() + i * blk_size + 82, &min, sizeof(sycl::half)); + for (int j = 0; j < 16; ++j) { + data[i * blk_size + j] = j; + } + for (int j = 16; j < 128 + 16; ++j) { + data[i * blk_size + j] = (j - 16) % 256; + } + } + + // Print the input data + std::cout << "Input data:" << std::endl; + for (int i = 0; i < num_blocks; ++i) { + std::cout << "Block " << i << ":" << std::endl; + for (int j = 0; j < blk_size; ++j) { + std::cout << static_cast(data[i * blk_size + j]) << " "; + } + std::cout << std::endl; + } + + // Create a SYCL queue + queue q; + + // Allocate device memory + int8_t* d_data = malloc_device(data.size(), q); + float* d_output = malloc_device(output.size(), q); + + // Copy data to device + q.memcpy(d_data, data.data(), data.size() * sizeof(int8_t)).wait(); + q.memcpy(d_output, output.data(), output.size() * sizeof(float)).wait(); + + // Define the kernel execution configuration + range<3> global_work_size(1, 1, num_blocks); + range<3> local_work_size(1, 1, 1); + + // Launch the kernel + q.submit([&](handler& h) { + h.parallel_for(nd_range<3>(global_work_size, local_work_size), [=](nd_item<3> item_ct1) { + dequantize_q2_k_fp32_kernel(d_data, d_output, blk_size, ele_per_blk, num_blocks, item_ct1); + }); + }).wait(); + + // Copy the result back to host + q.memcpy(output.data(), d_output, output.size() * sizeof(float)).wait(); + + // Free device memory + free(d_data, q); + free(d_output, q); + + // Print the output data + std::cout << "Output data:" << std::endl; + for (int i = 0; i < num_blocks; ++i) { + std::cout << "Block " << i << ":" << std::endl; + for (int j = 0; j < ele_per_blk; ++j) { + std::cout << output[i * ele_per_blk + j] << " "; + } + std::cout << std::endl; + } + + + // Check the results + bool success = true; + for (int i = 0; i < num_blocks; ++i) { + sycl::half d = 0.5f; + sycl::half min = 0.1f; + for (int j = 0; j < ele_per_blk; ++j) { + // Calculate expected value + int block_offset = i * blk_size; + int q_offset = block_offset + 16 + (j / 128) * 32; + int scale_offset = block_offset + (j / 64) * 2; + uint8_t sc = data[scale_offset]; + float dl = d * (sc & 0xF); + float ml = min * (sc >> 4); + int q_idx = (j % 64) / 16; + int shift = (j % 16) * 2; + int8_t q_val = (data[q_offset + q_idx] >> shift) & 3; + float expected = dl * q_val - ml; + + if (std::fabs(output[i * ele_per_blk + j] - expected) > 1e-3) { + success = false; + std::cout << "Mismatch at block " << i << ", element " << j << ": expected " << expected << ", got " << output[i * ele_per_blk + j] << std::endl; + } + } + } + + if (success) { + std::cout << "Test passed!" << std::endl; + } else { + std::cout << "Test failed!" << std::endl; + } + + return 0; +} diff --git a/third-party-programs/ktransformers/custom_gguf/migrated/single_kernel_test_need_debug/test_dequantize_q3_k_fp32_kernel.cpp b/third-party-programs/ktransformers/custom_gguf/migrated/single_kernel_test_need_debug/test_dequantize_q3_k_fp32_kernel.cpp new file mode 100644 index 000000000..8dea71422 --- /dev/null +++ b/third-party-programs/ktransformers/custom_gguf/migrated/single_kernel_test_need_debug/test_dequantize_q3_k_fp32_kernel.cpp @@ -0,0 +1,157 @@ +#include +#include +#include +#include + +using namespace sycl; + +void dequantize_q3_k_fp32_kernel(const int8_t* data, float* output, const int blk_size, const int ele_per_blk, const int num_blocks, + const sycl::nd_item<3> &item_ct1) { + + long long global_idx = item_ct1.get_group(2) * item_ct1.get_local_range(2) + + item_ct1.get_local_id(2); + const uint32_t kmask1 = 0x03030303; + const uint32_t kmask2 = 0x0f0f0f0f; + for (long long block_id = global_idx; block_id < num_blocks; + block_id += + item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) { + float* __restrict__ output_blk = (float*)(output + block_id * ele_per_blk); + + uint32_t aux[4]; + const int8_t * scales = (const int8_t*)aux; + const float d_all = + sycl::vec(*(reinterpret_cast( + data + block_id * blk_size + 108))) + .convert()[0]; + + const uint8_t * __restrict__ q = (uint8_t*)(data + block_id * blk_size + 32); + const uint8_t * __restrict__ hm = (uint8_t*)(data + block_id * blk_size + 0); + uint8_t m = 1; + + + uint8_t* block_scales = (uint8_t*)(data + block_id * blk_size + 96); + + for (int i = 0; i < 3; i++) { + aux[i] = 0; + for (int j = 0; j < 4; j++) { + aux[i] |= ((uint32_t)block_scales[i * 4 + j]) << (j * 8); + } + } + + uint32_t tmp = aux[2]; + aux[2] = ((aux[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4); + aux[3] = ((aux[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4); + aux[0] = (aux[0] & kmask2) | (((tmp >> 0) & kmask1) << 4); + aux[1] = (aux[1] & kmask2) | (((tmp >> 2) & kmask1) << 4); + + int is = 0; + float dl; + for (int n = 0; n < 256; n += 128) { + int shift = 0; + for (int j = 0; j < 4; ++j) { + + dl = d_all * (scales[is++] - 32); + for (int l = 0; l < 16; ++l) { + *output_blk++ = dl * ((int8_t)((q[l+ 0] >> shift) & 3) - ((hm[l+ 0] & m) ? 0 : 4)); + } + + dl = d_all * (scales[is++] - 32); + for (int l = 0; l < 16; ++l) { + *output_blk++ = dl * ((int8_t)((q[l+16] >> shift) & 3) - ((hm[l+16] & m) ? 0 : 4)); + } + + shift += 2; + m <<= 1; + } + q += 32; + } + } +} + +int main() { + // Define the parameters + const int blk_size = 128 + 32 + 2 * sizeof(sycl::half); // Adjusted to match the kernel's data layout + const int ele_per_blk = 256; + const int num_blocks = 2; + + // Initialize input data + std::vector data(blk_size * num_blocks); + std::vector output(ele_per_blk * num_blocks, 0.0f); + + // Fill the data with some values + for (int i = 0; i < num_blocks; ++i) { + sycl::half d_all = 0.5f; + std::memcpy(data.data() + i * blk_size + 108, &d_all, sizeof(sycl::half)); + for (int j = 0; j < 32; ++j) { + data[i * blk_size + j] = j % 2; // Initialize hm values + } + for (int j = 32; j < 128 + 32; ++j) { + data[i * blk_size + j] = (j - 32) % 256; // Initialize q values + } + for (int j = 0; j < 16; ++j) { + data[i * blk_size + 96 + j] = j % 16; // Initialize block scales + } + } + + // Create a SYCL queue + queue q; + + // Allocate device memory + int8_t* d_data = malloc_device(data.size(), q); + float* d_output = malloc_device(output.size(), q); + + // Copy data to device + q.memcpy(d_data, data.data(), data.size() * sizeof(int8_t)).wait(); + q.memcpy(d_output, output.data(), output.size() * sizeof(float)).wait(); + + // Define the kernel execution configuration + range<3> global_work_size(1, 1, num_blocks); + range<3> local_work_size(1, 1, 1); + + // Launch the kernel + q.submit([&](handler& h) { + h.parallel_for(nd_range<3>(global_work_size, local_work_size), [=](nd_item<3> item_ct1) { + dequantize_q3_k_fp32_kernel(d_data, d_output, blk_size, ele_per_blk, num_blocks, item_ct1); + }); + }).wait(); + + // Copy the result back to host + q.memcpy(output.data(), d_output, output.size() * sizeof(float)).wait(); + + // Free device memory + free(d_data, q); + free(d_output, q); + + // Check the results + bool success = true; + for (int i = 0; i < num_blocks; ++i) { + sycl::half d_all = 0.5f; + for (int j = 0; j < ele_per_blk; ++j) { + // Calculate expected value + int block_offset = i * blk_size; + int q_offset = block_offset + 32 + (j / 128) * 32; + int scale_offset = block_offset + 96 + (j / 64) * 2; + uint8_t sc = data[scale_offset]; + float dl = d_all * (sc - 32); + int q_idx = (j % 64) / 16; + int shift = (j % 16) * 2; + int8_t q_val = (data[q_offset + q_idx] >> shift) & 3; + uint8_t hm_val = data[block_offset + (j % 32)]; + uint8_t m = 1 << (j % 8); + float expected = dl * (q_val - ((hm_val & m) ? 0 : 4)); + + if (std::fabs(output[i * ele_per_blk + j] - expected) > 1e-3) { + success = false; + std::cout << "Mismatch at block " << i << ", element " << j << ": expected " << expected << ", got " << output[i * ele_per_blk + j] << std::endl; + } + } + } + + if (success) { + std::cout << "Test passed!" << std::endl; + } else { + std::cout << "Test failed!" << std::endl; + } + + return 0; +} diff --git a/third-party-programs/ktransformers/custom_gguf/migrated/src/bindings.cpp b/third-party-programs/ktransformers/custom_gguf/migrated/src/bindings.cpp new file mode 100644 index 000000000..ade8ba6c3 --- /dev/null +++ b/third-party-programs/ktransformers/custom_gguf/migrated/src/bindings.cpp @@ -0,0 +1,57 @@ +#include +#include +#include +#include +#include +#include + +// Include your header file +#include "dequant.dp.hpp" + + +namespace py = pybind11; + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def("dequantize_q8_0", [](const intptr_t data, int num_bytes, int blk_size, const int ele_per_blk, torch::Device device, py::object target_dtype) { + torch::Dtype dtype = torch::python::detail::py_object_to_dtype(target_dtype); + return dequantize_q8_0((int8_t*)data, num_bytes, blk_size, ele_per_blk, device, dtype); + }, "Function to dequantize q8_0 data.", + py::arg("data"), py::arg("num_bytes"), py::arg("blk_size"), py::arg("ele_per_blk"), py::arg("device"), py::arg("target_dtype")); + + m.def("dequantize_q6_k", [](const intptr_t data, int num_bytes, int blk_size, const int ele_per_blk, torch::Device device, py::object target_dtype) { + torch::Dtype dtype = torch::python::detail::py_object_to_dtype(target_dtype); + return dequantize_q6_k((int8_t*)data, num_bytes, blk_size, ele_per_blk, device, dtype); + }, "Function to dequantize q6_k data.", + py::arg("data"), py::arg("num_bytes"), py::arg("blk_size"), py::arg("ele_per_blk"), py::arg("device"), py::arg("target_dtype")); + + m.def("dequantize_q5_k", [](const intptr_t data, int num_bytes, int blk_size, const int ele_per_blk, torch::Device device, py::object target_dtype) { + torch::Dtype dtype = torch::python::detail::py_object_to_dtype(target_dtype); + return dequantize_q5_k((int8_t*)data, num_bytes, blk_size, ele_per_blk, device, dtype); + }, "Function to dequantize q5_k data.", + py::arg("data"), py::arg("num_bytes"), py::arg("blk_size"), py::arg("ele_per_blk"), py::arg("device"), py::arg("target_dtype")); + + m.def("dequantize_q4_k", [](const intptr_t data, int num_bytes, int blk_size, const int ele_per_blk, torch::Device device, py::object target_dtype) { + torch::Dtype dtype = torch::python::detail::py_object_to_dtype(target_dtype); + return dequantize_q4_k((int8_t*)data, num_bytes, blk_size, ele_per_blk, device, dtype); + }, "Function to dequantize q4_k data.", + py::arg("data"), py::arg("num_bytes"), py::arg("blk_size"), py::arg("ele_per_blk"), py::arg("device"), py::arg("target_dtype")); + + m.def("dequantize_q3_k", [](const intptr_t data, int num_bytes, int blk_size, const int ele_per_blk, torch::Device device, py::object target_dtype) { + torch::Dtype dtype = torch::python::detail::py_object_to_dtype(target_dtype); + return dequantize_q3_k((int8_t*)data, num_bytes, blk_size, ele_per_blk, device, dtype); + }, "Function to dequantize q3_k data.", + py::arg("data"), py::arg("num_bytes"), py::arg("blk_size"), py::arg("ele_per_blk"), py::arg("device"), py::arg("target_dtype")); + + m.def("dequantize_q2_k", [](const intptr_t data, int num_bytes, int blk_size, const int ele_per_blk, torch::Device device, py::object target_dtype) { + torch::Dtype dtype = torch::python::detail::py_object_to_dtype(target_dtype); + return dequantize_q2_k((int8_t*)data, num_bytes, blk_size, ele_per_blk, device, dtype); + }, "Function to dequantize q2_k data.", + py::arg("data"), py::arg("num_bytes"), py::arg("blk_size"), py::arg("ele_per_blk"), py::arg("device"), py::arg("target_dtype")); + + m.def("dequantize_iq4_xs", [](const intptr_t data, int num_bytes, int blk_size, const int ele_per_blk, torch::Device device, py::object target_dtype) { + torch::Dtype dtype = torch::python::detail::py_object_to_dtype(target_dtype); + return dequantize_iq4_xs((int8_t*)data, num_bytes, blk_size, ele_per_blk, device, dtype); + }, "Function to dequantize iq4_xs data.", + py::arg("data"), py::arg("num_bytes"), py::arg("blk_size"), py::arg("ele_per_blk"), py::arg("device"), py::arg("target_dtype")); + +} \ No newline at end of file diff --git a/third-party-programs/ktransformers/custom_gguf/migrated/src/dequant.dp.cpp b/third-party-programs/ktransformers/custom_gguf/migrated/src/dequant.dp.cpp new file mode 100644 index 000000000..4dae0e3d0 --- /dev/null +++ b/third-party-programs/ktransformers/custom_gguf/migrated/src/dequant.dp.cpp @@ -0,0 +1,1601 @@ +/* + * @Description : + * @Author : Azure-Tang, Boxin Zhang + * @Date : 2024-07-25 13:38:30 + * @Version : 0.2.2 + * Adapted from https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.c + * Copyright (c) 2023-2024 The ggml authors + * Copyright (c) 2024 by KVCache.AI, All Rights Reserved. + */ + +#include "dequant.dp.hpp" +#include +#include +#include +#include +#include +#include +#include + +void dequantize_q8_0_fp32_kernel(const int8_t* data, float* output, const int blk_size, const int ele_per_blk, const int num_blocks, + const sycl::nd_item<3> &item_ct1) { + long long global_idx = item_ct1.get_group(2) * item_ct1.get_local_range(2) + + item_ct1.get_local_id(2); + for (long long block_id = global_idx; block_id < num_blocks; + block_id += + item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) { + float* __restrict__ output_blk = (float*)(output + block_id * ele_per_blk); + const int8_t* cur_block = data + block_id * blk_size; + float scale = sycl::vec(*((sycl::half *)cur_block)) + .convert()[0]; + cur_block += 2; + for (int i = 0; i < ele_per_blk; i++){ + output_blk[i] = scale * cur_block[i]; + } + } +} + +void dequantize_q8_0_fp16_kernel(const int8_t *data, sycl::half *output, + const int blk_size, const int ele_per_blk, + const int num_blocks, + const sycl::nd_item<3> &item_ct1) { + long long global_idx = item_ct1.get_group(2) * item_ct1.get_local_range(2) + + item_ct1.get_local_id(2); + for (long long block_id = global_idx; block_id < num_blocks; + block_id += + item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) { + sycl::half *__restrict__ output_blk = + (sycl::half *)(output + block_id * ele_per_blk); + const int8_t* cur_block = data + block_id * blk_size; + float scale = sycl::vec(*((sycl::half *)cur_block)) + .convert()[0]; + cur_block += 2; + for (int i = 0; i < ele_per_blk; i++) { + output_blk[i] = + sycl::vec(scale * cur_block[i]) + .convert()[0]; + } + } +} + +void dequantize_q8_0_bf16_kernel(const int8_t *data, + sycl::ext::oneapi::bfloat16 *output, + const int blk_size, const int ele_per_blk, + const int num_blocks, + const sycl::nd_item<3> &item_ct1) { + long long global_idx = item_ct1.get_group(2) * item_ct1.get_local_range(2) + + item_ct1.get_local_id(2); + for (long long block_id = global_idx; block_id < num_blocks; + block_id += + item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) { + sycl::ext::oneapi::bfloat16 *__restrict__ output_blk = + (sycl::ext::oneapi::bfloat16 *)(output + block_id * ele_per_blk); + const int8_t* cur_block = data + block_id * blk_size; + float scale = sycl::vec(*((sycl::half *)cur_block)) + .convert()[0]; + cur_block += 2; + for (int i = 0; i < ele_per_blk; i++) { + output_blk[i] = sycl::ext::oneapi::bfloat16(scale * cur_block[i]); + } + } +} + +// __device__ void get_scale_min_k4(int j, const uint8_t * __restrict__ q, uint8_t * __restrict__ d, uint8_t * __restrict__ m) { +void get_scale_min_k4(int j, const uint8_t * q, uint8_t * __restrict__ d, uint8_t * __restrict__ m) { + if (j < 4) { + *d = q[j] & 63; *m = q[j + 4] & 63; + } else { + *d = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4); + *m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4); + } +} + +void dequantize_q2_k_fp32_kernel(const int8_t* data, float* output, const int blk_size, const int ele_per_blk, const int num_blocks, + const sycl::nd_item<3> &item_ct1) { + long long global_idx = item_ct1.get_group(2) * item_ct1.get_local_range(2) + + item_ct1.get_local_id(2); + for (long long block_id = global_idx; block_id < num_blocks; + block_id += + item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) { + float* __restrict__ output_blk = (float*)(output + block_id * ele_per_blk); + + const float d = + sycl::vec(*(reinterpret_cast( + data + block_id * blk_size + 80))) + .convert()[0]; + const float min = + sycl::vec(*(reinterpret_cast( + data + block_id * blk_size + 82))) + .convert()[0]; + + const uint8_t * __restrict__ q = (uint8_t*)(data + block_id * blk_size + 16); + + int is = 0; + float dl, ml; + + for (int n = 0; n < 256; n += 128) { + int shift = 0; + for (int j = 0; j < 4; ++j) { + uint8_t* scales = (uint8_t*)(data + block_id * blk_size + (is++)); + uint8_t sc = *scales; + dl = d * (sc & 0xF); ml = min * (sc >> 4); + for (int l = 0; l < 16; ++l) *output_blk++ = dl * ((int8_t)((q[l] >> shift) & 3)) - ml; + + scales = (uint8_t*)(data + block_id * blk_size + (is++)); + sc = *scales; + + dl = d * (sc & 0xF); ml = min * (sc >> 4); + for (int l = 0; l < 16; ++l) *output_blk++ = dl * ((int8_t)((q[l+16] >> shift) & 3)) - ml; + + shift += 2; + } + q += 32; + } + } +} + +void dequantize_q2_k_fp16_kernel(const int8_t *data, sycl::half *output, + const int blk_size, const int ele_per_blk, + const int num_blocks, + const sycl::nd_item<3> &item_ct1) { + long long global_idx = item_ct1.get_group(2) * item_ct1.get_local_range(2) + + item_ct1.get_local_id(2); + for (long long block_id = global_idx; block_id < num_blocks; + block_id += + item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) { + sycl::half *__restrict__ output_blk = + (sycl::half *)(output + block_id * ele_per_blk); + + const float d = + sycl::vec(*(reinterpret_cast( + data + block_id * blk_size + 80))) + .convert()[0]; + const float min = + sycl::vec(*(reinterpret_cast( + data + block_id * blk_size + 82))) + .convert()[0]; + + const uint8_t * __restrict__ q = (uint8_t*)(data + block_id * blk_size + 16); + + int is = 0; + float dl, ml; + + for (int n = 0; n < 256; n += 128) { + int shift = 0; + for (int j = 0; j < 4; ++j) { + uint8_t* scales = (uint8_t*)(data + block_id * blk_size + (is++)); + uint8_t sc = *scales; + dl = d * (sc & 0xF); ml = min * (sc >> 4); + for (int l = 0; l < 16; ++l) *output_blk++ = + sycl::vec(dl * ((int8_t)((q[l] >> shift) & 3)) - + ml) + .convert()[0]; + + scales = (uint8_t*)(data + block_id * blk_size + (is++)); + sc = *scales; + + dl = d * (sc & 0xF); ml = min * (sc >> 4); + for (int l = 0; l < 16; ++l) *output_blk++ = + sycl::vec( + dl * ((int8_t)((q[l + 16] >> shift) & 3)) - ml) + .convert()[0]; + + shift += 2; + } + q += 32; + } + } +} + +void dequantize_q2_k_bf16_kernel(const int8_t *data, + sycl::ext::oneapi::bfloat16 *output, + const int blk_size, const int ele_per_blk, + const int num_blocks, + const sycl::nd_item<3> &item_ct1) { + long long global_idx = item_ct1.get_group(2) * item_ct1.get_local_range(2) + + item_ct1.get_local_id(2); + for (long long block_id = global_idx; block_id < num_blocks; + block_id += + item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) { + sycl::ext::oneapi::bfloat16 *__restrict__ output_blk = + (sycl::ext::oneapi::bfloat16 *)(output + block_id * ele_per_blk); + + const float d = + sycl::vec(*(reinterpret_cast( + data + block_id * blk_size + 80))) + .convert()[0]; + const float min = + sycl::vec(*(reinterpret_cast( + data + block_id * blk_size + 82))) + .convert()[0]; + + const uint8_t * __restrict__ q = (uint8_t*)(data + block_id * blk_size + 16); + + int is = 0; + float dl, ml; + + for (int n = 0; n < 256; n += 128) { + int shift = 0; + for (int j = 0; j < 4; ++j) { + uint8_t* scales = (uint8_t*)(data + block_id * blk_size + (is++)); + uint8_t sc = *scales; + dl = d * (sc & 0xF); ml = min * (sc >> 4); + for (int l = 0; l < 16; ++l) *output_blk++ = + sycl::ext::oneapi::bfloat16( + dl * ((int8_t)((q[l] >> shift) & 3)) - ml); + + scales = (uint8_t*)(data + block_id * blk_size + (is++)); + sc = *scales; + + dl = d * (sc & 0xF); ml = min * (sc >> 4); + for (int l = 0; l < 16; ++l) *output_blk++ = + sycl::ext::oneapi::bfloat16( + dl * ((int8_t)((q[l + 16] >> shift) & 3)) - ml); + + shift += 2; + } + q += 32; + } + } +} + +void dequantize_q3_k_fp32_kernel(const int8_t* data, float* output, const int blk_size, const int ele_per_blk, const int num_blocks, + const sycl::nd_item<3> &item_ct1) { + + long long global_idx = item_ct1.get_group(2) * item_ct1.get_local_range(2) + + item_ct1.get_local_id(2); + const uint32_t kmask1 = 0x03030303; + const uint32_t kmask2 = 0x0f0f0f0f; + for (long long block_id = global_idx; block_id < num_blocks; + block_id += + item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) { + float* __restrict__ output_blk = (float*)(output + block_id * ele_per_blk); + + uint32_t aux[4]; + const int8_t * scales = (const int8_t*)aux; + const float d_all = + sycl::vec(*(reinterpret_cast( + data + block_id * blk_size + 108))) + .convert()[0]; + + const uint8_t * __restrict__ q = (uint8_t*)(data + block_id * blk_size + 32); + const uint8_t * __restrict__ hm = (uint8_t*)(data + block_id * blk_size + 0); + uint8_t m = 1; + + + uint8_t* block_scales = (uint8_t*)(data + block_id * blk_size + 96); + + for (int i = 0; i < 3; i++) { + aux[i] = 0; + for (int j = 0; j < 4; j++) { + aux[i] |= ((uint32_t)block_scales[i * 4 + j]) << (j * 8); + } + } + + uint32_t tmp = aux[2]; + aux[2] = ((aux[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4); + aux[3] = ((aux[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4); + aux[0] = (aux[0] & kmask2) | (((tmp >> 0) & kmask1) << 4); + aux[1] = (aux[1] & kmask2) | (((tmp >> 2) & kmask1) << 4); + + int is = 0; + float dl; + for (int n = 0; n < 256; n += 128) { + int shift = 0; + for (int j = 0; j < 4; ++j) { + + dl = d_all * (scales[is++] - 32); + for (int l = 0; l < 16; ++l) { + *output_blk++ = dl * ((int8_t)((q[l+ 0] >> shift) & 3) - ((hm[l+ 0] & m) ? 0 : 4)); + } + + dl = d_all * (scales[is++] - 32); + for (int l = 0; l < 16; ++l) { + *output_blk++ = dl * ((int8_t)((q[l+16] >> shift) & 3) - ((hm[l+16] & m) ? 0 : 4)); + } + + shift += 2; + m <<= 1; + } + q += 32; + } + } +} + +void dequantize_q3_k_fp16_kernel(const int8_t *data, sycl::half *output, + const int blk_size, const int ele_per_blk, + const int num_blocks, + const sycl::nd_item<3> &item_ct1) { + + long long global_idx = item_ct1.get_group(2) * item_ct1.get_local_range(2) + + item_ct1.get_local_id(2); + const uint32_t kmask1 = 0x03030303; + const uint32_t kmask2 = 0x0f0f0f0f; + for (long long block_id = global_idx; block_id < num_blocks; + block_id += + item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) { + sycl::half *__restrict__ output_blk = + (sycl::half *)(output + block_id * ele_per_blk); + + uint32_t aux[4]; + const int8_t * scales = (const int8_t*)aux; + const float d_all = + sycl::vec(*(reinterpret_cast( + data + block_id * blk_size + 108))) + .convert()[0]; + + const uint8_t * __restrict__ q = (uint8_t*)(data + block_id * blk_size + 32); + const uint8_t * __restrict__ hm = (uint8_t*)(data + block_id * blk_size + 0); + uint8_t m = 1; + + + uint8_t* block_scales = (uint8_t*)(data + block_id * blk_size + 96); + + for (int i = 0; i < 3; i++) { + aux[i] = 0; + for (int j = 0; j < 4; j++) { + aux[i] |= ((uint32_t)block_scales[i * 4 + j]) << (j * 8); + } + } + + uint32_t tmp = aux[2]; + aux[2] = ((aux[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4); + aux[3] = ((aux[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4); + aux[0] = (aux[0] & kmask2) | (((tmp >> 0) & kmask1) << 4); + aux[1] = (aux[1] & kmask2) | (((tmp >> 2) & kmask1) << 4); + + int is = 0; + float dl; + for (int n = 0; n < 256; n += 128) { + int shift = 0; + for (int j = 0; j < 4; ++j) { + + dl = d_all * (scales[is++] - 32); + for (int l = 0; l < 16; ++l) { + *output_blk++ = + sycl::vec(dl * + ((int8_t)((q[l + 0] >> shift) & 3) - + ((hm[l + 0] & m) ? 0 : 4))) + .convert()[0]; + } + + dl = d_all * (scales[is++] - 32); + for (int l = 0; l < 16; ++l) { + *output_blk++ = + sycl::vec( + dl * ((int8_t)((q[l + 16] >> shift) & 3) - + ((hm[l + 16] & m) ? 0 : 4))) + .convert()[0]; + } + + shift += 2; + m <<= 1; + } + q += 32; + } + } +} + +void dequantize_q3_k_bf16_kernel(const int8_t *data, + sycl::ext::oneapi::bfloat16 *output, + const int blk_size, const int ele_per_blk, + const int num_blocks, + const sycl::nd_item<3> &item_ct1) { + + long long global_idx = item_ct1.get_group(2) * item_ct1.get_local_range(2) + + item_ct1.get_local_id(2); + const uint32_t kmask1 = 0x03030303; + const uint32_t kmask2 = 0x0f0f0f0f; + for (long long block_id = global_idx; block_id < num_blocks; + block_id += + item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) { + sycl::ext::oneapi::bfloat16 *__restrict__ output_blk = + (sycl::ext::oneapi::bfloat16 *)(output + block_id * ele_per_blk); + + uint32_t aux[4]; + const int8_t * scales = (const int8_t*)aux; + const float d_all = + sycl::vec(*(reinterpret_cast( + data + block_id * blk_size + 108))) + .convert()[0]; + + const uint8_t * __restrict__ q = (uint8_t*)(data + block_id * blk_size + 32); + const uint8_t * __restrict__ hm = (uint8_t*)(data + block_id * blk_size + 0); + uint8_t m = 1; + + + uint8_t* block_scales = (uint8_t*)(data + block_id * blk_size + 96); + + for (int i = 0; i < 3; i++) { + aux[i] = 0; + for (int j = 0; j < 4; j++) { + aux[i] |= ((uint32_t)block_scales[i * 4 + j]) << (j * 8); + } + } + + uint32_t tmp = aux[2]; + aux[2] = ((aux[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4); + aux[3] = ((aux[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4); + aux[0] = (aux[0] & kmask2) | (((tmp >> 0) & kmask1) << 4); + aux[1] = (aux[1] & kmask2) | (((tmp >> 2) & kmask1) << 4); + + int is = 0; + float dl; + for (int n = 0; n < 256; n += 128) { + int shift = 0; + for (int j = 0; j < 4; ++j) { + + dl = d_all * (scales[is++] - 32); + for (int l = 0; l < 16; ++l) { + *output_blk++ = sycl::ext::oneapi::bfloat16( + dl * ((int8_t)((q[l + 0] >> shift) & 3) - + ((hm[l + 0] & m) ? 0 : 4))); + } + + dl = d_all * (scales[is++] - 32); + for (int l = 0; l < 16; ++l) { + *output_blk++ = sycl::ext::oneapi::bfloat16( + dl * ((int8_t)((q[l + 16] >> shift) & 3) - + ((hm[l + 16] & m) ? 0 : 4))); + } + + shift += 2; + m <<= 1; + } + q += 32; + } + } +} + + +void dequantize_q4_k_fp32_kernel(const int8_t* data, float* output, const int blk_size, const int ele_per_blk, const int num_blocks, + const sycl::nd_item<3> &item_ct1) { + long long global_idx = item_ct1.get_group(2) * item_ct1.get_local_range(2) + + item_ct1.get_local_id(2); + for (long long block_id = global_idx; block_id < num_blocks; + block_id += + item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) { + float* __restrict__ output_blk = (float*)(output + block_id * ele_per_blk); + // const uint8_t * q = data[i].qs; + const uint8_t * q = (uint8_t*)(data + block_id * 144 + 16); + + const float d = + sycl::vec(*(reinterpret_cast( + data + block_id * 144 + 0))) + .convert()[0]; + const float min = + sycl::vec(*(reinterpret_cast( + data + block_id * 144 + 2))) + .convert()[0]; + int is = 0; + uint8_t sc, m; + for (int j = 0; j < ele_per_blk; j += 64) { + uint8_t* scales = (uint8_t*)(data + block_id * 144 + 4); + get_scale_min_k4(is + 0, scales, &sc, &m); + const float d1 = d * sc; const float m1 = min * m; + get_scale_min_k4(is + 1, scales, &sc, &m); + const float d2 = d * sc; const float m2 = min * m; + for (int l = 0; l < 32; ++l) *output_blk++ = d1 * (q[l] & 0xF) - m1; + for (int l = 0; l < 32; ++l) *output_blk++ = d2 * (q[l] >> 4) - m2; + q += 32; is += 2; + } + } +} + +void dequantize_q4_k_fp16_kernel(const int8_t *data, sycl::half *output, + const int blk_size, const int ele_per_blk, + const int num_blocks, + const sycl::nd_item<3> &item_ct1) { + long long global_idx = item_ct1.get_group(2) * item_ct1.get_local_range(2) + + item_ct1.get_local_id(2); + for (long long block_id = global_idx; block_id < num_blocks; + block_id += + item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) { + sycl::half *__restrict__ output_blk = + (sycl::half *)(output + block_id * ele_per_blk); + // const uint8_t * q = data[i].qs; + const uint8_t * q = (uint8_t*)(data + block_id * 144 + 16); + + const float d = + sycl::vec(*(reinterpret_cast( + data + block_id * 144 + 0))) + .convert()[0]; + const float min = + sycl::vec(*(reinterpret_cast( + data + block_id * 144 + 2))) + .convert()[0]; + int is = 0; + uint8_t sc, m; + for (int j = 0; j < ele_per_blk; j += 64) { + uint8_t* scales = (uint8_t*)(data + block_id * 144 + 4); + get_scale_min_k4(is + 0, scales, &sc, &m); + const float d1 = d * sc; const float m1 = min * m; + get_scale_min_k4(is + 1, scales, &sc, &m); + const float d2 = d * sc; const float m2 = min * m; + for (int l = 0; l < 32; ++l) *output_blk++ = + sycl::vec(d1 * (q[l] & 0xF) - m1) + .convert()[0]; + for (int l = 0; l < 32; ++l) *output_blk++ = + sycl::vec(d2 * (q[l] >> 4) - m2) + .convert()[0]; + q += 32; is += 2; + } + } +} + +void dequantize_q4_k_bf16_kernel(const int8_t *data, + sycl::ext::oneapi::bfloat16 *output, + const int blk_size, const int ele_per_blk, + const int num_blocks, + const sycl::nd_item<3> &item_ct1) { + long long global_idx = item_ct1.get_group(2) * item_ct1.get_local_range(2) + + item_ct1.get_local_id(2); + for (long long block_id = global_idx; block_id < num_blocks; + block_id += + item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) { + sycl::ext::oneapi::bfloat16 *__restrict__ output_blk = + (sycl::ext::oneapi::bfloat16 *)(output + block_id * ele_per_blk); + // const uint8_t * q = data[i].qs; + const uint8_t * q = (uint8_t*)(data + block_id * 144 + 16); + + const float d = + sycl::vec(*(reinterpret_cast( + data + block_id * 144 + 0))) + .convert()[0]; + const float min = + sycl::vec(*(reinterpret_cast( + data + block_id * 144 + 2))) + .convert()[0]; + int is = 0; + uint8_t sc, m; + for (int j = 0; j < ele_per_blk; j += 64) { + uint8_t* scales = (uint8_t*)(data + block_id * 144 + 4); + get_scale_min_k4(is + 0, scales, &sc, &m); + const float d1 = d * sc; const float m1 = min * m; + get_scale_min_k4(is + 1, scales, &sc, &m); + const float d2 = d * sc; const float m2 = min * m; + for (int l = 0; l < 32; ++l) *output_blk++ = + sycl::ext::oneapi::bfloat16(d1 * (q[l] & 0xF) - m1); + for (int l = 0; l < 32; ++l) *output_blk++ = + sycl::ext::oneapi::bfloat16(d2 * (q[l] >> 4) - m2); + q += 32; is += 2; + } + } +} + +void dequantize_q5_k_fp32_kernel(const int8_t* data, float* output, const int blk_size, const int ele_per_blk, const int num_blocks, + const sycl::nd_item<3> &item_ct1) { + long long global_idx = item_ct1.get_group(2) * item_ct1.get_local_range(2) + + item_ct1.get_local_id(2); + for (long long block_id = global_idx; block_id < num_blocks; + block_id += + item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) { + float* __restrict__ output_blk = (float*)(output + block_id * ele_per_blk); + + const float d = + sycl::vec(*(reinterpret_cast( + data + block_id * blk_size + 0))) + .convert()[0]; + const float min = + sycl::vec(*(reinterpret_cast( + data + block_id * blk_size + 2))) + .convert()[0]; + + const uint8_t * __restrict__ qh = (uint8_t*)(data + block_id * blk_size + 16); + const uint8_t * __restrict__ ql = (uint8_t*)(data + block_id * blk_size + 48); + + int is = 0; + uint8_t sc, m; + uint8_t u1 = 1, u2 = 2; + uint8_t* scales = (uint8_t*)(data + block_id * blk_size + 4); + + for (int j = 0; j < 256; j += 64) { + get_scale_min_k4(is + 0, scales, &sc, &m); + const float d1 = d * sc; const float m1 = min * m; + get_scale_min_k4(is + 1, scales, &sc, &m); + const float d2 = d * sc; const float m2 = min * m; + for (int l = 0; l < 32; ++l) *output_blk++ = d1 * ((ql[l] & 0xF) + (qh[l] & u1 ? 16 : 0)) - m1; + for (int l = 0; l < 32; ++l) *output_blk++ = d2 * ((ql[l] >> 4) + (qh[l] & u2 ? 16 : 0)) - m2; + ql += 32; is += 2; + u1 <<= 2; u2 <<= 2; + } + } +} + +void dequantize_q5_k_fp16_kernel(const int8_t *data, sycl::half *output, + const int blk_size, const int ele_per_blk, + const int num_blocks, + const sycl::nd_item<3> &item_ct1) { + long long global_idx = item_ct1.get_group(2) * item_ct1.get_local_range(2) + + item_ct1.get_local_id(2); + for (long long block_id = global_idx; block_id < num_blocks; + block_id += + item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) { + sycl::half *__restrict__ output_blk = + (sycl::half *)(output + block_id * ele_per_blk); + + const float d = + sycl::vec(*(reinterpret_cast( + data + block_id * blk_size + 0))) + .convert()[0]; + const float min = + sycl::vec(*(reinterpret_cast( + data + block_id * blk_size + 2))) + .convert()[0]; + + const uint8_t * __restrict__ qh = (uint8_t*)(data + block_id * blk_size + 16); + const uint8_t * __restrict__ ql = (uint8_t*)(data + block_id * blk_size + 48); + + int is = 0; + uint8_t sc, m; + uint8_t u1 = 1, u2 = 2; + uint8_t* scales = (uint8_t*)(data + block_id * blk_size + 4); + + for (int j = 0; j < 256; j += 64) { + get_scale_min_k4(is + 0, scales, &sc, &m); + const float d1 = d * sc; const float m1 = min * m; + get_scale_min_k4(is + 1, scales, &sc, &m); + const float d2 = d * sc; const float m2 = min * m; + for (int l = 0; l < 32; ++l) *output_blk++ = + sycl::vec( + d1 * ((ql[l] & 0xF) + (qh[l] & u1 ? 16 : 0)) - m1) + .convert()[0]; + for (int l = 0; l < 32; ++l) *output_blk++ = + sycl::vec( + d2 * ((ql[l] >> 4) + (qh[l] & u2 ? 16 : 0)) - m2) + .convert()[0]; + ql += 32; is += 2; + u1 <<= 2; u2 <<= 2; + } + } +} + +void dequantize_q5_k_bf16_kernel(const int8_t *data, + sycl::ext::oneapi::bfloat16 *output, + const int blk_size, const int ele_per_blk, + const int num_blocks, + const sycl::nd_item<3> &item_ct1) { + long long global_idx = item_ct1.get_group(2) * item_ct1.get_local_range(2) + + item_ct1.get_local_id(2); + for (long long block_id = global_idx; block_id < num_blocks; + block_id += + item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) { + sycl::ext::oneapi::bfloat16 *__restrict__ output_blk = + (sycl::ext::oneapi::bfloat16 *)(output + block_id * ele_per_blk); + + const float d = + sycl::vec(*(reinterpret_cast( + data + block_id * blk_size + 0))) + .convert()[0]; + const float min = + sycl::vec(*(reinterpret_cast( + data + block_id * blk_size + 2))) + .convert()[0]; + + const uint8_t * __restrict__ qh = (uint8_t*)(data + block_id * blk_size + 16); + const uint8_t * __restrict__ ql = (uint8_t*)(data + block_id * blk_size + 48); + + int is = 0; + uint8_t sc, m; + uint8_t u1 = 1, u2 = 2; + uint8_t* scales = (uint8_t*)(data + block_id * blk_size + 4); + + for (int j = 0; j < 256; j += 64) { + get_scale_min_k4(is + 0, scales, &sc, &m); + const float d1 = d * sc; const float m1 = min * m; + get_scale_min_k4(is + 1, scales, &sc, &m); + const float d2 = d * sc; const float m2 = min * m; + for (int l = 0; l < 32; ++l) *output_blk++ = + sycl::ext::oneapi::bfloat16( + d1 * ((ql[l] & 0xF) + (qh[l] & u1 ? 16 : 0)) - m1); + for (int l = 0; l < 32; ++l) *output_blk++ = + sycl::ext::oneapi::bfloat16( + d2 * ((ql[l] >> 4) + (qh[l] & u2 ? 16 : 0)) - m2); + ql += 32; is += 2; + u1 <<= 2; u2 <<= 2; + } + } +} + +void dequantize_q6_k_fp32_kernel(const int8_t* data, float* output, const int blk_size, const int ele_per_blk, const int num_blocks, + const sycl::nd_item<3> &item_ct1) { + long long global_idx = item_ct1.get_group(2) * item_ct1.get_local_range(2) + + item_ct1.get_local_id(2); + for (long long block_id = global_idx; block_id < num_blocks; + block_id += + item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) { + float* __restrict__ output_blk = (float*)(output + block_id * ele_per_blk); + const float d = + sycl::vec(*(reinterpret_cast( + data + block_id * blk_size + 208))) + .convert()[0]; + + const uint8_t * __restrict__ ql = (uint8_t*)(data + block_id * blk_size); + const uint8_t * __restrict__ qh = (uint8_t*)(data + block_id * blk_size + 128); + const int8_t * __restrict__ sc = (int8_t*)(data + block_id * blk_size + 192); + + + for (int n = 0; n < ele_per_blk; n += 128) { + for (int l = 0; l < 32; ++l) { + int is = l/16; + const int8_t q1 = (int8_t)((ql[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32; + const int8_t q2 = (int8_t)((ql[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32; + const int8_t q3 = (int8_t)((ql[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32; + const int8_t q4 = (int8_t)((ql[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32; + output_blk[l + 0] = d * sc[is + 0] * q1; + output_blk[l + 32] = d * sc[is + 2] * q2; + output_blk[l + 64] = d * sc[is + 4] * q3; + output_blk[l + 96] = d * sc[is + 6] * q4; + } + output_blk += 128; + ql += 64; + qh += 32; + sc += 8; + } + } +} + +void dequantize_q6_k_fp16_kernel(const int8_t *data, sycl::half *output, + const int blk_size, const int ele_per_blk, + const int num_blocks, + const sycl::nd_item<3> &item_ct1) { + long long global_idx = item_ct1.get_group(2) * item_ct1.get_local_range(2) + + item_ct1.get_local_id(2); + for (long long block_id = global_idx; block_id < num_blocks; + block_id += + item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) { + sycl::half *__restrict__ output_blk = + (sycl::half *)(output + block_id * ele_per_blk); + const float d = + sycl::vec(*(reinterpret_cast( + data + block_id * blk_size + 208))) + .convert()[0]; + + const uint8_t * __restrict__ ql = (uint8_t*)(data + block_id * blk_size); + const uint8_t * __restrict__ qh = (uint8_t*)(data + block_id * blk_size + 128); + const int8_t * __restrict__ sc = (int8_t*)(data + block_id * blk_size + 192); + + + for (int n = 0; n < ele_per_blk; n += 128) { + for (int l = 0; l < 32; ++l) { + int is = l/16; + const int8_t q1 = (int8_t)((ql[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32; + const int8_t q2 = (int8_t)((ql[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32; + const int8_t q3 = (int8_t)((ql[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32; + const int8_t q4 = (int8_t)((ql[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32; + output_blk[l + 0] = + sycl::vec(d * sc[is + 0] * q1) + .convert()[0]; + output_blk[l + 32] = + sycl::vec(d * sc[is + 2] * q2) + .convert()[0]; + output_blk[l + 64] = + sycl::vec(d * sc[is + 4] * q3) + .convert()[0]; + output_blk[l + 96] = + sycl::vec(d * sc[is + 6] * q4) + .convert()[0]; + } + output_blk += 128; + ql += 64; + qh += 32; + sc += 8; + } + } +} + +void dequantize_q6_k_bf16_kernel(const int8_t *data, + sycl::ext::oneapi::bfloat16 *output, + const int blk_size, const int ele_per_blk, + const int num_blocks, + const sycl::nd_item<3> &item_ct1) { + long long global_idx = item_ct1.get_group(2) * item_ct1.get_local_range(2) + + item_ct1.get_local_id(2); + for (long long block_id = global_idx; block_id < num_blocks; + block_id += + item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) { + sycl::ext::oneapi::bfloat16 *__restrict__ output_blk = + (sycl::ext::oneapi::bfloat16 *)(output + block_id * ele_per_blk); + const float d = + sycl::vec(*(reinterpret_cast( + data + block_id * blk_size + 208))) + .convert()[0]; + + const uint8_t * __restrict__ ql = (uint8_t*)(data + block_id * blk_size); + const uint8_t * __restrict__ qh = (uint8_t*)(data + block_id * blk_size + 128); + const int8_t * __restrict__ sc = (int8_t*)(data + block_id * blk_size + 192); + + + for (int n = 0; n < ele_per_blk; n += 128) { + for (int l = 0; l < 32; ++l) { + int is = l/16; + const int8_t q1 = (int8_t)((ql[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32; + const int8_t q2 = (int8_t)((ql[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32; + const int8_t q3 = (int8_t)((ql[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32; + const int8_t q4 = (int8_t)((ql[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32; + output_blk[l + 0] = + sycl::ext::oneapi::bfloat16(d * sc[is + 0] * q1); + output_blk[l + 32] = + sycl::ext::oneapi::bfloat16(d * sc[is + 2] * q2); + output_blk[l + 64] = + sycl::ext::oneapi::bfloat16(d * sc[is + 4] * q3); + output_blk[l + 96] = + sycl::ext::oneapi::bfloat16(d * sc[is + 6] * q4); + } + output_blk += 128; + ql += 64; + qh += 32; + sc += 8; + } + } +} + +static dpct::global_memory + kvalues_iq4nl(sycl::range<1>(16), {-127, -104, -83, -65, -49, -35, -22, -10, + 1, 13, 25, 38, 53, 69, 89, 113}); + +void dequantize_iq4_xs_fp32_kernel(const int8_t* data, float* output, const int blk_size, const int ele_per_blk, const int num_blocks, + const sycl::nd_item<3> &item_ct1, + const int8_t *kvalues_iq4nl) { + long long global_idx = item_ct1.get_group(2) * item_ct1.get_local_range(2) + + item_ct1.get_local_id(2); + for (long long block_id = global_idx; block_id < num_blocks; + block_id += + item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) { + float* __restrict__ output_blk = (float*)(output + block_id * ele_per_blk); + const float d = + sycl::vec(*(reinterpret_cast( + data + block_id * blk_size))) + .convert()[0]; + const uint16_t scales_h = *(reinterpret_cast(data + block_id * blk_size + 2)); + const uint8_t* scales_l = (uint8_t*)(data + block_id * blk_size + 2 + 2); + const uint8_t* qs = (uint8_t*)(data + block_id * blk_size + 2 + 2 + 4); + + for (int ib = 0; ib < 8; ++ib) { + const int ls = ((scales_l[ib / 2] >> 4 * (ib % 2)) & 0xf) | (((scales_h >> 2 * ib) & 3) << 4); + const float dl = d * (ls - 32); + for (int j = 0; j < 16; ++j) { + output_blk[j + 0] = dl * kvalues_iq4nl[qs[j] & 0xf]; + output_blk[j + 16] = dl * kvalues_iq4nl[qs[j] >> 4]; + } + output_blk += 32; + qs += 16; + } + } +} + +void dequantize_iq4_xs_fp16_kernel(const int8_t *data, sycl::half *output, + const int blk_size, const int ele_per_blk, + const int num_blocks, + const sycl::nd_item<3> &item_ct1, + const int8_t *kvalues_iq4nl) { + long long global_idx = item_ct1.get_group(2) * item_ct1.get_local_range(2) + + item_ct1.get_local_id(2); + for (long long block_id = global_idx; block_id < num_blocks; + block_id += + item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) { + sycl::half *__restrict__ output_blk = + (sycl::half *)(output + block_id * ele_per_blk); + const float d = + sycl::vec(*(reinterpret_cast( + data + block_id * blk_size))) + .convert()[0]; + const uint16_t scales_h = *(reinterpret_cast(data + block_id * blk_size + 2)); + const uint8_t* scales_l = (uint8_t*)(data + block_id * blk_size + 2 + 2); + const uint8_t* qs = (uint8_t*)(data + block_id * blk_size + 2 + 2 + 4); + + for (int ib = 0; ib < 8; ++ib) { + const int ls = ((scales_l[ib / 2] >> 4 * (ib % 2)) & 0xf) | (((scales_h >> 2 * ib) & 3) << 4); + const float dl = d * (ls - 32); + for (int j = 0; j < 16; ++j) { + output_blk[j + 0] = + sycl::vec(dl * kvalues_iq4nl[qs[j] & 0xf]) + .convert()[0]; + output_blk[j + 16] = + sycl::vec(dl * kvalues_iq4nl[qs[j] >> 4]) + .convert()[0]; + } + output_blk += 32; + qs += 16; + } + } +} + +void dequantize_iq4_xs_bf16_kernel(const int8_t *data, + sycl::ext::oneapi::bfloat16 *output, + const int blk_size, const int ele_per_blk, + const int num_blocks, + const sycl::nd_item<3> &item_ct1, + const int8_t *kvalues_iq4nl) { + long long global_idx = item_ct1.get_group(2) * item_ct1.get_local_range(2) + + item_ct1.get_local_id(2); + for (long long block_id = global_idx; block_id < num_blocks; + block_id += + item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) { + sycl::ext::oneapi::bfloat16 *__restrict__ output_blk = + (sycl::ext::oneapi::bfloat16 *)(output + block_id * ele_per_blk); + const float d = + sycl::vec(*(reinterpret_cast( + data + block_id * blk_size))) + .convert()[0]; + const uint16_t scales_h = *(reinterpret_cast(data + block_id * blk_size + 2)); + const uint8_t* scales_l = (uint8_t*)(data + block_id * blk_size + 2 + 2); + const uint8_t* qs = (uint8_t*)(data + block_id * blk_size + 2 + 2 + 4); + + for (int ib = 0; ib < 8; ++ib) { + const int ls = ((scales_l[ib / 2] >> 4 * (ib % 2)) & 0xf) | (((scales_h >> 2 * ib) & 3) << 4); + const float dl = d * (ls - 32); + for (int j = 0; j < 16; ++j) { + output_blk[j + 0] = sycl::ext::oneapi::bfloat16( + dl * kvalues_iq4nl[qs[j] & 0xf]); + output_blk[j + 16] = + sycl::ext::oneapi::bfloat16(dl * kvalues_iq4nl[qs[j] >> 4]); + } + output_blk += 32; + qs += 16; + } + } +} + +torch::Tensor dequantize_q8_0(const int8_t *data, const int num_bytes, + const int blk_size, const int ele_per_blk, + const torch::Device device, + const torch::Dtype target_dtype) { + dpct::device_ext &dev_ct1 = dpct::get_current_device(); + sycl::queue &q_ct1 = dev_ct1.in_order_queue(); + int num_blocks = num_bytes / blk_size; + const c10::OptionalDeviceGuard device_guard(device); + + auto options = torch::TensorOptions().dtype(torch::kInt8).device(device).memory_format(torch::MemoryFormat::Contiguous); + auto data_gpu = torch::empty({ num_bytes }, options); + + q_ct1.memcpy(data_gpu.data_ptr(), data, num_bytes).wait(); + //data_gpu.copy_(data, false); + + // Create output tensor + auto output = torch::zeros({ num_blocks, 32 }, torch::dtype(target_dtype).device(device)); + + switch (target_dtype) { + case torch::kFloat16: { + dpct::has_capability_or_fail(q_ct1.get_device(), + {sycl::aspect::fp16}); + + q_ct1.submit([&](sycl::handler &cgh) { + const int8_t *data_gpu_data_ptr_int8_t_ct0 = + data_gpu.data_ptr(); + auto output_data_ptr_ct1 = (sycl::half *)output.data_ptr(); + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 512) * + sycl::range<3>(1, 1, 256), + sycl::range<3>(1, 1, 256)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_q8_0_fp16_kernel( + data_gpu_data_ptr_int8_t_ct0, + output_data_ptr_ct1, blk_size, ele_per_blk, + num_blocks, item_ct1); + }); + }); + } break; + case torch::kBFloat16: { + dpct::has_capability_or_fail(q_ct1.get_device(), + {sycl::aspect::fp16}); + + q_ct1.submit([&](sycl::handler &cgh) { + const int8_t *data_gpu_data_ptr_int8_t_ct0 = + data_gpu.data_ptr(); + auto output_data_ptr_ct1 = + (sycl::ext::oneapi::bfloat16 *)output.data_ptr(); + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 512) * + sycl::range<3>(1, 1, 256), + sycl::range<3>(1, 1, 256)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_q8_0_bf16_kernel( + data_gpu_data_ptr_int8_t_ct0, + output_data_ptr_ct1, blk_size, ele_per_blk, + num_blocks, item_ct1); + }); + }); + } break; + case torch::kFloat32: { + dpct::has_capability_or_fail(q_ct1.get_device(), + {sycl::aspect::fp16}); + + q_ct1.submit([&](sycl::handler &cgh) { + const int8_t *data_gpu_data_ptr_int8_t_ct0 = + data_gpu.data_ptr(); + auto output_data_ptr_float_ct1 = output.data_ptr(); + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 512) * + sycl::range<3>(1, 1, 256), + sycl::range<3>(1, 1, 256)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_q8_0_fp32_kernel( + data_gpu_data_ptr_int8_t_ct0, + output_data_ptr_float_ct1, blk_size, + ele_per_blk, num_blocks, item_ct1); + }); + }); + } break; + default: + printf("target type not support\n"); + exit(0); + } + + dev_ct1.queues_wait_and_throw(); + return output; +} + +torch::Tensor dequantize_q6_k(const int8_t *data, const int num_bytes, + const int blk_size, const int ele_per_blk, + const torch::Device device, + const torch::Dtype target_dtype) { + dpct::device_ext &dev_ct1 = dpct::get_current_device(); + sycl::queue &q_ct1 = dev_ct1.in_order_queue(); + // data.numel%blk_size should be 0, else raise err + int num_blocks = num_bytes / blk_size; + + const c10::OptionalDeviceGuard device_guard(device); + auto options = torch::TensorOptions().dtype(torch::kInt8).device(device).memory_format(torch::MemoryFormat::Contiguous); + auto data_gpu = torch::empty({num_bytes}, options); + + q_ct1.memcpy(data_gpu.data_ptr(), data, num_bytes).wait(); + //data_gpu.copy_(data, false); + + // Create output tensor + auto output = torch::zeros({num_blocks, 256}, torch::dtype(target_dtype).device(device)); + + switch (target_dtype) { + case torch::kFloat16: { + dpct::has_capability_or_fail(q_ct1.get_device(), + {sycl::aspect::fp16}); + + q_ct1.submit([&](sycl::handler &cgh) { + const int8_t *data_gpu_data_ptr_int8_t_ct0 = + data_gpu.data_ptr(); + auto output_data_ptr_ct1 = (sycl::half *)output.data_ptr(); + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 512) * + sycl::range<3>(1, 1, 256), + sycl::range<3>(1, 1, 256)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_q6_k_fp16_kernel( + data_gpu_data_ptr_int8_t_ct0, + output_data_ptr_ct1, blk_size, ele_per_blk, + num_blocks, item_ct1); + }); + }); + } break; + case torch::kBFloat16: { + dpct::has_capability_or_fail(q_ct1.get_device(), + {sycl::aspect::fp16}); + + q_ct1.submit([&](sycl::handler &cgh) { + const int8_t *data_gpu_data_ptr_int8_t_ct0 = + data_gpu.data_ptr(); + auto output_data_ptr_ct1 = + (sycl::ext::oneapi::bfloat16 *)output.data_ptr(); + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 512) * + sycl::range<3>(1, 1, 256), + sycl::range<3>(1, 1, 256)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_q6_k_bf16_kernel( + data_gpu_data_ptr_int8_t_ct0, + output_data_ptr_ct1, blk_size, ele_per_blk, + num_blocks, item_ct1); + }); + }); + } break; + case torch::kFloat32: { + dpct::has_capability_or_fail(q_ct1.get_device(), + {sycl::aspect::fp16}); + + q_ct1.submit([&](sycl::handler &cgh) { + const int8_t *data_gpu_data_ptr_int8_t_ct0 = + data_gpu.data_ptr(); + auto output_data_ptr_float_ct1 = output.data_ptr(); + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 512) * + sycl::range<3>(1, 1, 256), + sycl::range<3>(1, 1, 256)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_q6_k_fp32_kernel( + data_gpu_data_ptr_int8_t_ct0, + output_data_ptr_float_ct1, blk_size, + ele_per_blk, num_blocks, item_ct1); + }); + }); + } break; + default: + printf("target type not support\n"); + exit(0); + } + dev_ct1.queues_wait_and_throw(); + return output; +} + +torch::Tensor dequantize_q5_k(const int8_t *data, const int num_bytes, + const int blk_size, const int ele_per_blk, + const torch::Device device, + const torch::Dtype target_dtype) { + dpct::device_ext &dev_ct1 = dpct::get_current_device(); + sycl::queue &q_ct1 = dev_ct1.in_order_queue(); + int num_blocks = num_bytes / blk_size; + const c10::OptionalDeviceGuard device_guard(device); + + auto options = torch::TensorOptions().dtype(torch::kInt8).device(device).memory_format(torch::MemoryFormat::Contiguous); + auto data_gpu = torch::empty({num_bytes}, options); + + q_ct1.memcpy(data_gpu.data_ptr(), data, num_bytes).wait(); + //data_gpu.copy_(data, false); + + // Create output tensor + auto output = torch::zeros({num_blocks, 256}, torch::dtype(target_dtype).device(device)); + + switch (target_dtype) { + case torch::kFloat16: { + dpct::has_capability_or_fail(q_ct1.get_device(), + {sycl::aspect::fp16}); + + q_ct1.submit([&](sycl::handler &cgh) { + const int8_t *data_gpu_data_ptr_int8_t_ct0 = + data_gpu.data_ptr(); + auto output_data_ptr_ct1 = (sycl::half *)output.data_ptr(); + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 512) * + sycl::range<3>(1, 1, 256), + sycl::range<3>(1, 1, 256)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_q5_k_fp16_kernel( + data_gpu_data_ptr_int8_t_ct0, + output_data_ptr_ct1, blk_size, ele_per_blk, + num_blocks, item_ct1); + }); + }); + } break; + case torch::kBFloat16: { + dpct::has_capability_or_fail(q_ct1.get_device(), + {sycl::aspect::fp16}); + + q_ct1.submit([&](sycl::handler &cgh) { + const int8_t *data_gpu_data_ptr_int8_t_ct0 = + data_gpu.data_ptr(); + auto output_data_ptr_ct1 = + (sycl::ext::oneapi::bfloat16 *)output.data_ptr(); + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 512) * + sycl::range<3>(1, 1, 256), + sycl::range<3>(1, 1, 256)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_q5_k_bf16_kernel( + data_gpu_data_ptr_int8_t_ct0, + output_data_ptr_ct1, blk_size, ele_per_blk, + num_blocks, item_ct1); + }); + }); + } break; + case torch::kFloat32: { + dpct::has_capability_or_fail(q_ct1.get_device(), + {sycl::aspect::fp16}); + + q_ct1.submit([&](sycl::handler &cgh) { + const int8_t *data_gpu_data_ptr_int8_t_ct0 = + data_gpu.data_ptr(); + auto output_data_ptr_float_ct1 = output.data_ptr(); + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 512) * + sycl::range<3>(1, 1, 256), + sycl::range<3>(1, 1, 256)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_q5_k_fp32_kernel( + data_gpu_data_ptr_int8_t_ct0, + output_data_ptr_float_ct1, blk_size, + ele_per_blk, num_blocks, item_ct1); + }); + }); + } break; + default: + printf("target type not support\n"); + exit(0); + } + dev_ct1.queues_wait_and_throw(); + return output; +} + +torch::Tensor dequantize_q4_k(const int8_t *data, const int num_bytes, + const int blk_size, const int ele_per_blk, + const torch::Device device, + const torch::Dtype target_dtype) { + dpct::device_ext &dev_ct1 = dpct::get_current_device(); + sycl::queue &q_ct1 = dev_ct1.in_order_queue(); + // data.numel%blk_size should be 0, else raise err + int num_blocks = num_bytes / blk_size; + const c10::OptionalDeviceGuard device_guard(device); + + auto options = torch::TensorOptions().dtype(torch::kInt8).device(device).memory_format(torch::MemoryFormat::Contiguous); + auto data_gpu = torch::empty({num_bytes}, options); + + q_ct1.memcpy(data_gpu.data_ptr(), data, num_bytes).wait(); + //data_gpu.copy_(data, false); + + // Create output tensor + auto output = torch::zeros({num_blocks, 256}, torch::dtype(target_dtype).device(device)); + + switch (target_dtype) { + case torch::kFloat16: { + dpct::has_capability_or_fail(q_ct1.get_device(), + {sycl::aspect::fp16}); + + q_ct1.submit([&](sycl::handler &cgh) { + const int8_t *data_gpu_data_ptr_int8_t_ct0 = + data_gpu.data_ptr(); + auto output_data_ptr_ct1 = (sycl::half *)output.data_ptr(); + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 512) * + sycl::range<3>(1, 1, 256), + sycl::range<3>(1, 1, 256)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_q4_k_fp16_kernel( + data_gpu_data_ptr_int8_t_ct0, + output_data_ptr_ct1, blk_size, ele_per_blk, + num_blocks, item_ct1); + }); + }); + } break; + case torch::kBFloat16: { + dpct::has_capability_or_fail(q_ct1.get_device(), + {sycl::aspect::fp16}); + + q_ct1.submit([&](sycl::handler &cgh) { + const int8_t *data_gpu_data_ptr_int8_t_ct0 = + data_gpu.data_ptr(); + auto output_data_ptr_ct1 = + (sycl::ext::oneapi::bfloat16 *)output.data_ptr(); + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 512) * + sycl::range<3>(1, 1, 256), + sycl::range<3>(1, 1, 256)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_q4_k_bf16_kernel( + data_gpu_data_ptr_int8_t_ct0, + output_data_ptr_ct1, blk_size, ele_per_blk, + num_blocks, item_ct1); + }); + }); + } break; + case torch::kFloat32: { + dpct::has_capability_or_fail(q_ct1.get_device(), + {sycl::aspect::fp16}); + + q_ct1.submit([&](sycl::handler &cgh) { + const int8_t *data_gpu_data_ptr_int8_t_ct0 = + data_gpu.data_ptr(); + auto output_data_ptr_float_ct1 = output.data_ptr(); + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 512) * + sycl::range<3>(1, 1, 256), + sycl::range<3>(1, 1, 256)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_q4_k_fp32_kernel( + data_gpu_data_ptr_int8_t_ct0, + output_data_ptr_float_ct1, blk_size, + ele_per_blk, num_blocks, item_ct1); + }); + }); + } break; + default: + printf("target type not support\n"); + exit(0); + } + dev_ct1.queues_wait_and_throw(); + return output; +} + +torch::Tensor dequantize_q3_k(const int8_t *data, const int num_bytes, + const int blk_size, const int ele_per_blk, + const torch::Device device, + const torch::Dtype target_dtype) { + dpct::device_ext &dev_ct1 = dpct::get_current_device(); + sycl::queue &q_ct1 = dev_ct1.in_order_queue(); + int num_blocks = num_bytes / blk_size; + const c10::OptionalDeviceGuard device_guard(device); + + auto options = torch::TensorOptions().dtype(torch::kInt8).device(device).memory_format(torch::MemoryFormat::Contiguous); + auto data_gpu = torch::empty({num_bytes}, options); + + q_ct1.memcpy(data_gpu.data_ptr(), data, num_bytes).wait(); + //data_gpu.copy_(data, false); + + // Create output tensor + auto output = torch::zeros({num_blocks, 256}, torch::dtype(target_dtype).device(device)); + + switch (target_dtype) { + case torch::kFloat16: { + dpct::has_capability_or_fail(q_ct1.get_device(), + {sycl::aspect::fp16}); + + q_ct1.submit([&](sycl::handler &cgh) { + const int8_t *data_gpu_data_ptr_int8_t_ct0 = + data_gpu.data_ptr(); + auto output_data_ptr_ct1 = (sycl::half *)output.data_ptr(); + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 512) * + sycl::range<3>(1, 1, 256), + sycl::range<3>(1, 1, 256)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_q3_k_fp16_kernel( + data_gpu_data_ptr_int8_t_ct0, + output_data_ptr_ct1, blk_size, ele_per_blk, + num_blocks, item_ct1); + }); + }); + } break; + case torch::kBFloat16: { + dpct::has_capability_or_fail(q_ct1.get_device(), + {sycl::aspect::fp16}); + + q_ct1.submit([&](sycl::handler &cgh) { + const int8_t *data_gpu_data_ptr_int8_t_ct0 = + data_gpu.data_ptr(); + auto output_data_ptr_ct1 = + (sycl::ext::oneapi::bfloat16 *)output.data_ptr(); + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 512) * + sycl::range<3>(1, 1, 256), + sycl::range<3>(1, 1, 256)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_q3_k_bf16_kernel( + data_gpu_data_ptr_int8_t_ct0, + output_data_ptr_ct1, blk_size, ele_per_blk, + num_blocks, item_ct1); + }); + }); + } break; + case torch::kFloat32: { + dpct::has_capability_or_fail(q_ct1.get_device(), + {sycl::aspect::fp16}); + + q_ct1.submit([&](sycl::handler &cgh) { + const int8_t *data_gpu_data_ptr_int8_t_ct0 = + data_gpu.data_ptr(); + auto output_data_ptr_float_ct1 = output.data_ptr(); + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 512) * + sycl::range<3>(1, 1, 256), + sycl::range<3>(1, 1, 256)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_q3_k_fp32_kernel( + data_gpu_data_ptr_int8_t_ct0, + output_data_ptr_float_ct1, blk_size, + ele_per_blk, num_blocks, item_ct1); + }); + }); + } break; + default: + printf("target type not support\n"); + exit(0); + } + dev_ct1.queues_wait_and_throw(); + return output; +} + +torch::Tensor dequantize_q2_k(const int8_t *data, const int num_bytes, + const int blk_size, const int ele_per_blk, + const torch::Device device, + const torch::Dtype target_dtype) { + dpct::device_ext &dev_ct1 = dpct::get_current_device(); + sycl::queue &q_ct1 = dev_ct1.in_order_queue(); + int num_blocks = num_bytes / blk_size; + const c10::OptionalDeviceGuard device_guard(device); + + auto options = torch::TensorOptions().dtype(torch::kInt8).device(device).memory_format(torch::MemoryFormat::Contiguous); + auto data_gpu = torch::empty({num_bytes}, options); + + q_ct1.memcpy(data_gpu.data_ptr(), data, num_bytes).wait(); + //data_gpu.copy_(data, false); + + // Create output tensor + auto output = torch::zeros({num_blocks, 256}, torch::dtype(target_dtype).device(device)); + + switch (target_dtype) { + case torch::kFloat16: { + dpct::has_capability_or_fail(q_ct1.get_device(), + {sycl::aspect::fp16}); + + q_ct1.submit([&](sycl::handler &cgh) { + const int8_t *data_gpu_data_ptr_int8_t_ct0 = + data_gpu.data_ptr(); + auto output_data_ptr_ct1 = (sycl::half *)output.data_ptr(); + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 512) * + sycl::range<3>(1, 1, 256), + sycl::range<3>(1, 1, 256)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_q2_k_fp16_kernel( + data_gpu_data_ptr_int8_t_ct0, + output_data_ptr_ct1, blk_size, ele_per_blk, + num_blocks, item_ct1); + }); + }); + } break; + case torch::kBFloat16: { + dpct::has_capability_or_fail(q_ct1.get_device(), + {sycl::aspect::fp16}); + + q_ct1.submit([&](sycl::handler &cgh) { + const int8_t *data_gpu_data_ptr_int8_t_ct0 = + data_gpu.data_ptr(); + auto output_data_ptr_ct1 = + (sycl::ext::oneapi::bfloat16 *)output.data_ptr(); + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 512) * + sycl::range<3>(1, 1, 256), + sycl::range<3>(1, 1, 256)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_q2_k_bf16_kernel( + data_gpu_data_ptr_int8_t_ct0, + output_data_ptr_ct1, blk_size, ele_per_blk, + num_blocks, item_ct1); + }); + }); + } break; + case torch::kFloat32: { + dpct::has_capability_or_fail(q_ct1.get_device(), + {sycl::aspect::fp16}); + + q_ct1.submit([&](sycl::handler &cgh) { + const int8_t *data_gpu_data_ptr_int8_t_ct0 = + data_gpu.data_ptr(); + auto output_data_ptr_float_ct1 = output.data_ptr(); + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 512) * + sycl::range<3>(1, 1, 256), + sycl::range<3>(1, 1, 256)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_q2_k_fp32_kernel( + data_gpu_data_ptr_int8_t_ct0, + output_data_ptr_float_ct1, blk_size, + ele_per_blk, num_blocks, item_ct1); + }); + }); + } break; + default: + printf("target type not support\n"); + exit(0); + } + dev_ct1.queues_wait_and_throw(); + return output; +} + +torch::Tensor dequantize_iq4_xs(const int8_t *data, const int num_bytes, + const int blk_size, const int ele_per_blk, + const torch::Device device, + const torch::Dtype target_dtype) { + dpct::device_ext &dev_ct1 = dpct::get_current_device(); + sycl::queue &q_ct1 = dev_ct1.in_order_queue(); + int num_blocks = num_bytes / blk_size; + const c10::OptionalDeviceGuard device_guard(device); + + auto options = torch::TensorOptions().dtype(torch::kInt8).device(device).memory_format(torch::MemoryFormat::Contiguous); + auto data_gpu = torch::empty({num_bytes}, options); + + q_ct1.memcpy(data_gpu.data_ptr(), data, num_bytes).wait(); + //data_gpu.copy_(data, false); + + // Create output tensor + auto output = torch::zeros({num_blocks, 256}, torch::dtype(target_dtype).device(device)); + + switch (target_dtype) { + case torch::kFloat16: { + kvalues_iq4nl.init(); + + dpct::has_capability_or_fail(q_ct1.get_device(), + {sycl::aspect::fp16}); + + q_ct1.submit([&](sycl::handler &cgh) { + auto kvalues_iq4nl_ptr_ct1 = kvalues_iq4nl.get_ptr(); + + const int8_t *data_gpu_data_ptr_int8_t_ct0 = + data_gpu.data_ptr(); + auto output_data_ptr_ct1 = (sycl::half *)output.data_ptr(); + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 512) * + sycl::range<3>(1, 1, 256), + sycl::range<3>(1, 1, 256)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_iq4_xs_fp16_kernel( + data_gpu_data_ptr_int8_t_ct0, + output_data_ptr_ct1, blk_size, ele_per_blk, + num_blocks, item_ct1, kvalues_iq4nl_ptr_ct1); + }); + }); + } break; + case torch::kBFloat16: { + kvalues_iq4nl.init(); + + dpct::has_capability_or_fail(q_ct1.get_device(), + {sycl::aspect::fp16}); + + q_ct1.submit([&](sycl::handler &cgh) { + auto kvalues_iq4nl_ptr_ct1 = kvalues_iq4nl.get_ptr(); + + const int8_t *data_gpu_data_ptr_int8_t_ct0 = + data_gpu.data_ptr(); + auto output_data_ptr_ct1 = + (sycl::ext::oneapi::bfloat16 *)output.data_ptr(); + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 512) * + sycl::range<3>(1, 1, 256), + sycl::range<3>(1, 1, 256)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_iq4_xs_bf16_kernel( + data_gpu_data_ptr_int8_t_ct0, + output_data_ptr_ct1, blk_size, ele_per_blk, + num_blocks, item_ct1, kvalues_iq4nl_ptr_ct1); + }); + }); + } break; + case torch::kFloat32: { + kvalues_iq4nl.init(); + + dpct::has_capability_or_fail(q_ct1.get_device(), + {sycl::aspect::fp16}); + + q_ct1.submit([&](sycl::handler &cgh) { + auto kvalues_iq4nl_ptr_ct1 = kvalues_iq4nl.get_ptr(); + + const int8_t *data_gpu_data_ptr_int8_t_ct0 = + data_gpu.data_ptr(); + auto output_data_ptr_float_ct1 = output.data_ptr(); + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 512) * + sycl::range<3>(1, 1, 256), + sycl::range<3>(1, 1, 256)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_iq4_xs_fp32_kernel( + data_gpu_data_ptr_int8_t_ct0, + output_data_ptr_float_ct1, blk_size, + ele_per_blk, num_blocks, item_ct1, + kvalues_iq4nl_ptr_ct1); + }); + }); + } break; + default: + printf("target type not support\n"); + exit(0); + } + dev_ct1.queues_wait_and_throw(); + return output; +} diff --git a/third-party-programs/ktransformers/custom_gguf/migrated/src/dequant.dp.hpp b/third-party-programs/ktransformers/custom_gguf/migrated/src/dequant.dp.hpp new file mode 100644 index 000000000..cbd07a7f6 --- /dev/null +++ b/third-party-programs/ktransformers/custom_gguf/migrated/src/dequant.dp.hpp @@ -0,0 +1,44 @@ +#ifndef DEQUANT_DP_H +#define DEQUANT_DP_H + +#include + +torch::Tensor dequantize_q8_0(const int8_t *data, const int num_bytes, + const int blk_size, const int ele_per_blk, + const torch::Device device, + const torch::Dtype target_dtype); + +torch::Tensor dequantize_q6_k(const int8_t *data, const int num_bytes, + const int blk_size, const int ele_per_blk, + const torch::Device device, + const torch::Dtype target_dtype); + +torch::Tensor dequantize_q5_k(const int8_t *data, const int num_bytes, + const int blk_size, const int ele_per_blk, + const torch::Device device, + const torch::Dtype target_dtype); + +torch::Tensor dequantize_q4_k(const int8_t *data, const int num_bytes, + const int blk_size, const int ele_per_blk, + const torch::Device device, + const torch::Dtype target_dtype); + +torch::Tensor dequantize_q3_k(const int8_t *data, const int num_bytes, + const int blk_size, const int ele_per_blk, + const torch::Device device, + const torch::Dtype target_dtype); + +torch::Tensor dequantize_q2_k(const int8_t *data, const int num_bytes, + const int blk_size, const int ele_per_blk, + const torch::Device device, + const torch::Dtype target_dtype); + + +torch::Tensor dequantize_iq4_xs(const int8_t *data, const int num_bytes, + const int blk_size, const int ele_per_blk, + const torch::Device device, + const torch::Dtype target_dtype); + + + +#endif // DEQUANT_DP_H \ No newline at end of file diff --git a/third-party-programs/ktransformers/custom_gguf/migrated/torch_test/dequant.hpp b/third-party-programs/ktransformers/custom_gguf/migrated/torch_test/dequant.hpp new file mode 100644 index 000000000..ecba90e1c --- /dev/null +++ b/third-party-programs/ktransformers/custom_gguf/migrated/torch_test/dequant.hpp @@ -0,0 +1,946 @@ +/* + * @Description : + * @Author : Azure-Tang, Boxin Zhang + * @Date : 2024-07-25 13:38:30 + * @Version : 0.2.2 + * Adapted from https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.c + * Copyright (c) 2023-2024 The ggml authors + * Copyright (c) 2024 by KVCache.AI, All Rights Reserved. + */ +#include +#include +#include +#include +#include +#include +#include + +void dequantize_q8_0_fp32_kernel(const int8_t* data, float* output, const int blk_size, const int ele_per_blk, const int num_blocks, + const sycl::nd_item<3> &item_ct1) { + long long global_idx = item_ct1.get_group(2) * item_ct1.get_local_range(2) + + item_ct1.get_local_id(2); + for (long long block_id = global_idx; block_id < num_blocks; + block_id += + item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) { + float* __restrict__ output_blk = (float*)(output + block_id * ele_per_blk); + const int8_t* cur_block = data + block_id * blk_size; + float scale = sycl::vec(*((sycl::half *)cur_block)) + .convert()[0]; + cur_block += 2; + for (int i = 0; i < ele_per_blk; i++){ + output_blk[i] = scale * cur_block[i]; + } + } +} + +void dequantize_q8_0_fp16_kernel(const int8_t *data, sycl::half *output, + const int blk_size, const int ele_per_blk, + const int num_blocks, + const sycl::nd_item<3> &item_ct1) { + long long global_idx = item_ct1.get_group(2) * item_ct1.get_local_range(2) + + item_ct1.get_local_id(2); + for (long long block_id = global_idx; block_id < num_blocks; + block_id += + item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) { + sycl::half *__restrict__ output_blk = + (sycl::half *)(output + block_id * ele_per_blk); + const int8_t* cur_block = data + block_id * blk_size; + float scale = sycl::vec(*((sycl::half *)cur_block)) + .convert()[0]; + cur_block += 2; + for (int i = 0; i < ele_per_blk; i++) { + output_blk[i] = + sycl::vec(scale * cur_block[i]) + .convert()[0]; + } + } +} + +void dequantize_q8_0_bf16_kernel(const int8_t *data, + sycl::ext::oneapi::bfloat16 *output, + const int blk_size, const int ele_per_blk, + const int num_blocks, + const sycl::nd_item<3> &item_ct1) { + long long global_idx = item_ct1.get_group(2) * item_ct1.get_local_range(2) + + item_ct1.get_local_id(2); + for (long long block_id = global_idx; block_id < num_blocks; + block_id += + item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) { + sycl::ext::oneapi::bfloat16 *__restrict__ output_blk = + (sycl::ext::oneapi::bfloat16 *)(output + block_id * ele_per_blk); + const int8_t* cur_block = data + block_id * blk_size; + float scale = sycl::vec(*((sycl::half *)cur_block)) + .convert()[0]; + cur_block += 2; + for (int i = 0; i < ele_per_blk; i++) { + output_blk[i] = sycl::ext::oneapi::bfloat16(scale * cur_block[i]); + } + } +} + +// __device__ void get_scale_min_k4(int j, const uint8_t * __restrict__ q, uint8_t * __restrict__ d, uint8_t * __restrict__ m) { +void get_scale_min_k4(int j, const uint8_t * q, uint8_t * __restrict__ d, uint8_t * __restrict__ m) { + if (j < 4) { + *d = q[j] & 63; *m = q[j + 4] & 63; + } else { + *d = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4); + *m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4); + } +} + +void dequantize_q2_k_fp32_kernel(const int8_t* data, float* output, const int blk_size, const int ele_per_blk, const int num_blocks, + const sycl::nd_item<3> &item_ct1) { + long long global_idx = item_ct1.get_group(2) * item_ct1.get_local_range(2) + + item_ct1.get_local_id(2); + for (long long block_id = global_idx; block_id < num_blocks; + block_id += + item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) { + float* __restrict__ output_blk = (float*)(output + block_id * ele_per_blk); + + const float d = + sycl::vec(*(reinterpret_cast( + data + block_id * blk_size + 80))) + .convert()[0]; + const float min = + sycl::vec(*(reinterpret_cast( + data + block_id * blk_size + 82))) + .convert()[0]; + + const uint8_t * __restrict__ q = (uint8_t*)(data + block_id * blk_size + 16); + + int is = 0; + float dl, ml; + + for (int n = 0; n < 256; n += 128) { + int shift = 0; + for (int j = 0; j < 4; ++j) { + uint8_t* scales = (uint8_t*)(data + block_id * blk_size + (is++)); + uint8_t sc = *scales; + dl = d * (sc & 0xF); ml = min * (sc >> 4); + for (int l = 0; l < 16; ++l) *output_blk++ = dl * ((int8_t)((q[l] >> shift) & 3)) - ml; + + scales = (uint8_t*)(data + block_id * blk_size + (is++)); + sc = *scales; + + dl = d * (sc & 0xF); ml = min * (sc >> 4); + for (int l = 0; l < 16; ++l) *output_blk++ = dl * ((int8_t)((q[l+16] >> shift) & 3)) - ml; + + shift += 2; + } + q += 32; + } + } +} + +void dequantize_q2_k_fp16_kernel(const int8_t *data, sycl::half *output, + const int blk_size, const int ele_per_blk, + const int num_blocks, + const sycl::nd_item<3> &item_ct1) { + long long global_idx = item_ct1.get_group(2) * item_ct1.get_local_range(2) + + item_ct1.get_local_id(2); + for (long long block_id = global_idx; block_id < num_blocks; + block_id += + item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) { + sycl::half *__restrict__ output_blk = + (sycl::half *)(output + block_id * ele_per_blk); + + const float d = + sycl::vec(*(reinterpret_cast( + data + block_id * blk_size + 80))) + .convert()[0]; + const float min = + sycl::vec(*(reinterpret_cast( + data + block_id * blk_size + 82))) + .convert()[0]; + + const uint8_t * __restrict__ q = (uint8_t*)(data + block_id * blk_size + 16); + + int is = 0; + float dl, ml; + + for (int n = 0; n < 256; n += 128) { + int shift = 0; + for (int j = 0; j < 4; ++j) { + uint8_t* scales = (uint8_t*)(data + block_id * blk_size + (is++)); + uint8_t sc = *scales; + dl = d * (sc & 0xF); ml = min * (sc >> 4); + for (int l = 0; l < 16; ++l) *output_blk++ = + sycl::vec(dl * ((int8_t)((q[l] >> shift) & 3)) - + ml) + .convert()[0]; + + scales = (uint8_t*)(data + block_id * blk_size + (is++)); + sc = *scales; + + dl = d * (sc & 0xF); ml = min * (sc >> 4); + for (int l = 0; l < 16; ++l) *output_blk++ = + sycl::vec( + dl * ((int8_t)((q[l + 16] >> shift) & 3)) - ml) + .convert()[0]; + + shift += 2; + } + q += 32; + } + } +} + +void dequantize_q2_k_bf16_kernel(const int8_t *data, + sycl::ext::oneapi::bfloat16 *output, + const int blk_size, const int ele_per_blk, + const int num_blocks, + const sycl::nd_item<3> &item_ct1) { + long long global_idx = item_ct1.get_group(2) * item_ct1.get_local_range(2) + + item_ct1.get_local_id(2); + for (long long block_id = global_idx; block_id < num_blocks; + block_id += + item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) { + sycl::ext::oneapi::bfloat16 *__restrict__ output_blk = + (sycl::ext::oneapi::bfloat16 *)(output + block_id * ele_per_blk); + + const float d = + sycl::vec(*(reinterpret_cast( + data + block_id * blk_size + 80))) + .convert()[0]; + const float min = + sycl::vec(*(reinterpret_cast( + data + block_id * blk_size + 82))) + .convert()[0]; + + const uint8_t * __restrict__ q = (uint8_t*)(data + block_id * blk_size + 16); + + int is = 0; + float dl, ml; + + for (int n = 0; n < 256; n += 128) { + int shift = 0; + for (int j = 0; j < 4; ++j) { + uint8_t* scales = (uint8_t*)(data + block_id * blk_size + (is++)); + uint8_t sc = *scales; + dl = d * (sc & 0xF); ml = min * (sc >> 4); + for (int l = 0; l < 16; ++l) *output_blk++ = + sycl::ext::oneapi::bfloat16( + dl * ((int8_t)((q[l] >> shift) & 3)) - ml); + + scales = (uint8_t*)(data + block_id * blk_size + (is++)); + sc = *scales; + + dl = d * (sc & 0xF); ml = min * (sc >> 4); + for (int l = 0; l < 16; ++l) *output_blk++ = + sycl::ext::oneapi::bfloat16( + dl * ((int8_t)((q[l + 16] >> shift) & 3)) - ml); + + shift += 2; + } + q += 32; + } + } +} + +void dequantize_q3_k_fp32_kernel(const int8_t* data, float* output, const int blk_size, const int ele_per_blk, const int num_blocks, + const sycl::nd_item<3> &item_ct1) { + + long long global_idx = item_ct1.get_group(2) * item_ct1.get_local_range(2) + + item_ct1.get_local_id(2); + const uint32_t kmask1 = 0x03030303; + const uint32_t kmask2 = 0x0f0f0f0f; + for (long long block_id = global_idx; block_id < num_blocks; + block_id += + item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) { + float* __restrict__ output_blk = (float*)(output + block_id * ele_per_blk); + + uint32_t aux[4]; + const int8_t * scales = (const int8_t*)aux; + const float d_all = + sycl::vec(*(reinterpret_cast( + data + block_id * blk_size + 108))) + .convert()[0]; + + const uint8_t * __restrict__ q = (uint8_t*)(data + block_id * blk_size + 32); + const uint8_t * __restrict__ hm = (uint8_t*)(data + block_id * blk_size + 0); + uint8_t m = 1; + + + uint8_t* block_scales = (uint8_t*)(data + block_id * blk_size + 96); + + for (int i = 0; i < 3; i++) { + aux[i] = 0; + for (int j = 0; j < 4; j++) { + aux[i] |= ((uint32_t)block_scales[i * 4 + j]) << (j * 8); + } + } + + uint32_t tmp = aux[2]; + aux[2] = ((aux[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4); + aux[3] = ((aux[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4); + aux[0] = (aux[0] & kmask2) | (((tmp >> 0) & kmask1) << 4); + aux[1] = (aux[1] & kmask2) | (((tmp >> 2) & kmask1) << 4); + + int is = 0; + float dl; + for (int n = 0; n < 256; n += 128) { + int shift = 0; + for (int j = 0; j < 4; ++j) { + + dl = d_all * (scales[is++] - 32); + for (int l = 0; l < 16; ++l) { + *output_blk++ = dl * ((int8_t)((q[l+ 0] >> shift) & 3) - ((hm[l+ 0] & m) ? 0 : 4)); + } + + dl = d_all * (scales[is++] - 32); + for (int l = 0; l < 16; ++l) { + *output_blk++ = dl * ((int8_t)((q[l+16] >> shift) & 3) - ((hm[l+16] & m) ? 0 : 4)); + } + + shift += 2; + m <<= 1; + } + q += 32; + } + } +} + +void dequantize_q3_k_fp16_kernel(const int8_t *data, sycl::half *output, + const int blk_size, const int ele_per_blk, + const int num_blocks, + const sycl::nd_item<3> &item_ct1) { + + long long global_idx = item_ct1.get_group(2) * item_ct1.get_local_range(2) + + item_ct1.get_local_id(2); + const uint32_t kmask1 = 0x03030303; + const uint32_t kmask2 = 0x0f0f0f0f; + for (long long block_id = global_idx; block_id < num_blocks; + block_id += + item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) { + sycl::half *__restrict__ output_blk = + (sycl::half *)(output + block_id * ele_per_blk); + + uint32_t aux[4]; + const int8_t * scales = (const int8_t*)aux; + const float d_all = + sycl::vec(*(reinterpret_cast( + data + block_id * blk_size + 108))) + .convert()[0]; + + const uint8_t * __restrict__ q = (uint8_t*)(data + block_id * blk_size + 32); + const uint8_t * __restrict__ hm = (uint8_t*)(data + block_id * blk_size + 0); + uint8_t m = 1; + + + uint8_t* block_scales = (uint8_t*)(data + block_id * blk_size + 96); + + for (int i = 0; i < 3; i++) { + aux[i] = 0; + for (int j = 0; j < 4; j++) { + aux[i] |= ((uint32_t)block_scales[i * 4 + j]) << (j * 8); + } + } + + uint32_t tmp = aux[2]; + aux[2] = ((aux[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4); + aux[3] = ((aux[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4); + aux[0] = (aux[0] & kmask2) | (((tmp >> 0) & kmask1) << 4); + aux[1] = (aux[1] & kmask2) | (((tmp >> 2) & kmask1) << 4); + + int is = 0; + float dl; + for (int n = 0; n < 256; n += 128) { + int shift = 0; + for (int j = 0; j < 4; ++j) { + + dl = d_all * (scales[is++] - 32); + for (int l = 0; l < 16; ++l) { + *output_blk++ = + sycl::vec(dl * + ((int8_t)((q[l + 0] >> shift) & 3) - + ((hm[l + 0] & m) ? 0 : 4))) + .convert()[0]; + } + + dl = d_all * (scales[is++] - 32); + for (int l = 0; l < 16; ++l) { + *output_blk++ = + sycl::vec( + dl * ((int8_t)((q[l + 16] >> shift) & 3) - + ((hm[l + 16] & m) ? 0 : 4))) + .convert()[0]; + } + + shift += 2; + m <<= 1; + } + q += 32; + } + } +} + +void dequantize_q3_k_bf16_kernel(const int8_t *data, + sycl::ext::oneapi::bfloat16 *output, + const int blk_size, const int ele_per_blk, + const int num_blocks, + const sycl::nd_item<3> &item_ct1) { + + long long global_idx = item_ct1.get_group(2) * item_ct1.get_local_range(2) + + item_ct1.get_local_id(2); + const uint32_t kmask1 = 0x03030303; + const uint32_t kmask2 = 0x0f0f0f0f; + for (long long block_id = global_idx; block_id < num_blocks; + block_id += + item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) { + sycl::ext::oneapi::bfloat16 *__restrict__ output_blk = + (sycl::ext::oneapi::bfloat16 *)(output + block_id * ele_per_blk); + + uint32_t aux[4]; + const int8_t * scales = (const int8_t*)aux; + const float d_all = + sycl::vec(*(reinterpret_cast( + data + block_id * blk_size + 108))) + .convert()[0]; + + const uint8_t * __restrict__ q = (uint8_t*)(data + block_id * blk_size + 32); + const uint8_t * __restrict__ hm = (uint8_t*)(data + block_id * blk_size + 0); + uint8_t m = 1; + + + uint8_t* block_scales = (uint8_t*)(data + block_id * blk_size + 96); + + for (int i = 0; i < 3; i++) { + aux[i] = 0; + for (int j = 0; j < 4; j++) { + aux[i] |= ((uint32_t)block_scales[i * 4 + j]) << (j * 8); + } + } + + uint32_t tmp = aux[2]; + aux[2] = ((aux[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4); + aux[3] = ((aux[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4); + aux[0] = (aux[0] & kmask2) | (((tmp >> 0) & kmask1) << 4); + aux[1] = (aux[1] & kmask2) | (((tmp >> 2) & kmask1) << 4); + + int is = 0; + float dl; + for (int n = 0; n < 256; n += 128) { + int shift = 0; + for (int j = 0; j < 4; ++j) { + + dl = d_all * (scales[is++] - 32); + for (int l = 0; l < 16; ++l) { + *output_blk++ = sycl::ext::oneapi::bfloat16( + dl * ((int8_t)((q[l + 0] >> shift) & 3) - + ((hm[l + 0] & m) ? 0 : 4))); + } + + dl = d_all * (scales[is++] - 32); + for (int l = 0; l < 16; ++l) { + *output_blk++ = sycl::ext::oneapi::bfloat16( + dl * ((int8_t)((q[l + 16] >> shift) & 3) - + ((hm[l + 16] & m) ? 0 : 4))); + } + + shift += 2; + m <<= 1; + } + q += 32; + } + } +} + + +void dequantize_q4_k_fp32_kernel(const int8_t* data, float* output, const int blk_size, const int ele_per_blk, const int num_blocks, + const sycl::nd_item<3> &item_ct1) { + long long global_idx = item_ct1.get_group(2) * item_ct1.get_local_range(2) + + item_ct1.get_local_id(2); + for (long long block_id = global_idx; block_id < num_blocks; + block_id += + item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) { + float* __restrict__ output_blk = (float*)(output + block_id * ele_per_blk); + // const uint8_t * q = data[i].qs; + const uint8_t * q = (uint8_t*)(data + block_id * 144 + 16); + + const float d = + sycl::vec(*(reinterpret_cast( + data + block_id * 144 + 0))) + .convert()[0]; + const float min = + sycl::vec(*(reinterpret_cast( + data + block_id * 144 + 2))) + .convert()[0]; + int is = 0; + uint8_t sc, m; + for (int j = 0; j < ele_per_blk; j += 64) { + uint8_t* scales = (uint8_t*)(data + block_id * 144 + 4); + get_scale_min_k4(is + 0, scales, &sc, &m); + const float d1 = d * sc; const float m1 = min * m; + get_scale_min_k4(is + 1, scales, &sc, &m); + const float d2 = d * sc; const float m2 = min * m; + for (int l = 0; l < 32; ++l) *output_blk++ = d1 * (q[l] & 0xF) - m1; + for (int l = 0; l < 32; ++l) *output_blk++ = d2 * (q[l] >> 4) - m2; + q += 32; is += 2; + } + } +} + +void dequantize_q4_k_fp16_kernel(const int8_t *data, sycl::half *output, + const int blk_size, const int ele_per_blk, + const int num_blocks, + const sycl::nd_item<3> &item_ct1) { + long long global_idx = item_ct1.get_group(2) * item_ct1.get_local_range(2) + + item_ct1.get_local_id(2); + for (long long block_id = global_idx; block_id < num_blocks; + block_id += + item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) { + sycl::half *__restrict__ output_blk = + (sycl::half *)(output + block_id * ele_per_blk); + // const uint8_t * q = data[i].qs; + const uint8_t * q = (uint8_t*)(data + block_id * 144 + 16); + + const float d = + sycl::vec(*(reinterpret_cast( + data + block_id * 144 + 0))) + .convert()[0]; + const float min = + sycl::vec(*(reinterpret_cast( + data + block_id * 144 + 2))) + .convert()[0]; + int is = 0; + uint8_t sc, m; + for (int j = 0; j < ele_per_blk; j += 64) { + uint8_t* scales = (uint8_t*)(data + block_id * 144 + 4); + get_scale_min_k4(is + 0, scales, &sc, &m); + const float d1 = d * sc; const float m1 = min * m; + get_scale_min_k4(is + 1, scales, &sc, &m); + const float d2 = d * sc; const float m2 = min * m; + for (int l = 0; l < 32; ++l) *output_blk++ = + sycl::vec(d1 * (q[l] & 0xF) - m1) + .convert()[0]; + for (int l = 0; l < 32; ++l) *output_blk++ = + sycl::vec(d2 * (q[l] >> 4) - m2) + .convert()[0]; + q += 32; is += 2; + } + } +} + +void dequantize_q4_k_bf16_kernel(const int8_t *data, + sycl::ext::oneapi::bfloat16 *output, + const int blk_size, const int ele_per_blk, + const int num_blocks, + const sycl::nd_item<3> &item_ct1) { + long long global_idx = item_ct1.get_group(2) * item_ct1.get_local_range(2) + + item_ct1.get_local_id(2); + for (long long block_id = global_idx; block_id < num_blocks; + block_id += + item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) { + sycl::ext::oneapi::bfloat16 *__restrict__ output_blk = + (sycl::ext::oneapi::bfloat16 *)(output + block_id * ele_per_blk); + // const uint8_t * q = data[i].qs; + const uint8_t * q = (uint8_t*)(data + block_id * 144 + 16); + + const float d = + sycl::vec(*(reinterpret_cast( + data + block_id * 144 + 0))) + .convert()[0]; + const float min = + sycl::vec(*(reinterpret_cast( + data + block_id * 144 + 2))) + .convert()[0]; + int is = 0; + uint8_t sc, m; + for (int j = 0; j < ele_per_blk; j += 64) { + uint8_t* scales = (uint8_t*)(data + block_id * 144 + 4); + get_scale_min_k4(is + 0, scales, &sc, &m); + const float d1 = d * sc; const float m1 = min * m; + get_scale_min_k4(is + 1, scales, &sc, &m); + const float d2 = d * sc; const float m2 = min * m; + for (int l = 0; l < 32; ++l) *output_blk++ = + sycl::ext::oneapi::bfloat16(d1 * (q[l] & 0xF) - m1); + for (int l = 0; l < 32; ++l) *output_blk++ = + sycl::ext::oneapi::bfloat16(d2 * (q[l] >> 4) - m2); + q += 32; is += 2; + } + } +} + +void dequantize_q5_k_fp32_kernel(const int8_t* data, float* output, const int blk_size, const int ele_per_blk, const int num_blocks, + const sycl::nd_item<3> &item_ct1) { + long long global_idx = item_ct1.get_group(2) * item_ct1.get_local_range(2) + + item_ct1.get_local_id(2); + for (long long block_id = global_idx; block_id < num_blocks; + block_id += + item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) { + float* __restrict__ output_blk = (float*)(output + block_id * ele_per_blk); + + const float d = + sycl::vec(*(reinterpret_cast( + data + block_id * blk_size + 0))) + .convert()[0]; + const float min = + sycl::vec(*(reinterpret_cast( + data + block_id * blk_size + 2))) + .convert()[0]; + + const uint8_t * __restrict__ qh = (uint8_t*)(data + block_id * blk_size + 16); + const uint8_t * __restrict__ ql = (uint8_t*)(data + block_id * blk_size + 48); + + int is = 0; + uint8_t sc, m; + uint8_t u1 = 1, u2 = 2; + uint8_t* scales = (uint8_t*)(data + block_id * blk_size + 4); + + for (int j = 0; j < 256; j += 64) { + get_scale_min_k4(is + 0, scales, &sc, &m); + const float d1 = d * sc; const float m1 = min * m; + get_scale_min_k4(is + 1, scales, &sc, &m); + const float d2 = d * sc; const float m2 = min * m; + for (int l = 0; l < 32; ++l) *output_blk++ = d1 * ((ql[l] & 0xF) + (qh[l] & u1 ? 16 : 0)) - m1; + for (int l = 0; l < 32; ++l) *output_blk++ = d2 * ((ql[l] >> 4) + (qh[l] & u2 ? 16 : 0)) - m2; + ql += 32; is += 2; + u1 <<= 2; u2 <<= 2; + } + } +} + +void dequantize_q5_k_fp16_kernel(const int8_t *data, sycl::half *output, + const int blk_size, const int ele_per_blk, + const int num_blocks, + const sycl::nd_item<3> &item_ct1) { + long long global_idx = item_ct1.get_group(2) * item_ct1.get_local_range(2) + + item_ct1.get_local_id(2); + for (long long block_id = global_idx; block_id < num_blocks; + block_id += + item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) { + sycl::half *__restrict__ output_blk = + (sycl::half *)(output + block_id * ele_per_blk); + + const float d = + sycl::vec(*(reinterpret_cast( + data + block_id * blk_size + 0))) + .convert()[0]; + const float min = + sycl::vec(*(reinterpret_cast( + data + block_id * blk_size + 2))) + .convert()[0]; + + const uint8_t * __restrict__ qh = (uint8_t*)(data + block_id * blk_size + 16); + const uint8_t * __restrict__ ql = (uint8_t*)(data + block_id * blk_size + 48); + + int is = 0; + uint8_t sc, m; + uint8_t u1 = 1, u2 = 2; + uint8_t* scales = (uint8_t*)(data + block_id * blk_size + 4); + + for (int j = 0; j < 256; j += 64) { + get_scale_min_k4(is + 0, scales, &sc, &m); + const float d1 = d * sc; const float m1 = min * m; + get_scale_min_k4(is + 1, scales, &sc, &m); + const float d2 = d * sc; const float m2 = min * m; + for (int l = 0; l < 32; ++l) *output_blk++ = + sycl::vec( + d1 * ((ql[l] & 0xF) + (qh[l] & u1 ? 16 : 0)) - m1) + .convert()[0]; + for (int l = 0; l < 32; ++l) *output_blk++ = + sycl::vec( + d2 * ((ql[l] >> 4) + (qh[l] & u2 ? 16 : 0)) - m2) + .convert()[0]; + ql += 32; is += 2; + u1 <<= 2; u2 <<= 2; + } + } +} + +void dequantize_q5_k_bf16_kernel(const int8_t *data, + sycl::ext::oneapi::bfloat16 *output, + const int blk_size, const int ele_per_blk, + const int num_blocks, + const sycl::nd_item<3> &item_ct1) { + long long global_idx = item_ct1.get_group(2) * item_ct1.get_local_range(2) + + item_ct1.get_local_id(2); + for (long long block_id = global_idx; block_id < num_blocks; + block_id += + item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) { + sycl::ext::oneapi::bfloat16 *__restrict__ output_blk = + (sycl::ext::oneapi::bfloat16 *)(output + block_id * ele_per_blk); + + const float d = + sycl::vec(*(reinterpret_cast( + data + block_id * blk_size + 0))) + .convert()[0]; + const float min = + sycl::vec(*(reinterpret_cast( + data + block_id * blk_size + 2))) + .convert()[0]; + + const uint8_t * __restrict__ qh = (uint8_t*)(data + block_id * blk_size + 16); + const uint8_t * __restrict__ ql = (uint8_t*)(data + block_id * blk_size + 48); + + int is = 0; + uint8_t sc, m; + uint8_t u1 = 1, u2 = 2; + uint8_t* scales = (uint8_t*)(data + block_id * blk_size + 4); + + for (int j = 0; j < 256; j += 64) { + get_scale_min_k4(is + 0, scales, &sc, &m); + const float d1 = d * sc; const float m1 = min * m; + get_scale_min_k4(is + 1, scales, &sc, &m); + const float d2 = d * sc; const float m2 = min * m; + for (int l = 0; l < 32; ++l) *output_blk++ = + sycl::ext::oneapi::bfloat16( + d1 * ((ql[l] & 0xF) + (qh[l] & u1 ? 16 : 0)) - m1); + for (int l = 0; l < 32; ++l) *output_blk++ = + sycl::ext::oneapi::bfloat16( + d2 * ((ql[l] >> 4) + (qh[l] & u2 ? 16 : 0)) - m2); + ql += 32; is += 2; + u1 <<= 2; u2 <<= 2; + } + } +} + +void dequantize_q6_k_fp32_kernel(const int8_t* data, float* output, const int blk_size, const int ele_per_blk, const int num_blocks, + const sycl::nd_item<3> &item_ct1) { + long long global_idx = item_ct1.get_group(2) * item_ct1.get_local_range(2) + + item_ct1.get_local_id(2); + for (long long block_id = global_idx; block_id < num_blocks; + block_id += + item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) { + float* __restrict__ output_blk = (float*)(output + block_id * ele_per_blk); + const float d = + sycl::vec(*(reinterpret_cast( + data + block_id * blk_size + 208))) + .convert()[0]; + + const uint8_t * __restrict__ ql = (uint8_t*)(data + block_id * blk_size); + const uint8_t * __restrict__ qh = (uint8_t*)(data + block_id * blk_size + 128); + const int8_t * __restrict__ sc = (int8_t*)(data + block_id * blk_size + 192); + + + for (int n = 0; n < ele_per_blk; n += 128) { + for (int l = 0; l < 32; ++l) { + int is = l/16; + const int8_t q1 = (int8_t)((ql[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32; + const int8_t q2 = (int8_t)((ql[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32; + const int8_t q3 = (int8_t)((ql[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32; + const int8_t q4 = (int8_t)((ql[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32; + output_blk[l + 0] = d * sc[is + 0] * q1; + output_blk[l + 32] = d * sc[is + 2] * q2; + output_blk[l + 64] = d * sc[is + 4] * q3; + output_blk[l + 96] = d * sc[is + 6] * q4; + } + output_blk += 128; + ql += 64; + qh += 32; + sc += 8; + } + } +} + +void dequantize_q6_k_fp16_kernel(const int8_t *data, sycl::half *output, + const int blk_size, const int ele_per_blk, + const int num_blocks, + const sycl::nd_item<3> &item_ct1) { + long long global_idx = item_ct1.get_group(2) * item_ct1.get_local_range(2) + + item_ct1.get_local_id(2); + for (long long block_id = global_idx; block_id < num_blocks; + block_id += + item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) { + sycl::half *__restrict__ output_blk = + (sycl::half *)(output + block_id * ele_per_blk); + const float d = + sycl::vec(*(reinterpret_cast( + data + block_id * blk_size + 208))) + .convert()[0]; + + const uint8_t * __restrict__ ql = (uint8_t*)(data + block_id * blk_size); + const uint8_t * __restrict__ qh = (uint8_t*)(data + block_id * blk_size + 128); + const int8_t * __restrict__ sc = (int8_t*)(data + block_id * blk_size + 192); + + + for (int n = 0; n < ele_per_blk; n += 128) { + for (int l = 0; l < 32; ++l) { + int is = l/16; + const int8_t q1 = (int8_t)((ql[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32; + const int8_t q2 = (int8_t)((ql[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32; + const int8_t q3 = (int8_t)((ql[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32; + const int8_t q4 = (int8_t)((ql[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32; + output_blk[l + 0] = + sycl::vec(d * sc[is + 0] * q1) + .convert()[0]; + output_blk[l + 32] = + sycl::vec(d * sc[is + 2] * q2) + .convert()[0]; + output_blk[l + 64] = + sycl::vec(d * sc[is + 4] * q3) + .convert()[0]; + output_blk[l + 96] = + sycl::vec(d * sc[is + 6] * q4) + .convert()[0]; + } + output_blk += 128; + ql += 64; + qh += 32; + sc += 8; + } + } +} + +void dequantize_q6_k_bf16_kernel(const int8_t *data, + sycl::ext::oneapi::bfloat16 *output, + const int blk_size, const int ele_per_blk, + const int num_blocks, + const sycl::nd_item<3> &item_ct1) { + long long global_idx = item_ct1.get_group(2) * item_ct1.get_local_range(2) + + item_ct1.get_local_id(2); + for (long long block_id = global_idx; block_id < num_blocks; + block_id += + item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) { + sycl::ext::oneapi::bfloat16 *__restrict__ output_blk = + (sycl::ext::oneapi::bfloat16 *)(output + block_id * ele_per_blk); + const float d = + sycl::vec(*(reinterpret_cast( + data + block_id * blk_size + 208))) + .convert()[0]; + + const uint8_t * __restrict__ ql = (uint8_t*)(data + block_id * blk_size); + const uint8_t * __restrict__ qh = (uint8_t*)(data + block_id * blk_size + 128); + const int8_t * __restrict__ sc = (int8_t*)(data + block_id * blk_size + 192); + + + for (int n = 0; n < ele_per_blk; n += 128) { + for (int l = 0; l < 32; ++l) { + int is = l/16; + const int8_t q1 = (int8_t)((ql[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32; + const int8_t q2 = (int8_t)((ql[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32; + const int8_t q3 = (int8_t)((ql[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32; + const int8_t q4 = (int8_t)((ql[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32; + output_blk[l + 0] = + sycl::ext::oneapi::bfloat16(d * sc[is + 0] * q1); + output_blk[l + 32] = + sycl::ext::oneapi::bfloat16(d * sc[is + 2] * q2); + output_blk[l + 64] = + sycl::ext::oneapi::bfloat16(d * sc[is + 4] * q3); + output_blk[l + 96] = + sycl::ext::oneapi::bfloat16(d * sc[is + 6] * q4); + } + output_blk += 128; + ql += 64; + qh += 32; + sc += 8; + } + } +} + +static dpct::global_memory + kvalues_iq4nl(sycl::range<1>(16), {-127, -104, -83, -65, -49, -35, -22, -10, + 1, 13, 25, 38, 53, 69, 89, 113}); + +void dequantize_iq4_xs_fp32_kernel(const int8_t* data, float* output, const int blk_size, const int ele_per_blk, const int num_blocks, + const sycl::nd_item<3> &item_ct1, + const int8_t *kvalues_iq4nl) { + long long global_idx = item_ct1.get_group(2) * item_ct1.get_local_range(2) + + item_ct1.get_local_id(2); + for (long long block_id = global_idx; block_id < num_blocks; + block_id += + item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) { + float* __restrict__ output_blk = (float*)(output + block_id * ele_per_blk); + const float d = + sycl::vec(*(reinterpret_cast( + data + block_id * blk_size))) + .convert()[0]; + const uint16_t scales_h = *(reinterpret_cast(data + block_id * blk_size + 2)); + const uint8_t* scales_l = (uint8_t*)(data + block_id * blk_size + 2 + 2); + const uint8_t* qs = (uint8_t*)(data + block_id * blk_size + 2 + 2 + 4); + + for (int ib = 0; ib < 8; ++ib) { + const int ls = ((scales_l[ib / 2] >> 4 * (ib % 2)) & 0xf) | (((scales_h >> 2 * ib) & 3) << 4); + const float dl = d * (ls - 32); + for (int j = 0; j < 16; ++j) { + output_blk[j + 0] = dl * kvalues_iq4nl[qs[j] & 0xf]; + output_blk[j + 16] = dl * kvalues_iq4nl[qs[j] >> 4]; + } + output_blk += 32; + qs += 16; + } + } +} + +void dequantize_iq4_xs_fp16_kernel(const int8_t *data, sycl::half *output, + const int blk_size, const int ele_per_blk, + const int num_blocks, + const sycl::nd_item<3> &item_ct1, + const int8_t *kvalues_iq4nl) { + long long global_idx = item_ct1.get_group(2) * item_ct1.get_local_range(2) + + item_ct1.get_local_id(2); + for (long long block_id = global_idx; block_id < num_blocks; + block_id += + item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) { + sycl::half *__restrict__ output_blk = + (sycl::half *)(output + block_id * ele_per_blk); + const float d = + sycl::vec(*(reinterpret_cast( + data + block_id * blk_size))) + .convert()[0]; + const uint16_t scales_h = *(reinterpret_cast(data + block_id * blk_size + 2)); + const uint8_t* scales_l = (uint8_t*)(data + block_id * blk_size + 2 + 2); + const uint8_t* qs = (uint8_t*)(data + block_id * blk_size + 2 + 2 + 4); + + for (int ib = 0; ib < 8; ++ib) { + const int ls = ((scales_l[ib / 2] >> 4 * (ib % 2)) & 0xf) | (((scales_h >> 2 * ib) & 3) << 4); + const float dl = d * (ls - 32); + for (int j = 0; j < 16; ++j) { + output_blk[j + 0] = + sycl::vec(dl * kvalues_iq4nl[qs[j] & 0xf]) + .convert()[0]; + output_blk[j + 16] = + sycl::vec(dl * kvalues_iq4nl[qs[j] >> 4]) + .convert()[0]; + } + output_blk += 32; + qs += 16; + } + } +} + +void dequantize_iq4_xs_bf16_kernel(const int8_t *data, + sycl::ext::oneapi::bfloat16 *output, + const int blk_size, const int ele_per_blk, + const int num_blocks, + const sycl::nd_item<3> &item_ct1, + const int8_t *kvalues_iq4nl) { + long long global_idx = item_ct1.get_group(2) * item_ct1.get_local_range(2) + + item_ct1.get_local_id(2); + for (long long block_id = global_idx; block_id < num_blocks; + block_id += + item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) { + sycl::ext::oneapi::bfloat16 *__restrict__ output_blk = + (sycl::ext::oneapi::bfloat16 *)(output + block_id * ele_per_blk); + const float d = + sycl::vec(*(reinterpret_cast( + data + block_id * blk_size))) + .convert()[0]; + const uint16_t scales_h = *(reinterpret_cast(data + block_id * blk_size + 2)); + const uint8_t* scales_l = (uint8_t*)(data + block_id * blk_size + 2 + 2); + const uint8_t* qs = (uint8_t*)(data + block_id * blk_size + 2 + 2 + 4); + + for (int ib = 0; ib < 8; ++ib) { + const int ls = ((scales_l[ib / 2] >> 4 * (ib % 2)) & 0xf) | (((scales_h >> 2 * ib) & 3) << 4); + const float dl = d * (ls - 32); + for (int j = 0; j < 16; ++j) { + output_blk[j + 0] = sycl::ext::oneapi::bfloat16( + dl * kvalues_iq4nl[qs[j] & 0xf]); + output_blk[j + 16] = + sycl::ext::oneapi::bfloat16(dl * kvalues_iq4nl[qs[j] >> 4]); + } + output_blk += 32; + qs += 16; + } + } +} diff --git a/third-party-programs/ktransformers/custom_gguf/migrated/torch_test/dequantize_q8_0.cpp b/third-party-programs/ktransformers/custom_gguf/migrated/torch_test/dequantize_q8_0.cpp new file mode 100644 index 000000000..4b60ca4c0 --- /dev/null +++ b/third-party-programs/ktransformers/custom_gguf/migrated/torch_test/dequantize_q8_0.cpp @@ -0,0 +1,122 @@ +#include +#include +#include +#include +#include +#include + +torch::Tensor dequantize_q8_0(const int8_t *data, const int num_bytes, + const int blk_size, const int ele_per_blk, + const torch::Device device, + const torch::Dtype target_dtype) { + dpct::device_ext &dev_ct1 = dpct::get_current_device(); + sycl::queue &q_ct1 = dev_ct1.in_order_queue(); + int num_blocks = num_bytes / blk_size; + const c10::OptionalDeviceGuard device_guard(device); + + auto options = torch::TensorOptions().dtype(torch::kInt8).device(device).memory_format(torch::MemoryFormat::Contiguous); + auto data_gpu = torch::empty({ num_bytes }, options); + + q_ct1.memcpy(data_gpu.data_ptr(), data, num_bytes).wait(); + + // Create output tensor + auto output = torch::zeros({ num_blocks, 32 }, torch::dtype(target_dtype).device(device)); + + switch (target_dtype) { + case torch::kFloat16: { + dpct::has_capability_or_fail(q_ct1.get_device(), + {sycl::aspect::fp16}); + + q_ct1.submit([&](sycl::handler &cgh) { + const int8_t *data_gpu_data_ptr_int8_t_ct0 = + data_gpu.data_ptr(); + auto output_data_ptr_ct1 = (sycl::half *)output.data_ptr(); + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 512) * + sycl::range<3>(1, 1, 256), + sycl::range<3>(1, 1, 256)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_q8_0_fp16_kernel( + data_gpu_data_ptr_int8_t_ct0, + output_data_ptr_ct1, blk_size, ele_per_blk, + num_blocks, item_ct1); + }); + }); + } break; + case torch::kBFloat16: { + dpct::has_capability_or_fail(q_ct1.get_device(), + {sycl::aspect::fp16}); + + q_ct1.submit([&](sycl::handler &cgh) { + const int8_t *data_gpu_data_ptr_int8_t_ct0 = + data_gpu.data_ptr(); + auto output_data_ptr_ct1 = + (sycl::ext::oneapi::bfloat16 *)output.data_ptr(); + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 512) * + sycl::range<3>(1, 1, 256), + sycl::range<3>(1, 1, 256)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_q8_0_bf16_kernel( + data_gpu_data_ptr_int8_t_ct0, + output_data_ptr_ct1, blk_size, ele_per_blk, + num_blocks, item_ct1); + }); + }); + } break; + case torch::kFloat32: { + dpct::has_capability_or_fail(q_ct1.get_device(), + {sycl::aspect::fp16}); + + q_ct1.submit([&](sycl::handler &cgh) { + const int8_t *data_gpu_data_ptr_int8_t_ct0 = + data_gpu.data_ptr(); + auto output_data_ptr_float_ct1 = output.data_ptr(); + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 512) * + sycl::range<3>(1, 1, 256), + sycl::range<3>(1, 1, 256)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_q8_0_fp32_kernel( + data_gpu_data_ptr_int8_t_ct0, + output_data_ptr_float_ct1, blk_size, + ele_per_blk, num_blocks, item_ct1); + }); + }); + } break; + default: + printf("target type not support\n"); + exit(0); + } + + dev_ct1.queues_wait_and_throw(); + return output; +} + + +int main() { + const int num_bytes = 1024; + int8_t data[num_bytes]; + + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_int_distribution<> dis(-128, 127); + + for (int i = 0; i < num_bytes; ++i) { + data[i] = static_cast(dis(gen)); + } + + const int blk_size = 256; + const int ele_per_blk = 32; + const torch::Device device(torch::kXPU, 0); + const torch::Dtype target_dtype = torch::kFloat32; + + torch::Tensor result = dequantize_q8_0(data, num_bytes, blk_size, ele_per_blk, device, target_dtype); + + std::cout << result << std::endl; + + return 0; +}