adjust docker file and cutlass instructions

Jopyth · Jopyth · commit b5bfedf6460e · 2024-04-29T22:57:55.000+02:00
diff --git a/.dev-scripts/.gitignore b/.dev-scripts/.gitignore
@@ -0,0 +1 @@
+test_*.sh
diff --git a/.dev-scripts/extract_install_cmd.py b/.dev-scripts/extract_install_cmd.py
@@ -0,0 +1,99 @@
+# take caution: everything is quite hardcoded here
+# any changes to the readme could break this code
+# run it from root directory: python extract_install_cmd.py path/to/custom/torch-xxx.whl
+
+import argparse
+parser = argparse.ArgumentParser()
+parser.add_argument("custom_pytorch_path", help="Path to custom PyTorch wheel")
+args = parser.parse_args()
+
+BLOCK_HEADER_START = "### Conda on Linux"
+
+with open("README.md") as infile:
+    content = infile.readlines()
+
+local_install_instructions = []
+global_install_instructions = []
+
+in_code_block = False
+reading_instructions = False
+insert_block_pause = False
+instruction_type = ""
+
+FILE_INTRO = """#!/usr/bin/env bash
+
+function check_error() {
+    # shows and then runs a command. if the exit code is not zero, aborts the script
+    # usage: check_error mv foo bar
+
+    echo + $@
+    "$@"
+    local exit_code=$?
+    if [ "${exit_code}" -ne 0 ]; then
+        echo "! > An error occured, aborting."
+        exit 1
+    fi
+}
+"""
+EXTRA_CONDA_INSTRUCTION = """# extra step for bash script (not required in a proper command line):
+eval "$(conda shell.bash hook)"
+"""
+
+
+for line in content:
+    if line.startswith("```"):
+        in_code_block = not in_code_block
+        continue
+    if line.startswith(BLOCK_HEADER_START):
+        reading_instructions = True
+        instruction_type = "global"
+        continue
+    if line.startswith("<details><summary>"):
+        instruction_type = "local"
+        continue
+    if line.startswith("</details>"):
+        instruction_type = "both"
+        continue
+    if line.startswith(BLOCK_HEADER_START.split()[0]):
+        reading_instructions = False
+        continue
+    if not reading_instructions:
+        continue
+    if not in_code_block:
+        insert_block_pause = True
+        continue
+
+    # deal with comments
+    if line.startswith("# export CC="):
+        line = line[2:]
+    if line.startswith("#"):
+        continue
+
+    # replace some line contents and add some lines
+    if "conda activate" in line:
+        line = EXTRA_CONDA_INSTRUCTION + "check_error " + line
+    if "export BITORCH_WORKSPACE" in line:
+        line = line.replace("${HOME}", "$(pwd)")
+    if line.startswith("pip install torch-"):
+        line = "pip install {}\n".format(args.custom_pytorch_path)
+
+    # decide how to write line
+    line_format = "check_error {line}"
+    if line.startswith("#"):
+        line_format = "{line}"
+    if insert_block_pause:
+        insert_block_pause = False
+        line_format = "\n" + line_format
+
+    # write result line(s)
+    if instruction_type == "global" or instruction_type == "both":
+        global_install_instructions.append(line_format.format(line=line))
+    if instruction_type == "local" or instruction_type == "both":
+        local_install_instructions.append(line_format.format(line=line))
+
+with open(".dev-scripts/test_local_conda_install.sh", "w") as outfile:
+    outfile.write(FILE_INTRO)
+    outfile.writelines(local_install_instructions)
+with open(".dev-scripts/test_global_conda_install.sh", "w") as outfile:
+    outfile.write(FILE_INTRO)
+    outfile.writelines(global_install_instructions)
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,17 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/)
 and this project adheres to [Semantic Versioning](http://semver.org/).
 
 
+## [0.2.2] - 2024/04/29
+
+### Updated
+
+- Building instructions (adding a section for cutlass)
+- Checksums for custom torch builds (within docker)
+
+### Fixed
+
+- An error in `pack_fp_weight`
+
 ## [0.2.1] - 2024/04/27
 
 ### Fixed
diff --git a/README.md b/README.md
@@ -40,15 +40,20 @@ The requirements are:
 - A compiler that fully supports C++17, such as clang or gcc (gcc 9.4.0 or newer is required, but gcc 12.x is not supported yet)
 - Python 3.9 or later
 - PyTorch 1.8 or later
-- CUDA Toolkit 11.8 or 12.1 (optional, for CUDA accelerated layers)
 
-For more detailed information, you can check the [requirements of PyTorch](https://github.com/pytorch/pytorch?tab=readme-ov-file#prerequisites).
+Please check your operating system's options for the C++ compiler.
+For more detailed information, you can check the [requirements to build PyTorch from source](https://github.com/pytorch/pytorch?tab=readme-ov-file#prerequisites).
+In addition, for layers to speed up on specific hardware (such as CUDA devices, or MacOS M1/2/3 chips), we recommend installing:
+
+- CUDA Toolkit 11.8 or 12.1 for CUDA accelerated layers
+- **[MLX](https://github.com/ml-explore/mlx)** for mlx-based layers on MacOS
+- **[CUTLASS](https://github.com/NVIDIA/cutlass)** for cutlass-based layers
 
 Currently, the engine **needs to be built from source**.
-We provide instructions how to install Python/PyTorch (and CUDA/MLX) for:
+We provide instructions for the following options:
 
-- Conda + Linux (with CUDA)
-- Docker (with CUDA)
+- Conda + Linux (with CUDA and cutlass)
+- Docker (with CUDA and cutlass)
 - Conda + MacOS (with MLX)
 
 We recommend managing your BITorch Engine installation in a conda environment (otherwise you should adapt/remove certain variables, e.g. `CUDA_HOME`).
@@ -57,6 +62,8 @@ You may wish to adapt the CUDA version to 12.1 where applicable.
 
 ### Conda on Linux (with CUDA)
 
+To use these instructions, you need to have [conda](https://conda.io/projects/conda/en/latest/user-guide/getting-started.html) and a suitable C++ compiler installed.
+
 1. Create Environment for Python 3.9 and activate it:
 ```bash
 conda create -y --name bitorch-engine python=3.9
@@ -72,8 +79,22 @@ pip install torch-2.1.0-cp39-cp39-linux_x86_64.whl
 # optional: install corresponding torchvision (check https://github.com/pytorch/vision?tab=readme-ov-file#installation in the future)
 pip install "torchvision==0.16.0" --index-url https://download.pytorch.org/whl/cu118
 ```
+4. To use cutlass layers, you should also install CUTLASS 2.8.0 (from source), adjust `CUTLASS_HOME` (this is where we clone and install cutlass)
+(if you have older or newer GPUs you may need to add your [CUDA compute capability](https://developer.nvidia.com/cuda-gpus) in `CUTLASS_NVCC_ARCHS`):
+```bash
+export CUTLASS_HOME="/some/path"
+mkdir -p "${CUTLASS_HOME}"
+git clone --depth 1 --branch "v2.8.0" "https://github.com/NVIDIA/cutlass.git" --recursive ${CUTLASS_HOME}/source
+mkdir -p "${CUTLASS_HOME}/build" && mkdir -p "${CUTLASS_HOME}/install"
+cd "${CUTLASS_HOME}/build"
+cmake ../source -DCMAKE_INSTALL_PREFIX="${CUTLASS_HOME}/install" -DCUTLASS_ENABLE_TESTS=OFF -DCUTLASS_ENABLE_EXAMPLES=OFF -DCUTLASS_NVCC_ARCHS='75;80;86'
+make -j 4
+cmake --install .
+```
+If you have difficulties installing cutlass, you can check the [official documentation](https://github.com/NVIDIA/cutlass/tree/v2.8.0),
+use the other layers without installing it or try the docker installation.
 
-Alternatively, you can also save the environment and clone the repository within the same directory.
+As an alternative to the instructions above, you can also store the environment and clone all repositories within one "root" directory.
 
 <details><summary>Click to here to expand the instructions for this.</summary>
 
@@ -99,17 +120,33 @@ pip install torch-2.1.0-cp39-cp39-linux_x86_64.whl
 # optional: install corresponding torchvision (check https://github.com/pytorch/vision?tab=readme-ov-file#installation in the future)
 pip install "torchvision==0.16.0" --index-url https://download.pytorch.org/whl/cu118
 ```
+4. To use cutlass layers, you should also install CUTLASS 2.8.0
+(if you have older or newer GPUs you may need to add your [CUDA compute capability](https://developer.nvidia.com/cuda-gpus) in `CUTLASS_NVCC_ARCHS`):
+```bash
+export CUTLASS_HOME="${BITORCH_WORKSPACE}/cutlass"
+mkdir -p "${CUTLASS_HOME}"
+git clone --depth 1 --branch "v2.8.0" "https://github.com/NVIDIA/cutlass.git" --recursive ${CUTLASS_HOME}/source
+mkdir -p "${CUTLASS_HOME}/build" && mkdir -p "${CUTLASS_HOME}/install"
+cd "${CUTLASS_HOME}/build"
+cmake ../source -DCMAKE_INSTALL_PREFIX="${CUTLASS_HOME}/install" -DCUTLASS_ENABLE_TESTS=OFF -DCUTLASS_ENABLE_EXAMPLES=OFF -DCUTLASS_NVCC_ARCHS='75;80;86'
+make -j 4
+cmake --install .
+cd "${BITORCH_WORKSPACE}"
+```
+If you have difficulties installing cutlass, you can check the [official documentation](https://github.com/NVIDIA/cutlass/tree/v2.8.0),
+use the other layers without installing it or try the docker installation.
 </details>
 
 After setting up the environment, clone the code and build with pip (to hide the build output remove `-v`):
 
 ```bash
+# make sure you are in a suitable directory, e.g. your bitorch workspace
 git clone --recursive https://github.com/GreenBitAI/bitorch-engine
 cd bitorch-engine
 # only gcc versions 9.x, 10.x, 11.x are supported
 # to select the correct gcc, use:
 # export CC=gcc-11 CPP=g++-11 CXX=g++-11
-CUDA_HOME="${CONDA_PREFIX}" pip install -e . -v
+CPATH="${CUTLASS_HOME}/install/include" CUDA_HOME="${CONDA_PREFIX}" pip install -e . -v
 ```
 
 ### Docker (with CUDA) 
diff --git a/bitorch_engine/layers/qlinear/nbit/cuda/utils.py b/bitorch_engine/layers/qlinear/nbit/cuda/utils.py
@@ -107,8 +107,9 @@ def pack_fp_weight(weight: torch.Tensor, qweight: MPQWeightParameter) -> torch.T
                 # Adjust scales and zeros for symmetric quantization without group index
                 scales = scales.unsqueeze(1).repeat(1, weight.size(0)//scales.size(0), 1).view(-1, scales.size(-1))
                 zeros = zeros.unsqueeze(1).repeat(1, weight.size(0) // zeros.size(0), 1).view(-1, zeros.size(-1))
-                q_perm = qweight.q_perm.unsqueeze(1).repeat(1, weight.size(1)).long()
-                weight = torch.gather(weight, dim=0, index=q_perm)
+                if hasattr(qweight, "q_perm") and qweight.q_perm is not None:
+                    q_perm = qweight.q_perm.unsqueeze(1).repeat(1, weight.size(1)).long()
+                    weight = torch.gather(weight, dim=0, index=q_perm)
 
                 intweight = torch.round((weight + zeros) / scales).to(torch.int32).clamp(0, 2 ** w_bit - 1)
             else:
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -7,10 +7,10 @@ RUN apt-get update && \
   apt-get install -y git && \
   apt-get clean && \
   rm -rf /var/lib/apt/lists/* && \
-  git clone --depth 1 --branch "v${CUTLASS_VERSION}" "https://github.com/NVIDIA/cutlass.git" --recursive /cutlass && \
+  git clone --depth 1 --branch "v${CUTLASS_VERSION}" "https://github.com/NVIDIA/cutlass.git" --recursive /cutlass/source && \
   mkdir /cutlass/build && \
   cd /cutlass/build && \
-  cmake .. -DCMAKE_INSTALL_PREFIX:PATH=/usr/local -DBUILD_TESTING=OFF -DCUTLASS_NVCC_ARCHS='75;80;86' && \
+  cmake ../source -DCMAKE_INSTALL_PREFIX=/cutlass/install -DCUTLASS_ENABLE_TESTS=OFF -DCUTLASS_ENABLE_EXAMPLES=OFF -DCUTLASS_NVCC_ARCHS='75;80;86' && \
   make -j `$(nproc)` && \
   cmake --install .
 
@@ -32,7 +32,7 @@ RUN git clone \
     "${GIT_URL}" \
     /bitorch-engine && \
   cd /bitorch-engine && \
-  BIE_FORCE_CUDA="true" pip install -e ${BUILD_TARGET} -v && \
+  BIE_FORCE_CUDA="true" CPATH="/cutlass/install/include" pip install -e ${BUILD_TARGET} -v && \
   rm -rf build/ bitorch_engine.egg-info/
 
 FROM no-examples as example-ready
diff --git a/docker/build_scripts/install_modified_pytorch.sh b/docker/build_scripts/install_modified_pytorch.sh
@@ -17,24 +17,14 @@ file="custom_torch.whl"
 ## adding them here individually is tedious, but we need to build them manually and ensure compatibility anyway
 
 if [ "${from_image}" == "pytorch/pytorch:2.2.0-cuda11.8-cudnn8-devel" ]; then
-    gdrive_id="1sS3LS_8wEm2CJ-oCHZAWYeuXjJHXPINP"
+    gdrive_id="1PoVor85-RF3s0KpOP19mFV5hNUnHERa1"
     file="torch-2.2.2-cp310-cp310-linux_x86_64.whl"
-    checksum="1a7e8f1c315d3aefcc65b0a6676857b9cde4877737a134cf1423a048d8938985"
+    checksum="6646519e5e7b4af8f99b79eb9be3e6460b0d05c4695bbf86de02568f37ff3fea"
 fi
 if [ "${from_image}" == "pytorch/pytorch:2.2.0-cuda12.1-cudnn8-devel" ]; then
-    gdrive_id="18DP0P9MJ4U211HR5-1ss6NogFPcIOJDR"
+    gdrive_id="1LjFNImboq8QeFSompMS2gPjBRYtP2Dsz"
     file="torch-2.2.2-cp310-cp310-linux_x86_64.whl"
-    checksum="5f89163d910e1e1ee6010e4ea5d478756c021abab1e248be9716d3bee729b9e7"
-fi
-if [ "${from_image}" == "pytorch/pytorch:2.1.0-cuda11.8-cudnn8-devel" ]; then
-    gdrive_id="1QK_QqlPubFNgitiOkSABZ3AZyg7M0ezc"
-    file="torch-2.1.0-cp39-cp39-linux_x86_64.whl"
-    checksum="6600c130395b66bd047ca01b077f702703924eb3eaab2d3d04d9eb51154d9080"
-fi
-if [ "${from_image}" == "pytorch/pytorch:2.1.0-cuda12.1-cudnn8-devel" ]; then
-    gdrive_id="1fguT0jRJwRE1126rPpEvL9G6F246CLar"
-    file="torch-2.1.0-cp39-cp39-linux_x86_64.whl"
-    checksum="10b95aaca45558f3b80ee331677ddd925f3891ef542ab419ae68dd57641b9a12"
+    checksum="2a5953dab7be6c1640112e38ae7519ad88180d9fa79faab6c86dbee6b1cc210e"
 fi
 #if [ "${from_image}" == "pytorch/pytorch:X.X.X-cudaXX.X-cudnn8-devel" ]; then
 #    gdrive_id="xxx"
diff --git a/docs/source/installation.rst b/docs/source/installation.rst