Skip to content

Failed to build docker image due to missing cutlass/cutlass.h #566

@CoolFish88

Description

@CoolFish88

System Info

Hello,

I am trying to build the TEI 1.6 docker image on a GPU machine (A10G), whose OS is describe by the following parameters:

NAME="Amazon Linux"
VERSION="2"
ID="amzn"
ID_LIKE="centos rhel fedora"
VERSION_ID="2"
PRETTY_NAME="Amazon Linux 2"
ANSI_COLOR="0;33"
CPE_NAME="cpe:2.3:o:amazon:amazon_linux:2"
HOME_URL="https://amazonlinux.com/"

Rust version: 1.85.1

The reason I am trying to built in from scratch, instead of fetching it using:
docker pull ghcr.io/huggingface/text-embeddings-inference:1.6, is to benefit from the improvements brought by this recent PR: #559

As I intend to deploy the TEI container on a Sagemaker endpoint, I am using Dockerfile-cuda-all, which references a sagemaker entry point script, as opposed to Dockerfile-cuda.

When I run: docker build . -f Dockerfile-cuda-all --build-arg CUDA_COMPUTE_CAP=86
I get the error:

error: failed to run custom build command for candle-flash-attn-v1 v0.0.1 (/usr/src/candle-extensions/candle-flash-attn-v1) Caused by: process didn't exit successfully: /usr/src/target/release/build/candle-flash-attn-v1-74cefb017284d1d2/build-script-build` (exit status: 1)

With the stderr printing:

2.920 --- stderr
2.920 #$ NVVM_BRANCH=nvvm
2.920 #$ SPACE=
2.920 #$ CUDART=cudart
2.920 #$ HERE=/usr/local/cuda/bin
2.920 #$ THERE=/usr/local/cuda/bin
2.920 #$ TARGET_SIZE=
2.920 #$ TARGET_DIR=
2.920 #$ TARGET_DIR=targets/x86_64-linux
2.920 #$ TOP=/usr/local/cuda/bin/..
2.920 #$ NVVMIR_LIBRARY_DIR=/usr/local/cuda/bin/../nvvm/libdevice
2.920 #$ LD_LIBRARY_PATH=/usr/local/cuda/bin/../lib:/usr/src/target/release/deps:/usr/src/target/release:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib/rustlib/x86_64-unknown-linux-gnu/lib:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
2.920 #$ PATH=/usr/local/cuda/bin/../nvvm/bin:/usr/local/cuda/bin:/root/.cargo/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
2.920 #$ INCLUDES="-I/usr/local/cuda/bin/../targets/x86_64-linux/include"
2.920 #$ NVVM_BRANCH=nvvm
2.920 #$ LIBRARIES= "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib/stubs" "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib"
2.920 #$ SPACE=
2.920 #$ CUDAFE_FLAGS=
2.920 #$ CUDART=cudart
2.920 #$ PTXAS_FLAGS=
2.920 #$ HERE=/usr/local/cuda/bin
2.920 #$ THERE=/usr/local/cuda/bin
2.920 #$ TARGET_SIZE=
2.920 #$ TARGET_DIR=
2.920 #$ TARGET_DIR=targets/x86_64-linux
2.920 #$ gcc -std=c++17 -D__CUDA_ARCH_LIST__=750 -E -x c++ -D__CUDACC__ -D__NVCC__ -D__CUDACC_EXTENDED_LAMBDA__ -D__CUDACC_RELAXED_CONSTEXPR__ -O3 -I"cutlass/include" "-I/usr/local/cuda/bin/../targets/x86_64-linux/include" -U "CUDA_NO_HALF_OPERATORS" -U "CUDA_NO_HALF_CONVERSIONS" -U "CUDA_NO_HALF2_OPERATORS" -U "CUDA_NO_BFLOAT16_CONVERSIONS" -D__CUDACC_VER_MAJOR__=12 -D__CUDACC_VER_MINOR__=2 -D__CUDACC_VER_BUILD__=140 -D__CUDA_API_VER_MAJOR__=12 -D__CUDA_API_VER_MINOR__=2 -DCUDA_API_PER_THREAD_DEFAULT_STREAM=1 -D__NVCC_DIAG_PRAGMA_SUPPORT__=1 -include "cuda_runtime.h" -m64 "kernels/flash_api.cu" -o "/tmp/tmpxft_00000507_00000000-5_flash_api.cpp4.ii"
2.920 #$ TOP=/usr/local/cuda/bin/..
2.920 #$ NVVMIR_LIBRARY_DIR=/usr/local/cuda/bin/../nvvm/libdevice
2.920 #$ LD_LIBRARY_PATH=/usr/local/cuda/bin/../lib:/usr/src/target/release/deps:/usr/src/target/release:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib/rustlib/x86_64-unknown-linux-gnu/lib:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
2.920 #$ PATH=/usr/local/cuda/bin/../nvvm/bin:/usr/local/cuda/bin:/root/.cargo/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
2.920 #$ INCLUDES="-I/usr/local/cuda/bin/../targets/x86_64-linux/include"
2.920 #$ LIBRARIES= "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib/stubs" "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib"
2.920 #$ CUDAFE_FLAGS=
2.920 #$ PTXAS_FLAGS=
2.920 #$ gcc -std=c++17 -D__CUDA_ARCH_LIST__=750 -E -x c++ -D__CUDACC__ -D__NVCC__ -D__CUDACC_EXTENDED_LAMBDA__ -D__CUDACC_RELAXED_CONSTEXPR__ -O3 -I"cutlass/include" "-I/usr/local/cuda/bin/../targets/x86_64-linux/include" -U "CUDA_NO_HALF_OPERATORS" -U "CUDA_NO_HALF_CONVERSIONS" -U "CUDA_NO_HALF2_OPERATORS" -U "CUDA_NO_BFLOAT16_CONVERSIONS" -D__CUDACC_VER_MAJOR__=12 -D__CUDACC_VER_MINOR__=2 -D__CUDACC_VER_BUILD__=140 -D__CUDA_API_VER_MAJOR__=12 -D__CUDA_API_VER_MINOR__=2 -DCUDA_API_PER_THREAD_DEFAULT_STREAM=1 -D__NVCC_DIAG_PRAGMA_SUPPORT__=1 -include "cuda_runtime.h" -m64 "kernels/fmha_fwd_hdim128.cu" -o "/tmp/tmpxft_00000508_00000000-5_fmha_fwd_hdim128.cpp4.ii"
2.920 #$ NVVM_BRANCH=nvvm
2.920 #$ SPACE=
2.920 #$ CUDART=cudart
2.920 #$ HERE=/usr/local/cuda/bin
2.920 #$ THERE=/usr/local/cuda/bin
2.920 #$ TARGET_SIZE=
2.920 #$ TARGET_DIR=
2.920 #$ TARGET_DIR=targets/x86_64-linux
2.920 #$ TOP=/usr/local/cuda/bin/..
2.920 #$ NVVMIR_LIBRARY_DIR=/usr/local/cuda/bin/../nvvm/libdevice
2.920 #$ LD_LIBRARY_PATH=/usr/local/cuda/bin/../lib:/usr/src/target/release/deps:/usr/src/target/release:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib/rustlib/x86_64-unknown-linux-gnu/lib:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
2.920 #$ PATH=/usr/local/cuda/bin/../nvvm/bin:/usr/local/cuda/bin:/root/.cargo/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
2.920 #$ INCLUDES="-I/usr/local/cuda/bin/../targets/x86_64-linux/include"
2.920 #$ LIBRARIES= "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib/stubs" "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib"
2.920 #$ NVVM_BRANCH=nvvm
2.920 #$ CUDAFE_FLAGS=
2.920 #$ PTXAS_FLAGS=
2.920 #$ SPACE=
2.920 #$ CUDART=cudart
2.920 #$ HERE=/usr/local/cuda/bin
2.920 #$ THERE=/usr/local/cuda/bin
2.920 #$ TARGET_SIZE=
2.920 #$ TARGET_DIR=
2.920 #$ TARGET_DIR=targets/x86_64-linux
2.920 #$ gcc -std=c++17 -D__CUDA_ARCH_LIST__=750 -E -x c++ -D__CUDACC__ -D__NVCC__ -D__CUDACC_EXTENDED_LAMBDA__ -D__CUDACC_RELAXED_CONSTEXPR__ -O3 -I"cutlass/include" "-I/usr/local/cuda/bin/../targets/x86_64-linux/include" -U "CUDA_NO_HALF_OPERATORS" -U "CUDA_NO_HALF_CONVERSIONS" -U "CUDA_NO_HALF2_OPERATORS" -U "CUDA_NO_BFLOAT16_CONVERSIONS" -D__CUDACC_VER_MAJOR__=12 -D__CUDACC_VER_MINOR__=2 -D__CUDACC_VER_BUILD__=140 -D__CUDA_API_VER_MAJOR__=12 -D__CUDA_API_VER_MINOR__=2 -DCUDA_API_PER_THREAD_DEFAULT_STREAM=1 -D__NVCC_DIAG_PRAGMA_SUPPORT__=1 -include "cuda_runtime.h" -m64 "kernels/fmha_fwd_hdim32.cu" -o "/tmp/tmpxft_00000509_00000000-5_fmha_fwd_hdim32.cpp4.ii"
2.920 #$ TOP=/usr/local/cuda/bin/..
2.920 #$ NVVMIR_LIBRARY_DIR=/usr/local/cuda/bin/../nvvm/libdevice
2.920 #$ LD_LIBRARY_PATH=/usr/local/cuda/bin/../lib:/usr/src/target/release/deps:/usr/src/target/release:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib/rustlib/x86_64-unknown-linux-gnu/lib:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
2.920 #$ PATH=/usr/local/cuda/bin/../nvvm/bin:/usr/local/cuda/bin:/root/.cargo/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
2.920 #$ INCLUDES="-I/usr/local/cuda/bin/../targets/x86_64-linux/include"
2.920 #$ LIBRARIES= "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib/stubs" "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib"
2.920 #$ CUDAFE_FLAGS=
2.920 #$ PTXAS_FLAGS=
2.920 #$ gcc -std=c++17 -D__CUDA_ARCH_LIST__=750 -E -x c++ -D__CUDACC__ -D__NVCC__ -D__CUDACC_EXTENDED_LAMBDA__ -D__CUDACC_RELAXED_CONSTEXPR__ -O3 -I"cutlass/include" "-I/usr/local/cuda/bin/../targets/x86_64-linux/include" -U "CUDA_NO_HALF_OPERATORS" -U "CUDA_NO_HALF_CONVERSIONS" -U "CUDA_NO_HALF2_OPERATORS" -U "CUDA_NO_BFLOAT16_CONVERSIONS" -D__CUDACC_VER_MAJOR__=12 -D__CUDACC_VER_MINOR__=2 -D__CUDACC_VER_BUILD__=140 -D__CUDA_API_VER_MAJOR__=12 -D__CUDA_API_VER_MINOR__=2 -DCUDA_API_PER_THREAD_DEFAULT_STREAM=1 -D__NVCC_DIAG_PRAGMA_SUPPORT__=1 -include "cuda_runtime.h" -m64 "kernels/fmha_fwd_hdim64.cu" -o "/tmp/tmpxft_0000050a_00000000-5_fmha_fwd_hdim64.cpp4.ii"
2.920 In file included from kernels/fmha/smem_tile.h:32,
2.920 from kernels/fmha_kernel.h:34,
2.920 from kernels/fmha_fprop_kernel_1xN.h:31,
2.920 from kernels/fmha_fwd_launch_template.h:12,
2.920 from kernels/fmha_fwd_hdim64.cu:5:
2.920 kernels/fmha/gemm.h:32:10: fatal error: cutlass/cutlass.h: No such file or directory
2.920 32 | #include "cutlass/cutlass.h"
2.920 | ^~~~~~~~~~~~~~~~~~~
2.920 compilation terminated.
2.920 In file included from kernels/fmha/smem_tile.h:32,
2.920 from kernels/fmha_kernel.h:34,
2.920 from kernels/fmha_fprop_kernel_1xN.h:31,
2.920 from kernels/fmha_fwd_launch_template.h:12,
2.920 from kernels/fmha_fwd_hdim128.cu:5:
2.920 kernels/fmha/gemm.h:32:10: fatal error: cutlass/cutlass.h: No such file or directory
2.920 32 | #include "cutlass/cutlass.h"
2.920 | ^~~~~~~~~~~~~~~~~~~
2.920 compilation terminated.
2.920 In file included from kernels/fmha/smem_tile.h:32,
2.920 from kernels/fmha_kernel.h:34,
2.920 from kernels/fmha_fprop_kernel_1xN.h:31,
2.920 from kernels/fmha_fwd_launch_template.h:12,
2.920 from kernels/fmha_fwd_hdim32.cu:5:
2.920 kernels/fmha/gemm.h:32:10: fatal error: cutlass/cutlass.h: No such file or directory
2.920 32 | #include "cutlass/cutlass.h"
2.920 | ^~~~~~~~~~~~~~~~~~~
2.920 compilation terminated.
2.920 #$ cudafe++ --c++17 --gnu_version=110400 --display_error_number --orig_src_file_name "kernels/flash_api.cu" --orig_src_path_name "/usr/src/candle-extensions/candle-flash-attn-v1/kernels/flash_api.cu" --allow_managed --extended-lambda --relaxed_constexpr --m64 --parse_templates --gen_c_file_name "/tmp/tmpxft_00000507_00000000-6_flash_api.cudafe1.cpp" --stub_file_name "tmpxft_00000507_00000000-6_flash_api.cudafe1.stub.c" --gen_module_id_file --module_id_file_name "/tmp/tmpxft_00000507_00000000-4_flash_api.module_id" "/tmp/tmpxft_00000507_00000000-5_flash_api.cpp4.ii"
2.920 # --error 0x1 --
2.920 # --error 0x1 --
2.920 # --error 0x1 --
2.920 #$ gcc -std=c++17 -D__CUDA_ARCH__=750 -D__CUDA_ARCH_LIST__=750 -E -x c++ -DCUDA_DOUBLE_MATH_FUNCTIONS -D__CUDACC__ -D__NVCC__ -D__CUDACC_EXTENDED_LAMBDA__ -D__CUDACC_RELAXED_CONSTEXPR__ -O3 -I"cutlass/include" "-I/usr/local/cuda/bin/../targets/x86_64-linux/include" -U "CUDA_NO_HALF_OPERATORS" -U "CUDA_NO_HALF_CONVERSIONS" -U "CUDA_NO_HALF2_OPERATORS" -U "CUDA_NO_BFLOAT16_CONVERSIONS" -D__CUDACC_VER_MAJOR__=12 -D__CUDACC_VER_MINOR__=2 -D__CUDACC_VER_BUILD__=140 -D__CUDA_API_VER_MAJOR__=12 -D__CUDA_API_VER_MINOR__=2 -DCUDA_API_PER_THREAD_DEFAULT_STREAM=1 -D__NVCC_DIAG_PRAGMA_SUPPORT__=1 -include "cuda_runtime.h" -m64 "kernels/flash_api.cu" -o "/tmp/tmpxft_00000507_00000000-7_flash_api.cpp1.ii"
2.920 #$ cicc --c++17 --gnu_version=110400 --display_error_number --orig_src_file_name "kernels/flash_api.cu" --orig_src_path_name "/usr/src/candle-extensions/candle-flash-attn-v1/kernels/flash_api.cu" --allow_managed --extended-lambda --relaxed_constexpr -arch compute_75 -m64 --no-version-ident -ftz=1 -prec_div=0 -prec_sqrt=0 -fmad=1 -fast-math --gen_div_approx_ftz --include_file_name "tmpxft_00000507_00000000-3_flash_api.fatbin.c" -tused --module_id_file_name "/tmp/tmpxft_00000507_00000000-4_flash_api.module_id" --gen_c_file_name "/tmp/tmpxft_00000507_00000000-6_flash_api.cudafe1.c" --stub_file_name "/tmp/tmpxft_00000507_00000000-6_flash_api.cudafe1.stub.c" --gen_device_file_name "/tmp/tmpxft_00000507_00000000-6_flash_api.cudafe1.gpu" "/tmp/tmpxft_00000507_00000000-7_flash_api.cpp1.ii" -o "/tmp/tmpxft_00000507_00000000-6_flash_api.ptx"
2.920 #$ ptxas -arch=sm_75 -m64 -v "/tmp/tmpxft_00000507_00000000-6_flash_api.ptx" -o "/tmp/tmpxft_00000507_00000000-8_flash_api.sm_75.cubin"
2.920 ptxas info : 0 bytes gmem
2.920 #$ fatbinary -64 --cmdline="-v " --cicc-cmdline="-ftz=1 -prec_div=0 -prec_sqrt=0 -fmad=1 " "--image3=kind=elf,sm=75,file=/tmp/tmpxft_00000507_00000000-8_flash_api.sm_75.cubin" "--image3=kind=ptx,sm=75,file=/tmp/tmpxft_00000507_00000000-6_flash_api.ptx" --embedded-fatbin="/tmp/tmpxft_00000507_00000000-3_flash_api.fatbin.c"
2.920 #$ rm /tmp/tmpxft_00000507_00000000-3_flash_api.fatbin
2.920 #$ gcc -std=c++17 -D__CUDA_ARCH__=750 -D__CUDA_ARCH_LIST__=750 -c -x c++ -DCUDA_DOUBLE_MATH_FUNCTIONS -O3 -I"cutlass/include" "-I/usr/local/cuda/bin/../targets/x86_64-linux/include" -m64 "/tmp/tmpxft_00000507_00000000-6_flash_api.cudafe1.cpp" -o "/usr/src/target/release/build/candle-flash-attn-v1-89935d9d0de9255d/out/kernels/flash_api.o"
2.920 Error: nvcc error while executing compiling: "nvcc" "-O3" "-std=c++17" "--gpu-architecture=sm_75" "-c" "-o" "/usr/src/target/release/build/candle-flash-attn-v1-89935d9d0de9255d/out/kernels/fmha_fwd_hdim64.o" "--default-stream" "per-thread" "-Icutlass/include" "-U__CUDA_NO_HALF_OPERATORS__" "-U__CUDA_NO_HALF_CONVERSIONS__" "-U__CUDA_NO_HALF2_OPERATORS__" "-U__CUDA_NO_BFLOAT16_CONVERSIONS__" "--expt-relaxed-constexpr" "--expt-extended-lambda" "--use_fast_math" "--ptxas-options=-v" "--verbose" "kernels/fmha_fwd_hdim64.cu"
2.920
2.920 # stdout
2.920
2.920
2.920 # stderr
2.920
2.920 warning: build failed, waiting for other jobs to finish...

Information

  • Docker
  • The CLI directly

Tasks

  • An officially supported command
  • My own modifications

Reproduction

docker build . -f Dockerfile-cuda-all --build-arg CUDA_COMPUTE_CAP=86

Expected behavior

Build the docker image without errors

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions