feat: use aggressive compress-mode for fatbin (#484)

guocuimi · web-flow · commit ffc3f9a66b13 · 2025-07-10T00:04:54.000-07:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -201,7 +201,7 @@ else()
   FetchContent_MakeAvailable(libtorch)
 
   find_package(Torch REQUIRED PATHS ${libtorch_SOURCE_DIR} NO_DEFAULT_PATH)
-  message(STATUS "Downloading and using libtorch 2.7.1 for cuda ${CUDA_VERSION} at ${libtorch_SOURCE_DIR}")
+  message(STATUS "Downloading and using libtorch 2.7.1 for cuda ${CUDAToolkit_VERSION} at ${libtorch_SOURCE_DIR}")
 endif()
 
 # carry over torch flags to the rest of the project
@@ -214,14 +214,21 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DC10_USE_GLOG")
 message(STATUS "CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
 message(STATUS "CMAKE_CXX_FLAGS_DEBUG: ${CMAKE_CXX_FLAGS_DEBUG}")
 
-set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -O3)
 # The following definitions must be undefined since half-precision operation is required.
-set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}
+list(APPEND CUDA_NVCC_FLAGS
       -U__CUDA_NO_HALF_OPERATORS__
       -U__CUDA_NO_HALF_CONVERSIONS__
       -U__CUDA_NO_HALF2_OPERATORS__
-      -U__CUDA_NO_BFLOAT16_CONVERSIONS__)
-set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} --use_fast_math -Xfatbin -compress-all)
+      -U__CUDA_NO_BFLOAT16_CONVERSIONS__
+      -O3
+      --use_fast_math
+      -Xfatbin
+      -compress-all)
+
+# Enable aggresive fatbin compress for CUDA 12.8 or later.
+if(${CUDAToolkit_VERSION} VERSION_GREATER_EQUAL 12.8)
+  list(APPEND CUDA_NVCC_FLAGS -compress-mode=size)
+endif()
 message(STATUS "CUDA_NVCC_FLAGS: ${CUDA_NVCC_FLAGS}")
 
 # enable testing in this directory so we can do a top-level `make test`.