CAR check is done elsewhere, as in upstream

gshtras · gshtras · commit 6ec25337d24b · 2025-06-09T19:48:13.000Z
Remove dead code
diff --git a/csrc/layernorm_kernels.cu b/csrc/layernorm_kernels.cu
@@ -10,12 +10,6 @@
   #include <hipcub/hipcub.hpp>
 #endif
 
-#ifdef USE_ROCM
-  #include "quantization/fp8/amd/quant_utils.cuh"
-#else
-  #include "quantization/fp8/nvidia/quant_utils.cuh"
-#endif
-
 namespace vllm {
 
 // This kernel uses the _f16Vec to represent vectorized data.
@@ -191,26 +185,6 @@ fused_add_rms_norm_kernel(
   }
 }
 
-/* Function specialization in the case of FP16/BF16 tensors.
-   Additional optimizations we can make in this case are
-   packed and vectorized operations, which help with the
-   memory latency bottleneck. */
-
-template <>
-struct Vec<c10::Float8_e4m3fnuz, 8> {
-  using Type = uint2;
-};
-
-template <>
-struct Vec<c10::Half, 8> {
-  using Type = uint4;
-};
-
-template <>
-struct Vec<c10::BFloat16, 8> {
-  using Type = bf16_8_t;
-};
-
 }  // namespace vllm
 
 #define LAUNCH_RMS_NORM(width)                                               \
diff --git a/vllm/config.py b/vllm/config.py
@@ -1951,13 +1951,6 @@ def __post_init__(self) -> None:
 
         self._verify_args()
 
-        from vllm.platforms.rocm import on_gfx1x
-        if on_gfx1x() and self.tensor_parallel_size > 1:
-            self.disable_custom_all_reduce = True
-            logger.info(
-                "Disabled the custom all-reduce kernel because it is not "
-                "working correctly on multiple AMD Radeon GPUs.")
-
     @property
     def use_ray(self) -> bool:
         return self.distributed_executor_backend == "ray" or (