Add CPU registrations to custom operators (pytorch#363)

jwfromm · facebook-github-bot · commit 087e3c1ad9ce · 2024-10-22T00:05:57.000-07:00
Summary: X-link: pytorch#3262 Pull Request resolved: facebookresearch/FBGEMM#363 While CPU arguments shouldnt be used for custom cuda kernels, it turns out they sometimes are in production. The outputs will be garbage but doing so seems to be part of the model construction process. This small diff fixes the issue by adding CPU registrations for custom operators. This should enable production use cases without break torch.export support. Reviewed By: jaconey, jianyuh, jiawenliu64 Differential Revision: D64703788 fbshipit-source-id: c0c8cfb7f0b67c13be10f419c8e3d83991429edb
diff --git a/fbgemm_gpu/experimental/gen_ai/src/comm/car.cpp b/fbgemm_gpu/experimental/gen_ai/src/comm/car.cpp
@@ -278,4 +278,15 @@ TORCH_LIBRARY_IMPL(fbgemm, CUDA, m) {
   m.impl("two_shot_car_allreduce", two_shot_car_allreduce);
 }
 
+// Though it shouldnt be used, it is useful to define these functions for CPU to
+// accomodate model creation.
+TORCH_LIBRARY_IMPL(fbgemm, CPU, m) {
+  m.impl("nccl_allreduce", nccl_allreduce);
+  m.impl("nccl_allgather", nccl_allgather);
+  m.impl("nccl_alltoall", nccl_alltoall);
+  m.impl("nccl_reducescatter", nccl_reducescatter);
+  m.impl("one_shot_car_allreduce", one_shot_car_allreduce);
+  m.impl("two_shot_car_allreduce", two_shot_car_allreduce);
+}
+
 } // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/quantize.cpp b/fbgemm_gpu/experimental/gen_ai/src/quantize/quantize.cpp
@@ -214,4 +214,24 @@ TORCH_LIBRARY_IMPL(fbgemm, CUDA, m) {
 #endif
 }
 
+// Though it should never be used, it still seems helpful to define these
+// functions for CPU to accomodate model creation.
+TORCH_LIBRARY_IMPL(fbgemm, CPU, m) {
+  m.impl("f8f8bf16_blockwise", f8f8bf16_blockwise);
+  m.impl("f8f8bf16_tensorwise", f8f8bf16_tensorwise);
+  m.impl("f8f8bf16_rowwise", f8f8bf16_rowwise);
+  m.impl("quantize_fp8_per_tensor", quantize_fp8_per_tensor);
+  m.impl("quantize_fp8_per_row", quantize_fp8_per_row);
+  m.impl("quantize_fp8_per_col", quantize_fp8_per_col);
+#ifndef USE_ROCM
+  m.impl("i8i8bf16", i8i8bf16);
+  m.impl("f8f8bf16", f8f8bf16);
+  m.impl("f8f8bf16_cublas", f8f8bf16_cublas);
+  m.impl("f8f8bf16_rowwise_batched", f8f8bf16_rowwise_batched);
+  m.impl("f8i4bf16_rowwise", f8i4bf16_rowwise);
+  m.impl("bf16i4bf16_rowwise_batched", bf16i4bf16_rowwise_batched);
+  m.impl("bf16i4bf16_rowwise", bf16i4bf16_rowwise);
+#endif
+}
+
 } // namespace fbgemm_gpu