@@ -531,21 +531,10 @@ ifndef GGML_NO_ACCELERATE
531531 endif
532532endif # GGML_NO_ACCELERATE
533533
534- ifdef GGML_MUSA
535- CC := clang
536- CXX := clang++
537- GGML_CUDA := 1
538- MK_CPPFLAGS += -DGGML_USE_MUSA
539- endif
540-
541534ifndef GGML_NO_OPENMP
542535 MK_CPPFLAGS += -DGGML_USE_OPENMP
543536 MK_CFLAGS += -fopenmp
544537 MK_CXXFLAGS += -fopenmp
545- ifdef GGML_MUSA
546- MK_CPPFLAGS += -I/usr/lib/llvm-10/include/openmp
547- MK_LDFLAGS += -L/usr/lib/llvm-10/lib
548- endif # GGML_MUSA
549538endif # GGML_NO_OPENMP
550539
551540ifdef GGML_OPENBLAS
@@ -601,27 +590,15 @@ else
601590endif # GGML_CUDA_FA_ALL_QUANTS
602591
603592ifdef GGML_CUDA
604- ifdef GGML_MUSA
605- ifneq ('', '$(wildcard /opt/musa)')
606- CUDA_PATH ?= /opt/musa
607- else
608- CUDA_PATH ?= /usr/local/musa
609- endif
610-
611- MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include
612- MK_LDFLAGS += -lmusa -lmublas -lmusart -lpthread -ldl -lrt -L$(CUDA_PATH)/lib -L/usr/lib64
613- MK_NVCCFLAGS += -x musa -mtgpu --cuda-gpu-arch=mp_21 --cuda-gpu-arch=mp_22
593+ ifneq ('', '$(wildcard /opt/cuda)')
594+ CUDA_PATH ?= /opt/cuda
614595 else
615- ifneq ('', '$(wildcard /opt/cuda)')
616- CUDA_PATH ?= /opt/cuda
617- else
618- CUDA_PATH ?= /usr/local/cuda
619- endif
596+ CUDA_PATH ?= /usr/local/cuda
597+ endif
620598
621- MK_CPPFLAGS += -DGGML_USE_CUDA -DGGML_CUDA_USE_GRAPHS -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
622- MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib
623- MK_NVCCFLAGS += -use_fast_math
624- endif # GGML_MUSA
599+ MK_CPPFLAGS += -DGGML_USE_CUDA -DGGML_CUDA_USE_GRAPHS -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
600+ MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib
601+ MK_NVCCFLAGS += -use_fast_math
625602
626603 OBJ_GGML += ggml/src/ggml-cuda/ggml-cuda.o
627604 OBJ_GGML += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
@@ -631,11 +608,9 @@ ifdef LLAMA_FATAL_WARNINGS
631608 MK_NVCCFLAGS += -Werror all-warnings
632609endif # LLAMA_FATAL_WARNINGS
633610
634- ifndef GGML_MUSA
635611ifndef JETSON_EOL_MODULE_DETECT
636612 MK_NVCCFLAGS += --forward-unknown-to-host-compiler
637613endif # JETSON_EOL_MODULE_DETECT
638- endif # GGML_MUSA
639614
640615ifdef LLAMA_DEBUG
641616 MK_NVCCFLAGS += -lineinfo
@@ -648,11 +623,7 @@ endif # GGML_CUDA_DEBUG
648623ifdef GGML_CUDA_NVCC
649624 NVCC = $(CCACHE) $(GGML_CUDA_NVCC)
650625else
651- ifdef GGML_MUSA
652- NVCC = $(CCACHE) mcc
653- else
654- NVCC = $(CCACHE) nvcc
655- endif # GGML_MUSA
626+ NVCC = $(CCACHE) nvcc
656627endif # GGML_CUDA_NVCC
657628
658629ifdef CUDA_DOCKER_ARCH
@@ -724,15 +695,9 @@ define NVCC_COMPILE
724695 $(NVCC ) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS ) $(CPPFLAGS ) -Xcompiler "$(CUDA_CXXFLAGS ) " -c $< -o $@
725696endef # NVCC_COMPILE
726697else
727- ifdef GGML_MUSA
728- define NVCC_COMPILE
729- $(NVCC ) $(NVCCFLAGS ) $(CPPFLAGS ) -c $< -o $@
730- endef # NVCC_COMPILE
731- else
732698define NVCC_COMPILE
733699 $(NVCC ) $(NVCCFLAGS ) $(CPPFLAGS ) -Xcompiler "$(CUDA_CXXFLAGS ) " -c $< -o $@
734700endef # NVCC_COMPILE
735- endif # GGML_MUSA
736701endif # JETSON_EOL_MODULE_DETECT
737702
738703ggml/src/ggml-cuda/% .o : \
@@ -874,6 +839,107 @@ ggml/src/ggml-cuda/%.o: \
874839 $(HIPCC ) $(CXXFLAGS ) $(HIPFLAGS ) -x hip -c -o $@ $<
875840endif # GGML_HIPBLAS
876841
842+ ifdef GGML_MUSA
843+ ifeq ($(wildcard /opt/musa),)
844+ MUSA_PATH ?= /usr/local/musa
845+ else
846+ MUSA_PATH ?= /opt/musa
847+ endif
848+ MTGPU_TARGETS ?= mp_21 mp_22
849+
850+ MK_CPPFLAGS += -DGGML_USE_MUSA -DGGML_USE_CUDA
851+ MK_LDFLAGS += -L$(MUSA_PATH)/lib -Wl,-rpath=$(MUSA_PATH)/lib
852+ MK_LDFLAGS += -lmusa -lmusart -lmublas
853+
854+ ifndef GGML_NO_OPENMP
855+ # For Ubuntu Focal
856+ MK_CPPFLAGS += -I/usr/lib/llvm-10/include/openmp
857+ MK_LDFLAGS += -L/usr/lib/llvm-10/lib
858+ # For Ubuntu Jammy
859+ MK_CPPFLAGS += -I/usr/lib/llvm-14/lib/clang/14.0.0/include
860+ MK_LDFLAGS += -L/usr/lib/llvm-14/lib
861+ endif # GGML_NO_OPENMP
862+
863+ CC := $(MUSA_PATH)/bin/clang
864+ CXX := $(MUSA_PATH)/bin/clang++
865+ MCC := $(CCACHE) $(MUSA_PATH)/bin/mcc
866+
867+ MUSAFLAGS += $(addprefix --cuda-gpu-arch=, $(MTGPU_TARGETS))
868+
869+ ifdef GGML_CUDA_FORCE_DMMV
870+ MUSAFLAGS += -DGGML_CUDA_FORCE_DMMV
871+ endif # GGML_CUDA_FORCE_DMMV
872+
873+ ifdef GGML_CUDA_FORCE_MMQ
874+ MUSAFLAGS += -DGGML_CUDA_FORCE_MMQ
875+ endif # GGML_CUDA_FORCE_MMQ
876+
877+ ifdef GGML_CUDA_FORCE_CUBLAS
878+ MUSAFLAGS += -DGGML_CUDA_FORCE_CUBLAS
879+ endif # GGML_CUDA_FORCE_CUBLAS
880+
881+ ifdef GGML_CUDA_DMMV_X
882+ MUSAFLAGS += -DGGML_CUDA_DMMV_X=$(GGML_CUDA_DMMV_X)
883+ else
884+ MUSAFLAGS += -DGGML_CUDA_DMMV_X=32
885+ endif # GGML_CUDA_DMMV_X
886+
887+ ifdef GGML_CUDA_MMV_Y
888+ MUSAFLAGS += -DGGML_CUDA_MMV_Y=$(GGML_CUDA_MMV_Y)
889+ else
890+ MUSAFLAGS += -DGGML_CUDA_MMV_Y=1
891+ endif # GGML_CUDA_MMV_Y
892+
893+ ifdef GGML_CUDA_F16
894+ MUSAFLAGS += -DGGML_CUDA_F16
895+ endif # GGML_CUDA_F16
896+
897+ ifdef GGML_CUDA_DMMV_F16
898+ MUSAFLAGS += -DGGML_CUDA_F16
899+ endif # GGML_CUDA_DMMV_F16
900+
901+ ifdef GGML_CUDA_KQUANTS_ITER
902+ MUSAFLAGS += -DK_QUANTS_PER_ITERATION=$(GGML_CUDA_KQUANTS_ITER)
903+ else
904+ MUSAFLAGS += -DK_QUANTS_PER_ITERATION=2
905+ endif
906+
907+ ifdef GGML_CUDA_PEER_MAX_BATCH_SIZE
908+ MUSAFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=$(GGML_CUDA_PEER_MAX_BATCH_SIZE)
909+ else
910+ MUSAFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128
911+ endif # GGML_CUDA_PEER_MAX_BATCH_SIZE
912+
913+ ifdef GGML_CUDA_NO_PEER_COPY
914+ MUSAFLAGS += -DGGML_CUDA_NO_PEER_COPY
915+ endif # GGML_CUDA_NO_PEER_COPY
916+
917+ ifdef GGML_CUDA_FA_ALL_QUANTS
918+ MUSAFLAGS += -DGGML_CUDA_FA_ALL_QUANTS
919+ endif # GGML_CUDA_FA_ALL_QUANTS
920+
921+ OBJ_GGML += ggml/src/ggml-cuda/ggml-cuda.o
922+ OBJ_GGML += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
923+ OBJ_GGML += $(OBJ_CUDA_TMPL)
924+
925+ ggml/src/ggml-cuda/ggml-cuda.o : \
926+ ggml/src/ggml-cuda/ggml-cuda.cu \
927+ ggml/include/ggml-cuda.h \
928+ ggml/include/ggml.h \
929+ ggml/include/ggml-backend.h \
930+ ggml/src/ggml-backend-impl.h \
931+ ggml/src/ggml-common.h \
932+ $(wildcard ggml/src/ggml-cuda/* .cuh)
933+ $(MCC ) $(CXXFLAGS ) $(MUSAFLAGS ) -x musa -mtgpu -c -o $@ $<
934+
935+ ggml/src/ggml-cuda/% .o : \
936+ ggml/src/ggml-cuda/%.cu \
937+ ggml/include/ggml.h \
938+ ggml/src/ggml-common.h \
939+ ggml/src/ggml-cuda/common.cuh
940+ $(MCC ) $(CXXFLAGS ) $(MUSAFLAGS ) -x musa -mtgpu -c -o $@ $<
941+ endif # GGML_MUSA
942+
877943ifdef GGML_METAL
878944 MK_CPPFLAGS += -DGGML_USE_METAL
879945 MK_LDFLAGS += -framework Foundation -framework Metal -framework MetalKit
@@ -1002,7 +1068,6 @@ $(info I CXX: $(shell $(CXX) --version | head -n 1))
10021068ifdef GGML_CUDA
10031069$(info I NVCC : $(shell $(NVCC ) --version | tail -n 1) )
10041070CUDA_VERSION := $(shell $(NVCC ) --version | grep -oP 'release (\K[0-9]+\.[0-9]) ')
1005- ifndef GGML_MUSA
10061071ifeq ($(shell awk -v "v=$(CUDA_VERSION ) " 'BEGIN { print (v < 11.7) }'),1)
10071072
10081073ifndef CUDA_DOCKER_ARCH
@@ -1012,7 +1077,6 @@ endif # CUDA_POWER_ARCH
10121077endif # CUDA_DOCKER_ARCH
10131078
10141079endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
1015- endif # GGML_MUSA
10161080endif # GGML_CUDA
10171081$(info )
10181082
@@ -1283,6 +1347,7 @@ clean:
12831347 rm -vrf ggml/src/ggml-rpc/* .o
12841348 rm -vrf ggml/src/ggml-sycl/* .o
12851349 rm -vrf ggml/src/ggml-vulkan/* .o
1350+ rm -vrf ggml/src/ggml-musa/* .o
12861351 rm -rvf $(BUILD_TARGETS )
12871352 rm -rvf $(TEST_TARGETS )
12881353 rm -f vulkan-shaders-gen ggml/src/ggml-vulkan-shaders.hpp ggml/src/ggml-vulkan-shaders.cpp
0 commit comments