Skip to content

Commit 7b59859

Browse files
authored
Merge branch 'ggerganov:master' into mobile_vlm
2 parents 3932fd5 + cc2983d commit 7b59859

File tree

137 files changed

+16119
-8704
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

137 files changed

+16119
-8704
lines changed

.devops/full-musa.Dockerfile

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
ARG UBUNTU_VERSION=22.04
2+
# This needs to generally match the container host's environment.
3+
ARG MUSA_VERSION=rc3.1.0
4+
# Target the MUSA build image
5+
ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
6+
7+
FROM ${BASE_MUSA_DEV_CONTAINER} AS build
8+
9+
RUN apt-get update && \
10+
apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1
11+
12+
COPY requirements.txt requirements.txt
13+
COPY requirements requirements
14+
15+
RUN pip install --upgrade pip setuptools wheel \
16+
&& pip install -r requirements.txt
17+
18+
WORKDIR /app
19+
20+
COPY . .
21+
22+
RUN cmake -B build -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
23+
cmake --build build --config Release -j$(nproc) && \
24+
cp build/bin/* .
25+
26+
ENTRYPOINT ["/app/.devops/tools.sh"]

.devops/llama-cli-musa.Dockerfile

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
ARG UBUNTU_VERSION=22.04
2+
# This needs to generally match the container host's environment.
3+
ARG MUSA_VERSION=rc3.1.0
4+
# Target the MUSA build image
5+
ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
6+
# Target the MUSA runtime image
7+
ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
8+
9+
FROM ${BASE_MUSA_DEV_CONTAINER} AS build
10+
11+
RUN apt-get update && \
12+
apt-get install -y build-essential git cmake
13+
14+
WORKDIR /app
15+
16+
COPY . .
17+
18+
RUN cmake -B build -DGGML_MUSA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
19+
cmake --build build --config Release --target llama-cli -j$(nproc)
20+
21+
FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime
22+
23+
RUN apt-get update && \
24+
apt-get install -y libgomp1
25+
26+
COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
27+
COPY --from=build /app/build/src/libllama.so /libllama.so
28+
COPY --from=build /app/build/bin/llama-cli /llama-cli
29+
30+
ENTRYPOINT [ "/llama-cli" ]
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
ARG UBUNTU_VERSION=22.04
2+
# This needs to generally match the container host's environment.
3+
ARG MUSA_VERSION=rc3.1.0
4+
# Target the MUSA build image
5+
ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
6+
# Target the MUSA runtime image
7+
ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
8+
9+
FROM ${BASE_MUSA_DEV_CONTAINER} AS build
10+
11+
RUN apt-get update && \
12+
apt-get install -y build-essential git cmake libcurl4-openssl-dev
13+
14+
WORKDIR /app
15+
16+
COPY . .
17+
18+
RUN cmake -B build -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
19+
cmake --build build --config Release --target llama-server -j$(nproc)
20+
21+
FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime
22+
23+
RUN apt-get update && \
24+
apt-get install -y libcurl4-openssl-dev libgomp1 curl
25+
26+
COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
27+
COPY --from=build /app/build/src/libllama.so /libllama.so
28+
COPY --from=build /app/build/bin/llama-server /llama-server
29+
30+
# Must be set to 0.0.0.0 so it can listen to requests from host machine
31+
ENV LLAMA_ARG_HOST=0.0.0.0
32+
33+
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
34+
35+
ENTRYPOINT [ "/llama-server" ]

.github/workflows/docker.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,9 @@ jobs:
4343
- { tag: "light-cuda", dockerfile: ".devops/llama-cli-cuda.Dockerfile", platforms: "linux/amd64" }
4444
- { tag: "server-cuda", dockerfile: ".devops/llama-server-cuda.Dockerfile", platforms: "linux/amd64" }
4545
- { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
46+
- { tag: "light-musa", dockerfile: ".devops/llama-cli-musa.Dockerfile", platforms: "linux/amd64" }
47+
- { tag: "server-musa", dockerfile: ".devops/llama-server-musa.Dockerfile", platforms: "linux/amd64" }
48+
- { tag: "full-musa", dockerfile: ".devops/full-musa.Dockerfile", platforms: "linux/amd64" }
4649
# Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
4750
#- { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
4851
#- { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }

CMakeLists.txt

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ option(LLAMA_SANITIZE_ADDRESS "llama: enable address sanitizer" OFF)
6363
option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF)
6464

6565
# utils
66-
option(LLAMA_BUILD_COMMON "llama: build common utils library" ON)
66+
option(LLAMA_BUILD_COMMON "llama: build common utils library" ${LLAMA_STANDALONE})
6767

6868
# extra artifacts
6969
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
@@ -88,6 +88,10 @@ if (NOT DEFINED GGML_LLAMAFILE)
8888
set(GGML_LLAMAFILE_DEFAULT ON)
8989
endif()
9090

91+
if (NOT DEFINED GGML_AMX)
92+
set(GGML_AMX ON)
93+
endif()
94+
9195
if (NOT DEFINED GGML_CUDA_GRAPHS)
9296
set(GGML_CUDA_GRAPHS_DEFAULT ON)
9397
endif()
@@ -201,12 +205,12 @@ if (LLAMA_BUILD_COMMON)
201205
add_subdirectory(common)
202206
endif()
203207

204-
if (LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
208+
if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
205209
include(CTest)
206210
add_subdirectory(tests)
207211
endif()
208212

209-
if (LLAMA_BUILD_EXAMPLES)
213+
if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_EXAMPLES)
210214
add_subdirectory(examples)
211215
add_subdirectory(pocs)
212216
endif()

Makefile

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -93,11 +93,6 @@ GGML_METAL := 1
9393
DEPRECATE_WARNING := 1
9494
endif
9595

96-
ifdef LLAMA_OPENMP
97-
GGML_OPENMP := 1
98-
DEPRECATE_WARNING := 1
99-
endif
100-
10196
ifdef LLAMA_RPC
10297
GGML_RPC := 1
10398
DEPRECATE_WARNING := 1
@@ -584,6 +579,11 @@ ifndef GGML_NO_LLAMAFILE
584579
OBJ_GGML += ggml/src/llamafile/sgemm.o
585580
endif
586581

582+
ifndef GGML_NO_AMX
583+
MK_CPPFLAGS += -DGGML_USE_AMX
584+
OBJ_GGML += ggml/src/ggml-amx.o ggml/src/ggml-amx/mmq.o
585+
endif
586+
587587
ifdef GGML_RPC
588588
MK_CPPFLAGS += -DGGML_USE_RPC
589589
OBJ_GGML += ggml/src/ggml-rpc.o
@@ -1087,6 +1087,19 @@ ggml/src/llamafile/sgemm.o: \
10871087
$(CXX) $(CXXFLAGS) -c $< -o $@
10881088
endif # GGML_NO_LLAMAFILE
10891089

1090+
ifndef GGML_NO_AMX
1091+
ggml/src/ggml-amx.o: \
1092+
ggml/src/ggml-amx.cpp \
1093+
ggml/include/ggml-amx.h
1094+
$(CXX) $(CXXFLAGS) -c $< -o $@
1095+
1096+
ggml/src/ggml-amx/mmq.o: \
1097+
ggml/src/ggml-amx/mmq.cpp \
1098+
ggml/src/ggml-amx/mmq.h \
1099+
ggml/include/ggml.h
1100+
$(CXX) $(CXXFLAGS) -c $< -o $@
1101+
endif
1102+
10901103
ifdef GGML_RPC
10911104
ggml/src/ggml-rpc.o: \
10921105
ggml/src/ggml-rpc.cpp \
@@ -1238,6 +1251,7 @@ clean:
12381251
rm -vrf ggml/src/ggml-metal-embed.metal
12391252
rm -vrf ggml/src/ggml-cuda/*.o
12401253
rm -vrf ggml/src/ggml-cuda/template-instances/*.o
1254+
rm -vrf ggml/src/ggml-amx/*.o
12411255
rm -rvf $(BUILD_TARGETS)
12421256
rm -rvf $(TEST_TARGETS)
12431257
rm -f vulkan-shaders-gen ggml/src/ggml-vulkan-shaders.hpp ggml/src/ggml-vulkan-shaders.cpp

README.md

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,9 @@ variety of hardware - locally and in the cloud.
2929

3030
- Plain C/C++ implementation without any dependencies
3131
- Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
32-
- AVX, AVX2 and AVX512 support for x86 architectures
32+
- AVX, AVX2, AVX512 and AMX support for x86 architectures
3333
- 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
34-
- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP)
34+
- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP and Moore Threads MTT GPUs via MUSA)
3535
- Vulkan and SYCL backend support
3636
- CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity
3737

@@ -93,6 +93,7 @@ Typically finetunes of the base models below are supported as well.
9393
- [x] [FalconMamba Models](https://huggingface.co/collections/tiiuae/falconmamba-7b-66b9a580324dd1598b0f6d4a)
9494
- [x] [Jais](https://huggingface.co/inceptionai/jais-13b-chat)
9595
- [x] [Bielik-11B-v2.3](https://huggingface.co/collections/speakleash/bielik-11b-v23-66ee813238d9b526a072408a)
96+
- [x] [RWKV-6](https://github.com/BlinkDL/RWKV-LM)
9697

9798
(instructions for supporting more models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md))
9899

@@ -122,6 +123,7 @@ Typically finetunes of the base models below are supported as well.
122123
- Rust (nicer API): [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
123124
- Rust (more direct bindings): [utilityai/llama-cpp-rs](https://github.com/utilityai/llama-cpp-rs)
124125
- C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp)
126+
- C#/VB.NET (more features - community license): [LM-Kit.NET](https://docs.lm-kit.com/lm-kit-net/index.html)
125127
- Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s)
126128
- Clojure: [phronmophobic/llama.clj](https://github.com/phronmophobic/llama.clj)
127129
- React Native: [mybigday/llama.rn](https://github.com/mybigday/llama.rn)
@@ -130,6 +132,8 @@ Typically finetunes of the base models below are supported as well.
130132
- Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart)
131133
- PHP (API bindings and features built on top of llama.cpp): [distantmagic/resonance](https://github.com/distantmagic/resonance) [(more info)](https://github.com/ggerganov/llama.cpp/pull/6326)
132134
- Guile Scheme: [guile_llama_cpp](https://savannah.nongnu.org/projects/guile-llama-cpp)
135+
- Swift [srgtuszy/llama-cpp-swift](https://github.com/srgtuszy/llama-cpp-swift)
136+
- Swift [ShenghaiWang/SwiftLlama](https://github.com/ShenghaiWang/SwiftLlama)
133137

134138
**UI:**
135139

@@ -170,6 +174,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
170174
- [LARS - The LLM & Advanced Referencing Solution](https://github.com/abgulati/LARS) (AGPL)
171175
- [LLMUnity](https://github.com/undreamai/LLMUnity) (MIT)
172176
- [Llama Assistant](https://github.com/vietanhdev/llama-assistant) (GPL)
177+
- [PocketPal AI - An iOS and Android App](https://github.com/a-ghorbani/pocketpal-ai) (MIT)
173178

174179
*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
175180

@@ -185,6 +190,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
185190

186191
- [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp
187192
- [GPUStack](https://github.com/gpustack/gpustack) - Manage GPU clusters for running LLMs
193+
- [llama_cpp_canister](https://github.com/onicai/llama_cpp_canister) - llama.cpp as a smart contract on the Internet Computer, using WebAssembly
188194

189195
**Games:**
190196
- [Lucy's Labyrinth](https://github.com/MorganRO8/Lucys_Labyrinth) - A simple maze game where agents controlled by an AI model will try to trick you.
@@ -413,7 +419,7 @@ Please refer to [Build llama.cpp locally](./docs/build.md)
413419
| [BLAS](./docs/build.md#blas-build) | All |
414420
| [BLIS](./docs/backend/BLIS.md) | All |
415421
| [SYCL](./docs/backend/SYCL.md) | Intel and Nvidia GPU |
416-
| [MUSA](./docs/build.md#musa) | Moore Threads GPU |
422+
| [MUSA](./docs/build.md#musa) | Moore Threads MTT GPU |
417423
| [CUDA](./docs/build.md#cuda) | Nvidia GPU |
418424
| [hipBLAS](./docs/build.md#hipblas) | AMD GPU |
419425
| [Vulkan](./docs/build.md#vulkan) | GPU |

ci/run.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ if [ ! -z ${GG_BUILD_SYCL} ]; then
5353
exit 1
5454
fi
5555

56-
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON"
56+
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_SYCL=1 -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON"
5757
fi
5858

5959
if [ ! -z ${GG_BUILD_VULKAN} ]; then

0 commit comments

Comments
 (0)