Skip to content

Commit 85bcb08

Browse files
committed
Merge branch 'master' into esocrok
2 parents 10124d8 + a86a580 commit 85bcb08

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

70 files changed

+7144
-2550
lines changed

.devops/s390x.Dockerfile

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
ARG GCC_VERSION=15.2.0
2+
ARG UBUNTU_VERSION=24.04
3+
4+
### Build Llama.cpp stage
5+
FROM gcc:${GCC_VERSION} AS build
6+
7+
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
8+
--mount=type=cache,target=/var/lib/apt/lists,sharing=locked \
9+
apt update -y && \
10+
apt upgrade -y && \
11+
apt install -y --no-install-recommends \
12+
git cmake ccache ninja-build \
13+
# WARNING: Do not use libopenblas-openmp-dev. libopenblas-dev is faster.
14+
libopenblas-dev libcurl4-openssl-dev && \
15+
rm -rf /var/lib/apt/lists/*
16+
17+
WORKDIR /app
18+
COPY . .
19+
20+
RUN --mount=type=cache,target=/root/.ccache \
21+
--mount=type=cache,target=/app/build \
22+
cmake -S . -B build -G Ninja \
23+
-DCMAKE_BUILD_TYPE=Release \
24+
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
25+
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
26+
-DLLAMA_BUILD_TESTS=OFF \
27+
-DGGML_BACKEND_DL=OFF \
28+
-DGGML_NATIVE=OFF \
29+
-DGGML_BLAS=ON \
30+
-DGGML_BLAS_VENDOR=OpenBLAS && \
31+
cmake --build build --config Release -j $(nproc) && \
32+
cmake --install build --prefix /opt/llama.cpp
33+
34+
COPY *.py /opt/llama.cpp/bin
35+
COPY .devops/tools.sh /opt/llama.cpp/bin
36+
37+
COPY gguf-py /opt/llama.cpp/gguf-py
38+
COPY requirements.txt /opt/llama.cpp/gguf-py
39+
COPY requirements /opt/llama.cpp/gguf-py/requirements
40+
41+
42+
### Collect all llama.cpp binaries, libraries and distro libraries
43+
FROM scratch AS collector
44+
45+
# Copy llama.cpp binaries and libraries
46+
COPY --from=build /opt/llama.cpp/bin /llama.cpp/bin
47+
COPY --from=build /opt/llama.cpp/lib /llama.cpp/lib
48+
COPY --from=build /opt/llama.cpp/gguf-py /llama.cpp/gguf-py
49+
50+
51+
### Base image
52+
FROM ubuntu:${UBUNTU_VERSION} AS base
53+
54+
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
55+
--mount=type=cache,target=/var/lib/apt/lists,sharing=locked \
56+
apt update -y && \
57+
apt install -y --no-install-recommends \
58+
# WARNING: Do not use libopenblas-openmp-dev. libopenblas-dev is faster.
59+
# See: https://github.com/ggml-org/llama.cpp/pull/15915#issuecomment-3317166506
60+
curl libgomp1 libopenblas-dev && \
61+
apt autoremove -y && \
62+
apt clean -y && \
63+
rm -rf /tmp/* /var/tmp/* && \
64+
find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \
65+
find /var/cache -type f -delete
66+
67+
# Copy llama.cpp libraries
68+
COPY --from=collector /llama.cpp/lib /usr/lib/s390x-linux-gnu
69+
70+
71+
### Full
72+
FROM base AS full
73+
74+
ENV PATH="/root/.cargo/bin:${PATH}"
75+
WORKDIR /app
76+
77+
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
78+
--mount=type=cache,target=/var/lib/apt/lists,sharing=locked \
79+
apt update -y && \
80+
apt install -y \
81+
git cmake libjpeg-dev \
82+
python3 python3-pip python3-dev && \
83+
apt autoremove -y && \
84+
apt clean -y && \
85+
rm -rf /tmp/* /var/tmp/* && \
86+
find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \
87+
find /var/cache -type f -delete
88+
89+
RUN curl https://sh.rustup.rs -sSf | bash -s -- -y
90+
91+
COPY --from=collector /llama.cpp/bin /app
92+
COPY --from=collector /llama.cpp/gguf-py /app/gguf-py
93+
94+
RUN pip install --no-cache-dir --break-system-packages \
95+
-r /app/gguf-py/requirements.txt
96+
97+
ENTRYPOINT [ "/app/tools.sh" ]
98+
99+
100+
### CLI Only
101+
FROM base AS light
102+
103+
WORKDIR /llama.cpp/bin
104+
105+
# Copy llama.cpp binaries and libraries
106+
COPY --from=collector /llama.cpp/bin/llama-cli /llama.cpp/bin
107+
108+
ENTRYPOINT [ "/llama.cpp/bin/llama-cli" ]
109+
110+
111+
### Server
112+
FROM base AS server
113+
114+
ENV LLAMA_ARG_HOST=0.0.0.0
115+
116+
WORKDIR /llama.cpp/bin
117+
118+
# Copy llama.cpp binaries and libraries
119+
COPY --from=collector /llama.cpp/bin/llama-server /llama.cpp/bin
120+
121+
EXPOSE 8080
122+
123+
ENTRYPOINT [ "/llama.cpp/bin/llama-server" ]

common/common.cpp

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#include <climits>
2323
#include <cmath>
2424
#include <codecvt>
25+
#include <chrono>
2526
#include <cstdarg>
2627
#include <cstring>
2728
#include <ctime>
@@ -968,15 +969,13 @@ struct common_init_result common_init_from_params(common_params & params) {
968969

969970
bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
970971
bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
972+
bool has_rerank_prompt = llama_model_chat_template(model, "rerank") != NULL;
971973

972-
if (!has_eos && !has_sep) {
973-
LOG_WRN("%s: warning: vocab does not have an EOS token or SEP token, reranking will not work\n", __func__);
974+
if (!has_eos && !has_sep && !has_rerank_prompt) {
975+
LOG_WRN("%s: warning: vocab does not have an EOS token, SEP token, or rerank prompt. Reranking will not work\n", __func__);
974976
ok = false;
975977
} else if (!has_eos) {
976978
LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
977-
} else if (!has_sep) {
978-
LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
979-
ok = false;
980979
}
981980

982981
if (!ok) {

common/common.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -734,7 +734,7 @@ const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
734734
// MoE utils
735735
//
736736

737-
const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate)_exps";
737+
const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate)_(ch|)exps";
738738

739739
static std::string llm_ffn_exps_block_regex(int idx) {
740740
return string_format("blk\\.%d%s", idx, LLM_FFN_EXPS_REGEX);

common/sampling.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -332,6 +332,7 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
332332
}
333333
if (ctx) {
334334
llama_perf_context_print(ctx);
335+
llama_memory_breakdown_print(ctx);
335336
}
336337
}
337338

0 commit comments

Comments
 (0)