apicalshark
diff --git a/‎.devops/full.Dockerfile‎
Lines changed: 22 additions & 9 deletions b/‎.devops/full.Dockerfile‎
Lines changed: 22 additions & 9 deletions
diff --git a/‎.devops/llama-cli.Dockerfile‎
Lines changed: 11 additions & 5 deletions b/‎.devops/llama-cli.Dockerfile‎
Lines changed: 11 additions & 5 deletions
diff --git a/‎.devops/llama-server.Dockerfile‎
Lines changed: 7 additions & 15 deletions b/‎.devops/llama-server.Dockerfile‎
Lines changed: 7 additions & 15 deletions
diff --git a/‎.github/workflows/server.yml‎
Lines changed: 16 additions & 10 deletions b/‎.github/workflows/server.yml‎
Lines changed: 16 additions & 10 deletions
diff --git a/‎.gitignore‎
Lines changed: 4 additions & 0 deletions b/‎.gitignore‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎Makefile‎
Lines changed: 10 additions & 9 deletions b/‎Makefile‎
Lines changed: 10 additions & 9 deletions
diff --git a/‎convert_hf_to_gguf.py‎
Lines changed: 33 additions & 22 deletions b/‎convert_hf_to_gguf.py‎
Lines changed: 33 additions & 22 deletions
diff --git a/‎convert_hf_to_gguf_update.py‎
Lines changed: 1 addition & 1 deletion b/‎convert_hf_to_gguf_update.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/build.md‎
Lines changed: 13 additions & 8 deletions b/‎docs/build.md‎
Lines changed: 13 additions & 8 deletions
diff --git a/‎examples/deprecation-warning/deprecation-warning.cpp‎
Lines changed: 1 addition & 1 deletion b/‎examples/deprecation-warning/deprecation-warning.cpp‎
Lines changed: 1 addition & 1 deletion
@@ -3,23 +3,36 @@ ARG UBUNTU_VERSION=22.04
 FROM ubuntu:$UBUNTU_VERSION AS build
 
 RUN apt-get update && \
-    apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1
+    apt-get install -y build-essential git cmake libcurl4-openssl-dev
+
+WORKDIR /app
+
+COPY . .
 
-COPY requirements.txt   requirements.txt
-COPY requirements       requirements
+RUN cmake -S . -B build -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_CURL=ON -DCMAKE_BUILD_TYPE=Release && \
+    cmake --build build -j $(nproc) && \
+    mkdir -p /app/lib && \
+    find build -name "*.so" -exec cp {} /app/lib/ \;
 
-RUN pip install --upgrade pip setuptools wheel \
-    && pip install -r requirements.txt
+FROM ubuntu:$UBUNTU_VERSION as runtime
 
 WORKDIR /app
 
-COPY . .
+RUN apt-get update && \
+    apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1
 
-ENV LLAMA_CURL=1
+COPY requirements.txt   /app/requirements.txt
+COPY requirements       /app/requirements
+COPY .devops/tools.sh   /app/tools.sh
 
+RUN pip install --upgrade pip setuptools wheel && \
+    pip install -r /app/requirements.txt
 
-RUN make -j$(nproc)
+COPY --from=build /app/build/bin/ /app/
+COPY --from=build /app/lib/ /app/
+COPY --from=build /app/convert_hf_to_gguf.py /app/
+COPY --from=build /app/gguf-py /app/gguf-py
 
 ENV LC_ALL=C.utf8
 
-ENTRYPOINT ["/app/.devops/tools.sh"]
+ENTRYPOINT ["/app/tools.sh"]
@@ -3,21 +3,27 @@ ARG UBUNTU_VERSION=22.04
 FROM ubuntu:$UBUNTU_VERSION AS build
 
 RUN apt-get update && \
-    apt-get install -y build-essential git
+    apt-get install -y build-essential git cmake libcurl4-openssl-dev
 
 WORKDIR /app
 
 COPY . .
 
-RUN make -j$(nproc) llama-cli
+RUN cmake -S . -B build -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_CURL=ON -DCMAKE_BUILD_TYPE=Release && \
+    cmake --build build -j $(nproc) && \
+    mkdir -p /app/lib && \
+    find build -name "*.so" -exec cp {} /app/lib/ \;
 
 FROM ubuntu:$UBUNTU_VERSION AS runtime
 
+WORKDIR /app
+
 RUN apt-get update && \
-    apt-get install -y libgomp1
+    apt-get install -y libcurl4-openssl-dev libgomp1 curl
 
-COPY --from=build /app/llama-cli /llama-cli
+COPY --from=build /app/build/bin/llama-cli /app/
+COPY --from=build /app/lib/ /app/
 
 ENV LC_ALL=C.utf8
 
-ENTRYPOINT [ "/llama-cli" ]
+ENTRYPOINT [ "/app/llama-cli" ]
@@ -9,33 +9,25 @@ WORKDIR /app
 
 COPY . .
 
-
-RUN \
-    # Build multiple versions of the CPU backend
-    scripts/build-cpu.sh avx         -DGGML_AVX=ON -DGGML_AVX2=OFF && \
-    scripts/build-cpu.sh avx2        -DGGML_AVX=ON -DGGML_AVX2=ON && \
-    scripts/build-cpu.sh avx512      -DGGML_AVX=ON -DGGML_AVX2=ON -DGGML_AVX512=ON && \
-    scripts/build-cpu.sh amx         -DGGML_AVX=ON -DGGML_AVX2=ON -DGGML_AVX512=ON -DGGML_AVX_VNNI=ON -DGGML_AVX512_VNNI=ON -DGGML_AMX_TILE=ON -DGGML_AMX_INT8=ON && \
-    # Build llama-server
-    cmake -S . -B build -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF -DLLAMA_CURL=ON -DCMAKE_BUILD_TYPE=Release && \
-    cmake --build build --target llama-server -j $(nproc) && \
-    # Copy the built libraries to /app/lib
+RUN cmake -S . -B build -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_CURL=ON -DCMAKE_BUILD_TYPE=Release && \
+    cmake --build build -j $(nproc) && \
     mkdir -p /app/lib && \
-    mv libggml-cpu* /app/lib/ && \
     find build -name "*.so" -exec cp {} /app/lib/ \;
 
 FROM ubuntu:$UBUNTU_VERSION AS runtime
 
+WORKDIR /app
+
 RUN apt-get update && \
     apt-get install -y libcurl4-openssl-dev libgomp1 curl
 
-COPY --from=build /app/build/bin/llama-server /llama-server
-COPY --from=build /app/lib/ /
+COPY --from=build /app/build/bin/llama-server /app/
+COPY --from=build /app/lib/ /app/
 
 ENV LC_ALL=C.utf8
 # Must be set to 0.0.0.0 so it can listen to requests from host machine
 ENV LLAMA_ARG_HOST=0.0.0.0
 
 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
 
-ENTRYPOINT [ "/llama-server" ]
+ENTRYPOINT [ "/app/llama-server" ]
@@ -76,20 +76,26 @@ jobs:
         run: |
           pip install -r examples/server/tests/requirements.txt
 
-      - name: Verify server deps
-        id: verify_server_deps
+      # Setup nodejs (to be used for verifying bundled index.html)
+      - uses: actions/setup-node@v4
+        with:
+          node-version: 22
+
+      - name: Verify bundled index.html
+        id: verify_server_index_html
         run: |
           git config --global --add safe.directory $(realpath .)
-          cd examples/server
-          git ls-files --others --modified
+          cd examples/server/webui
           git status
-          ./deps.sh
+          npm ci
+          npm run build
           git status
-          not_ignored_files="$(git ls-files --others --modified)"
-          echo "Modified files: ${not_ignored_files}"
-          if [ -n "${not_ignored_files}" ]; then
-            echo "Repository is dirty or server deps are not built as expected"
-            echo "${not_ignored_files}"
+          modified_files="$(git status -s)"
+          echo "Modified files: ${modified_files}"
+          if [ -n "${modified_files}" ]; then
+            echo "Repository is dirty or server/webui is not built as expected"
+            echo "Hint: You may need to follow Web UI build guide in server/README.md"
+            echo "${modified_files}"
             exit 1
           fi
 
 
@@ -104,6 +104,10 @@ examples/server/*.mjs.hpp
 !examples/sycl/*.bat
 !examples/sycl/*.sh
 
+# Server Web UI temporary files
+node_modules
+examples/server/webui/dist
+
 # Python
 
 /.venv
 
@@ -1145,8 +1145,15 @@ $(LIB_COMMON_S): $(OBJ_COMMON)
 # Include dependency files
 -include $(DEP_FILES)
 
+# Clean generated server assets
+clean-server-assets:
+	find examples/server -type f -name "*.js.hpp"   -delete
+	find examples/server -type f -name "*.mjs.hpp"  -delete
+	find examples/server -type f -name "*.css.hpp"  -delete
+	find examples/server -type f -name "*.html.hpp" -delete
+
 # Clean rule
-clean:
+clean: clean-server-assets
 	rm -vrf $(BUILD_TARGETS) $(TEST_TARGETS)
 	rm -rvf *.a *.dll *.so *.dot
 	find ggml src common tests examples pocs -type f -name "*.o" -delete
@@ -1354,20 +1361,14 @@ llama-server: \
 	examples/server/utils.hpp \
 	examples/server/httplib.h \
 	examples/server/index.html.hpp \
-	examples/server/completion.js.hpp \
 	examples/server/loading.html.hpp \
-	examples/server/deps_daisyui.min.css.hpp \
-	examples/server/deps_markdown-it.js.hpp \
-	examples/server/deps_tailwindcss.js.hpp \
-	examples/server/deps_vue.esm-browser.js.hpp \
 	common/json.hpp \
-	common/stb_image.h \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
 
 # Portable equivalent of `cd examples/server/public && xxd -i $(notdir $<) ../$(notdir $<).hpp`:
-examples/server/%.hpp: examples/server/public/% Makefile
+examples/server/%.hpp: examples/server/public/% FORCE Makefile
 	@( export NAME=$(subst .,_,$(subst -,_,$(notdir $<))) && \
 		echo "unsigned char $${NAME}[] = {" && \
 		cat $< | od -v -t x1 -An | sed -E 's/([0-9a-fA-F]+)/0x\1, /g' && \
@@ -1542,7 +1543,7 @@ llama-q8dot: pocs/vdot/q8dot.cpp ggml/src/ggml.o \
 # Deprecated binaries that we want to keep around long enough for people to migrate to the new filenames, then these can be removed.
 #
 # Mark legacy binary targets as .PHONY so that they are always checked.
-.PHONY: main quantize perplexity embedding server
+.PHONY: FORCE main quantize perplexity embedding server
 
 # Define the object file target
 examples/deprecation-warning/deprecation-warning.o: examples/deprecation-warning/deprecation-warning.cpp
 
@@ -1831,29 +1831,40 @@ class MiniCPMModel(Model):
     model_arch = gguf.MODEL_ARCH.MINICPM
 
     def set_gguf_parameters(self):
-        block_count = self.hparams["num_hidden_layers"]
-        self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
-        self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
-        self.gguf_writer.add_block_count(block_count)
-        self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
-        self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
-        self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
-        self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
-        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
-        self.gguf_writer.add_file_type(self.ftype)
+        super().set_gguf_parameters()
+        embedding_scale = float(self.hparams["scale_emb"])
+        self.gguf_writer.add_embedding_scale(embedding_scale)
+        logger.info(f"gguf: (minicpm) embedding_scale = {embedding_scale}")
+        residual_scale = self.hparams["scale_depth"] / self.hparams["num_hidden_layers"] ** 0.5
+        self.gguf_writer.add_residual_scale(residual_scale)
+        logger.info(f"gguf: (minicpm) residual_scale = {residual_scale}")
+        logit_scale = self.hparams["hidden_size"] / self.hparams["dim_model_base"]
+        self.gguf_writer.add_logit_scale(logit_scale)
+        logger.info(f"gguf: (minicpm) logit_scale = {logit_scale}")
+        if self.hparams.get("rope_scaling") is not None:
+            if self.hparams["rope_scaling"].get("type") == "longrope":
+                self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LONGROPE)
+                logger.info(f"gguf: (minicpm) rope_scaling_type = {gguf.RopeScalingType.LONGROPE}")
 
-    def set_vocab(self):
-        self._set_vocab_llama_hf()
+    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
+        rope_dims = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
 
-    def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
-        if n_kv_head is not None and n_head != n_kv_head:
-            n_head //= n_kv_head
+        rope_scaling = self.find_hparam(['rope_scaling'], True)
+        if rope_scaling is not None:
+            long_factors = rope_scaling.get('long_factor', None)
+            short_factors = rope_scaling.get('short_factor', None)
 
-        return (
-            weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
-            .swapaxes(1, 2)
-            .reshape(weights.shape)
-        )
+            if long_factors is None or short_factors is None:
+                raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
+
+            if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
+                raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
+
+            yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32))
+            yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))
+
+    def set_vocab(self):
+        self._set_vocab_sentencepiece()
 
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
         del bid  # unused
@@ -1863,9 +1874,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
 
         # HF models permute some of the tensors, so we need to undo that
         if name.endswith(("q_proj.weight")):
-            data_torch = self._reverse_hf_permute(data_torch, n_head, n_head)
+            data_torch = LlamaModel.permute(data_torch, n_head, n_head)
         if name.endswith(("k_proj.weight")):
-            data_torch = self._reverse_hf_permute(data_torch, n_head, n_kv_head)
+            data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
 
         return [(self.map_tensor_name(name), data_torch)]
 
 
@@ -17,7 +17,7 @@
 #
 #   python3 convert_hf_to_gguf_update.py <huggingface_token>
 #
-# - Copy-paste the generated get_vocab_base_pre() function into convert_hf_to_gguf.py
+# - The convert_hf_to_gguf.py script will have had its get_vocab_base_pre() function updated
 # - Update llama.cpp with the new pre-tokenizer if necessary
 #
 # TODO: generate tokenizer tests for llama.cpp
 
@@ -26,19 +26,24 @@ cmake --build build --config Release
 
     1. Single-config generators (e.g. default = `Unix Makefiles`; note that they just ignore the `--config` flag):
 
-    ```bash
-    cmake -B build -DCMAKE_BUILD_TYPE=Debug
-    cmake --build build
-    ```
+       ```bash
+       cmake -B build -DCMAKE_BUILD_TYPE=Debug
+       cmake --build build
+       ```
 
     2. Multi-config generators (`-G` param set to Visual Studio, XCode...):
 
-    ```bash
-    cmake -B build -G "Xcode"
-    cmake --build build --config Debug
-    ```
+       ```bash
+       cmake -B build -G "Xcode"
+       cmake --build build --config Debug
+       ```
 
     For more details and a list of supported generators, see the [CMake documentation](https://cmake.org/cmake/help/latest/manual/cmake-generators.7.html).
+- For static builds, add `-DBUILD_SHARED_LIBS=OFF`:
+  ```
+  cmake -B build -DBUILD_SHARED_LIBS=OFF
+  cmake --build build --config Release
+  ```
 
 - Building for Windows (x86, x64 and arm64) with MSVC or clang as compilers:
     - Install Visual Studio 2022, e.g. via the [Community Edition](https://visualstudio.microsoft.com/de/vs/community/). In the installer, select at least the following options (this also automatically installs the required additional tools like CMake,...):
 
@@ -12,7 +12,7 @@ int main(int argc, char** argv) {
     }
 
     // Get only the program name from the full path
-    auto pos = filename.find_last_of('/');
+    auto pos = filename.find_last_of("/\\");
     if (pos != std::string::npos) {
         filename = filename.substr(pos+1);
     }
Original file line number	Diff line number	Diff line change
`@@ -17,7 +17,7 @@`
`17`	`17`	`#`
`18`	`18`	`# python3 convert_hf_to_gguf_update.py <huggingface_token>`
`19`	`19`	`#`
`20`		`-# - Copy-paste the generated get_vocab_base_pre() function into convert_hf_to_gguf.py`
	`20`	`+# - The convert_hf_to_gguf.py script will have had its get_vocab_base_pre() function updated`
`21`	`21`	`# - Update llama.cpp with the new pre-tokenizer if necessary`
`22`	`22`	`#`
`23`	`23`	`# TODO: generate tokenizer tests for llama.cpp`
Original file line number	Diff line number	Diff line change
`@@ -12,7 +12,7 @@ int main(int argc, char** argv) {`
`12`	`12`	`}`
`13`	`13`
`14`	`14`	`// Get only the program name from the full path`
`15`		`- auto pos = filename.find_last_of('/');`
	`15`	`+ auto pos = filename.find_last_of("/\\");`
`16`	`16`	`if (pos != std::string::npos) {`
`17`	`17`	`filename = filename.substr(pos+1);`
`18`	`18`	`}`