Skip to content

Commit a29e53c

Browse files
authored
Merge b3557
b3557
2 parents f56c4b5 + 3071c0a commit a29e53c

27 files changed

+1819
-244
lines changed

.gitignore

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,6 @@ models-mnt
7979
!models/ggml-vocab-*.gguf*
8080

8181
# Zig
82-
8382
zig-out/
8483
zig-cache/
8584

Makefile

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ BUILD_TARGETS = \
1919
llama-imatrix \
2020
llama-infill \
2121
llama-llava-cli \
22+
llama-minicpmv-cli\
2223
llama-lookahead \
2324
llama-lookup \
2425
llama-lookup-create \
@@ -1206,6 +1207,7 @@ clean:
12061207
rm -rvf ggml/*.dll
12071208
rm -rvf ggml/*.so
12081209
rm -vrf ggml/src/*.o
1210+
rm -rvf ggml/src/llamafile/*.o
12091211
rm -rvf common/build-info.cpp
12101212
rm -vrf ggml/src/ggml-metal-embed.metal
12111213
rm -vrf ggml/src/ggml-cuda/*.o
@@ -1462,6 +1464,17 @@ llama-llava-cli: examples/llava/llava-cli.cpp \
14621464
$(CXX) $(CXXFLAGS) -c examples/llava/llava.cpp -o $(call GET_OBJ_FILE, examples/llava/llava.cpp)
14631465
$(CXX) $(CXXFLAGS) $(filter-out %.h $< examples/llava/clip.cpp examples/llava/llava.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) $(call GET_OBJ_FILE, examples/llava/llava.cpp) -o $@ $(LDFLAGS)
14641466

1467+
llama-minicpmv-cli: examples/llava/minicpmv-cli.cpp \
1468+
examples/llava/clip.h \
1469+
examples/llava/clip.cpp \
1470+
examples/llava/llava.h \
1471+
examples/llava/llava.cpp \
1472+
$(OBJ_ALL)
1473+
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
1474+
$(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual
1475+
$(CXX) $(CXXFLAGS) -c examples/llava/llava.cpp -o $(call GET_OBJ_FILE, examples/llava/llava.cpp)
1476+
$(CXX) $(CXXFLAGS) $(filter-out %.h $< examples/llava/clip.cpp examples/llava/llava.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) $(call GET_OBJ_FILE, examples/llava/llava.cpp) -o $@ $(LDFLAGS)
1477+
14651478
ifeq ($(UNAME_S),Darwin)
14661479
swift: examples/batched.swift
14671480
(cd examples/batched.swift; make build)

convert_hf_to_gguf.py

Lines changed: 45 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -251,12 +251,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
251251

252252
return [(self.map_tensor_name(name), data_torch)]
253253

254-
def extra_f32_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool:
255-
del name, new_name, bid, n_dims # unused
256-
257-
return False
258-
259-
def extra_f16_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool:
254+
def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool:
260255
del name, new_name, bid, n_dims # unused
261256

262257
return False
@@ -285,54 +280,46 @@ def prepare_tensors(self):
285280
for new_name, data in ((n, d.squeeze().numpy()) for n, d in self.modify_tensors(data_torch, name, bid)):
286281
data: np.ndarray # type hint
287282
n_dims = len(data.shape)
288-
data_dtype = data.dtype
289-
data_qtype: gguf.GGMLQuantizationType | None = None
290-
291-
# when both are True, f32 should win
292-
extra_f32 = self.extra_f32_tensors(name, new_name, bid, n_dims)
293-
extra_f16 = self.extra_f16_tensors(name, new_name, bid, n_dims)
283+
data_qtype: gguf.GGMLQuantizationType | bool = self.tensor_force_quant(name, new_name, bid, n_dims)
294284

295285
# Most of the codebase that takes in 1D tensors or norms only handles F32 tensors
296-
# Conditions should closely match those in llama_model_quantize_internal in llama.cpp
297-
extra_f32 = any(cond for cond in (
298-
extra_f32,
299-
n_dims == 1,
300-
new_name.endswith("_norm.weight"),
301-
))
286+
if n_dims <= 1 or new_name.endswith("_norm.weight"):
287+
data_qtype = gguf.GGMLQuantizationType.F32
302288

289+
# Conditions should closely match those in llama_model_quantize_internal in llama.cpp
303290
# Some tensor types are always in float32
304-
extra_f32 = extra_f32 or any(self.match_model_tensor_name(new_name, key, bid) for key in (
305-
gguf.MODEL_TENSOR.FFN_GATE_INP,
306-
gguf.MODEL_TENSOR.POS_EMBD,
307-
gguf.MODEL_TENSOR.TOKEN_TYPES,
308-
))
309-
310-
# if f16 desired, convert any float32 2-dim weight tensors to float16
311-
extra_f16 = any(cond for cond in (
312-
extra_f16,
313-
(name.endswith(".weight") and n_dims >= 2),
314-
))
315-
316-
if self.ftype != gguf.LlamaFileType.ALL_F32 and extra_f16 and not extra_f32:
317-
if self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
318-
data = gguf.quantize_bf16(data)
319-
assert data.dtype == np.uint16
320-
data_qtype = gguf.GGMLQuantizationType.BF16
321-
322-
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0 and gguf.can_quantize_to_q8_0(data):
323-
data = gguf.quantize_q8_0(data)
324-
assert data.dtype == np.uint8
325-
data_qtype = gguf.GGMLQuantizationType.Q8_0
291+
if data_qtype is False and (
292+
any(
293+
self.match_model_tensor_name(new_name, key, bid)
294+
for key in (
295+
gguf.MODEL_TENSOR.FFN_GATE_INP,
296+
gguf.MODEL_TENSOR.POS_EMBD,
297+
gguf.MODEL_TENSOR.TOKEN_TYPES,
298+
)
299+
)
300+
or not name.endswith(".weight")
301+
):
302+
data_qtype = gguf.GGMLQuantizationType.F32
326303

327-
else: # default to float16 for quantized tensors
328-
if data_dtype != np.float16:
329-
data = data.astype(np.float16)
304+
# No override (data_qtype is False), or wants to be quantized (data_qtype is True)
305+
if isinstance(data_qtype, bool):
306+
if self.ftype == gguf.LlamaFileType.ALL_F32:
307+
data_qtype = gguf.GGMLQuantizationType.F32
308+
elif self.ftype == gguf.LlamaFileType.MOSTLY_F16:
330309
data_qtype = gguf.GGMLQuantizationType.F16
310+
elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
311+
data_qtype = gguf.GGMLQuantizationType.BF16
312+
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0:
313+
data_qtype = gguf.GGMLQuantizationType.Q8_0
314+
else:
315+
raise ValueError(f"Unknown file type: {self.ftype.name}")
331316

332-
if data_qtype is None: # by default, convert to float32
333-
if data_dtype != np.float32:
334-
data = data.astype(np.float32)
335-
data_qtype = gguf.GGMLQuantizationType.F32
317+
try:
318+
data = gguf.quants.quantize(data, data_qtype)
319+
except gguf.QuantError as e:
320+
logger.warning("%s, %s", e, "falling back to F16")
321+
data_qtype = gguf.GGMLQuantizationType.F16
322+
data = gguf.quants.quantize(data, data_qtype)
336323

337324
shape = gguf.quant_shape_from_byte_shape(data.shape, data_qtype) if data.dtype == np.uint8 else data.shape
338325

@@ -1765,7 +1752,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
17651752

17661753
return [(new_name, data_torch)]
17671754

1768-
def extra_f16_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool:
1755+
def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool:
17691756
del name, new_name, bid # unused
17701757

17711758
return n_dims > 1
@@ -2786,18 +2773,22 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
27862773

27872774
return [(new_name, data_torch)]
27882775

2789-
def extra_f32_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool:
2790-
del n_dims # unused
2791-
2792-
return bid is not None and new_name in (
2793-
self.format_tensor_name(n, bid, ".weight" if name.endswith(".weight") else "") for n in [
2776+
def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool:
2777+
if bid is not None and new_name in (
2778+
self.format_tensor_name(
2779+
n, bid, ".weight" if name.endswith(".weight") else ""
2780+
)
2781+
for n in [
27942782
gguf.MODEL_TENSOR.SSM_CONV1D,
27952783
gguf.MODEL_TENSOR.SSM_X,
27962784
gguf.MODEL_TENSOR.SSM_DT,
27972785
gguf.MODEL_TENSOR.SSM_A,
27982786
gguf.MODEL_TENSOR.SSM_D,
27992787
]
2800-
)
2788+
):
2789+
return gguf.GGMLQuantizationType.F32
2790+
2791+
return super().tensor_force_quant(name, new_name, bid, n_dims)
28012792

28022793

28032794
@Model.register("CohereForCausalLM")

examples/embedding/README.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,13 @@ To get started right away, run the following command, making sure to use the cor
99
### Unix-based systems (Linux, macOS, etc.):
1010

1111
```bash
12-
./llama-embedding -m ./path/to/model --log-disable -p "Hello World!" 2>/dev/null
12+
./llama-embedding -m ./path/to/model --pooling mean --log-disable -p "Hello World!" 2>/dev/null
1313
```
1414

1515
### Windows:
1616

1717
```powershell
18-
llama-embedding.exe -m ./path/to/model --log-disable -p "Hello World!" 2>$null
18+
llama-embedding.exe -m ./path/to/model --pooling mean --log-disable -p "Hello World!" 2>$null
1919
```
2020

2121
The above command will output space-separated float values.
@@ -50,11 +50,11 @@ The above command will output space-separated float values.
5050
### Unix-based systems (Linux, macOS, etc.):
5151

5252
```bash
53-
./embedding -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --embd-separator '<#sep#>' --embd-normalize 2 --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null
53+
./llama-embedding -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --pooling mean --embd-separator '<#sep#>' --embd-normalize 2 --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null
5454
```
5555

5656
### Windows:
5757

5858
```powershell
59-
embedding.exe -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --embd-separator '<#sep#>' --embd-normalize 2 --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null
59+
llama-embedding.exe -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --pooling mean --embd-separator '<#sep#>' --embd-normalize 2 --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null
6060
```

examples/llava/CMakeLists.txt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,3 +36,10 @@ set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-llava-cli)
3636
install(TARGETS ${TARGET} RUNTIME)
3737
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
3838
target_compile_features(${TARGET} PRIVATE cxx_std_11)
39+
40+
set(TARGET llama-minicpmv-cli)
41+
add_executable(${TARGET} minicpmv-cli.cpp)
42+
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-minicpmv-cli)
43+
install(TARGETS ${TARGET} RUNTIME)
44+
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
45+
target_compile_features(${TARGET} PRIVATE cxx_std_11)
Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
## MiniCPM-Llama3-V 2.5
2+
3+
### Prepare models and code
4+
5+
Download [MiniCPM-Llama3-V-2_5](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5) PyTorch model from huggingface to "MiniCPM-Llama3-V-2_5" folder.
6+
7+
Clone llama.cpp:
8+
```bash
9+
git clone https://github.com/ggerganov/llama.cpp
10+
cd llama.cpp
11+
```
12+
13+
### Usage
14+
15+
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf) by us)
16+
17+
```bash
18+
python ./examples/minicpmv/minicpmv-surgery.py -m ../MiniCPM-Llama3-V-2_5
19+
python ./examples/minicpmv/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-Llama3-V-2_5 --minicpmv-projector ../MiniCPM-Llama3-V-2_5/minicpmv.projector --output-dir ../MiniCPM-Llama3-V-2_5/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5
20+
python ./convert-hf-to-gguf.py ../MiniCPM-Llama3-V-2_5/model
21+
22+
# quantize int4 version
23+
./llama-quantize ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf Q4_K_M
24+
```
25+
26+
Build for Linux or Mac
27+
28+
```bash
29+
make
30+
make llama-minicpmv-cli
31+
```
32+
33+
Inference on Linux or Mac
34+
```
35+
# run f16 version
36+
./llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
37+
38+
# run quantized int4 version
39+
./llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
40+
41+
# or run in interactive mode
42+
./llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -i
43+
```
44+
45+
### Android
46+
47+
#### Build on Android device using Termux
48+
We found that build on Android device would bring better runtime performance, so we recommend to build on device.
49+
50+
[Termux](https://github.com/termux/termux-app#installation) is a terminal app on Android device (no root required).
51+
52+
Install tools in Termux:
53+
```
54+
apt update && apt upgrade -y
55+
apt install git make cmake
56+
```
57+
58+
It's recommended to move your model inside the `~/` directory for best performance:
59+
```
60+
cd storage/downloads
61+
mv model.gguf ~/
62+
```
63+
64+
#### Building the Project using Android NDK
65+
Obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake.
66+
67+
Execute the following commands on your computer to avoid downloading the NDK to your mobile. Alternatively, you can also do this in Termux:
68+
69+
```bash
70+
mkdir build-android
71+
cd build-android
72+
export NDK=/your_ndk_path
73+
cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod ..
74+
make
75+
```
76+
77+
Install [termux](https://github.com/termux/termux-app#installation) on your device and run `termux-setup-storage` to get access to your SD card (if Android 11+ then run the command twice).
78+
79+
Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission:
80+
81+
(Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`)
82+
```
83+
$cp -r /sdcard/llama.cpp/bin /data/data/com.termux/files/home/
84+
$cd /data/data/com.termux/files/home/bin
85+
$chmod +x ./*
86+
```
87+
88+
Download models and push them to `/sdcard/llama.cpp/`, then move it to `/data/data/com.termux/files/home/model/`
89+
90+
```
91+
$mv /sdcard/llama.cpp/ggml-model-Q4_K_M.gguf /data/data/com.termux/files/home/model/
92+
$mv /sdcard/llama.cpp/mmproj-model-f16.gguf /data/data/com.termux/files/home/model/
93+
```
94+
95+
Now, you can start chatting:
96+
```
97+
$cd /data/data/com.termux/files/home/bin
98+
$./llama-minicpmv-cli -m ../model/ggml-model-Q4_K_M.gguf --mmproj ../model/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
99+
```

0 commit comments

Comments
 (0)