Skip to content

Commit 504ee68

Browse files
Merge branch 'ggerganov:master' into master
2 parents 9a60f1b + 8e6a9d2 commit 504ee68

File tree

11 files changed

+249
-83
lines changed

11 files changed

+249
-83
lines changed

.github/workflows/build.yml

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,47 @@ jobs:
184184
cmake -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ..
185185
cmake --build . --config Release -j $(nproc)
186186
187+
ubuntu-22-cmake-sycl-fp16:
188+
runs-on: ubuntu-22.04
189+
190+
continue-on-error: true
191+
192+
steps:
193+
- uses: actions/checkout@v2
194+
195+
- name: add oneAPI to apt
196+
shell: bash
197+
run: |
198+
cd /tmp
199+
wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
200+
sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
201+
rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
202+
sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
203+
204+
- name: install oneAPI dpcpp compiler
205+
shell: bash
206+
run: |
207+
sudo apt update
208+
sudo apt install intel-oneapi-compiler-dpcpp-cpp
209+
210+
- name: install oneAPI MKL library
211+
shell: bash
212+
run: |
213+
sudo apt install intel-oneapi-mkl-devel
214+
215+
- name: Clone
216+
id: checkout
217+
uses: actions/checkout@v3
218+
219+
- name: Build
220+
id: cmake_build
221+
run: |
222+
source /opt/intel/oneapi/setvars.sh
223+
mkdir build
224+
cd build
225+
cmake -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON ..
226+
cmake --build . --config Release -j $(nproc)
227+
187228
# TODO: build with LLAMA_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
188229
# how to debug it.
189230
# ref: https://github.com/ggerganov/llama.cpp/actions/runs/7131777249/job/19420981052#step:5:1124

CMakeLists.txt

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -850,7 +850,9 @@ endif()
850850
851851
set(ARCH_FLAGS "")
852852
853-
if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") OR ("${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "arm64"))
853+
if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR CMAKE_GENERATOR_PLATFORM_LWR STREQUAL "arm64" OR
854+
(NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
855+
CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm.*|ARM64)$"))
854856
message(STATUS "ARM detected")
855857
if (MSVC)
856858
add_compile_definitions(__ARM_NEON)
@@ -876,7 +878,9 @@ if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATC
876878
list(APPEND ARCH_FLAGS -mno-unaligned-access)
877879
endif()
878880
endif()
879-
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "^(x86_64|i686|amd64|x64)$" )
881+
elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
882+
(NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
883+
CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$"))
880884
message(STATUS "x86 detected")
881885
if (MSVC)
882886
# instruction set detection for MSVC only

README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
150150
- [ollama/ollama](https://github.com/ollama/ollama)
151151
- [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui) (AGPL)
152152
- [psugihara/FreeChat](https://github.com/psugihara/FreeChat)
153+
- [cztomsik/ava](https://github.com/cztomsik/ava) (MIT)
153154
- [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal)
154155
- [pythops/tenere](https://github.com/pythops/tenere) (AGPL)
155156
- [semperai/amica](https://github.com/semperai/amica)
@@ -679,7 +680,7 @@ python3 -m pip install -r requirements.txt
679680
python3 convert.py models/mymodel/
680681
681682
# [Optional] for models using BPE tokenizers
682-
python convert.py models/mymodel/ --vocabtype bpe
683+
python convert.py models/mymodel/ --vocab-type bpe
683684
684685
# quantize the model to 4-bits (using Q4_K_M method)
685686
./quantize ./models/mymodel/ggml-model-f16.gguf ./models/mymodel/ggml-model-Q4_K_M.gguf Q4_K_M

common/sampling.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,7 @@ static void sampler_queue(
132132
const float temp = params.temp;
133133
const float dynatemp_range = params.dynatemp_range;
134134
const float dynatemp_exponent = params.dynatemp_exponent;
135-
const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k;
135+
const int32_t top_k = params.top_k;
136136
const float top_p = params.top_p;
137137
const float min_p = params.min_p;
138138
const float tfs_z = params.tfs_z;

convert-hf-to-gguf.py

Lines changed: 61 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1078,17 +1078,76 @@ def set_gguf_parameters(self):
10781078
self.gguf_writer.add_name("MiniCPM")
10791079
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
10801080
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
1081-
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
10821081
self.gguf_writer.add_block_count(block_count)
1082+
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
1083+
self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
10831084
self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
10841085
self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
10851086
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
10861087
self.gguf_writer.add_file_type(self.ftype)
1087-
self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
10881088

10891089
def set_vocab(self):
10901090
self._set_vocab_hf()
10911091

1092+
def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
1093+
if n_kv_head is not None and n_head != n_kv_head:
1094+
n_head //= n_kv_head
1095+
1096+
return (
1097+
weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
1098+
.swapaxes(1, 2)
1099+
.reshape(weights.shape)
1100+
)
1101+
1102+
def write_tensors(self):
1103+
block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
1104+
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
1105+
n_head = self.hparams.get("num_attention_heads")
1106+
n_kv_head = self.hparams.get("num_key_value_heads")
1107+
for name, data_torch in self.get_tensors():
1108+
# we don't need these
1109+
if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
1110+
continue
1111+
1112+
old_dtype = data_torch.dtype
1113+
1114+
# convert any unsupported data types to float32
1115+
if data_torch.dtype not in (torch.float16, torch.float32):
1116+
data_torch = data_torch.to(torch.float32)
1117+
1118+
# HF models permute some of the tensors, so we need to undo that
1119+
if name.endswith(("q_proj.weight")):
1120+
data_torch = self._reverse_hf_permute(data_torch, n_head, n_head)
1121+
if name.endswith(("k_proj.weight")):
1122+
data_torch = self._reverse_hf_permute(data_torch, n_head, n_kv_head)
1123+
1124+
data = data_torch.squeeze().numpy()
1125+
1126+
# map tensor names
1127+
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
1128+
if new_name is None:
1129+
print(f"Can not map tensor {name!r}")
1130+
sys.exit()
1131+
1132+
n_dims = len(data.shape)
1133+
data_dtype = data.dtype
1134+
1135+
# if f32 desired, convert any float16 to float32
1136+
if self.ftype == 0 and data_dtype == np.float16:
1137+
data = data.astype(np.float32)
1138+
1139+
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
1140+
if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
1141+
data = data.astype(np.float32)
1142+
1143+
# if f16 desired, convert any float32 2-dim weight tensors to float16
1144+
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
1145+
data = data.astype(np.float16)
1146+
1147+
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
1148+
1149+
self.gguf_writer.add_tensor(new_name, data)
1150+
10921151

10931152
class QwenModel(Model):
10941153
@staticmethod

examples/llava/README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,14 +14,14 @@ Build with cmake or run `make llava-cli` to build it.
1414
After building, run: `./llava-cli` to see the usage. For example:
1515

1616
```sh
17-
./llava-cli -m llava-v1.5-7b/ggml-model-q5_k.gguf --mmproj llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg
17+
./llava-cli -m ../llava-v1.5-7b/ggml-model-f16.gguf --mmproj ../llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg
1818
```
1919

2020
**note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so.
2121

2222
## Model conversion
2323

24-
- Clone `llava-v15-7b`` and `clip-vit-large-patch14-336`` locally:
24+
- Clone `llava-v15-7b` and `clip-vit-large-patch14-336` locally:
2525

2626
```sh
2727
git clone https://huggingface.co/liuhaotian/llava-v1.5-7b
@@ -38,7 +38,7 @@ python ./examples/llava/llava-surgery.py -m ../llava-v1.5-7b
3838
3. Use `convert-image-encoder-to-gguf.py` to convert the LLaVA image encoder to GGUF:
3939

4040
```sh
41-
python ./examples/llava/convert-image-encoder-to-gguf -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b
41+
python ./examples/llava/convert-image-encoder-to-gguf.py -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b
4242
```
4343

4444
4. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF:

0 commit comments

Comments
 (0)