Skip to content

Commit c734b53

Browse files
committed
Merge branch 'master' into xsn/server_mtmd
2 parents 78a76de + 1d735c0 commit c734b53

File tree

116 files changed

+11356
-7975
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

116 files changed

+11356
-7975
lines changed

.github/workflows/build.yml

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -601,8 +601,9 @@ jobs:
601601
-DGGML_SYCL_F16=ON
602602
cmake --build build --config Release -j $(nproc)
603603
604-
build-linux-cross:
605-
uses: ./.github/workflows/build-linux-cross.yml
604+
# Disabled for now due to sporadic issue syncing.
605+
# build-linux-cross:
606+
# uses: ./.github/workflows/build-linux-cross.yml
606607

607608
macOS-latest-cmake-ios:
608609
runs-on: macos-latest
@@ -1766,16 +1767,17 @@ jobs:
17661767
if: ${{ github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'Ascend NPU') }}
17671768
defaults:
17681769
run:
1769-
shell: bash -el {0}
1770-
runs-on: ubuntu-24.04-arm
1770+
shell: bash -el {0}
17711771
strategy:
17721772
matrix:
1773+
arch: [x86, aarch64]
17731774
cann:
17741775
- '8.1.RC1.alpha001-910b-openeuler22.03-py3.10'
17751776
device:
17761777
- 'ascend910b3'
17771778
build:
17781779
- 'Release'
1780+
runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
17791781
container: ascendai/cann:${{ matrix.cann }}
17801782
steps:
17811783
- name: Checkout

Makefile

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -780,10 +780,6 @@ ifdef GGML_HIP
780780

781781
MK_CPPFLAGS += -DGGML_USE_HIP -DGGML_USE_CUDA
782782

783-
ifdef GGML_HIP_UMA
784-
MK_CPPFLAGS += -DGGML_HIP_UMA
785-
endif # GGML_HIP_UMA
786-
787783
MK_LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
788784
MK_LDFLAGS += -L$(ROCM_PATH)/lib64 -Wl,-rpath=$(ROCM_PATH)/lib64
789785
MK_LDFLAGS += -lhipblas -lamdhip64 -lrocblas

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
1616

1717
## Hot topics
1818

19+
- A new binary `llama-mtmd-cli` is introduced to replace `llava-cli`, `minicpmv-cli` and `gemma3-cli` https://github.com/ggml-org/llama.cpp/pull/13012, `libllava` will be deprecated
1920
- **How to use [MTLResidencySet](https://developer.apple.com/documentation/metal/mtlresidencyset?language=objc) to keep the GPU memory active?** https://github.com/ggml-org/llama.cpp/pull/11427
2021
- **VS Code extension for FIM completions:** https://github.com/ggml-org/llama.vscode
2122
- Universal [tool call support](./docs/function-calling.md) in `llama-server` https://github.com/ggml-org/llama.cpp/pull/9639

common/arg.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2728,7 +2728,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
27282728
[](common_params & params, const std::string & value) {
27292729
params.chat_template = value;
27302730
}
2731-
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
2731+
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_LLAVA}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
27322732
add_opt(common_arg(
27332733
{"--chat-template-file"}, "JINJA_TEMPLATE_FILE",
27342734
string_format(

convert_hf_to_gguf.py

Lines changed: 394 additions & 273 deletions
Large diffs are not rendered by default.

convert_lora_to_gguf.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
import gguf
2525

2626
# reuse model definitions from convert_hf_to_gguf.py
27-
from convert_hf_to_gguf import LazyTorchTensor, Model
27+
from convert_hf_to_gguf import LazyTorchTensor, ModelBase
2828

2929
logger = logging.getLogger("lora-to-gguf")
3030

@@ -340,11 +340,11 @@ def load_hparams_from_hf(hf_model_id: str) -> dict[str, Any]:
340340
sys.exit(1)
341341
else:
342342
logger.info(f"Loading base model: {dir_base_model.name}")
343-
hparams = Model.load_hparams(dir_base_model)
343+
hparams = ModelBase.load_hparams(dir_base_model)
344344

345345
with torch.inference_mode():
346346
try:
347-
model_class = Model.from_model_architecture(hparams["architectures"][0])
347+
model_class = ModelBase.from_model_architecture(hparams["architectures"][0])
348348
except NotImplementedError:
349349
logger.error(f"Model {hparams['architectures'][0]} is not supported")
350350
sys.exit(1)

docs/build.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -259,8 +259,6 @@ You can download it from your Linux distro's package manager or from here: [ROCm
259259
cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
260260
&& cmake --build build --config Release -- -j 16
261261
```
262-
On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DGGML_HIP_UMA=ON`.
263-
However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
264262

265263
To enhance flash attention performance on RDNA3+ or CDNA architectures, you can utilize the rocWMMA library by enabling the `-DGGML_HIP_ROCWMMA_FATTN=ON` option. This requires rocWMMA headers to be installed on the build system.
266264

@@ -296,6 +294,10 @@ You can download it from your Linux distro's package manager or from here: [ROCm
296294
The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used.
297295
If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3.
298296

297+
### Unified Memory
298+
299+
On Linux it is possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1`. However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
300+
299301
## Vulkan
300302

301303
**Windows**

examples/llava/CMakeLists.txt

Lines changed: 6 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -61,19 +61,9 @@ if(TARGET BUILD_INFO)
6161
add_dependencies(mtmd BUILD_INFO)
6262
endif()
6363

64-
set(TARGET llama-llava-cli)
65-
add_executable(${TARGET} llava-cli.cpp)
66-
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-llava-cli)
67-
install(TARGETS ${TARGET} RUNTIME)
68-
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
69-
target_compile_features(${TARGET} PRIVATE cxx_std_17)
70-
71-
set(TARGET llama-minicpmv-cli)
72-
add_executable(${TARGET} minicpmv-cli.cpp)
73-
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-minicpmv-cli)
74-
install(TARGETS ${TARGET} RUNTIME)
75-
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
76-
target_compile_features(${TARGET} PRIVATE cxx_std_17)
64+
add_executable(llama-llava-cli deprecation-warning.cpp)
65+
add_executable(llama-gemma3-cli deprecation-warning.cpp)
66+
add_executable(llama-minicpmv-cli deprecation-warning.cpp)
7767

7868
set(TARGET llama-qwen2vl-cli)
7969
add_executable(${TARGET} qwen2vl-cli.cpp)
@@ -82,9 +72,9 @@ install(TARGETS ${TARGET} RUNTIME)
8272
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
8373
target_compile_features(${TARGET} PRIVATE cxx_std_17)
8474

85-
set(TARGET llama-gemma3-cli)
86-
add_executable(${TARGET} gemma3-cli.cpp)
87-
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-gemma3-cli)
75+
set(TARGET llama-mtmd-cli)
76+
add_executable(${TARGET} mtmd-cli.cpp)
77+
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-mtmd-cli)
8878
install(TARGETS ${TARGET} RUNTIME)
8979
target_link_libraries(${TARGET} PRIVATE common mtmd ${CMAKE_THREAD_LIBS_INIT})
9080
target_compile_features(${TARGET} PRIVATE cxx_std_17)

examples/llava/clip-impl.h

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,6 @@
5050
// tensor name constants
5151
//
5252

53-
#define TN_TOKEN_EMBD "%s.token_embd.weight"
5453
#define TN_POS_EMBD "%s.position_embd.weight"
5554
#define TN_CLASS_EMBD "v.class_embd"
5655
#define TN_PATCH_EMBD "v.patch_embd.weight" // not rename tensor with ".0" postfix for backwrad compat
@@ -66,8 +65,6 @@
6665
#define TN_LN_2 "%s.blk.%d.ln2.%s"
6766
#define TN_LN_PRE "%s.pre_ln.%s"
6867
#define TN_LN_POST "%s.post_ln.%s"
69-
#define TN_TEXT_PROJ "text_projection.weight"
70-
#define TN_VIS_PROJ "visual_projection.weight"
7168
#define TN_LLAVA_PROJ "mm.%d.%s"
7269
#define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s"
7370
#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"

0 commit comments

Comments
 (0)