Skip to content

Commit 0701c73

Browse files
committed
Sync abetlen/llama-cpp-python 0.3.11 code
1 parent 0e2ae3a commit 0701c73

File tree

8 files changed

+43
-17
lines changed

8 files changed

+43
-17
lines changed

.github/workflows/build-and-release.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ jobs:
1111
runs-on: ${{ matrix.os }}
1212
strategy:
1313
matrix:
14-
os: [ubuntu-20.04, windows-2019, macos-13]
14+
os: [ubuntu-22.04, windows-2022, macos-14, macos-15]
1515

1616
steps:
1717
- uses: actions/checkout@v4

.github/workflows/build-wheels-cuda.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ permissions:
88
jobs:
99
define_matrix:
1010
name: Define Build Matrix
11-
runs-on: ubuntu-latest
11+
runs-on: ubuntu-22.04
1212
outputs:
1313
matrix: ${{ steps.set-matrix.outputs.matrix }}
1414
defaults:
@@ -20,7 +20,7 @@ jobs:
2020
id: set-matrix
2121
run: |
2222
$matrix = @{
23-
'os' = @('ubuntu-latest', 'windows-2019')
23+
'os' = @('ubuntu-22.04', 'windows-2022')
2424
'pyver' = @("3.9", "3.10", "3.11", "3.12")
2525
'cuda' = @("12.1.1", "12.2.2", "12.3.2", "12.4.1") #, "12.5.1", "12.6.1")
2626
'releasetag' = @("basic")

.github/workflows/build-wheels-metal.yaml

Lines changed: 2 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ jobs:
1111
runs-on: ${{ matrix.os }}
1212
strategy:
1313
matrix:
14-
os: [macos-13, macos-14, macos-15]
14+
os: [macos-14, macos-15]
1515

1616
steps:
1717
- uses: actions/checkout@v4
@@ -25,30 +25,19 @@ jobs:
2525
cache: 'pip'
2626

2727
- name: Install dependencies (Linux/MacOS)
28-
if: runner.os != 'Windows'
2928
run: |
3029
python -m pip install --upgrade pip
3130
python -m pip install uv
3231
RUST_LOG=trace python -m uv pip install -e .[all] --verbose
3332
shell: bash
3433

35-
- name: Install dependencies (Windows)
36-
if: runner.os == 'Windows'
37-
env:
38-
RUST_LOG: trace
39-
run: |
40-
python -m pip install --upgrade pip
41-
python -m pip install uv
42-
python -m uv pip install -e .[all] --verbose
43-
shell: cmd
44-
4534
- name: Build wheels
4635
uses: pypa/[email protected]
4736
env:
4837
# disable repair
4938
CIBW_REPAIR_WHEEL_COMMAND: ""
5039
CIBW_ARCHS: "arm64"
51-
CIBW_ENVIRONMENT: CMAKE_ARGS="-DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_APPLE_SILICON_PROCESSOR=arm64 -DGGML_METAL=on"
40+
CIBW_ENVIRONMENT: CMAKE_ARGS="-DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_APPLE_SILICON_PROCESSOR=arm64 -DGGML_METAL=on -DCMAKE_CROSSCOMPILING=ON"
5241
CIBW_BUILD: "cp39-* cp310-* cp311-* cp312-*"
5342
with:
5443
package-dir: .

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
## [Unreleased]
99

10+
## [0.3.11]
11+
12+
- fix: Update reference to `llama_kv_cache_clear` in Llama.embed. Closes #2037 by @abetlen in 9e5a4eaa84156084ed7bbb91e6efcc91dc6217bc
13+
1014
## [0.3.10]
1115

1216
- feat: Update llama.cpp to ggerganov/llama.cpp@28657a8229b5adc6028cf1c4ed62191792d2fdb0

docker/simple/Dockerfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ ARG IMAGE
99

1010
# Update and upgrade the existing packages
1111
RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-recommends \
12+
git \
1213
python3 \
1314
python3-pip \
1415
ninja-build \

llama_cpp/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
from .llama_cpp import *
22
from .llama import *
33

4-
__version__ = "0.3.10"
4+
__version__ = "0.3.11"

llama_cpp/server/model.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,20 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
171171
chat_handler = llama_cpp.llama_chat_format.MiniCPMv26ChatHandler(
172172
clip_model_path=settings.clip_model_path, verbose=settings.verbose
173173
)
174+
elif settings.chat_format == "qwen2.5-vl":
175+
assert settings.clip_model_path is not None, "clip model not found"
176+
if settings.hf_model_repo_id is not None:
177+
chat_handler = (
178+
llama_cpp.llama_chat_format.Qwen25VLChatHandler.from_pretrained(
179+
repo_id=settings.hf_model_repo_id,
180+
filename=settings.clip_model_path,
181+
verbose=settings.verbose,
182+
)
183+
)
184+
else:
185+
chat_handler = llama_cpp.llama_chat_format.Qwen25VLChatHandler(
186+
clip_model_path=settings.clip_model_path, verbose=settings.verbose
187+
)
174188
elif settings.chat_format == "hf-autotokenizer":
175189
assert (
176190
settings.hf_pretrained_model_name_or_path is not None

tests/test_llama.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,7 @@ def test_real_llama(llama_cpp_model_path):
123123
n_threads=multiprocessing.cpu_count(),
124124
n_threads_batch=multiprocessing.cpu_count(),
125125
flash_attn=True,
126+
logits_all=False,
126127
swa_full = True
127128
)
128129

@@ -216,3 +217,20 @@ def logit_processor_func(input_ids, logits):
216217

217218
assert number_1 != number_2
218219
assert number_1 == number_3
220+
221+
222+
def test_real_llama_embeddings(llama_cpp_model_path):
223+
model = llama_cpp.Llama(
224+
llama_cpp_model_path,
225+
n_ctx=32,
226+
n_batch=32,
227+
n_ubatch=32,
228+
n_threads=multiprocessing.cpu_count(),
229+
n_threads_batch=multiprocessing.cpu_count(),
230+
logits_all=False,
231+
flash_attn=True,
232+
swa_full=True,
233+
embedding=True
234+
)
235+
# Smoke test for now
236+
model.embed("Hello World")

0 commit comments

Comments
 (0)