Skip to content

Commit 46325c7

Browse files
committed
Merge branch 'master' into feature/sparsek-attn-sycl
2 parents 5d6d3b7 + 1c1409e commit 46325c7

File tree

12 files changed

+155
-52
lines changed

12 files changed

+155
-52
lines changed

common/arg.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3248,7 +3248,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
32483248
).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
32493249
add_opt(common_arg(
32503250
{"--embd-output-format"}, "FORMAT",
3251-
"empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix",
3251+
"empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix, \"raw\" = plain whitespace-delimited output (one embedding per line)",
32523252
[](common_params & params, const std::string & value) {
32533253
params.embd_out = value;
32543254
}

common/json-schema-to-grammar.cpp

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -601,7 +601,10 @@ class SchemaConverter {
601601
}
602602

603603
std::string _resolve_ref(const std::string & ref) {
604-
std::string ref_name = ref.substr(ref.find_last_of('/') + 1);
604+
auto it = ref.find('#');
605+
std::string ref_fragment = it != std::string::npos ? ref.substr(it + 1) : ref;
606+
static const std::regex nonalphanumeric_regex(R"([^a-zA-Z0-9-]+)");
607+
std::string ref_name = "ref" + std::regex_replace(ref_fragment, nonalphanumeric_regex, "-");
605608
if (_rules.find(ref_name) == _rules.end() && _refs_being_resolved.find(ref) == _refs_being_resolved.end()) {
606609
_refs_being_resolved.insert(ref);
607610
json resolved = _refs[ref];
@@ -774,11 +777,24 @@ class SchemaConverter {
774777
std::vector<std::string> tokens = string_split(pointer, "/");
775778
for (size_t i = 1; i < tokens.size(); ++i) {
776779
std::string sel = tokens[i];
777-
if (target.is_null() || !target.contains(sel)) {
780+
if (target.is_object() && target.contains(sel)) {
781+
target = target[sel];
782+
} else if (target.is_array()) {
783+
size_t sel_index;
784+
try {
785+
sel_index = std::stoul(sel);
786+
} catch (const std::invalid_argument & e) {
787+
sel_index = target.size();
788+
}
789+
if (sel_index >= target.size()) {
790+
_errors.push_back("Error resolving ref " + ref + ": " + sel + " not in " + target.dump());
791+
return;
792+
}
793+
target = target[sel_index];
794+
} else {
778795
_errors.push_back("Error resolving ref " + ref + ": " + sel + " not in " + target.dump());
779796
return;
780797
}
781-
target = target[sel];
782798
}
783799
_refs[ref] = target;
784800
}

examples/embedding/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ The above command will output space-separated float values.
3838
| | multiple embeddings | $[[x_1,...,x_n],[x_1,...,x_n],...,[x_1,...,x_n]]$
3939
| 'json' | openai style |
4040
| 'json+' | add cosine similarity matrix |
41+
| 'raw' | plain text output |
4142

4243
### --embd-separator $"string"$
4344
| $"string"$ | |

examples/embedding/embedding.cpp

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,29 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
7070
}
7171
}
7272

73+
// plain, pipe-friendly output: one embedding per line
74+
static void print_raw_embeddings(const float * emb,
75+
int n_embd_count,
76+
int n_embd,
77+
const llama_model * model,
78+
enum llama_pooling_type pooling_type,
79+
int embd_normalize) {
80+
const uint32_t n_cls_out = llama_model_n_cls_out(model);
81+
const bool is_rank = (pooling_type == LLAMA_POOLING_TYPE_RANK);
82+
const int cols = is_rank ? std::min<int>(n_embd, (int) n_cls_out) : n_embd;
83+
84+
for (int j = 0; j < n_embd_count; ++j) {
85+
for (int i = 0; i < cols; ++i) {
86+
if (embd_normalize == 0) {
87+
LOG("%1.0f%s", emb[j * n_embd + i], (i + 1 < cols ? " " : ""));
88+
} else {
89+
LOG("%1.7f%s", emb[j * n_embd + i], (i + 1 < cols ? " " : ""));
90+
}
91+
}
92+
LOG("\n");
93+
}
94+
}
95+
7396
int main(int argc, char ** argv) {
7497
common_params params;
7598

@@ -372,6 +395,8 @@ int main(int argc, char ** argv) {
372395
}
373396

374397
if (notArray) LOG("\n}\n");
398+
} else if (params.embd_out == "raw") {
399+
print_raw_embeddings(emb, n_embd_count, n_embd, model, pooling_type, params.embd_normalize);
375400
}
376401

377402
LOG("\n");

examples/json_schema_to_grammar.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -371,8 +371,17 @@ def visit(n: dict):
371371
raise ValueError(f'Unsupported ref {ref}')
372372

373373
for sel in ref.split('#')[-1].split('/')[1:]:
374-
assert target is not None and sel in target, f'Error resolving ref {ref}: {sel} not in {target}'
375-
target = target[sel]
374+
assert target is not None, f'Error resolving ref {ref}: {sel} not in {target}'
375+
if isinstance(target, list):
376+
try:
377+
sel_index = int(sel)
378+
except ValueError:
379+
raise ValueError(f'Error resolving ref {ref}: {sel} not in {target}')
380+
assert 0 <= sel_index < len(target), f'Error resolving ref {ref}: {sel} not in {target}'
381+
target = target[sel_index]
382+
else:
383+
assert sel in target, f'Error resolving ref {ref}: {sel} not in {target}'
384+
target = target[sel]
376385

377386
self._refs[ref] = target
378387
else:
@@ -547,7 +556,8 @@ def join_seq():
547556

548557

549558
def _resolve_ref(self, ref):
550-
ref_name = ref.split('/')[-1]
559+
ref_fragment = ref.split('#')[-1]
560+
ref_name = 'ref' + re.sub(r'[^a-zA-Z0-9-]+', '-', ref_fragment)
551561
if ref_name not in self._rules and ref not in self._refs_being_resolved:
552562
self._refs_being_resolved.add(ref)
553563
resolved = self._refs[ref]

src/llama-kv-cache.cpp

Lines changed: 18 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
#include <algorithm>
99
#include <cassert>
1010
#include <cmath>
11+
#include <cstring>
1112
#include <limits>
1213
#include <map>
1314
#include <stdexcept>
@@ -37,8 +38,15 @@ llama_kv_cache::llama_kv_cache(
3738

3839
const uint32_t n_layer_kv = hparams.n_layer_kv();
3940

41+
// define a comparator for the buft -> ctx map to ensure that the order is well-defined:
42+
struct ggml_backend_buft_comparator {
43+
bool operator()(const ggml_backend_buffer_type_t & lhs, const ggml_backend_buffer_type_t & rhs) const {
44+
return strcmp(ggml_backend_buft_name(lhs), ggml_backend_buft_name(rhs)) < 0;
45+
}
46+
};
47+
std::map<ggml_backend_buffer_type_t, ggml_context_ptr, ggml_backend_buft_comparator> ctx_map;
48+
4049
// create a context for each buffer type
41-
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
4250
auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
4351
auto it = ctx_map.find(buft);
4452
if (it == ctx_map.end()) {
@@ -53,13 +61,12 @@ llama_kv_cache::llama_kv_cache(
5361
return nullptr;
5462
}
5563

56-
ctx_map[buft] = ctx;
57-
ctxs.emplace_back(ctx);
64+
ctx_map.emplace(buft, ctx);
5865

5966
return ctx;
6067
}
6168

62-
return it->second;
69+
return it->second.get();
6370
};
6471

6572
GGML_ASSERT(n_stream == 1 || n_stream == n_seq_max);
@@ -167,19 +174,16 @@ llama_kv_cache::llama_kv_cache(
167174
}
168175

169176
// allocate tensors and initialize the buffers to avoid NaNs in the padding
170-
for (auto it : ctx_map) {
171-
auto * buft = it.first;
172-
auto * ctx = it.second;
173-
174-
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
177+
for (auto & [buft, ctx] : ctx_map) {
178+
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx.get(), buft);
175179
if (!buf) {
176180
throw std::runtime_error("failed to allocate buffer for kv cache");
177181
}
178182

179183
LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
180184

181185
ggml_backend_buffer_clear(buf, 0);
182-
bufs.emplace_back(buf);
186+
ctxs_bufs.emplace_back(std::move(ctx), buf);
183187
}
184188

185189
{
@@ -203,7 +207,7 @@ void llama_kv_cache::clear(bool data) {
203207
}
204208

205209
if (data) {
206-
for (auto & buf : bufs) {
210+
for (auto & [_, buf] : ctxs_bufs) {
207211
ggml_backend_buffer_clear(buf.get(), 0);
208212
}
209213
}
@@ -472,8 +476,8 @@ llama_pos llama_kv_cache::seq_pos_max(llama_seq_id seq_id) const {
472476

473477
std::map<ggml_backend_buffer_type_t, size_t> llama_kv_cache::memory_breakdown() const {
474478
std::map<ggml_backend_buffer_type_t, size_t> ret;
475-
for (const ggml_backend_buffer_ptr & buf_ptr : bufs) {
476-
ret[ggml_backend_buffer_get_type(buf_ptr.get())] += ggml_backend_buffer_get_size(buf_ptr.get());
479+
for (const auto & [_, buf] : ctxs_bufs) {
480+
ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
477481
}
478482
return ret;
479483
}
@@ -1298,7 +1302,7 @@ void llama_kv_cache::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch
12981302
size_t llama_kv_cache::total_size() const {
12991303
size_t size = 0;
13001304

1301-
for (const auto & buf : bufs) {
1305+
for (const auto & [_, buf] : ctxs_bufs) {
13021306
size += ggml_backend_buffer_get_size(buf.get());
13031307
}
13041308

src/llama-kv-cache.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -217,8 +217,8 @@ class llama_kv_cache : public llama_memory_i {
217217
// this is the SWA type of the cache - not to be confused with the model SWA type
218218
const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
219219

220-
std::vector<ggml_context_ptr> ctxs;
221-
std::vector<ggml_backend_buffer_ptr> bufs;
220+
// ggml contexts for the KV cache along with the allocated backend buffers:
221+
std::vector<std::pair<ggml_context_ptr, ggml_backend_buffer_ptr>> ctxs_bufs;
222222

223223
// the current index from where we start searching for a free slot in the ring buffer of KV cells (see find_slot())
224224
// note: this is not part of the KV state and it's only used to speed-up the find_slot() method

src/llama-memory-recurrent.cpp

Lines changed: 18 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
#include <algorithm>
99
#include <cassert>
10+
#include <cstring>
1011
#include <limits>
1112
#include <map>
1213
#include <stdexcept>
@@ -32,8 +33,15 @@ llama_memory_recurrent::llama_memory_recurrent(
3233
cells.clear();
3334
cells.resize(mem_size);
3435

36+
// define a comparator for the buft -> ctx map to ensure that the order is well-defined:
37+
struct ggml_backend_buft_comparator {
38+
bool operator()(const ggml_backend_buffer_type_t & lhs, const ggml_backend_buffer_type_t & rhs) const {
39+
return strcmp(ggml_backend_buft_name(lhs), ggml_backend_buft_name(rhs)) < 0;
40+
}
41+
};
42+
std::map<ggml_backend_buffer_type_t, ggml_context_ptr, ggml_backend_buft_comparator> ctx_map;
43+
3544
// create a context for each buffer type
36-
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
3745
auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
3846
auto it = ctx_map.find(buft);
3947
if (it == ctx_map.end()) {
@@ -48,13 +56,12 @@ llama_memory_recurrent::llama_memory_recurrent(
4856
return nullptr;
4957
}
5058

51-
ctx_map[buft] = ctx;
52-
ctxs.emplace_back(ctx);
59+
ctx_map.emplace(buft, ctx);
5360

5461
return ctx;
5562
}
5663

57-
return it->second;
64+
return it->second.get();
5865
};
5966

6067
r_l.resize(n_layer);
@@ -93,17 +100,14 @@ llama_memory_recurrent::llama_memory_recurrent(
93100
}
94101

95102
// allocate tensors and initialize the buffers to avoid NaNs in the padding
96-
for (auto it : ctx_map) {
97-
auto * buft = it.first;
98-
auto * ctx = it.second;
99-
100-
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
103+
for (auto & [buft, ctx] : ctx_map) {
104+
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx.get(), buft);
101105
if (!buf) {
102106
throw std::runtime_error("failed to allocate buffer for rs cache");
103107
}
104108
ggml_backend_buffer_clear(buf, 0);
105109
LLAMA_LOG_INFO("%s: %10s RS buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
106-
bufs.emplace_back(buf);
110+
ctxs_bufs.emplace_back(std::move(ctx), buf);
107111
}
108112

109113
{
@@ -129,7 +133,7 @@ void llama_memory_recurrent::clear(bool data) {
129133
used = 0;
130134

131135
if (data) {
132-
for (auto & buf : bufs) {
136+
for (auto & [_, buf] : ctxs_bufs) {
133137
ggml_backend_buffer_clear(buf.get(), 0);
134138
}
135139
}
@@ -364,8 +368,8 @@ llama_pos llama_memory_recurrent::seq_pos_max(llama_seq_id seq_id) const {
364368

365369
std::map<ggml_backend_buffer_type_t, size_t> llama_memory_recurrent::memory_breakdown() const {
366370
std::map<ggml_backend_buffer_type_t, size_t> ret;
367-
for (const ggml_backend_buffer_ptr & buf_ptr : bufs) {
368-
ret[ggml_backend_buffer_get_type(buf_ptr.get())] += ggml_backend_buffer_get_size(buf_ptr.get());
371+
for (const auto & [_, buf] : ctxs_bufs) {
372+
ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
369373
}
370374
return ret;
371375
}
@@ -662,7 +666,7 @@ bool llama_memory_recurrent::get_can_shift() const {
662666

663667
size_t llama_memory_recurrent::total_size() const {
664668
size_t size = 0;
665-
for (const auto & buf : bufs) {
669+
for (const auto & [_, buf] : ctxs_bufs) {
666670
size += ggml_backend_buffer_get_size(buf.get());
667671
}
668672

src/llama-memory-recurrent.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -109,8 +109,8 @@ class llama_memory_recurrent : public llama_memory_i {
109109

110110
const uint32_t n_seq_max = 1;
111111

112-
std::vector<ggml_context_ptr> ctxs;
113-
std::vector<ggml_backend_buffer_ptr> bufs;
112+
// ggml contexts for the KV cache along with the allocated backend buffers:
113+
std::vector<std::pair<ggml_context_ptr, ggml_backend_buffer_ptr>> ctxs_bufs;
114114

115115
size_t total_size() const;
116116

src/llama-model.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2231,7 +2231,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
22312231
// define a comparator for the buft -> ctx map to ensure that the order is well-defined:
22322232
struct ggml_backend_buft_comparator {
22332233
bool operator()(const ggml_backend_buffer_type_t & lhs, const ggml_backend_buffer_type_t & rhs) const {
2234-
return ggml_backend_buft_name(lhs) < ggml_backend_buft_name(rhs);
2234+
return strcmp(ggml_backend_buft_name(lhs), ggml_backend_buft_name(rhs)) < 0;
22352235
}
22362236
};
22372237
std::map<ggml_backend_buffer_type_t, ggml_context_ptr, ggml_backend_buft_comparator> ctx_map;

0 commit comments

Comments
 (0)