Skip to content

Commit f57018f

Browse files
committed
Merge branch 'upstream' into concedo_experimental
# Conflicts: # .github/workflows/build-linux-cross.yml
2 parents afca31b + 086cf81 commit f57018f

File tree

6 files changed

+68
-59
lines changed

6 files changed

+68
-59
lines changed

convert_hf_to_gguf_update.py

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
import re
88

99
import requests
10-
import sys
1110
import json
1211
import shutil
1312
import argparse
@@ -69,8 +68,7 @@ class TOKENIZER_TYPE(IntEnum):
6968
hf_token = args.hf_token if args.hf_token is not None else hf_token
7069

7170
if hf_token is None:
72-
logger.error("HF token is required. Please provide it as an argument or set it in ~/.cache/huggingface/token")
73-
sys.exit(1)
71+
logger.warning("HF token not found. You can provide it as an argument or set it in ~/.cache/huggingface/token")
7472

7573
# TODO: this string has to exercise as much pre-tokenizer functionality as possible
7674
# will be updated with time - contributions welcome
@@ -151,7 +149,7 @@ class TOKENIZER_TYPE(IntEnum):
151149

152150

153151
def download_file_with_auth(url, token, save_path):
154-
headers = {"Authorization": f"Bearer {token}"}
152+
headers = {"Authorization": f"Bearer {token}"} if token else None
155153
response = sess.get(url, headers=headers)
156154
response.raise_for_status()
157155
os.makedirs(os.path.dirname(save_path), exist_ok=True)
@@ -250,20 +248,18 @@ def get_existing_models(convert_py):
250248
else:
251249
# otherwise, compute the hash of the tokenizer
252250

253-
# Skip if the tokenizer folder does not exist or there are other download issues previously
254-
if not os.path.exists(f"models/tokenizers/{name}"):
255-
logger.warning(f"Directory for tokenizer {name} not found. Skipping...")
256-
continue
251+
# Fail if the tokenizer folder with config does not exist or there are other download issues previously
252+
if not os.path.isfile(f"models/tokenizers/{name}/tokenizer_config.json"):
253+
raise OSError(f"Config for tokenizer {name} not found. The model may not exist or is not accessible with the provided token.")
257254

258255
try:
259256
logger.info(f"Loading tokenizer from {f'models/tokenizers/{name}'}...")
260257
if name == "t5":
261258
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}", use_fast=False)
262259
else:
263260
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
264-
except OSError as e:
265-
logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}")
266-
continue # Skip to the next model if the tokenizer can't be loaded
261+
except Exception as e:
262+
raise OSError(f"Error loading tokenizer for model {name}.") from e
267263

268264
chktok = tokenizer.encode(CHK_TXT)
269265
chkhsh = sha256(str(chktok).encode()).hexdigest()

src/llama-batch.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,8 @@ bool llama_batch_allocr::init(
157157
n_outputs += batch.logits[i] != 0;
158158
}
159159

160+
has_cpl = false;
161+
160162
// determine coupled sequences
161163
// these are pairs of sequences that have at least one token in the input batch that is assigned to both of them
162164
for (int32_t i = 0; i < batch.n_tokens; ++i) {

src/llama-batch.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ class llama_batch_allocr {
117117
using seq_cpl_t = std::vector<bool>;
118118

119119
// helper flag to quickly determine if there are any coupled sequences in the batch
120-
bool has_cpl;
120+
bool has_cpl = false;
121121

122122
std::vector<pos_set_t> seq_pos; // seq_pos[s]: the set of positions in sequence s
123123
std::vector<seq_cpl_t> seq_cpl; // seq_cpl[s0][s1]: if sequence s0 is coupled to sequence s1

src/llama-kv-cache-unified.cpp

Lines changed: 18 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1283,6 +1283,8 @@ void llama_kv_cache_unified::set_input_kq_mask(ggml_tensor * dst, const llama_ub
12831283
const int64_t n_tps = n_tokens/n_stream;
12841284
const int64_t n_tps_pad = GGML_PAD(n_tps, GGML_KQ_MASK_PAD);
12851285

1286+
std::fill(data, data + ggml_nelements(dst), -INFINITY);
1287+
12861288
// Use only the previous KV cells of the correct sequence for each token of the ubatch.
12871289
// It's assumed that if a token in the batch has multiple sequences, they are equivalent.
12881290
// Example with a cache of 10 tokens, 2 tokens populated in cache and 3 tokens in batch:
@@ -1306,44 +1308,31 @@ void llama_kv_cache_unified::set_input_kq_mask(ggml_tensor * dst, const llama_ub
13061308

13071309
const llama_pos p1 = ubatch->pos[i];
13081310

1309-
for (uint32_t j = 0; j < n_kv; ++j) {
1310-
float f = 0.0f;
1311-
1312-
bool masked = false;
1311+
const uint64_t idst = n_kv*(h*n_stream*n_tps_pad + s*n_tps_pad + ii);
13131312

1313+
for (uint32_t j = 0; j < n_kv; ++j) {
13141314
if (cells.is_empty(j)) {
1315-
masked = true;
1316-
} else {
1317-
const llama_pos p0 = cells.pos_get(j);
1318-
1319-
// mask the token if not the same sequence
1320-
masked = masked || (!cells.seq_has(j, seq_id));
1315+
continue;
1316+
}
13211317

1322-
// mask future tokens
1323-
masked = masked || (causal_attn && p0 > p1);
1318+
// mask the token if not the same sequence
1319+
if (!cells.seq_has(j, seq_id)) {
1320+
continue;
1321+
}
13241322

1325-
// apply SWA if any
1326-
masked = masked || (is_masked_swa(p0, p1));
1323+
const llama_pos p0 = cells.pos_get(j);
13271324

1328-
if (!masked && hparams.use_alibi) {
1329-
f = -std::abs(p0 - p1);
1330-
}
1325+
// mask future tokens
1326+
if (causal_attn && p0 > p1) {
1327+
continue;
13311328
}
13321329

1333-
if (masked) {
1334-
f = -INFINITY;
1330+
// apply SWA if any
1331+
if (is_masked_swa(p0, p1)) {
1332+
continue;
13351333
}
13361334

1337-
data[h*n_stream*n_tps_pad*n_kv + s*n_tps_pad*n_kv + ii*n_kv + j] = f;
1338-
}
1339-
1340-
// mask padded tokens
1341-
if (data) {
1342-
for (uint32_t ii = n_tps; ii < n_tps_pad; ++ii) {
1343-
for (uint32_t j = 0; j < n_kv; ++j) {
1344-
data[h*n_stream*n_tps_pad*n_kv + s*n_tps_pad*n_kv + ii*n_kv + j] = -INFINITY;
1345-
}
1346-
}
1335+
data[idst + j] = hparams.use_alibi ? -std::abs(p0 - p1) : 0.0f;
13471336
}
13481337
}
13491338
}

src/llama-memory-hybrid.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,9 @@ llama_memory_hybrid::llama_memory_hybrid(
3838
type_v,
3939
v_trans,
4040
offload,
41+
1,
4142
kv_size,
4243
n_seq_max,
43-
1,
4444
n_pad,
4545
n_swa,
4646
swa_type

src/llama-model.cpp

Lines changed: 39 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -16654,46 +16654,68 @@ struct llm_build_lfm2 : public llm_graph_context {
1665416654
ggml_tensor * cur,
1665516655
llm_graph_input_rs * inp_recr,
1665616656
int il) {
16657-
const auto * mctx_cur = static_cast<const llama_memory_hybrid_context *>(mctx)->get_recr();
16657+
const auto * mctx_cur = static_cast<const llama_memory_hybrid_context *>(mctx)->get_recr();
16658+
const uint32_t kv_head = mctx_cur->get_head();
16659+
const int64_t n_seq_tokens = ubatch.n_seq_tokens;
16660+
const int64_t n_seqs = ubatch.n_seqs;
16661+
GGML_ASSERT(n_seqs != 0);
16662+
GGML_ASSERT(ubatch.equal_seqs);
16663+
GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
16664+
16665+
GGML_ASSERT(hparams.n_shortconv_l_cache > 1);
16666+
const uint32_t d_conv = hparams.n_shortconv_l_cache - 1;
16667+
16668+
// {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
16669+
cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
1665816670

1665916671
auto * bcx = build_lora_mm(model.layers[il].shortconv.in_proj, cur);
1666016672
cb(bcx, "model.layers.{}.conv.in_proj", il);
1666116673

1666216674
constexpr auto n_chunks = 3;
1666316675
GGML_ASSERT(bcx->ne[0] % n_chunks == 0);
1666416676
auto const chunk_size = bcx->ne[0] / n_chunks;
16665-
auto * b = ggml_view_2d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->nb[1], 0 * chunk_size * ggml_element_size(bcx));
16666-
auto * c = ggml_view_2d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->nb[1], 1 * chunk_size * ggml_element_size(bcx));
16667-
auto * x = ggml_view_2d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->nb[1], 2 * chunk_size * ggml_element_size(bcx));
16677+
auto * b = ggml_view_3d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->ne[2], bcx->nb[1], bcx->nb[2], 0*chunk_size*ggml_element_size(bcx));
16678+
auto * c = ggml_view_3d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->ne[2], bcx->nb[1], bcx->nb[2], 1*chunk_size*ggml_element_size(bcx));
16679+
auto * x = ggml_view_3d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->ne[2], bcx->nb[1], bcx->nb[2], 2*chunk_size*ggml_element_size(bcx));
1666816680

1666916681
auto * bx = ggml_transpose(ctx0, ggml_mul(ctx0, b, x));
1667016682

16671-
// read conv state directly, with build_rs generation is slower
16672-
ggml_tensor * conv_state = mctx_cur->get_r_l(il);
16673-
const int64_t n_seqs = ubatch.n_seqs;
16674-
ggml_tensor * conv = build_rs(inp_recr, gf, conv_state, hparams.n_embd_r(), n_seqs);
16675-
conv = ggml_reshape_3d(ctx0, conv_state, hparams.n_shortconv_l_cache - 1, hparams.n_embd, n_seqs);
16683+
// read conv state
16684+
auto * conv_state = mctx_cur->get_r_l(il);
16685+
auto * conv_rs = build_rs(inp_recr, gf, conv_state, hparams.n_embd_r(), n_seqs);
16686+
auto * conv = ggml_reshape_3d(ctx0, conv_rs, d_conv, hparams.n_embd, n_seqs);
1667616687

1667716688
bx = ggml_concat(ctx0, conv, bx, 0);
1667816689
GGML_ASSERT(bx->ne[0] > conv->ne[0]);
1667916690

16680-
auto * new_conv = ggml_view_2d(ctx0, bx, conv->ne[0], bx->ne[1], bx->nb[1], (bx->ne[0] - conv->ne[0]) * ggml_element_size(bx));
16691+
// last d_conv columns is a new conv state
16692+
auto * new_conv = ggml_view_3d(ctx0, bx, conv->ne[0], bx->ne[1], bx->ne[2], bx->nb[1], bx->nb[2], (bx->ne[0] - conv->ne[0])*ggml_element_size(bx));
1668116693
GGML_ASSERT(ggml_are_same_shape(conv, new_conv));
1668216694

16683-
// write conv state
16684-
ggml_build_forward_expand(gf, ggml_cpy(ctx0, new_conv, conv_state));
16695+
// write new conv conv state
16696+
ggml_build_forward_expand(
16697+
gf,
16698+
ggml_cpy(
16699+
ctx0,
16700+
new_conv,
16701+
ggml_view_1d(
16702+
ctx0,
16703+
conv_state,
16704+
ggml_nelements(new_conv),
16705+
kv_head*d_conv*n_embd*ggml_element_size(new_conv)
16706+
)
16707+
)
16708+
);
1668516709

1668616710
auto * conv_kernel = model.layers[il].shortconv.conv;
16687-
GGML_ASSERT(hparams.n_shortconv_l_cache > 0);
16688-
16689-
// construct ssm_conv op
16690-
ggml_tensor * conv_out = ggml_ssm_conv(ctx0, bx, conv_kernel);
16711+
auto * conv_out = ggml_ssm_conv(ctx0, bx, conv_kernel);
1669116712
cb(conv_out, "model.layers.{}.conv.conv", il);
1669216713

1669316714
auto * y = ggml_mul(ctx0, c, conv_out);
16694-
1669516715
y = build_lora_mm(model.layers[il].shortconv.out_proj, y);
1669616716
cb(y, "model.layers.{}.conv.out_proj", il);
16717+
// {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
16718+
y = ggml_reshape_2d(ctx0, y, y->ne[0], n_seq_tokens * n_seqs);
1669716719

1669816720
return y;
1669916721
}

0 commit comments

Comments
 (0)