Skip to content

Commit eab776e

Browse files
committed
re-format and delete unused implementations
1 parent 95f49d9 commit eab776e

File tree

5 files changed

+7
-147
lines changed

5 files changed

+7
-147
lines changed

convert_hf_to_gguf.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3935,6 +3935,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
39353935

39363936
return super().modify_tensors(data_torch, name, bid)
39373937

3938+
39383939
@ModelBase.register("ModernBert", "ModernBertForMaskedLM", "ModernBertForSequenceClassification")
39393940
class ModernBertModel(BertModel):
39403941
model_arch = gguf.MODEL_ARCH.MODERN_BERT
@@ -3958,6 +3959,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
39583959

39593960
return super().modify_tensors(data_torch, name, bid)
39603961

3962+
39613963
@ModelBase.register("RobertaModel", "RobertaForSequenceClassification")
39623964
class RobertaModel(BertModel):
39633965
model_arch = gguf.MODEL_ARCH.BERT

gguf-py/gguf/gguf_writer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -812,7 +812,7 @@ def add_rope_dimension_sections(self, dims: Sequence[int]) -> None:
812812

813813
def add_rope_freq_base(self, value: float) -> None:
814814
self.add_float32(Keys.Rope.FREQ_BASE.format(arch=self.arch), value)
815-
815+
816816
def add_rope_freq_base_swa(self, value: float) -> None:
817817
self.add_float32(Keys.Rope.FREQ_BASE_SWA.format(arch=self.arch), value)
818818

src/llama-graph.cpp

Lines changed: 0 additions & 105 deletions
Original file line numberDiff line numberDiff line change
@@ -363,111 +363,6 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
363363
}
364364
}
365365

366-
void llm_graph_input_attn_no_cache_iswa::set_input(const llama_ubatch * ubatch) {
367-
// Standard attention mask
368-
if (kq_mask) {
369-
if (cparams.causal_attn) {
370-
const int64_t n_kv = ubatch->n_tokens;
371-
const int64_t n_tokens = ubatch->n_tokens;
372-
const int64_t n_seq_tokens = ubatch->n_seq_tokens;
373-
const int64_t n_seqs = ubatch->n_seqs;
374-
375-
GGML_ASSERT(ggml_backend_buffer_is_host(kq_mask->buffer));
376-
float * data = (float *) kq_mask->data;
377-
378-
for (int h = 0; h < 1; ++h) {
379-
for (int s1 = 0; s1 < n_seqs; ++s1) {
380-
const llama_seq_id seq_id = ubatch->seq_id[s1][0];
381-
382-
for (int j = 0; j < n_seq_tokens; ++j) {
383-
const int32_t tj = s1*n_seq_tokens + j;
384-
385-
for (int s0 = 0; s0 < n_seqs; ++s0) {
386-
for (int i = 0; i < n_seq_tokens; ++i) {
387-
const int32_t ti = s0 * n_seq_tokens + i;
388-
float f = -INFINITY;
389-
390-
for (int s = 0; s < ubatch->n_seq_id[s0]; ++s) {
391-
if (ubatch->seq_id[s0][s] == seq_id && ubatch->pos[ti] <= ubatch->pos[tj]) {
392-
if (hparams.use_alibi) {
393-
f = -std::abs(ubatch->pos[ti] - ubatch->pos[tj]);
394-
} else {
395-
f = 0.0f;
396-
}
397-
break;
398-
}
399-
}
400-
401-
data[h * (n_kv * n_tokens) + tj * n_kv + ti] = f;
402-
}
403-
}
404-
}
405-
}
406-
407-
for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
408-
for (int j = 0; j < n_kv; ++j) {
409-
data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
410-
}
411-
}
412-
}
413-
}
414-
}
415-
416-
// SWA attention mask
417-
if (kq_mask_swa) {
418-
if (cparams.causal_attn) {
419-
const int64_t n_kv = ubatch->n_tokens;
420-
const int64_t n_tokens = ubatch->n_tokens;
421-
const int64_t n_seq_tokens = ubatch->n_seq_tokens;
422-
const int64_t n_seqs = ubatch->n_seqs;
423-
const int64_t window_size = hparams.n_swa;
424-
425-
GGML_ASSERT(ggml_backend_buffer_is_host(kq_mask_swa->buffer));
426-
float * data = (float *) kq_mask_swa->data;
427-
428-
for (int h = 0; h < 1; ++h) {
429-
for (int s1 = 0; s1 < n_seqs; ++s1) {
430-
const llama_seq_id seq_id = ubatch->seq_id[s1][0];
431-
432-
for (int j = 0; j < n_seq_tokens; ++j) {
433-
const int32_t tj = s1*n_seq_tokens + j;
434-
435-
for (int s0 = 0; s0 < n_seqs; ++s0) {
436-
for (int i = 0; i < n_seq_tokens; ++i) {
437-
const int32_t ti = s0 * n_seq_tokens + i;
438-
float f = -INFINITY;
439-
440-
for (int s = 0; s < ubatch->n_seq_id[s0]; ++s) {
441-
if (ubatch->seq_id[s0][s] == seq_id && ubatch->pos[ti] <= ubatch->pos[tj]) {
442-
const bool in_window = (ubatch->pos[tj] - ubatch->pos[ti]) <= window_size;
443-
444-
if (in_window) {
445-
if (hparams.use_alibi) {
446-
f = -std::abs(ubatch->pos[ti] - ubatch->pos[tj]);
447-
} else {
448-
f = 0.0f;
449-
}
450-
}
451-
break;
452-
}
453-
}
454-
455-
data[h * (n_kv * n_tokens) + tj * n_kv + ti] = f;
456-
}
457-
}
458-
}
459-
}
460-
461-
for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
462-
for (int j = 0; j < n_kv; ++j) {
463-
data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
464-
}
465-
}
466-
}
467-
}
468-
}
469-
}
470-
471366
void llm_graph_input_attn_kv_unified::set_input(const llama_ubatch * ubatch) {
472367
if (self_kq_mask) {
473368
kv_state->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);

src/llama-graph.h

Lines changed: 0 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -243,28 +243,6 @@ class llm_graph_input_attn_no_cache : public llm_graph_input_i {
243243
const llama_cparams & cparams;
244244
};
245245

246-
class llm_graph_input_attn_no_cache_iswa : public llm_graph_input_i {
247-
public:
248-
llm_graph_input_attn_no_cache_iswa(const llama_hparams & hparams, const llama_cparams & cparams) :
249-
hparams(hparams),
250-
cparams(cparams) {
251-
}
252-
~llm_graph_input_attn_no_cache_iswa() = default;
253-
254-
void set_input(const llama_ubatch * ubatch) override;
255-
256-
ggml_tensor * get_kq_mask() const { return kq_mask_cnv; }
257-
ggml_tensor * get_kq_mask_swa() const { return kq_mask_swa_cnv; }
258-
259-
ggml_tensor * kq_mask = nullptr; // F32 [n_tokens, n_batch]
260-
ggml_tensor * kq_mask_cnv = nullptr; // [n_tokens, n_batch]
261-
ggml_tensor * kq_mask_swa = nullptr; // F32 [n_tokens, n_batch]
262-
ggml_tensor * kq_mask_swa_cnv = nullptr; // [n_tokens, n_batch]
263-
264-
const llama_hparams & hparams;
265-
const llama_cparams & cparams;
266-
};
267-
268246
class llm_graph_input_attn_kv_unified : public llm_graph_input_i {
269247
public:
270248
llm_graph_input_attn_kv_unified(
@@ -565,8 +543,6 @@ struct llm_graph_context {
565543

566544
llm_graph_input_attn_no_cache * build_attn_inp_no_cache() const;
567545

568-
llm_graph_input_attn_no_cache_iswa * build_attn_inp_no_cache_iswa() const;
569-
570546
ggml_tensor * build_attn(
571547
llm_graph_input_attn_no_cache * inp,
572548
ggml_cgraph * gf,
@@ -580,19 +556,6 @@ struct llm_graph_context {
580556
float kq_scale,
581557
int il) const;
582558

583-
ggml_tensor * build_attn(
584-
llm_graph_input_attn_no_cache_iswa * inp,
585-
ggml_cgraph * gf,
586-
ggml_tensor * wo,
587-
ggml_tensor * wo_b,
588-
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
589-
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
590-
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
591-
ggml_tensor * kq_b,
592-
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
593-
float kq_scale,
594-
int il) const;
595-
596559
llm_graph_input_attn_kv_unified * build_attn_inp_kv_unified() const;
597560

598561
ggml_tensor * build_attn(

src/llama-model.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6203,25 +6203,25 @@ struct llm_build_modern_bert : public llm_graph_context {
62036203
// feed-forward network
62046204
ggml_tensor * ffn_up = build_lora_mm(model.layers[il].ffn_up, cur);
62056205
cb(ffn_up, "ffn_up", il);
6206-
6206+
62076207
int64_t split_point = ffn_up->ne[0] / 2;
62086208
ggml_tensor * output_ffn_up = ggml_cont(ctx0, ggml_view_2d(
62096209
ctx0, ffn_up, split_point,
62106210
ffn_up->ne[1], ffn_up->nb[1], 0
62116211
));
62126212
ggml_tensor * output_ffn_gate = ggml_cont(ctx0, ggml_view_2d(
62136213
ctx0, ffn_up, split_point,
6214-
ffn_up->ne[1], ffn_up->nb[1],
6214+
ffn_up->ne[1], ffn_up->nb[1],
62156215
split_point * ggml_element_size(ffn_up)
62166216
));
62176217

62186218
// Apply activation function
62196219
output_ffn_up = ggml_gelu(ctx0, output_ffn_up);
6220-
6220+
62216221
// Element-wise multiplication
62226222
ggml_tensor * gated = ggml_mul(ctx0, output_ffn_up, output_ffn_gate);
62236223
cb(gated, "ffn_gated", il);
6224-
6224+
62256225
// Final projection
62266226
cur = build_lora_mm(model.layers[il].ffn_down, gated);
62276227

0 commit comments

Comments
 (0)