Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 0 additions & 27 deletions convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -4798,25 +4798,6 @@ def prepare_tensors(self):
class JinaBertV2Model(BertModel):
model_arch = gguf.MODEL_ARCH.JINA_BERT_V2

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.intermediate_size = self.hparams["intermediate_size"]

def get_tensors(self):
for name, data in super().get_tensors():
if 'gated_layer' in name:
d1 = data[:self.intermediate_size, :]
name1 = name.replace('gated_layers', 'gated_layers_w')
name1 = name1.replace('up_gated_layer', 'gated_layers_v')
d2 = data[self.intermediate_size:, :]
name2 = name.replace('gated_layers', 'gated_layers_v')
name2 = name2.replace('up_gated_layer', 'gated_layers_w')
yield name1, d1
yield name2, d2
continue

yield name, data

def set_vocab(self):
tokenizer_class = 'BertTokenizer'
with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f:
Expand All @@ -4832,14 +4813,6 @@ def set_vocab(self):
self.gguf_writer.add_add_bos_token(True)
self.gguf_writer.add_add_eos_token(True)

def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# if name starts with "bert.", remove the prefix
# e.g. https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
if name.startswith("bert."):
name = name[5:]

return super().modify_tensors(data_torch, name, bid)


@ModelBase.register("OpenELMForCausalLM")
class OpenELMModel(TextModel):
Expand Down
1 change: 1 addition & 0 deletions gguf-py/gguf/tensor_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,7 @@ class TensorNameMap:
"encoder.layers.{bid}.mlp.fc1", # nomic-bert-moe
"model.layers.{bid}.mlp.c_fc", # starcoder2
"encoder.layer.{bid}.mlp.gated_layers_v", # jina-bert-v2
"encoder.layer.{bid}.mlp.gated_layers", # jina-bert-v2 (GEGLU)
"model.layers.{bid}.residual_mlp.w3", # arctic
"encoder.layers.{bid}.mlp.dense_h_to_4h", # chatglm
"transformer.h.{bid}.mlp.c_fc_1", # exaone
Expand Down
9 changes: 4 additions & 5 deletions src/llama-graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -650,8 +650,8 @@ ggml_tensor * llm_graph_context::build_ffn(
{
// Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf
int64_t split_point = cur->ne[0] / 2;
ggml_tensor * x0 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], 0));
ggml_tensor * x1 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur)));
ggml_tensor * x0 = ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], 0);
ggml_tensor * x1 = ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur));

x0 = ggml_silu(ctx0, x0);
cb(cur, "ffn_silu", il);
Expand All @@ -663,9 +663,8 @@ ggml_tensor * llm_graph_context::build_ffn(
{
// Split into two equal parts
int64_t split_point = cur->ne[0] / 2;
// TODO: these conts should not be needed
ggml_tensor * x0 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], 0));
ggml_tensor * x1 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur)));
ggml_tensor * x0 = ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], 0);
ggml_tensor * x1 = ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur));
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removing the conts works because the ops will be offloaded back to the CPU if the tensor is not contiguous:

switch (op->op) {
case GGML_OP_UNARY:
switch (ggml_get_unary_op(op)) {
case GGML_UNARY_OP_ABS:
case GGML_UNARY_OP_SGN:
case GGML_UNARY_OP_NEG:
case GGML_UNARY_OP_STEP:
case GGML_UNARY_OP_GELU:
case GGML_UNARY_OP_SILU:
case GGML_UNARY_OP_RELU:
case GGML_UNARY_OP_SIGMOID:
case GGML_UNARY_OP_HARDSIGMOID:
case GGML_UNARY_OP_HARDSWISH:
case GGML_UNARY_OP_GELU_ERF:
case GGML_UNARY_OP_GELU_QUICK:
case GGML_UNARY_OP_TANH:
case GGML_UNARY_OP_EXP:
return ggml_is_contiguous(op->src[0]);
default:
return false;
}
break;

For now lets keep the conts and expand the TODO to reference this comment, so that we update the backend implementations to support those.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ggerganov Just to make sure I'm on the right track here, I started looking into this yesterday, rather than (or maybe in addition to) making these ops handle non-contiguous data it seems much simpler (and more efficient) to add GEGLU/SWIGLU ops.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, good point.


x0 = ggml_gelu(ctx0, x0);
cb(x0, "ffn_gelu", il);
Expand Down
6 changes: 3 additions & 3 deletions src/llama-model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2224,8 +2224,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);

layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, layer.ffn_gate ? n_ff : n_ff * 2}, 0);

layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
Expand Down Expand Up @@ -6043,7 +6043,7 @@ struct llm_build_bert : public llm_graph_context {
model.layers[il].ffn_gate, NULL, NULL,
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
NULL,
LLM_FFN_GELU, LLM_FFN_PAR, il);
model.layers[il].ffn_gate ? LLM_FFN_GELU : LLM_FFN_GEGLU, LLM_FFN_PAR, il);
cb(cur, "ffn_out", il);
} else {
cur = build_ffn(cur,
Expand Down