Skip to content

Commit 3060dfb

Browse files
committed
Merge branch 'upstream' into concedo_experimental
# Conflicts: # examples/model-conversion/Makefile # examples/model-conversion/scripts/causal/convert-model.sh # ggml/src/ggml-cann/aclnn_ops.cpp # ggml/src/ggml-cann/common.h # ggml/src/ggml-cann/ggml-cann.cpp # ggml/src/ggml-cuda/CMakeLists.txt # scripts/compare-commits.sh
2 parents 0f2436b + 7380414 commit 3060dfb

28 files changed

+395
-257
lines changed

common/arg.cpp

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2557,15 +2557,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
25572557
{"--lora"}, "FNAME",
25582558
"path to LoRA adapter (can be repeated to use multiple adapters)",
25592559
[](common_params & params, const std::string & value) {
2560-
params.lora_adapters.push_back({ std::string(value), 1.0, nullptr });
2560+
params.lora_adapters.push_back({ std::string(value), 1.0, "", "", nullptr });
25612561
}
25622562
// we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
25632563
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
25642564
add_opt(common_arg(
25652565
{"--lora-scaled"}, "FNAME", "SCALE",
25662566
"path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters)",
25672567
[](common_params & params, const std::string & fname, const std::string & scale) {
2568-
params.lora_adapters.push_back({ fname, std::stof(scale), nullptr });
2568+
params.lora_adapters.push_back({ fname, std::stof(scale), "", "", nullptr });
25692569
}
25702570
// we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
25712571
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
@@ -3540,6 +3540,22 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
35403540
}
35413541
).set_examples({LLAMA_EXAMPLE_SERVER}));
35423542

3543+
add_opt(common_arg(
3544+
{"--fim-qwen-30b-default"},
3545+
string_format("use default Qwen 3 Coder 30B A3B Instruct (note: can download weights from the internet)"),
3546+
[](common_params & params) {
3547+
params.model.hf_repo = "ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF";
3548+
params.model.hf_file = "qwen3-coder-30b-a3b-instruct-q8_0.gguf";
3549+
params.port = 8012;
3550+
params.n_gpu_layers = 99;
3551+
params.flash_attn = true;
3552+
params.n_ubatch = 1024;
3553+
params.n_batch = 1024;
3554+
params.n_ctx = 0;
3555+
params.n_cache_reuse = 256;
3556+
}
3557+
).set_examples({LLAMA_EXAMPLE_SERVER}));
3558+
35433559
add_opt(common_arg(
35443560
{ "--diffusion-steps" }, "N",
35453561
string_format("number of diffusion steps (default: %d)", params.diffusion.steps),

common/common.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -996,7 +996,12 @@ struct common_init_result common_init_from_params(common_params & params) {
996996
return iparams;
997997
}
998998

999+
char buf[1024];
9991000
la.ptr = lora.get();
1001+
llama_adapter_meta_val_str(la.ptr, "adapter.lora.task_name", buf, sizeof(buf));
1002+
la.task_name = buf;
1003+
llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", buf, sizeof(buf));
1004+
la.prompt_prefix = buf;
10001005
iparams.lora.emplace_back(std::move(lora)); // copy to list of loaded adapters
10011006
}
10021007

common/common.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,9 @@ struct common_adapter_lora_info {
3434
std::string path;
3535
float scale;
3636

37+
std::string task_name;
38+
std::string prompt_prefix;
39+
3740
struct llama_adapter_lora * ptr;
3841
};
3942

convert_hf_to_gguf.py

Lines changed: 77 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ class ModelBase:
7272
endianess: gguf.GGUFEndian
7373
use_temp_file: bool
7474
lazy: bool
75+
dry_run: bool
7576
part_names: list[str]
7677
is_safetensors: bool
7778
hparams: dict[str, Any]
@@ -111,6 +112,7 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path,
111112
self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
112113
self.use_temp_file = use_temp_file
113114
self.lazy = not eager or (remote_hf_model_id is not None)
115+
self.dry_run = dry_run
114116
self.remote_hf_model_id = remote_hf_model_id
115117
if remote_hf_model_id is not None:
116118
self.is_safetensors = True
@@ -4871,11 +4873,35 @@ def modify_tensors(self, data_torch, name, bid):
48714873
@ModelBase.register("XLMRobertaModel", "XLMRobertaForSequenceClassification")
48724874
class XLMRobertaModel(BertModel):
48734875
model_arch = gguf.MODEL_ARCH.BERT
4876+
_lora_files = {}
4877+
_lora_names = []
48744878

4875-
def __init__(self, *args, **kwargs):
4876-
super().__init__(*args, **kwargs)
4879+
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, **kwargs: Any):
4880+
hparams = kwargs.pop("hparams", None)
4881+
if hparams is None:
4882+
hparams = ModelBase.load_hparams(dir_model, False)
4883+
4884+
if lora_names := hparams.get("lora_adaptations"):
4885+
self._lora_names = lora_names
4886+
self.model_arch = gguf.MODEL_ARCH.JINA_BERT_V3
4887+
4888+
super().__init__(dir_model, ftype, fname_out, hparams=hparams, **kwargs)
48774889
self._xlmroberta_tokenizer_init()
48784890

4891+
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
4892+
if self._lora_names:
4893+
for name in self._lora_names:
4894+
fname = self.add_prefix_to_filename(self.fname_out, f"lora-{name}-")
4895+
self._lora_files[name] = gguf.GGUFWriter(fname, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file, dry_run=self.dry_run)
4896+
4897+
return super().generate_extra_tensors()
4898+
4899+
def set_type(self):
4900+
for lora_writer in self._lora_files.values():
4901+
lora_writer.add_type(gguf.GGUFType.ADAPTER)
4902+
lora_writer.add_string(gguf.Keys.Adapter.TYPE, "lora")
4903+
super().set_type()
4904+
48794905
def set_vocab(self):
48804906
self._xlmroberta_set_vocab()
48814907

@@ -4885,13 +4911,62 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
48854911
if name.startswith("roberta."):
48864912
name = name[8:]
48874913

4914+
# jina-embeddings-v3
4915+
if ".parametrizations." in name:
4916+
name = name.replace(".parametrizations.", ".")
4917+
if name.endswith(".original"):
4918+
name = name[:-9]
4919+
48884920
# position embeddings start at pad_token_id + 1, so just chop down the weight tensor
48894921
if name == "embeddings.position_embeddings.weight":
48904922
if self._position_offset is not None:
48914923
data_torch = data_torch[self._position_offset:,:]
48924924

4925+
if name.endswith(".0.lora_A") or name.endswith(".0.lora_B"):
4926+
if name.startswith("pooler.dense"):
4927+
return []
4928+
4929+
num_loras = data_torch.size(0)
4930+
assert num_loras == len(self._lora_names)
4931+
4932+
# Split out each LoRA in their own GGUF
4933+
for i, lora_writer in enumerate(self._lora_files.values()):
4934+
new_name = self.map_tensor_name(name[:-9]) + name[-7:].lower()
4935+
data = data_torch[i, :, :]
4936+
# Transpose/flip token_embd/types into correct shape
4937+
if new_name == "token_embd.weight.lora_b":
4938+
data = data.T
4939+
elif new_name.startswith("token_types.weight."):
4940+
new_name = new_name[:-1] + ("a" if new_name[-1:] == "b" else "b")
4941+
lora_writer.add_tensor(new_name, data.float().numpy(), raw_dtype=gguf.GGMLQuantizationType.F32)
4942+
4943+
return []
4944+
48934945
return super().modify_tensors(data_torch, name, bid)
48944946

4947+
def set_gguf_parameters(self):
4948+
super().set_gguf_parameters()
4949+
4950+
# jina-embeddings-v3
4951+
if rotary_emb_base := self.hparams.get("rotary_emb_base"):
4952+
self.gguf_writer.add_rope_freq_base(rotary_emb_base)
4953+
lora_alpha = self.hparams.get("lora_alpha")
4954+
if lora_prompt_prefixes := self.hparams.get("task_instructions"):
4955+
assert self._lora_files and all(lora_name in lora_prompt_prefixes for lora_name in self._lora_files.keys())
4956+
for lora_name, lora_writer in self._lora_files.items():
4957+
lora_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, lora_alpha if lora_alpha is not None else 1.0)
4958+
lora_writer.add_string(gguf.Keys.Adapter.LORA_TASK_NAME, lora_name)
4959+
if lora_prompt_prefixes:
4960+
lora_writer.add_string(gguf.Keys.Adapter.LORA_PROMPT_PREFIX, lora_prompt_prefixes[lora_name])
4961+
4962+
def write(self):
4963+
super().write()
4964+
for lora_writer in self._lora_files.values():
4965+
lora_writer.write_header_to_file()
4966+
lora_writer.write_kv_data_to_file()
4967+
lora_writer.write_tensors_to_file(progress=True)
4968+
lora_writer.close()
4969+
48954970

48964971
@ModelBase.register("GemmaForCausalLM")
48974972
class GemmaModel(TextModel):

ggml/src/ggml-cpu/ops.cpp

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -9003,8 +9003,7 @@ static void ggml_compute_forward_ssm_scan_f32(
90039003
GGML_ASSERT(src4->nb[0] == sizeof(float));
90049004
GGML_ASSERT(src5->nb[0] == sizeof(float));
90059005
GGML_ASSERT(src6->nb[0] == sizeof(int32_t));
9006-
// allows optimizing the modulo since n_group should be a power of 2
9007-
GGML_ASSERT((ng & -ng) == ng);
9006+
GGML_ASSERT(nh % ng == 0);
90089007

90099008
// heads per thread
90109009
const int dh = (nh + nth - 1)/nth;
@@ -9035,6 +9034,7 @@ static void ggml_compute_forward_ssm_scan_f32(
90359034
// ref: https://github.com/state-spaces/mamba/blob/62db608da60f6fc790b8ed9f4b3225e95ca15fde/mamba_ssm/ops/triton/softplus.py#L16
90369035
const float dt_soft_plus = dt[h] <= 20.0f ? log1pf(expf(dt[h])) : dt[h];
90379036
const float dA = expf(dt_soft_plus * A[h]);
9037+
const int g = h / (nh / ng); // repeat_interleave
90389038

90399039
// dim
90409040
for (int i1 = 0; i1 < nr; ++i1) {
@@ -9057,8 +9057,8 @@ static void ggml_compute_forward_ssm_scan_f32(
90579057
// TODO: maybe unroll more?
90589058
for (int j = 0; j < 1; j++) {
90599059
GGML_F32_VEC t0 = GGML_F32_VEC_LOAD(s0 + i + j*ggml_f32_epr + ii*nc);
9060-
GGML_F32_VEC t1 = GGML_F32_VEC_LOAD(B + i + j*ggml_f32_epr + (h & (ng - 1))*nc);
9061-
GGML_F32_VEC t2 = GGML_F32_VEC_LOAD(C + i + j*ggml_f32_epr + (h & (ng - 1))*nc);
9060+
GGML_F32_VEC t1 = GGML_F32_VEC_LOAD(B + i + j*ggml_f32_epr + g*nc);
9061+
GGML_F32_VEC t2 = GGML_F32_VEC_LOAD(C + i + j*ggml_f32_epr + g*nc);
90629062

90639063
t0 = GGML_F32_VEC_MUL(t0, adA);
90649064
t1 = GGML_F32_VEC_MUL(t1, axdt);
@@ -9090,8 +9090,8 @@ static void ggml_compute_forward_ssm_scan_f32(
90909090
for (int i = 0; i < np; i += GGML_F32_STEP) {
90919091
for (int j = 0; j < GGML_F32_ARR; j++) {
90929092
ax[j] = GGML_F32_VEC_LOAD(s0 + i + j*GGML_F32_EPR + ii*nc);
9093-
ay[j] = GGML_F32_VEC_LOAD(B + i + j*GGML_F32_EPR + (h & (ng - 1))*nc);
9094-
az[j] = GGML_F32_VEC_LOAD(C + i + j*GGML_F32_EPR + (h & (ng - 1))*nc);
9093+
ay[j] = GGML_F32_VEC_LOAD(B + i + j*GGML_F32_EPR + g*nc);
9094+
az[j] = GGML_F32_VEC_LOAD(C + i + j*GGML_F32_EPR + g*nc);
90959095

90969096
ax[j] = GGML_F32_VEC_MUL(ax[j], adA);
90979097
ay[j] = GGML_F32_VEC_MUL(ay[j], axdt);
@@ -9113,7 +9113,7 @@ static void ggml_compute_forward_ssm_scan_f32(
91139113
// d_state
91149114
for (int i0 = np; i0 < nc; ++i0) {
91159115
const int i = i0 + ii*nc;
9116-
const int ig = i0 + (h & (ng - 1))*nc;
9116+
const int ig = i0 + g*nc;
91179117
// state = prev_state * dA + dB * x
91189118
const float state = (s0[i] * dA) + (B[ig] * x_dt);
91199119
// y = rowwise_dotprod(state, C)
@@ -9130,6 +9130,7 @@ static void ggml_compute_forward_ssm_scan_f32(
91309130
for (int h = ih0; h < ih1; ++h) {
91319131
// ref: https://github.com/state-spaces/mamba/blob/62db608da60f6fc790b8ed9f4b3225e95ca15fde/mamba_ssm/ops/triton/softplus.py#L16
91329132
const float dt_soft_plus = dt[h] <= 20.0f ? log1pf(expf(dt[h])) : dt[h];
9133+
const int g = h / (nh / ng); // repeat_interleave
91339134

91349135
// dim
91359136
for (int i1 = 0; i1 < nr; ++i1) {
@@ -9144,8 +9145,8 @@ static void ggml_compute_forward_ssm_scan_f32(
91449145
// TODO: what happens when (d_state % svcntw()) != 0?
91459146
for (int64_t k = 0; k < nc; k += svcntw()) {
91469147
svfloat32_t vA = GGML_F32_VEC_LOAD(&A[h*nc + k]);
9147-
svfloat32_t vB = GGML_F32_VEC_LOAD(&B[k + (h & (ng - 1))*nc]);
9148-
svfloat32_t vC = GGML_F32_VEC_LOAD(&C[k + (h & (ng - 1))*nc]);
9148+
svfloat32_t vB = GGML_F32_VEC_LOAD(&B[k + g*nc]);
9149+
svfloat32_t vC = GGML_F32_VEC_LOAD(&C[k + g*nc]);
91499150
svfloat32_t vs0 = GGML_F32_VEC_LOAD(&s0[ii*nc + k]);
91509151

91519152
svfloat32_t t1 = GGML_F32_VEC_MUL(vdt_soft_plus, vA);
@@ -9165,7 +9166,7 @@ static void ggml_compute_forward_ssm_scan_f32(
91659166
// d_state
91669167
for (int i0 = 0; i0 < nc; ++i0) {
91679168
const int i = i0 + ii*nc;
9168-
const int ig = i0 + (h & (ng - 1))*nc;
9169+
const int ig = i0 + g*nc;
91699170
// state = prev_state * dA + dB * x
91709171
const float state = (s0[i] * expf(dt_soft_plus * A[i0 + h*nc])) + (B[ig] * x_dt);
91719172
// y = rowwise_dotprod(state, C)

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3119,7 +3119,7 @@ bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size) {
31193119
return false;
31203120
}
31213121

3122-
#if CUDART_VERSION >= 11010 || defined(GGML_USE_MUSA)
3122+
#if CUDART_VERSION >= 11010 || defined(GGML_USE_MUSA) || defined(GGML_USE_HIP)
31233123
cudaError_t err = cudaHostRegister(buffer, size, cudaHostRegisterPortable | cudaHostRegisterReadOnly);
31243124
if (err != cudaSuccess) {
31253125
// clear the error

ggml/src/ggml-cuda/ssm-scan.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ __global__ void __launch_bounds__(d_state, 1)
129129
const int head_off = ((blockIdx.x * splitH) % d_head) * sizeof(float);
130130
const int seq_idx = blockIdx.y;
131131

132-
const int group_off = (head_idx & (n_group - 1)) * d_state * sizeof(float);
132+
const int group_off = (head_idx / (n_head / n_group)) * d_state * sizeof(float);
133133

134134
const float * s0_block = (const float *) ((const char *) src0 + src6[seq_idx] * src0_nb3 + head_idx * src0_nb2 + head_off * d_state);
135135
const float * x_block = (const float *) ((const char *) src1 + (seq_idx * src1_nb3) + blockIdx.x * splitH * sizeof(float));

ggml/src/ggml-metal/ggml-metal.metal

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1983,14 +1983,15 @@ kernel void kernel_ssm_scan_f32(
19831983
device const float * s0_buff = (device const float *) ((device const char *) src0 + ir*args.nb02 + ids[i3]*args.nb03);
19841984
device float * s_buff = (device float *) ((device char *) dst + ir*args.nb02 + i3*args.nb03 + s_off);
19851985
const int64_t i = i0 + i1*nc;
1986+
const int64_t g = ir / (nh / ng); // repeat_interleave
19861987
float s0 = s0_buff[i];
19871988
float s = s_buff[i];
19881989

19891990
device const float * A = (device const float *) ((device const char *) src3 + ir*args.nb31);
19901991
device const float * x_block = (device const float *) ((device const char *) src1 + i1*nb10 + ir*args.nb11 + i3*args.nb13);
19911992
device const float * dt_block = (device const float *) ((device const char *) src2 + ir*nb20 + i3*args.nb22);
1992-
device const float * B_block = (device const float *) ((device const char *) src4 + (ir & (ng - 1))*args.nb41 + i3*args.nb43);
1993-
device const float * C_block = (device const float *) ((device const char *) src5 + (ir & (ng - 1))*args.nb51 + i3*args.nb53);
1993+
device const float * B_block = (device const float *) ((device const char *) src4 + g*args.nb41 + i3*args.nb43);
1994+
device const float * C_block = (device const float *) ((device const char *) src5 + g*args.nb51 + i3*args.nb53);
19941995
device float * y_block = (device float *) ((device char *) dst + (i1 + ir*(nr) + i3*(n_t*nh*nr))*nb00);
19951996

19961997
for (int64_t i2 = 0; i2 < n_t; ++i2) {
@@ -2098,14 +2099,15 @@ kernel void kernel_ssm_scan_f32_group(
20982099
device const float * s0_buff = (device const float *) ((device const char *) src0 + ir*args.nb02 + ids[i3]*args.nb03);
20992100
device float * s_buff = (device float *) ((device char *) dst + ir*args.nb02 + i3*args.nb03 + s_off);
21002101
const int64_t i = i0 + i1*nc;
2102+
const int64_t g = ir / (nh / ng); // repeat_interleave
21012103
float s0 = s0_buff[i];
21022104
float s = s_buff[i];
21032105

21042106
device const float * A = (device const float *) ((device const char *) src3 + ir*args.nb31); // {1, nh}
21052107
device const float * x_block = (device const float *) ((device const char *) src1 + i1*nb10 + ir*args.nb11 + i3*args.nb13);
21062108
device const float * dt_block = (device const float *) ((device const char *) src2 + ir*nb20 + i3*args.nb22);
2107-
device const float * B_block = (device const float *) ((device const char *) src4 + (ir & (ng - 1))*args.nb41 + i3*args.nb43);
2108-
device const float * C_block = (device const float *) ((device const char *) src5 + (ir & (ng - 1))*args.nb51 + i3*args.nb53);
2109+
device const float * B_block = (device const float *) ((device const char *) src4 + g*args.nb41 + i3*args.nb43);
2110+
device const float * C_block = (device const float *) ((device const char *) src5 + g*args.nb51 + i3*args.nb53);
21092111
device float * y_block = (device float *) ((device char *) dst + (i1 + ir*(nr) + i3*(n_t*nh*nr))*nb00);
21102112

21112113
for (int64_t i2 = 0; i2 < n_t; ++i2) {

gguf-py/gguf/constants.py

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -231,8 +231,10 @@ class Tokenizer:
231231
MIDDLE_ID = "tokenizer.ggml.middle_token_id"
232232

233233
class Adapter:
234-
TYPE = "adapter.type"
235-
LORA_ALPHA = "adapter.lora.alpha"
234+
TYPE = "adapter.type"
235+
LORA_ALPHA = "adapter.lora.alpha"
236+
LORA_TASK_NAME = "adapter.lora.task_name"
237+
LORA_PROMPT_PREFIX = "adapter.lora.prompt_prefix"
236238

237239
class IMatrix:
238240
CHUNK_COUNT = "imatrix.chunk_count"
@@ -315,6 +317,7 @@ class MODEL_ARCH(IntEnum):
315317
NOMIC_BERT_MOE = auto()
316318
NEO_BERT = auto()
317319
JINA_BERT_V2 = auto()
320+
JINA_BERT_V3 = auto()
318321
BLOOM = auto()
319322
STABLELM = auto()
320323
QWEN = auto()
@@ -647,6 +650,7 @@ class MODEL_TENSOR(IntEnum):
647650
MODEL_ARCH.NOMIC_BERT_MOE: "nomic-bert-moe",
648651
MODEL_ARCH.NEO_BERT: "neo-bert",
649652
MODEL_ARCH.JINA_BERT_V2: "jina-bert-v2",
653+
MODEL_ARCH.JINA_BERT_V3: "jina-bert-v3",
650654
MODEL_ARCH.BLOOM: "bloom",
651655
MODEL_ARCH.STABLELM: "stablelm",
652656
MODEL_ARCH.QWEN: "qwen",
@@ -1234,6 +1238,18 @@ class MODEL_TENSOR(IntEnum):
12341238
MODEL_TENSOR.LAYER_OUT_NORM,
12351239
MODEL_TENSOR.CLS,
12361240
],
1241+
MODEL_ARCH.JINA_BERT_V3: [
1242+
MODEL_TENSOR.TOKEN_EMBD,
1243+
MODEL_TENSOR.TOKEN_EMBD_NORM,
1244+
MODEL_TENSOR.TOKEN_TYPES,
1245+
MODEL_TENSOR.OUTPUT_NORM,
1246+
MODEL_TENSOR.ATTN_OUT_NORM,
1247+
MODEL_TENSOR.ATTN_QKV,
1248+
MODEL_TENSOR.ATTN_OUT,
1249+
MODEL_TENSOR.FFN_DOWN,
1250+
MODEL_TENSOR.FFN_UP,
1251+
MODEL_TENSOR.LAYER_OUT_NORM,
1252+
],
12371253
MODEL_ARCH.MPT: [
12381254
MODEL_TENSOR.TOKEN_EMBD,
12391255
MODEL_TENSOR.OUTPUT_NORM,

0 commit comments

Comments
 (0)