Skip to content

Commit f68cb3c

Browse files
Merge pull request #198 from menloresearch/update-dev-from-master-2025-08-08-00-13
Sync master with upstream release b6115
2 parents 5083622 + 50aa938 commit f68cb3c

28 files changed

+1116
-318
lines changed

convert_hf_to_gguf.py

Lines changed: 123 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3328,7 +3328,13 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
33283328
@ModelBase.register("InternVisionModel")
33293329
class InternVisionModel(MmprojModel):
33303330
def set_gguf_parameters(self):
3331+
assert self.hparams_vision is not None
3332+
if isinstance(self.hparams_vision['image_size'], list):
3333+
self.hparams_vision['image_size'] = self.hparams_vision['image_size'][0]
3334+
if isinstance(self.hparams_vision['patch_size'], list):
3335+
self.hparams_vision['patch_size'] = self.hparams_vision['patch_size'][0]
33313336
super().set_gguf_parameters()
3337+
33323338
hparams = self.hparams
33333339
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.INTERNVL)
33343340
self.gguf_writer.add_vision_attention_layernorm_eps(hparams["layer_norm_eps"])
@@ -3352,14 +3358,30 @@ def tensor_force_quant(self, name, new_name, bid, n_dims):
33523358
return gguf.GGMLQuantizationType.F32
33533359
return False
33543360

3361+
def _mapping_interns1_name(self, name):
3362+
names_map = {
3363+
"model.multi_modal_projector.layer_norm.bias": "mlp1.0.bias",
3364+
"model.multi_modal_projector.layer_norm.weight": "mlp1.0.weight",
3365+
"model.multi_modal_projector.linear_1.bias": "mlp1.1.bias",
3366+
"model.multi_modal_projector.linear_1.weight": "mlp1.1.weight",
3367+
"model.multi_modal_projector.linear_2.bias": "mlp1.3.bias",
3368+
"model.multi_modal_projector.linear_2.weight": "mlp1.3.weight",
3369+
}
3370+
if name in names_map:
3371+
name = names_map[name]
3372+
return name
3373+
33553374
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
33563375
del bid # unused
3357-
if name.startswith("vision_model") or name.startswith("mlp"):
3376+
vision_prefix = ['vision_model', 'mlp', 'model.vision_tower', 'model.multi_modal_projector']
3377+
# deal with intern-s1 special case
3378+
name = self._mapping_interns1_name(name)
3379+
if any([name.startswith(prefix) for prefix in vision_prefix]):
33583380
# process visual tensors
33593381
# correct name
33603382
if name.startswith("vision_model"):
33613383
name = "vision_tower." + name
3362-
if (".ls" in name or "position_embedding" in name) and not name.endswith(".weight"):
3384+
if (".ls" in name or ".lambda_" in name or "position_embedding" in name) and not name.endswith(".weight"):
33633385
name += ".weight"
33643386
# split QKV tensors if needed
33653387
if ".qkv." in name:
@@ -3445,6 +3467,10 @@ def set_gguf_parameters(self):
34453467

34463468
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
34473469
# process the experts separately
3470+
name = name.replace("language_model.", "") # InternVL
3471+
if name.startswith("mlp") or name.startswith("vision_model") or name.startswith("model.vision_tower") or name.startswith("model.multi_modal_projector"):
3472+
# skip visual tensors
3473+
return []
34483474
if name.find("experts") != -1:
34493475
n_experts = self.hparams["num_experts"]
34503476
assert bid is not None
@@ -3498,6 +3524,85 @@ class Qwen3Model(Qwen2Model):
34983524
class Qwen3MoeModel(Qwen2MoeModel):
34993525
model_arch = gguf.MODEL_ARCH.QWEN3MOE
35003526

3527+
def __init__(self, *args, **kwargs):
3528+
super().__init__(*args, **kwargs)
3529+
hparams = ModelBase.load_hparams(self.dir_model)
3530+
self.origin_hf_arch = hparams.get('architectures', [None])[0]
3531+
3532+
def set_vocab(self):
3533+
# deal with intern-s1
3534+
if self.origin_hf_arch == 'InternS1ForConditionalGeneration':
3535+
self._set_vocab_interns1()
3536+
return
3537+
3538+
try:
3539+
self._set_vocab_sentencepiece()
3540+
except FileNotFoundError:
3541+
self._set_vocab_gpt2()
3542+
3543+
def _set_vocab_interns1(self):
3544+
tokens: list[str] = []
3545+
toktypes: list[int] = []
3546+
3547+
from transformers import AutoTokenizer
3548+
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
3549+
vocab = getattr(tokenizer, 'vocab', tokenizer.get_vocab())
3550+
vocab_size = self.hparams.get("vocab_size", len(vocab))
3551+
assert max(vocab.values()) < vocab_size
3552+
3553+
tokpre = self.get_vocab_base_pre(tokenizer)
3554+
3555+
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in vocab.items()}
3556+
added_vocab = tokenizer.get_added_vocab()
3557+
3558+
added_tokens_decoder = tokenizer.added_tokens_decoder
3559+
3560+
for i in range(vocab_size):
3561+
if i not in reverse_vocab:
3562+
tokens.append(f"[PAD{i}]")
3563+
toktypes.append(gguf.TokenType.UNUSED)
3564+
else:
3565+
token: str = reverse_vocab[i]
3566+
if token in added_vocab:
3567+
# The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized.
3568+
# To avoid unexpected issues - we make sure to normalize non-normalized tokens
3569+
if not added_tokens_decoder[i].normalized:
3570+
previous_token = token
3571+
token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False))
3572+
if previous_token != token:
3573+
logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer")
3574+
3575+
if added_tokens_decoder[i].special or self.does_token_look_special(token):
3576+
toktypes.append(gguf.TokenType.CONTROL)
3577+
else:
3578+
toktypes.append(gguf.TokenType.USER_DEFINED)
3579+
else:
3580+
toktypes.append(gguf.TokenType.NORMAL)
3581+
tokens.append(token)
3582+
3583+
self.gguf_writer.add_tokenizer_model("gpt2")
3584+
self.gguf_writer.add_tokenizer_pre(tokpre)
3585+
self.gguf_writer.add_token_list(tokens)
3586+
self.gguf_writer.add_token_types(toktypes)
3587+
3588+
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
3589+
special_tokens_map_file = self.dir_model / 'special_tokens_map.json'
3590+
additional_special_tokens = []
3591+
if special_tokens_map_file.is_file():
3592+
with open(special_tokens_map_file, encoding = 'utf-8') as f:
3593+
additional_special_tokens = json.load(f).get('additional_special_tokens', [])
3594+
tokenizer_cfg_file = self.dir_model / 'special_tokens_map.json'
3595+
if tokenizer_cfg_file.is_file():
3596+
with open(tokenizer_cfg_file, encoding = 'utf-8') as f:
3597+
added_tokens_decoder = json.load(f).get('added_tokens_decoder', {})
3598+
token2ids_map = {data['content'] : int(token) for token, data in added_tokens_decoder.items() if data['special']}
3599+
for token in additional_special_tokens:
3600+
if token in token2ids_map:
3601+
special_vocab._set_special_token(token, token2ids_map[token])
3602+
special_vocab._set_special_token('eos', 151645)
3603+
special_vocab._set_special_token("bos", 151643)
3604+
special_vocab.add_to_gguf(self.gguf_writer)
3605+
35013606

35023607
@ModelBase.register("GPT2LMHeadModel")
35033608
class GPT2Model(TextModel):
@@ -7997,15 +8102,13 @@ def repack_mxfp4(self, new_name: str, blocks: Tensor, scales: Tensor):
79978102
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
79988103
blocks0: Tensor = torch.zeros(1)
79998104
blocks1: Tensor = torch.zeros(1)
8000-
found_mxfp4_tensors = False
80018105
# we assume that tensors are loaded in the correct order
80028106
for name, data_torch in self.get_tensors():
80038107
if "mlp.experts.down_proj_blocks" in name:
80048108
blocks0 = data_torch
80058109
elif "mlp.experts.down_proj_scales" in name:
80068110
new_name = self.map_tensor_name(name.replace("_scales", ".weight"))
80078111
self.repack_mxfp4(new_name, blocks0, data_torch)
8008-
found_mxfp4_tensors = True
80098112
elif "mlp.experts.gate_up_proj_blocks" in name:
80108113
blocks0, blocks1 = data_torch[:, ::2, :, :], data_torch[:, 1::2, :, :]
80118114
elif "mlp.experts.gate_up_proj_scales" in name:
@@ -8014,9 +8117,6 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
80148117
new_name_up = self.map_tensor_name(name.replace("gate_up_proj_scales", "up_proj.weight"))
80158118
self.repack_mxfp4(new_name_gate, blocks0, scales0)
80168119
self.repack_mxfp4(new_name_up, blocks1, scales1)
8017-
found_mxfp4_tensors = True
8018-
if not found_mxfp4_tensors:
8019-
raise ValueError("No MXFP4 tensors found in the model. Please make sure you are using MXFP4 model.")
80208120
return []
80218121

80228122
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
@@ -8029,7 +8129,12 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
80298129
if "down_proj" in name:
80308130
if name.endswith("_bias"):
80318131
name = name.replace("down_proj_bias", "down_proj.bias")
8132+
elif "_blocks" not in name and "_scales" not in name:
8133+
logger.warning(f"{name} is not in MXFP4, performance may be degraded")
8134+
name = name.replace("down_proj", "down_proj.weight")
8135+
data_torch = data_torch.transpose(-1, -2)
80328136
else:
8137+
# otherwise, it should already be repacked to ggml MXFP4 format
80338138
return []
80348139

80358140
# split the gate_up into gate and up
@@ -8042,7 +8147,18 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
80428147
(self.map_tensor_name(name_gate), gate_proj_bias),
80438148
(self.map_tensor_name(name_up), up_proj_bias)
80448149
]
8150+
elif "_blocks" not in name and "_scales" not in name:
8151+
logger.warning(f"{name} is not in MXFP4, performance may be degraded")
8152+
name_up = name.replace("gate_up_proj", "up_proj.weight")
8153+
name_gate = name.replace("gate_up_proj", "gate_proj.weight")
8154+
data_torch = data_torch.transpose(-1, -2)
8155+
gate_proj_weight, up_proj_weight = data_torch[:, ::2, :], data_torch[:, 1::2, :]
8156+
return [
8157+
(self.map_tensor_name(name_gate), gate_proj_weight),
8158+
(self.map_tensor_name(name_up), up_proj_weight)
8159+
]
80458160
else:
8161+
# otherwise, it should already be repacked to ggml MXFP4 format
80468162
return []
80478163

80488164
return [(self.map_tensor_name(name), data_torch)]

ggml/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,7 @@ option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM"
176176
option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF)
177177
option(GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12 "ggml: enable rocWMMA FlashAttention on GFX12" OFF)
178178
option(GGML_HIP_MMQ_MFMA "ggml: enable MFMA MMA for CDNA in MMQ" ON)
179+
option(GGML_HIP_EXPORT_METRICS "ggml: enable kernel perf metrics output" OFF)
179180
option(GGML_MUSA_GRAPHS "ggml: use MUSA graph, experimental, unstable" OFF)
180181
option(GGML_MUSA_MUDNN_COPY "ggml: enable muDNN for accelerated copy" OFF)
181182
option(GGML_VULKAN "ggml: use Vulkan" OFF)

ggml/cmake/ggml-config.cmake.in

Lines changed: 42 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ if(NOT TARGET ggml::ggml)
106106

107107
find_library(GGML_LIBRARY ggml
108108
REQUIRED
109-
HINTS ${GGML_LIB_DIR} ${GGML_BACKEND_DIR}
109+
HINTS ${GGML_LIB_DIR}
110110
NO_CMAKE_FIND_ROOT_PATH)
111111

112112
add_library(ggml::ggml UNKNOWN IMPORTED)
@@ -125,54 +125,56 @@ if(NOT TARGET ggml::ggml)
125125
IMPORTED_LOCATION "${GGML_BASE_LIBRARY}")
126126

127127
set(_ggml_all_targets "")
128-
foreach(_ggml_backend ${GGML_AVAILABLE_BACKENDS})
129-
string(REPLACE "-" "_" _ggml_backend_pfx "${_ggml_backend}")
130-
string(TOUPPER "${_ggml_backend_pfx}" _ggml_backend_pfx)
131-
132-
find_library(${_ggml_backend_pfx}_LIBRARY ${_ggml_backend}
133-
REQUIRED
134-
HINTS ${GGML_LIB_DIR}
135-
NO_CMAKE_FIND_ROOT_PATH)
136-
137-
message(STATUS "Found ${${_ggml_backend_pfx}_LIBRARY}")
138-
139-
add_library(ggml::${_ggml_backend} UNKNOWN IMPORTED)
140-
set_target_properties(ggml::${_ggml_backend}
141-
PROPERTIES
142-
INTERFACE_INCLUDE_DIRECTORIES "${GGML_INCLUDE_DIR}"
143-
IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
144-
IMPORTED_LOCATION "${${_ggml_backend_pfx}_LIBRARY}"
145-
INTERFACE_COMPILE_FEATURES c_std_90
146-
POSITION_INDEPENDENT_CODE ON)
147-
148-
string(REGEX MATCH "^ggml-cpu" is_cpu_variant "${_ggml_backend}")
149-
if(is_cpu_variant)
150-
list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES "ggml::ggml-base")
151-
set_target_properties(ggml::${_ggml_backend}
152-
PROPERTIES
153-
INTERFACE_LINK_LIBRARIES "${GGML_CPU_INTERFACE_LINK_LIBRARIES}")
128+
if (NOT GGML_BACKEND_DL)
129+
foreach(_ggml_backend ${GGML_AVAILABLE_BACKENDS})
130+
string(REPLACE "-" "_" _ggml_backend_pfx "${_ggml_backend}")
131+
string(TOUPPER "${_ggml_backend_pfx}" _ggml_backend_pfx)
154132

155-
if(GGML_CPU_INTERFACE_LINK_OPTIONS)
156-
set_target_properties(ggml::${_ggml_backend}
157-
PROPERTIES
158-
INTERFACE_LINK_OPTIONS "${GGML_CPU_INTERFACE_LINK_OPTIONS}")
159-
endif()
133+
find_library(${_ggml_backend_pfx}_LIBRARY ${_ggml_backend}
134+
REQUIRED
135+
HINTS ${GGML_LIB_DIR}
136+
NO_CMAKE_FIND_ROOT_PATH)
137+
138+
message(STATUS "Found ${${_ggml_backend_pfx}_LIBRARY}")
160139

161-
else()
162-
list(APPEND ${_ggml_backend_pfx}_INTERFACE_LINK_LIBRARIES "ggml::ggml-base")
140+
add_library(ggml::${_ggml_backend} UNKNOWN IMPORTED)
163141
set_target_properties(ggml::${_ggml_backend}
164142
PROPERTIES
165-
INTERFACE_LINK_LIBRARIES "${${_ggml_backend_pfx}_INTERFACE_LINK_LIBRARIES}")
143+
INTERFACE_INCLUDE_DIRECTORIES "${GGML_INCLUDE_DIR}"
144+
IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
145+
IMPORTED_LOCATION "${${_ggml_backend_pfx}_LIBRARY}"
146+
INTERFACE_COMPILE_FEATURES c_std_90
147+
POSITION_INDEPENDENT_CODE ON)
148+
149+
string(REGEX MATCH "^ggml-cpu" is_cpu_variant "${_ggml_backend}")
150+
if(is_cpu_variant)
151+
list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES "ggml::ggml-base")
152+
set_target_properties(ggml::${_ggml_backend}
153+
PROPERTIES
154+
INTERFACE_LINK_LIBRARIES "${GGML_CPU_INTERFACE_LINK_LIBRARIES}")
155+
156+
if(GGML_CPU_INTERFACE_LINK_OPTIONS)
157+
set_target_properties(ggml::${_ggml_backend}
158+
PROPERTIES
159+
INTERFACE_LINK_OPTIONS "${GGML_CPU_INTERFACE_LINK_OPTIONS}")
160+
endif()
166161

167-
if(${_ggml_backend_pfx}_INTERFACE_LINK_OPTIONS)
162+
else()
163+
list(APPEND ${_ggml_backend_pfx}_INTERFACE_LINK_LIBRARIES "ggml::ggml-base")
168164
set_target_properties(ggml::${_ggml_backend}
169165
PROPERTIES
170-
INTERFACE_LINK_OPTIONS "${${_ggml_backend_pfx}_INTERFACE_LINK_OPTIONS}")
166+
INTERFACE_LINK_LIBRARIES "${${_ggml_backend_pfx}_INTERFACE_LINK_LIBRARIES}")
167+
168+
if(${_ggml_backend_pfx}_INTERFACE_LINK_OPTIONS)
169+
set_target_properties(ggml::${_ggml_backend}
170+
PROPERTIES
171+
INTERFACE_LINK_OPTIONS "${${_ggml_backend_pfx}_INTERFACE_LINK_OPTIONS}")
172+
endif()
171173
endif()
172-
endif()
173174

174-
list(APPEND _ggml_all_targets ggml::${_ggml_backend})
175-
endforeach()
175+
list(APPEND _ggml_all_targets ggml::${_ggml_backend})
176+
endforeach()
177+
endif()
176178

177179
list(APPEND GGML_INTERFACE_LINK_LIBRARIES ggml::ggml-base "${_ggml_all_targets}")
178180
set_target_properties(ggml::ggml

ggml/src/ggml-cuda/common.cuh

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -233,9 +233,13 @@ typedef float2 dfloat2;
233233
#endif // defined(GGML_USE_HIP) && defined(CDNA) && !defined(GGML_HIP_NO_MMQ_MFMA)
234234

235235
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
236-
#define NEW_MMA_AVAILABLE
236+
#define TURING_MMA_AVAILABLE
237237
#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
238238

239+
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
240+
#define AMPERE_MMA_AVAILABLE
241+
#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
242+
239243
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
240244
#define CP_ASYNC_AVAILABLE
241245
#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
@@ -303,10 +307,14 @@ static bool amd_mfma_available(const int cc) {
303307
}
304308

305309
// Volta technically had FP16 tensor cores but they work very differently compared to Turing and later.
306-
static bool new_mma_available(const int cc) {
310+
static bool turing_mma_available(const int cc) {
307311
return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_TURING;
308312
}
309313

314+
static bool ampere_mma_available(const int cc) {
315+
return cc < GGML_CUDA_CC_OFFSET_AMD && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_AMPERE;
316+
}
317+
310318
static bool cp_async_available(const int cc) {
311319
return cc < GGML_CUDA_CC_OFFSET_AMD && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_AMPERE;
312320
}

ggml/src/ggml-cuda/fattn-mma-f16.cuh

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -418,7 +418,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
418418
float * const __restrict__ KQ_max,
419419
float * const __restrict__ KQ_rowsum,
420420
const int kb0) {
421-
#ifdef NEW_MMA_AVAILABLE
421+
#ifdef TURING_MMA_AVAILABLE
422422
typedef fattn_mma_f16_config<DKQ, DV> c;
423423

424424
#ifdef CP_ASYNC_AVAILABLE
@@ -776,7 +776,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
776776
GGML_UNUSED(VKQ_C); GGML_UNUSED(KQ_max); GGML_UNUSED(KQ_rowsum);
777777
GGML_UNUSED(kb0); GGML_UNUSED(tile_Q);
778778
NO_DEVICE_CODE;
779-
#endif // NEW_MMA_AVAILABLE
779+
#endif // TURING_MMA_AVAILABLE
780780
}
781781

782782
template<int DKQ, int DV, int ncols1, int ncols2, int nwarps, int ntiles, bool use_logit_softcap, bool mla, bool needs_fixup, bool is_fixup>
@@ -800,7 +800,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
800800
const int jt,
801801
const int kb0_start,
802802
const int kb0_stop) {
803-
#ifdef NEW_MMA_AVAILABLE
803+
#ifdef TURING_MMA_AVAILABLE
804804
//In this kernel Q, K, V are matrices while i, j, k are matrix indices.
805805

806806
typedef fattn_mma_f16_config<DKQ, DV> c;
@@ -1196,7 +1196,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
11961196
GGML_UNUSED(stride_Q2); GGML_UNUSED(stride_K); GGML_UNUSED(stride_V); GGML_UNUSED(stride_mask);
11971197
GGML_UNUSED(jt); GGML_UNUSED(kb0_start); GGML_UNUSED(kb0_stop);
11981198
NO_DEVICE_CODE;
1199-
#endif // NEW_MMA_AVAILABLE
1199+
#endif // TURING_MMA_AVAILABLE
12001200
}
12011201

12021202
template<int DKQ, int DV, int ncols1, int ncols2, int nwarps, int ntiles, bool use_logit_softcap, bool mla>
@@ -1223,7 +1223,7 @@ static __global__ void flash_attn_ext_f16(
12231223
const int32_t nb21, const int32_t nb22, const int64_t nb23,
12241224
const int32_t ne31, const int32_t ne32, const int32_t ne33,
12251225
const int32_t nb31, const int32_t nb32, const int64_t nb33) {
1226-
#if defined(FLASH_ATTN_AVAILABLE) && defined(NEW_MMA_AVAILABLE)
1226+
#if defined(FLASH_ATTN_AVAILABLE) && defined(TURING_MMA_AVAILABLE)
12271227

12281228
// Skip unused kernel variants for faster compilation:
12291229
if (use_logit_softcap && !(DKQ == 128 || DKQ == 256)) {
@@ -1354,7 +1354,7 @@ static __global__ void flash_attn_ext_f16(
13541354
GGML_UNUSED(ne31); GGML_UNUSED(ne32); GGML_UNUSED(ne33);
13551355
GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb33);
13561356
NO_DEVICE_CODE;
1357-
#endif // defined(FLASH_ATTN_AVAILABLE) && defined(NEW_MMA_AVAILABLE)
1357+
#endif // defined(FLASH_ATTN_AVAILABLE) && defined(TURING_MMA_AVAILABLE)
13581358
}
13591359

13601360
template <int DKQ, int DV, int ncols1, int ncols2>

0 commit comments

Comments
 (0)