Skip to content

Commit b5d2f1e

Browse files
committed
Merge remote-tracking branch 'ggml-org/master' into allozaur/svelte-webui
2 parents 606d3f8 + cd08fc3 commit b5d2f1e

File tree

9 files changed

+119
-21
lines changed

9 files changed

+119
-21
lines changed

.github/workflows/build.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ jobs:
9797
ctest -L 'main|curl' --verbose --timeout 900
9898
9999
macOS-latest-cmake-x64:
100-
runs-on: macos-latest
100+
runs-on: macos-13
101101

102102
steps:
103103
- name: Clone

common/json-schema-to-grammar.cpp

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -257,12 +257,13 @@ std::unordered_map<std::string, BuiltinRule> STRING_FORMAT_RULES = {
257257
};
258258

259259
static bool is_reserved_name(const std::string & name) {
260-
static std::unordered_set<std::string> RESERVED_NAMES;
261-
if (RESERVED_NAMES.empty()) {
262-
RESERVED_NAMES.insert("root");
263-
for (const auto &p : PRIMITIVE_RULES) RESERVED_NAMES.insert(p.first);
264-
for (const auto &p : STRING_FORMAT_RULES) RESERVED_NAMES.insert(p.first);
265-
}
260+
static const std::unordered_set<std::string> RESERVED_NAMES = [] {
261+
std::unordered_set<std::string> s;
262+
s.insert("root");
263+
for (const auto & p : PRIMITIVE_RULES) s.insert(p.first);
264+
for (const auto & p : STRING_FORMAT_RULES) s.insert(p.first);
265+
return s;
266+
}();
266267
return RESERVED_NAMES.find(name) != RESERVED_NAMES.end();
267268
}
268269

convert_hf_to_gguf.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6009,9 +6009,34 @@ class SeedOssModel(TextModel):
60096009

60106010

60116011
@ModelBase.register("Olmo2ForCausalLM")
6012+
@ModelBase.register("Olmo3ForCausalLM")
60126013
class Olmo2Model(TextModel):
60136014
model_arch = gguf.MODEL_ARCH.OLMO2
60146015

6016+
def set_gguf_parameters(self):
6017+
super().set_gguf_parameters()
6018+
6019+
rope_scaling = self.hparams.get("rope_scaling") or {}
6020+
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
6021+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
6022+
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
6023+
self.gguf_writer.add_rope_scaling_attn_factors(rope_scaling["attention_factor"])
6024+
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
6025+
6026+
if "sliding_window" in self.hparams:
6027+
self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
6028+
6029+
sliding_window_pattern = []
6030+
if "layer_types" in self.hparams:
6031+
sliding_window_pattern = [t == "sliding_attention" for t in self.hparams["layer_types"]]
6032+
else:
6033+
# Olmo2 does not use sliding window attention.
6034+
# Olmo3 defaults to using sliding window for all layers except every 4th.
6035+
for i in range(self.hparams["num_hidden_layers"]):
6036+
sliding_window_pattern.append((i + 1) % 4 != 0)
6037+
6038+
self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern)
6039+
60156040

60166041
@ModelBase.register("OlmoeForCausalLM")
60176042
class OlmoeModel(TextModel):

examples/simple/simple.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,20 @@ int main(int argc, char ** argv) {
145145

146146
llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size());
147147

148+
if (llama_model_has_encoder(model)) {
149+
if (llama_encode(ctx, batch)) {
150+
fprintf(stderr, "%s : failed to eval\n", __func__);
151+
return 1;
152+
}
153+
154+
llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
155+
if (decoder_start_token_id == LLAMA_TOKEN_NULL) {
156+
decoder_start_token_id = llama_vocab_bos(vocab);
157+
}
158+
159+
batch = llama_batch_get_one(&decoder_start_token_id, 1);
160+
}
161+
148162
// main loop
149163

150164
const auto t_main_start = ggml_time_us();

ggml/src/ggml-cann/common.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -526,7 +526,10 @@ struct ggml_backend_cann_context {
526526
*/
527527
aclrtStream stream(int stream) {
528528
if (streams[stream] == nullptr) {
529-
ggml_cann_set_device(device);
529+
// If the device is not set here, destroying the stream later may cause a mismatch
530+
// between the thread contexts where the stream was created and destroyed.
531+
// However, I printed the device_id, thread_id, and stream, and they are all consistent.
532+
ACL_CHECK(aclrtSetDevice(device));
530533
ACL_CHECK(aclrtCreateStream(&streams[stream]));
531534
}
532535
return streams[stream];

ggml/src/ggml-cann/ggml-cann.cpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -75,13 +75,12 @@
7575
* @param device The device ID to set.
7676
*/
7777
void ggml_cann_set_device(const int32_t device) {
78-
// TODO: uncomment these lines after empty context has fixed.
79-
// int current_device;
80-
// ACL_CHECK(aclrtGetDevice(&current_device));
78+
int current_device = -1;
79+
aclrtGetDevice(&current_device);
8180

82-
// if (device == current_device) {
83-
// return;
84-
// }
81+
if (device == current_device) {
82+
return;
83+
}
8584
ACL_CHECK(aclrtSetDevice(device));
8685
}
8786

@@ -1729,6 +1728,7 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
17291728
ggml_cann_get_rows(ctx, dst);
17301729
break;
17311730
case GGML_OP_SET_ROWS:
1731+
std::cout << "lcg GGML_OP_SET_ROWS"<< std::endl;
17321732
ggml_cann_set_rows(ctx, dst);
17331733
break;
17341734
case GGML_OP_DUP:

ggml/src/ggml-vulkan/ggml-vulkan.cpp

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4423,8 +4423,8 @@ static void ggml_vk_print_gpu_info(size_t idx) {
44234423

44244424
static bool ggml_vk_instance_validation_ext_available();
44254425
static bool ggml_vk_instance_portability_enumeration_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions);
4426-
44274426
static bool ggml_vk_instance_debug_utils_ext_available(const std::vector<vk::ExtensionProperties> & instance_extensions);
4427+
static bool ggml_vk_device_is_supported(const vk::PhysicalDevice & vkdev);
44284428

44294429
static void ggml_vk_instance_init() {
44304430
if (vk_instance_initialized) {
@@ -4540,7 +4540,7 @@ static void ggml_vk_instance_init() {
45404540
new_driver.pNext = &new_id;
45414541
devices[i].getProperties2(&new_props);
45424542

4543-
if (new_props.properties.deviceType == vk::PhysicalDeviceType::eDiscreteGpu || new_props.properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu) {
4543+
if ((new_props.properties.deviceType == vk::PhysicalDeviceType::eDiscreteGpu || new_props.properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu) && ggml_vk_device_is_supported(devices[i])) {
45444544
// Check if there are two physical devices corresponding to the same GPU
45454545
auto old_device = std::find_if(
45464546
vk_instance.device_indices.begin(),
@@ -12738,6 +12738,20 @@ static bool ggml_vk_instance_debug_utils_ext_available(
1273812738
UNUSED(instance_extensions);
1273912739
}
1274012740

12741+
static bool ggml_vk_device_is_supported(const vk::PhysicalDevice & vkdev) {
12742+
VkPhysicalDeviceFeatures2 device_features2;
12743+
device_features2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2;
12744+
12745+
VkPhysicalDeviceVulkan11Features vk11_features;
12746+
vk11_features.pNext = nullptr;
12747+
vk11_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES;
12748+
device_features2.pNext = &vk11_features;
12749+
12750+
vkGetPhysicalDeviceFeatures2(vkdev, &device_features2);
12751+
12752+
return vk11_features.storageBuffer16BitAccess;
12753+
}
12754+
1274112755
static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props, vk_device_architecture arch) {
1274212756
switch (props.vendorID) {
1274312757
case VK_VENDOR_ID_INTEL:

src/llama-model.cpp

Lines changed: 43 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1350,6 +1350,14 @@ void llama_model::load_hparams(llama_model_loader & ml) {
13501350
{
13511351
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
13521352

1353+
const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
1354+
if (found_swa && hparams.n_swa > 0) {
1355+
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1356+
hparams.set_swa_pattern(4);
1357+
} else {
1358+
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
1359+
}
1360+
13531361
switch (hparams.n_layer) {
13541362
case 16: type = LLM_TYPE_1B; break;
13551363
case 32: type = LLM_TYPE_7B; break;
@@ -12233,6 +12241,7 @@ struct llm_build_olmo : public llm_graph_context {
1223312241
}
1223412242
};
1223512243

12244+
template <bool iswa>
1223612245
struct llm_build_olmo2 : public llm_graph_context {
1223712246
llm_build_olmo2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
1223812247
const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -12248,7 +12257,14 @@ struct llm_build_olmo2 : public llm_graph_context {
1224812257
// inp_pos - contains the positions
1224912258
ggml_tensor * inp_pos = build_inp_pos();
1225012259

12251-
auto * inp_attn = build_attn_inp_kv();
12260+
using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
12261+
inp_attn_type * inp_attn = nullptr;
12262+
12263+
if constexpr (iswa) {
12264+
inp_attn = build_attn_inp_kv_iswa();
12265+
} else {
12266+
inp_attn = build_attn_inp_kv();
12267+
}
1225212268

1225312269
ggml_tensor * inp_out_ids = build_inp_out_ids();
1225412270

@@ -12281,17 +12297,36 @@ struct llm_build_olmo2 : public llm_graph_context {
1228112297
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
1228212298
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
1228312299

12284-
Qcur = ggml_rope_ext(
12300+
const bool is_swa = hparams.is_swa(il);
12301+
12302+
if (is_swa) {
12303+
// For sliding window layers, Olmo3 use regular rope with no yarn rope scaling.
12304+
// This is achieved here by setting freq_scale and attn_factor to 1.
12305+
// We also set ext_factor to 0 to avoid a few unnecessary computations.
12306+
Qcur = ggml_rope_ext(
12307+
ctx0, Qcur, inp_pos, nullptr,
12308+
n_rot, rope_type, n_ctx_orig, freq_base, 1.0,
12309+
0.0, 1.0, beta_fast, beta_slow
12310+
);
12311+
12312+
Kcur = ggml_rope_ext(
12313+
ctx0, Kcur, inp_pos, nullptr,
12314+
n_rot, rope_type, n_ctx_orig, freq_base, 1.0,
12315+
0.0, 1.0, beta_fast, beta_slow
12316+
);
12317+
} else {
12318+
Qcur = ggml_rope_ext(
1228512319
ctx0, Qcur, inp_pos, nullptr,
1228612320
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
1228712321
ext_factor, attn_factor, beta_fast, beta_slow
1228812322
);
1228912323

12290-
Kcur = ggml_rope_ext(
12324+
Kcur = ggml_rope_ext(
1229112325
ctx0, Kcur, inp_pos, nullptr,
1229212326
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
1229312327
ext_factor, attn_factor, beta_fast, beta_slow
1229412328
);
12329+
}
1229512330

1229612331
cb(Qcur, "Qcur", il);
1229712332
cb(Kcur, "Kcur", il);
@@ -19131,7 +19166,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
1913119166
} break;
1913219167
case LLM_ARCH_OLMO2:
1913319168
{
19134-
llm = std::make_unique<llm_build_olmo2>(*this, params);
19169+
if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
19170+
llm = std::make_unique<llm_build_olmo2<true>>(*this, params);
19171+
} else {
19172+
llm = std::make_unique<llm_build_olmo2<false>>(*this, params);
19173+
}
1913519174
} break;
1913619175
case LLM_ARCH_OLMOE:
1913719176
{

src/llama-quant.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -725,7 +725,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
725725
// attention layers have a non-zero number of kv heads
726726
int32_t n_attn_layer = model.hparams.n_layer - std::count(n_head_kv_iter, n_head_kv_iter + model.hparams.n_layer, 0);
727727
if (llama_model_has_encoder(&model)) {
728-
n_attn_layer *= 3;
728+
// now n_attn_layer is the number of attention layers in the encoder
729+
// for each decoder block, there are 2 attention layers
730+
n_attn_layer += 2 * model.hparams.dec_n_layer;
729731
}
730732
GGML_ASSERT((qs.n_attention_wv == n_attn_layer - pruned_attention_w) && "n_attention_wv is unexpected");
731733
}

0 commit comments

Comments
 (0)