@@ -852,20 +852,34 @@ void llama_model::load_hparams(llama_model_loader & ml) {
852852 // for backward compatibility ; see: https://github.com/ggerganov/llama.cpp/pull/8931
853853 if ((hparams.n_layer == 32 || hparams.n_layer == 40) && hparams.n_ctx_train == 4096) {
854854 // default value for Phi-3-mini-4k-instruct and Phi-3-medium-4k-instruct
855+ LLAMA_LOG_WARN("%s: assuming n_swa = 2047 for Phi-3-mini-4k-instruct and Phi-3-medium-4k-instruct\n", __func__);
856+
855857 hparams.n_swa = 2047;
856858 } else if (hparams.n_layer == 32 && hparams.n_head_kv(0) == 32 && hparams.n_ctx_train == 131072) {
857859 // default value for Phi-3-mini-128k-instruct
858- // note: this seems incorrect because the window is bigger than the train context?
859- hparams.n_swa = 262144;
860+ LLAMA_LOG_WARN("%s: assuming n_swa = n_ctx_train for Phi-3-mini-128k-instruct\n", __func__);
861+
862+ hparams.n_swa = hparams.n_ctx_train;
863+ hparams.n_swa_pattern = 1;
860864 } else if (hparams.n_layer == 40 && hparams.n_ctx_train == 131072) {
861865 // default value for Phi-3-medium-128k-instruct
862- // note: this seems incorrect because the window is equal to the train context?
863- hparams.n_swa = 131072;
866+ LLAMA_LOG_WARN("%s: assuming n_swa = n_ctx_train for Phi-3-medium-128k-instruct\n", __func__);
867+
868+ hparams.n_swa = hparams.n_ctx_train;
869+ hparams.n_swa_pattern = 1;
864870 }
871+
865872 bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
866873 if (!found_swa && hparams.n_swa == 0) {
867874 throw std::runtime_error("invalid value for sliding_window");
868875 }
876+
877+ if (hparams.n_swa > hparams.n_ctx_train) {
878+ LLAMA_LOG_WARN("%s: unexpected n_swa: %d >= %d, setting to 0\n", __func__, hparams.n_swa, hparams.n_ctx_train);
879+
880+ hparams.n_swa = hparams.n_ctx_train;
881+ hparams.n_swa_pattern = 1;
882+ }
869883 } break;
870884 case LLM_ARCH_PHIMOE:
871885 {
0 commit comments