Skip to content

Commit 1bf5cf0

Browse files
authored
Merge branch 'mradermacher' into master
2 parents 4a5686d + 322338b commit 1bf5cf0

File tree

9 files changed

+135
-28
lines changed

9 files changed

+135
-28
lines changed

common/common.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -915,6 +915,10 @@ struct common_init_result common_init_from_params(common_params & params) {
915915

916916
llama_context * lctx = llama_init_from_model(model, cparams);
917917
if (lctx == NULL) {
918+
if(getenv("DRYRUN")) {
919+
LOG_ERR("%s: Dryrun completed!\n", __func__);
920+
exit(0);
921+
}
918922
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
919923
llama_model_free(model);
920924
return iparams;

ggml/src/ggml-backend.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1966,6 +1966,11 @@ static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_ty
19661966
}
19671967

19681968
static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
1969+
if(getenv("DRYRUN")) {
1970+
GGML_LOG_ERROR("[DRYRUN][CPU]: %ld\n", size);
1971+
return NULL;
1972+
}
1973+
19691974
void * data = ggml_aligned_malloc(size);
19701975

19711976
if (data == NULL) {

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -679,6 +679,11 @@ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_bac
679679

680680
ggml_cuda_set_device(buft_ctx->device);
681681

682+
if(getenv("DRYRUN")) {
683+
GGML_LOG_ERROR("[DRYRUN][GPU%d]: %ld\n", buft_ctx->device, size);
684+
return nullptr;
685+
}
686+
682687
void * dev_ptr;
683688
cudaError_t err = ggml_cuda_device_malloc(&dev_ptr, size, buft_ctx->device);
684689
if (err != cudaSuccess) {
@@ -857,12 +862,18 @@ static enum ggml_status ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_
857862
// FIXME: do not crash if cudaMalloc fails
858863
// currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
859864
ggml_cuda_set_device(id);
865+
860866
char * buf;
861-
CUDA_CHECK(ggml_cuda_device_malloc((void**)&buf, size, id));
867+
if(getenv("DRYRUN")) {
868+
GGML_LOG_ERROR("[DRYRUN][GPU%d]: %ld\n", id, size);
869+
buf = nullptr;
870+
} else {
871+
CUDA_CHECK(ggml_cuda_device_malloc((void**)&buf, size, id));
862872

863-
// set padding to 0 to avoid possible NaN values
864-
if (size > original_size) {
865-
CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
873+
// set padding to 0 to avoid possible NaN values
874+
if (size > original_size) {
875+
CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
876+
}
866877
}
867878

868879
extra->data_device[id] = buf;
@@ -1118,6 +1129,10 @@ static void * ggml_cuda_host_malloc(size_t size) {
11181129
}
11191130

11201131
static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
1132+
if(getenv("DRYRUN")) {
1133+
GGML_LOG_ERROR("[DRYRUN][PINNED]: %ld\n", size);
1134+
return nullptr;
1135+
}
11211136
void * ptr = ggml_cuda_host_malloc(size);
11221137

11231138
if (ptr == nullptr) {

gguf-py/gguf/gguf_writer.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -238,6 +238,10 @@ def write_kv_data_to_file(self) -> None:
238238
kv_bytes = bytearray()
239239

240240
for key, val in kv_data.items():
241+
if val.type != GGUFValueType.ARRAY or len (val.value) < 50:
242+
print("gguf serialising key ", key, "value", val)
243+
else:
244+
print("gguf serialising key ", key, "value-suppressed")
241245
kv_bytes += self._pack_val(key, GGUFValueType.STRING, add_vtype=False)
242246
kv_bytes += self._pack_val(val.value, val.type, add_vtype=True, sub_type=val.sub_type)
243247

src/llama-kv-cache-unified.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,10 @@ llama_kv_cache_unified::llama_kv_cache_unified(
135135

136136
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
137137
if (!buf) {
138+
if(getenv("DRYRUN")) {
139+
LLAMA_LOG_ERROR("%s: pretend allocating buffer for kv cache was successful due to dry-run being enabled\n", __func__);
140+
return;
141+
}
138142
throw std::runtime_error("failed to allocate buffer for kv cache");
139143
}
140144

src/llama-memory-recurrent.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,10 @@ llama_memory_recurrent::llama_memory_recurrent(
9999

100100
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
101101
if (!buf) {
102+
if(getenv("DRYRUN")) {
103+
LLAMA_LOG_ERROR("%s: pretend allocating buffer for rs cache was successful due to dry-run being enabled\n", __func__);
104+
return;
105+
}
102106
throw std::runtime_error("failed to allocate buffer for rs cache");
103107
}
104108
ggml_backend_buffer_clear(buf, 0);

src/llama-model-loader.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -711,6 +711,13 @@ llama_model_loader::llama_model_loader(
711711
use_mmap = false;
712712
}
713713

714+
if(getenv("DRYRUN")) {
715+
if (use_mmap) {
716+
LLAMA_LOG_WARN("%s: mmap is not supported for dry-run so it is now disabled\n", __func__);
717+
use_mmap = false;
718+
}
719+
}
720+
714721
this->use_mmap = use_mmap;
715722
this->check_tensors = check_tensors;
716723
}

src/llama-model.cpp

Lines changed: 21 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4845,7 +4845,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
48454845
else {
48464846
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
48474847
if (buf == nullptr) {
4848-
throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
4848+
if(getenv("DRYRUN")) {
4849+
LLAMA_LOG_WARN("%s: pretend allocating %s buffer was successful due to dry-run being enabled\n", __func__, ggml_backend_buft_name(buft));
4850+
} else {
4851+
throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
4852+
}
48494853
}
48504854
pimpl->bufs.emplace_back(buf);
48514855
if (use_mlock && ggml_backend_buffer_is_host(buf)) {
@@ -4863,10 +4867,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
48634867
throw std::runtime_error("failed to allocate buffer");
48644868
}
48654869

4866-
for (auto & buf : buf_map) {
4867-
// indicate that this buffer contains weights
4868-
// this is used by ggml_backend_sched to improve op scheduling: ops that use a weight are preferably scheduled to the backend that contains the weight
4869-
ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
4870+
if(!getenv("DRYRUN")) {
4871+
for (auto & buf : buf_map) {
4872+
// indicate that this buffer contains weights
4873+
// this is used by ggml_backend_sched to improve op scheduling: ops that use a weight are preferably scheduled to the backend that contains the weight
4874+
ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
4875+
}
48704876
}
48714877

48724878
ctx_bufs.emplace_back(ctx, buf_map);
@@ -4887,8 +4893,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
48874893
}
48884894

48894895
// print memory requirements per buffer type
4890-
for (auto & buf : pimpl->bufs) {
4891-
LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
4896+
if(!getenv("DRYRUN")) {
4897+
for (auto & buf : pimpl->bufs) {
4898+
LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
4899+
}
48924900
}
48934901

48944902
// populate tensors_by_name
@@ -4899,11 +4907,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
48994907
}
49004908

49014909
// load tensor data
4902-
for (auto & it : ctx_bufs) {
4903-
ggml_context * ctx = it.first;
4904-
auto & bufs = it.second;
4905-
if (!ml.load_all_data(ctx, bufs, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
4910+
if(!getenv("DRYRUN")) {
4911+
for (auto & it : ctx_bufs) {
4912+
ggml_context * ctx = it.first;
4913+
auto & bufs = it.second;
4914+
if (!ml.load_all_data(ctx, bufs, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
49064915
return false;
4916+
}
49074917
}
49084918
}
49094919

tools/imatrix/imatrix.cpp

Lines changed: 67 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ struct Stats {
3333
std::vector<float> values;
3434
std::vector<int> counts;
3535
int ncall = 0;
36+
int n_as = 1;
3637
};
3738

3839
class IMatrixCollector {
@@ -127,11 +128,15 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
127128
if (e.values.empty()) {
128129
e.values.resize(src1->ne[0]*n_as, 0);
129130
e.counts.resize(src1->ne[0]*n_as, 0);
131+
e.n_as = n_as;
130132
}
131133
else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
132134
LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
133135
exit(1); //GGML_ABORT("fatal error");
134136
}
137+
else if (e.n_as != n_as) {
138+
LOG_ERR("%s: inconsistent n_as for %s (%d vs %d)\n", __func__, wname.c_str(), e.n_as, n_as);
139+
}
135140
LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
136141
// loop over all possible experts, regardless if they are used or not in the batch
137142
for (int ex = 0; ex < n_as; ++ex) {
@@ -173,23 +178,36 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
173178
} else {
174179
auto & e = m_stats[wname];
175180
if (e.values.empty()) {
176-
e.values.resize(src1->ne[0], 0);
177-
e.counts.resize(src1->ne[0], 0);
181+
if (src0->ne[3] > 1) {
182+
LOG_ERR("Unsupported 4D tensor %s\n", wname.c_str());
183+
exit(1);
184+
}
185+
// If we have a 3D tensor as it is the case for the attn_k_b and attn_v_b for DeepSeek MLA models,
186+
// than we need to compute the imatrix for each head, and not just one imatrx for all heads.
187+
// Hence, the storage we need is src0->ne[0]*src0->ne[2].
188+
e.values.resize(src0->ne[0]*src0->ne[2], 0);
189+
e.counts.resize(src0->ne[0]*src0->ne[2], 0);
178190
}
179-
else if (e.values.size() != (size_t)src1->ne[0]) {
191+
else if (e.values.size() != (size_t)(src0->ne[0]*src0->ne[2])) {
180192
LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
181193
exit(1); //GGML_ABORT("fatal error");
182194
}
183195
++e.ncall;
184196
LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
185-
for (int row = 0; row < (int)src1->ne[1]; ++row) {
186-
const float * x = (const float *) (data + row * src1->nb[1]);
187-
for (int j = 0; j < (int)src1->ne[0]; ++j) {
188-
e.values[j] += x[j]*x[j];
189-
e.counts[j]++;
190-
if (!std::isfinite(e.values[j])) {
191-
LOG_ERR("%f detected in %s\n", e.values[j], wname.c_str());
192-
exit(1);
197+
int rk2 = src1->ne[2]/src0->ne[2];
198+
for (int i12 = 0; i12 < (int)src1->ne[2]; ++i12) { // i.e., loop over attention heads for MLA models
199+
int i02 = i12/rk2;
200+
auto values = e.values.data() + i02*src0->ne[0];
201+
auto counts = e.counts.data() + i02*src0->ne[0];
202+
for (int i11 = 0; i11 < (int)src1->ne[1]; ++i11) {
203+
const float * x = (const float *)((const char *)data + i11*src1->nb[1] + i12*src1->nb[2]);
204+
for (int j = 0; j < (int)src1->ne[0]; ++j) {
205+
values[j] += x[j]*x[j];
206+
counts[j]++;
207+
if (!std::isfinite(values[j])) {
208+
LOG_ERR("%f detected in %s\n", e.values[j], wname.c_str());
209+
exit(1);
210+
}
193211
}
194212
}
195213
}
@@ -221,6 +239,10 @@ void IMatrixCollector::save_imatrix(int ncall) const {
221239
int n_entries = 0;
222240
std::vector<std::string> to_store;
223241

242+
// Retrieve the REQUIRED_GOOD_EXPERT_PERCENTAGE from the environment
243+
const char* required_good_expert_percentage_env_value = getenv("REQUIRED_GOOD_EXPERT_PERCENTAGE");
244+
double required_good_expert_percentage = required_good_expert_percentage_env_value ? std::clamp(std::stod(required_good_expert_percentage_env_value), 0.0, 100.0) : 90.0;
245+
224246
bool is_first = true; // for printing
225247
for (const auto & kv : m_stats) {
226248
const int n_all = kv.second.counts.size();
@@ -247,8 +269,40 @@ void IMatrixCollector::save_imatrix(int ncall) const {
247269
}
248270

249271
if (n_zeros > 0) {
250-
LOG_WRN("%s: entry '%40s' has partial data (%.2f%%) - skipping\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all);
251-
continue;
272+
LOG_WRN("%s: entry '%40s' has partial data (%.2f%%)\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all);
273+
bool store_it = false;
274+
if (kv.second.n_as > 1) {
275+
int n_per_expert = n_all / kv.second.n_as;
276+
std::vector<int> bad_experts;
277+
bad_experts.reserve(kv.second.n_as);
278+
for (int i = 0; i < kv.second.n_as; ++i) {
279+
auto counts = kv.second.counts.data() + i*n_per_expert;
280+
int nz_i = 0;
281+
for (int j = 0; j < n_per_expert; ++j) {
282+
if (counts[j] == 0) ++nz_i;
283+
}
284+
if (nz_i > 0) bad_experts.push_back(i);
285+
}
286+
size_t required_good_experts = round((kv.second.n_as * required_good_expert_percentage) / 100.0);
287+
size_t good_experts = kv.second.n_as - bad_experts.size();
288+
LOG_WRN("%s: %d out of %d experts are missing data - %ld out of %ld required\n", __func__, int(bad_experts.size()), kv.second.n_as, good_experts, required_good_experts);
289+
if (good_experts >= required_good_experts) {
290+
LOG_WRN("%s: %d out of %d experts are missing data - storing but be aware\n", __func__, int(bad_experts.size()), kv.second.n_as);
291+
store_it = true;
292+
for (auto i : bad_experts) {
293+
auto counts = const_cast<int*>(kv.second.counts.data()) + i * n_per_expert;
294+
auto values = const_cast<float*>(kv.second.values.data()) + i * n_per_expert;
295+
for (int j = 0; j < n_per_expert; ++j) {
296+
counts[j] = 1;
297+
values[j] = 1;
298+
}
299+
}
300+
}
301+
}
302+
if (!store_it) {
303+
LOG_WRN("%s: Skipping expert with missing data!\n", __func__);
304+
continue;
305+
}
252306
}
253307

254308
n_entries++;

0 commit comments

Comments
 (0)