You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
@@ -1378,16 +1381,91 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
1378
1381
} else {
1379
1382
ggml_backend_synchronize(split_backend);
1380
1383
}
1381
-
// try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
1382
-
// TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
1383
-
if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
1384
+
1385
+
// when offloading MoE weights, we can reduce the amount of data copied by copying only the experts that are used
// copy a bit extra at the to ensure there are no NaNs in the padding of the last expert
1430
+
// this is necessary for MMQ in the CUDA backend
1431
+
expert_size_copy + padding_end);
1432
+
};
1433
+
1434
+
int id = 0;
1435
+
while (!ggml_bitset_get(used_ids.data(), id)) {
1436
+
id++;
1437
+
}
1438
+
int32_t first_id = id;
1439
+
int32_t last_id = first_id;
1440
+
1441
+
for (++id; id < n_expert; ++id) {
1442
+
if (!ggml_bitset_get(used_ids.data(), id)) {
1443
+
continue;
1444
+
}
1445
+
1446
+
if (id == last_id + 1) {
1447
+
last_id = id;
1448
+
continue;
1449
+
}
1450
+
1451
+
copy_experts(first_id, last_id);
1452
+
1453
+
first_id = id;
1454
+
last_id = id;
1455
+
}
1456
+
copy_experts(first_id, last_id);
1457
+
} else {
1458
+
// try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
1459
+
// TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
1460
+
if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
1461
+
ggml_backend_synchronize(input_backend);
1462
+
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
0 commit comments