@@ -336,6 +336,7 @@ struct cmd_params {
336
336
std::vector<bool > use_mmap;
337
337
std::vector<bool > embeddings;
338
338
std::vector<bool > no_op_offload;
339
+ std::vector<bool > no_host;
339
340
ggml_numa_strategy numa;
340
341
int reps;
341
342
ggml_sched_priority prio;
@@ -373,6 +374,7 @@ static const cmd_params cmd_params_defaults = {
373
374
/* use_mmap */ { true },
374
375
/* embeddings */ { false },
375
376
/* no_op_offload */ { false },
377
+ /* no_host */ { false },
376
378
/* numa */ GGML_NUMA_STRATEGY_DISABLED,
377
379
/* reps */ 5 ,
378
380
/* prio */ GGML_SCHED_PRIO_NORMAL,
@@ -453,6 +455,8 @@ static void print_usage(int /* argc */, char ** argv) {
453
455
printf (" -ot --override-tensor <tensor name pattern>=<buffer type>;...\n " );
454
456
printf (" (default: disabled)\n " );
455
457
printf (" -nopo, --no-op-offload <0|1> (default: 0)\n " );
458
+ printf (" --no-host <0|1> (default: %s)\n " ,
459
+ join (cmd_params_defaults.no_host , " ," ).c_str ());
456
460
printf (" \n " );
457
461
printf (
458
462
" Multiple values can be given for each parameter by separating them with ','\n "
@@ -782,6 +786,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
782
786
}
783
787
auto p = string_split<bool >(argv[i], split_delim);
784
788
params.no_op_offload .insert (params.no_op_offload .end (), p.begin (), p.end ());
789
+ } else if (arg == " --no-host" ) {
790
+ if (++i >= argc) {
791
+ invalid_param = true ;
792
+ break ;
793
+ }
794
+ auto p = string_split<bool >(argv[i], split_delim);
795
+ params.no_host .insert (params.no_host .end (), p.begin (), p.end ());
785
796
} else if (arg == " -ts" || arg == " --tensor-split" ) {
786
797
if (++i >= argc) {
787
798
invalid_param = true ;
@@ -1003,6 +1014,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
1003
1014
if (params.no_op_offload .empty ()) {
1004
1015
params.no_op_offload = cmd_params_defaults.no_op_offload ;
1005
1016
}
1017
+ if (params.no_host .empty ()) {
1018
+ params.no_host = cmd_params_defaults.no_host ;
1019
+ }
1006
1020
if (params.n_threads .empty ()) {
1007
1021
params.n_threads = cmd_params_defaults.n_threads ;
1008
1022
}
@@ -1044,6 +1058,7 @@ struct cmd_params_instance {
1044
1058
bool use_mmap;
1045
1059
bool embeddings;
1046
1060
bool no_op_offload;
1061
+ bool no_host;
1047
1062
1048
1063
llama_model_params to_llama_mparams () const {
1049
1064
llama_model_params mparams = llama_model_default_params ();
@@ -1056,6 +1071,7 @@ struct cmd_params_instance {
1056
1071
mparams.main_gpu = main_gpu;
1057
1072
mparams.tensor_split = tensor_split.data ();
1058
1073
mparams.use_mmap = use_mmap;
1074
+ mparams.no_host = no_host;
1059
1075
1060
1076
if (n_cpu_moe <= 0 ) {
1061
1077
if (tensor_buft_overrides.empty ()) {
@@ -1101,6 +1117,7 @@ struct cmd_params_instance {
1101
1117
split_mode == other.split_mode &&
1102
1118
main_gpu == other.main_gpu && use_mmap == other.use_mmap && tensor_split == other.tensor_split &&
1103
1119
devices == other.devices &&
1120
+ no_host == other.no_host &&
1104
1121
vec_tensor_buft_override_equal (tensor_buft_overrides, other.tensor_buft_overrides );
1105
1122
}
1106
1123
@@ -1136,6 +1153,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
1136
1153
for (const auto & ts : params.tensor_split )
1137
1154
for (const auto & ot : params.tensor_buft_overrides )
1138
1155
for (const auto & mmp : params.use_mmap )
1156
+ for (const auto & noh : params.no_host )
1139
1157
for (const auto & embd : params.embeddings )
1140
1158
for (const auto & nopo : params.no_op_offload )
1141
1159
for (const auto & nb : params.n_batch )
@@ -1178,6 +1196,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
1178
1196
/* .use_mmap = */ mmp,
1179
1197
/* .embeddings = */ embd,
1180
1198
/* .no_op_offload= */ nopo,
1199
+ /* .no_host = */ noh,
1181
1200
};
1182
1201
instances.push_back (instance);
1183
1202
}
@@ -1211,6 +1230,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
1211
1230
/* .use_mmap = */ mmp,
1212
1231
/* .embeddings = */ embd,
1213
1232
/* .no_op_offload= */ nopo,
1233
+ /* .no_host = */ noh,
1214
1234
};
1215
1235
instances.push_back (instance);
1216
1236
}
@@ -1244,6 +1264,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
1244
1264
/* .use_mmap = */ mmp,
1245
1265
/* .embeddings = */ embd,
1246
1266
/* .no_op_offload= */ nopo,
1267
+ /* .no_host = */ noh,
1247
1268
};
1248
1269
instances.push_back (instance);
1249
1270
}
@@ -1282,6 +1303,7 @@ struct test {
1282
1303
bool use_mmap;
1283
1304
bool embeddings;
1284
1305
bool no_op_offload;
1306
+ bool no_host;
1285
1307
int n_prompt;
1286
1308
int n_gen;
1287
1309
int n_depth;
@@ -1318,6 +1340,7 @@ struct test {
1318
1340
use_mmap = inst.use_mmap ;
1319
1341
embeddings = inst.embeddings ;
1320
1342
no_op_offload = inst.no_op_offload ;
1343
+ no_host = inst.no_host ;
1321
1344
n_prompt = inst.n_prompt ;
1322
1345
n_gen = inst.n_gen ;
1323
1346
n_depth = inst.n_depth ;
@@ -1375,8 +1398,8 @@ struct test {
1375
1398
" type_k" , " type_v" , " n_gpu_layers" , " n_cpu_moe" , " split_mode" ,
1376
1399
" main_gpu" , " no_kv_offload" , " flash_attn" , " devices" , " tensor_split" ,
1377
1400
" tensor_buft_overrides" , " use_mmap" , " embeddings" , " no_op_offload" ,
1378
- " n_prompt " , " n_gen " , " n_depth " , " test_time " , " avg_ns " ,
1379
- " stddev_ns" , " avg_ts" , " stddev_ts"
1401
+ " no_host " , " n_prompt " , " n_gen " , " n_depth " , " test_time " ,
1402
+ " avg_ns " , " stddev_ns" , " avg_ts" , " stddev_ts"
1380
1403
};
1381
1404
return fields;
1382
1405
}
@@ -1391,7 +1414,7 @@ struct test {
1391
1414
return INT;
1392
1415
}
1393
1416
if (field == " f16_kv" || field == " no_kv_offload" || field == " cpu_strict" || field == " flash_attn" ||
1394
- field == " use_mmap" || field == " embeddings" ) {
1417
+ field == " use_mmap" || field == " embeddings" || field == " no_host " ) {
1395
1418
return BOOL;
1396
1419
}
1397
1420
if (field == " avg_ts" || field == " stddev_ts" ) {
@@ -1466,6 +1489,7 @@ struct test {
1466
1489
std::to_string (use_mmap),
1467
1490
std::to_string (embeddings),
1468
1491
std::to_string (no_op_offload),
1492
+ std::to_string (no_host),
1469
1493
std::to_string (n_prompt),
1470
1494
std::to_string (n_gen),
1471
1495
std::to_string (n_depth),
@@ -1654,6 +1678,9 @@ struct markdown_printer : public printer {
1654
1678
if (field == " no_op_offload" ) {
1655
1679
return 4 ;
1656
1680
}
1681
+ if (field == " no_host" ) {
1682
+ return 4 ;
1683
+ }
1657
1684
1658
1685
int width = std::max ((int ) field.length (), 10 );
1659
1686
@@ -1688,6 +1715,9 @@ struct markdown_printer : public printer {
1688
1715
if (field == " no_op_offload" ) {
1689
1716
return " nopo" ;
1690
1717
}
1718
+ if (field == " no_host" ) {
1719
+ return " noh" ;
1720
+ }
1691
1721
if (field == " devices" ) {
1692
1722
return " dev" ;
1693
1723
}
@@ -1768,6 +1798,9 @@ struct markdown_printer : public printer {
1768
1798
if (params.no_op_offload .size () > 1 || params.no_op_offload != cmd_params_defaults.no_op_offload ) {
1769
1799
fields.emplace_back (" no_op_offload" );
1770
1800
}
1801
+ if (params.no_host .size () > 1 || params.no_host != cmd_params_defaults.no_host ) {
1802
+ fields.emplace_back (" no_host" );
1803
+ }
1771
1804
fields.emplace_back (" test" );
1772
1805
fields.emplace_back (" t/s" );
1773
1806
0 commit comments