Skip to content

Commit a5365d2

Browse files
committed
feat(minor): improve memory usage estimation
1 parent 93346f9 commit a5365d2

34 files changed

+935
-330
lines changed

llama/addon/addon.cpp

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#include "globals/addonProgress.h"
1010
#include "globals/getGpuInfo.h"
1111
#include "globals/getSwapInfo.h"
12+
#include "globals/getMemoryInfo.h"
1213

1314
bool backendInitialized = false;
1415
bool backendDisposed = false;
@@ -25,6 +26,21 @@ Napi::Value addonGetSupportsMmap(const Napi::CallbackInfo& info) {
2526
return Napi::Boolean::New(info.Env(), llama_supports_mmap());
2627
}
2728

29+
Napi::Value addonGetGpuSupportsMmap(const Napi::CallbackInfo& info) {
30+
const auto llamaSupportsMmap = llama_supports_mmap();
31+
const auto gpuDevice = getGpuDevice().first;
32+
33+
if (gpuDevice == nullptr) {
34+
return Napi::Boolean::New(info.Env(), false);
35+
}
36+
37+
ggml_backend_dev_props props;
38+
ggml_backend_dev_get_props(gpuDevice, &props);
39+
40+
const bool gpuSupportsMmap = llama_supports_mmap() && props.caps.buffer_from_host_ptr;
41+
return Napi::Boolean::New(info.Env(), gpuSupportsMmap);
42+
}
43+
2844
Napi::Value addonGetSupportsMlock(const Napi::CallbackInfo& info) {
2945
return Napi::Boolean::New(info.Env(), llama_supports_mlock());
3046
}
@@ -210,6 +226,7 @@ Napi::Object registerCallback(Napi::Env env, Napi::Object exports) {
210226
Napi::PropertyDescriptor::Function("systemInfo", systemInfo),
211227
Napi::PropertyDescriptor::Function("getSupportsGpuOffloading", addonGetSupportsGpuOffloading),
212228
Napi::PropertyDescriptor::Function("getSupportsMmap", addonGetSupportsMmap),
229+
Napi::PropertyDescriptor::Function("getGpuSupportsMmap", addonGetGpuSupportsMmap),
213230
Napi::PropertyDescriptor::Function("getSupportsMlock", addonGetSupportsMlock),
214231
Napi::PropertyDescriptor::Function("getMathCores", addonGetMathCores),
215232
Napi::PropertyDescriptor::Function("getBlockSizeForGgmlType", addonGetBlockSizeForGgmlType),
@@ -221,6 +238,7 @@ Napi::Object registerCallback(Napi::Env env, Napi::Object exports) {
221238
Napi::PropertyDescriptor::Function("getGpuDeviceInfo", getGpuDeviceInfo),
222239
Napi::PropertyDescriptor::Function("getGpuType", getGpuType),
223240
Napi::PropertyDescriptor::Function("getSwapInfo", getSwapInfo),
241+
Napi::PropertyDescriptor::Function("getMemoryInfo", getMemoryInfo),
224242
Napi::PropertyDescriptor::Function("loadBackends", addonLoadBackends),
225243
Napi::PropertyDescriptor::Function("init", addonInit),
226244
Napi::PropertyDescriptor::Function("dispose", addonDispose),

llama/addon/globals/getGpuInfo.cpp

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -89,17 +89,17 @@ Napi::Value getGpuDeviceInfo(const Napi::CallbackInfo& info) {
8989
return result;
9090
}
9191

92-
Napi::Value getGpuType(const Napi::CallbackInfo& info) {
92+
std::pair<ggml_backend_dev_t, std::string> getGpuDevice() {
9393
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
9494
ggml_backend_dev_t device = ggml_backend_dev_get(i);
9595
const auto deviceName = std::string(ggml_backend_dev_name(device));
9696

9797
if (deviceName == "Metal") {
98-
return Napi::String::New(info.Env(), "metal");
98+
return std::pair<ggml_backend_dev_t, std::string>(device, "metal");
9999
} else if (std::string(deviceName).find("Vulkan") == 0) {
100-
return Napi::String::New(info.Env(), "vulkan");
100+
return std::pair<ggml_backend_dev_t, std::string>(device, "vulkan");
101101
} else if (std::string(deviceName).find("CUDA") == 0 || std::string(deviceName).find("ROCm") == 0 || std::string(deviceName).find("MUSA") == 0) {
102-
return Napi::String::New(info.Env(), "cuda");
102+
return std::pair<ggml_backend_dev_t, std::string>(device, "cuda");
103103
}
104104
}
105105

@@ -108,9 +108,23 @@ Napi::Value getGpuType(const Napi::CallbackInfo& info) {
108108
const auto deviceName = std::string(ggml_backend_dev_name(device));
109109

110110
if (deviceName == "CPU") {
111-
return Napi::Boolean::New(info.Env(), false);
111+
return std::pair<ggml_backend_dev_t, std::string>(device, "cpu");
112112
}
113113
}
114114

115+
return std::pair<ggml_backend_dev_t, std::string>(nullptr, "");
116+
}
117+
118+
Napi::Value getGpuType(const Napi::CallbackInfo& info) {
119+
const auto gpuDeviceRes = getGpuDevice();
120+
const auto device = gpuDeviceRes.first;
121+
const auto deviceType = gpuDeviceRes.second;
122+
123+
if (deviceType == "cpu") {
124+
return Napi::Boolean::New(info.Env(), false);
125+
} else if (device != nullptr && deviceType != "") {
126+
return Napi::String::New(info.Env(), deviceType);
127+
}
128+
115129
return info.Env().Undefined();
116130
}

llama/addon/globals/getGpuInfo.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
11
#pragma once
2+
#include <utility>
3+
#include <string>
24
#include "napi.h"
5+
#include "llama.h"
36

47
Napi::Value getGpuVramInfo(const Napi::CallbackInfo& info);
58
Napi::Value getGpuDeviceInfo(const Napi::CallbackInfo& info);
9+
std::pair<ggml_backend_dev_t, std::string> getGpuDevice();
610
Napi::Value getGpuType(const Napi::CallbackInfo& info);
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
#include "getMemoryInfo.h"
2+
#include "addonLog.h"
3+
4+
#ifdef __APPLE__
5+
#include <iostream>
6+
#include <mach/mach.h>
7+
#include <sys/sysctl.h>
8+
#elif __linux__
9+
#include <iostream>
10+
#include <sys/sysinfo.h>
11+
#elif _WIN32
12+
#include <iostream>
13+
#include <windows.h>
14+
#include <psapi.h>
15+
#endif
16+
17+
18+
Napi::Value getMemoryInfo(const Napi::CallbackInfo& info) {
19+
uint64_t totalMemoryUsage = 0;
20+
21+
#ifdef __APPLE__
22+
struct mach_task_basic_info taskInfo;
23+
mach_msg_type_number_t infoCount = MACH_TASK_BASIC_INFO_COUNT;
24+
if (task_info(mach_task_self(), MACH_TASK_BASIC_INFO, (task_info_t)&taskInfo, &infoCount) == KERN_SUCCESS) {
25+
totalMemoryUsage = taskInfo.virtual_size;
26+
} else {
27+
addonLlamaCppLogCallback(GGML_LOG_LEVEL_ERROR, std::string("Failed to get memory usage info").c_str(), nullptr);
28+
}
29+
#elif __linux__
30+
std::ifstream procStatus("/proc/self/status");
31+
std::string line;
32+
bool foundMemoryUsage = false;
33+
while (std::getline(procStatus, line)) {
34+
if (line.rfind("VmSize:", 0) == 0) { // Resident Set Size (current memory usage)
35+
std::istringstream iss(line);
36+
std::string key, unit;
37+
size_t value;
38+
if (iss >> key >> value >> unit) {
39+
totalMemoryUsage = value * 1024; // Convert from kB to bytes
40+
foundMemoryUsage = true;
41+
}
42+
break;
43+
}
44+
}
45+
46+
if (!foundMemoryUsage) {
47+
addonLlamaCppLogCallback(GGML_LOG_LEVEL_ERROR, std::string("Failed to get memory usage info").c_str(), nullptr);
48+
}
49+
#elif _WIN32
50+
PROCESS_MEMORY_COUNTERS memCounters;
51+
52+
if (GetProcessMemoryInfo(GetCurrentProcess(), &memCounters, sizeof(memCounters))) {
53+
totalMemoryUsage = memCounters.PrivateUsage;
54+
} else {
55+
addonLlamaCppLogCallback(GGML_LOG_LEVEL_ERROR, std::string("Failed to get memory usage info").c_str(), nullptr);
56+
}
57+
#endif
58+
59+
Napi::Object obj = Napi::Object::New(info.Env());
60+
obj.Set("total", Napi::Number::New(info.Env(), totalMemoryUsage));
61+
return obj;
62+
}
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
#pragma once
2+
#include "napi.h"
3+
4+
Napi::Value getMemoryInfo(const Napi::CallbackInfo& info);

src/bindings/AddonTypes.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ export type BindingModule = {
4848
systemInfo(): string,
4949
getSupportsGpuOffloading(): boolean,
5050
getSupportsMmap(): boolean,
51+
getGpuSupportsMmap(): boolean,
5152
getSupportsMlock(): boolean,
5253
getMathCores(): number,
5354
getBlockSizeForGgmlType(ggmlType: number): number | undefined,
@@ -76,6 +77,9 @@ export type BindingModule = {
7677
maxSize: number,
7778
free: number
7879
},
80+
getMemoryInfo(): {
81+
total: number
82+
},
7983
init(): Promise<void>,
8084
loadBackends(forceLoadLibrariesSearchPath?: string): void,
8185
dispose(): Promise<void>

src/bindings/Llama.ts

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ export class Llama {
4646
/** @internal */ private readonly _cmakeOptions: Readonly<Record<string, string>>;
4747
/** @internal */ private readonly _supportsGpuOffloading: boolean;
4848
/** @internal */ private readonly _supportsMmap: boolean;
49+
/** @internal */ private readonly _gpuSupportsMmap: boolean;
4950
/** @internal */ private readonly _supportsMlock: boolean;
5051
/** @internal */ private readonly _mathCores: number;
5152
/** @internal */ private readonly _llamaCppRelease: {
@@ -110,6 +111,7 @@ export class Llama {
110111
this._gpu = bindings.getGpuType() ?? false;
111112
this._supportsGpuOffloading = bindings.getSupportsGpuOffloading();
112113
this._supportsMmap = bindings.getSupportsMmap();
114+
this._gpuSupportsMmap = bindings.getGpuSupportsMmap();
113115
this._supportsMlock = bindings.getSupportsMlock();
114116
this._mathCores = bindings.getMathCores();
115117
this._consts = bindings.getConsts();
@@ -175,6 +177,10 @@ export class Llama {
175177
return this._supportsMmap;
176178
}
177179

180+
public get gpuSupportsMmap() {
181+
return this._gpuSupportsMmap;
182+
}
183+
178184
public get supportsMlock() {
179185
return this._supportsMlock;
180186
}

src/cli/commands/ChatCommand.ts

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ type ChatCommand = {
6868
debug: boolean,
6969
meter: boolean,
7070
timing: boolean,
71+
noMmap: boolean,
7172
printTimings: boolean
7273
};
7374

@@ -293,6 +294,11 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
293294
default: false,
294295
description: "Print how how long it took to generate each response"
295296
})
297+
.option("noMmap", {
298+
type: "boolean",
299+
default: false,
300+
description: "Disable mmap (memory-mapped file) usage"
301+
})
296302
.option("printTimings", {
297303
alias: "pt",
298304
type: "boolean",
@@ -306,15 +312,15 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
306312
noTrimWhitespace, grammar, jsonSchemaGrammarFile, threads, temperature, minP, topK,
307313
topP, seed, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine,
308314
repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, noHistory,
309-
environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, meter, timing, printTimings
315+
environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, meter, timing, noMmap, printTimings
310316
}) {
311317
try {
312318
await RunChat({
313319
modelPath, header, gpu, systemInfo, systemPrompt, systemPromptFile, prompt, promptFile, wrapper, noJinja, contextSize,
314320
batchSize, flashAttention, noTrimWhitespace, grammar, jsonSchemaGrammarFile, threads, temperature, minP, topK, topP, seed,
315321
gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty,
316322
maxTokens, noHistory, environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, meter,
317-
timing, printTimings
323+
timing, noMmap, printTimings
318324
});
319325
} catch (err) {
320326
await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
@@ -330,7 +336,7 @@ async function RunChat({
330336
contextSize, batchSize, flashAttention, noTrimWhitespace, grammar: grammarArg, jsonSchemaGrammarFile: jsonSchemaGrammarFilePath,
331337
threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine,
332338
repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, noHistory, environmentFunctions, tokenPredictionDraftModel,
333-
tokenPredictionModelContextSize, debug, meter, timing, printTimings
339+
tokenPredictionModelContextSize, debug, meter, timing, noMmap, printTimings
334340
}: ChatCommand) {
335341
if (contextSize === -1) contextSize = undefined;
336342
if (gpuLayers === -1) gpuLayers = undefined;
@@ -353,13 +359,16 @@ async function RunChat({
353359
logLevel: llamaLogLevel
354360
});
355361
const logBatchSize = batchSize != null;
362+
const useMmap = !noMmap && llama.supportsMmap;
356363

357364
const resolvedModelPath = await resolveCommandGgufPath(modelArg, llama, headers, {
358-
flashAttention
365+
flashAttention,
366+
useMmap
359367
});
360368
const resolvedDraftModelPath = (tokenPredictionDraftModel != null && tokenPredictionDraftModel !== "")
361369
? await resolveCommandGgufPath(tokenPredictionDraftModel, llama, headers, {
362370
flashAttention,
371+
useMmap,
363372
consoleTitle: "Draft model file"
364373
})
365374
: undefined;
@@ -404,6 +413,7 @@ async function RunChat({
404413
? {fitContext: {contextSize}}
405414
: undefined,
406415
defaultContextFlashAttention: flashAttention,
416+
useMmap,
407417
ignoreMemorySafetyChecks: gpuLayers != null,
408418
onLoadProgress(loadProgress: number) {
409419
progressUpdater.setProgress(loadProgress);
@@ -436,6 +446,7 @@ async function RunChat({
436446
return await llama.loadModel({
437447
modelPath: resolvedDraftModelPath,
438448
defaultContextFlashAttention: flashAttention,
449+
useMmap,
439450
onLoadProgress(loadProgress: number) {
440451
progressUpdater.setProgress(loadProgress);
441452
},
@@ -541,6 +552,7 @@ async function RunChat({
541552
const padTitle = await printCommonInfoLines({
542553
context,
543554
draftContext,
555+
useMmap,
544556
printBos: true,
545557
printEos: true,
546558
logBatchSize,

src/cli/commands/CompleteCommand.ts

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ type CompleteCommand = {
5050
debug: boolean,
5151
meter: boolean,
5252
timing: boolean,
53+
noMmap: boolean,
5354
printTimings: boolean
5455
};
5556

@@ -220,6 +221,11 @@ export const CompleteCommand: CommandModule<object, CompleteCommand> = {
220221
default: false,
221222
description: "Print how how long it took to generate each response"
222223
})
224+
.option("noMmap", {
225+
type: "boolean",
226+
default: false,
227+
description: "Disable mmap (memory-mapped file) usage"
228+
})
223229
.option("printTimings", {
224230
alias: "pt",
225231
type: "boolean",
@@ -232,14 +238,14 @@ export const CompleteCommand: CommandModule<object, CompleteCommand> = {
232238
flashAttention, threads, temperature, minP, topK,
233239
topP, seed, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine,
234240
repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, tokenPredictionDraftModel, tokenPredictionModelContextSize,
235-
debug, meter, timing, printTimings
241+
debug, meter, timing, noMmap, printTimings
236242
}) {
237243
try {
238244
await RunCompletion({
239245
modelPath, header, gpu, systemInfo, text, textFile, contextSize, batchSize, flashAttention,
240246
threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty,
241247
repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens,
242-
tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, meter, timing, printTimings
248+
tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, meter, timing, noMmap, printTimings
243249
});
244250
} catch (err) {
245251
await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
@@ -254,7 +260,7 @@ async function RunCompletion({
254260
modelPath: modelArg, header: headerArg, gpu, systemInfo, text, textFile, contextSize, batchSize, flashAttention,
255261
threads, temperature, minP, topK, topP, seed, gpuLayers,
256262
lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty,
257-
tokenPredictionDraftModel, tokenPredictionModelContextSize, maxTokens, debug, meter, timing, printTimings
263+
tokenPredictionDraftModel, tokenPredictionModelContextSize, maxTokens, debug, meter, timing, noMmap, printTimings
258264
}: CompleteCommand) {
259265
if (contextSize === -1) contextSize = undefined;
260266
if (gpuLayers === -1) gpuLayers = undefined;
@@ -276,13 +282,16 @@ async function RunCompletion({
276282
logLevel: llamaLogLevel
277283
});
278284
const logBatchSize = batchSize != null;
285+
const useMmap = !noMmap && llama.supportsMmap;
279286

280287
const resolvedModelPath = await resolveCommandGgufPath(modelArg, llama, headers, {
281-
flashAttention
288+
flashAttention,
289+
useMmap
282290
});
283291
const resolvedDraftModelPath = (tokenPredictionDraftModel != null && tokenPredictionDraftModel !== "")
284292
? await resolveCommandGgufPath(tokenPredictionDraftModel, llama, headers, {
285293
flashAttention,
294+
useMmap,
286295
consoleTitle: "Draft model file"
287296
})
288297
: undefined;
@@ -320,6 +329,7 @@ async function RunCompletion({
320329
? {fitContext: {contextSize}}
321330
: undefined,
322331
defaultContextFlashAttention: flashAttention,
332+
useMmap,
323333
ignoreMemorySafetyChecks: gpuLayers != null,
324334
onLoadProgress(loadProgress: number) {
325335
progressUpdater.setProgress(loadProgress);
@@ -352,6 +362,7 @@ async function RunCompletion({
352362
return await llama.loadModel({
353363
modelPath: resolvedDraftModelPath,
354364
defaultContextFlashAttention: flashAttention,
365+
useMmap,
355366
onLoadProgress(loadProgress: number) {
356367
progressUpdater.setProgress(loadProgress);
357368
},
@@ -429,6 +440,7 @@ async function RunCompletion({
429440
const padTitle = await printCommonInfoLines({
430441
context,
431442
draftContext,
443+
useMmap,
432444
minTitleLength: "Complete".length + 1,
433445
logBatchSize,
434446
tokenMeterEnabled: meter

0 commit comments

Comments
 (0)