Skip to content

Commit 47475ac

Browse files
authored
fix(Vulkan): include integrated GPU memory (#516)
* fix(Vulkan): include integrated GPU memory - adapt to a change in `llama.cpp` * fix(Vulkan): deduplicate the same device coming from different drivers * fix: adapt Llama chat wrappers to breaking `llama.cpp` changes * fix: internal log level * docs(Vulkan): recommend installing LLVM on Windows
1 parent 02805ee commit 47475ac

File tree

14 files changed

+192
-134
lines changed

14 files changed

+192
-134
lines changed

docs/guide/Vulkan.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,11 @@ If you see `Vulkan used VRAM` in the output, it means that Vulkan support is wor
6565
reg add "HKEY_LOCAL_MACHINE\SYSTEM\CurrentControlSet\Control\FileSystem" /v "LongPathsEnabled" /t REG_DWORD /d "1" /f
6666
```
6767
:::
68+
* :::details Windows only: LLVM (optional, recommended if you have build issues)
69+
There are a few methods to install LLVM:
70+
* **As part of Microsoft Visual C++ Build Tools (Recommended):** the dependencies for Window listed under [Downloading a Release](./building-from-source.md#downloading-a-release) will also install LLVM.
71+
* **Independently:** visit the [latest LLVM release page](https://github.com/llvm/llvm-project/releases/latest) and download the installer for your Windows architecture.
72+
:::
6873

6974
### Building From Source
7075
When you use the [`getLlama`](../api/functions/getLlama) method, if there's no binary that matches the provided options, it'll automatically build `llama.cpp` from source.

docs/guide/embedding.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -204,7 +204,7 @@ import DataBadge from "../../.vitepress/components/DataBadge/DataBadge.vue";
204204
</script>
205205
206206
#### Embedded databases {#databases-embedded}
207-
* **[LanceDB](https://lancedb.com/)** ([GitHub](https://github.com/lancedb/lancedb) | [npm](https://www.npmjs.com/package/@lancedb/lancedb) | [Quick start](https://lancedb.github.io/lancedb/basic/#__tabbed_1_2)) - Serverless vector database you can embed inside your application. No server required.
207+
* **[LanceDB](https://lancedb.com/)** ([GitHub](https://github.com/lancedb/lancedb) | [npm](https://www.npmjs.com/package/@lancedb/lancedb) | [Quick start](https://www.npmjs.com/package/@lancedb/lancedb#usage)) - Serverless vector database you can embed inside your application. No server required.
208208
<br/><DataBadge title="Written in" content="Rust"/><DataBadge title="License" content="Apache-2.0"/>
209209
210210
* **Vectra** ([GitHub](https://github.com/Stevenic/vectra) | [npm](https://www.npmjs.com/package/vectra)) - local vector database using local files

llama/addon/globals/getGpuInfo.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,8 @@ Napi::Value getGpuVramInfo(const Napi::CallbackInfo& info) {
2727

2828
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
2929
device = ggml_backend_dev_get(i);
30-
if (ggml_backend_dev_type(device) == GGML_BACKEND_DEVICE_TYPE_GPU) {
30+
auto deviceType = ggml_backend_dev_type(device);
31+
if (deviceType == GGML_BACKEND_DEVICE_TYPE_GPU || deviceType == GGML_BACKEND_DEVICE_TYPE_IGPU) {
3132
deviceTotal = 0;
3233
deviceFree = 0;
3334
ggml_backend_dev_memory(device, &deviceFree, &deviceTotal);
@@ -76,8 +77,8 @@ Napi::Value getGpuDeviceInfo(const Napi::CallbackInfo& info) {
7677

7778
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
7879
ggml_backend_dev_t device = ggml_backend_dev_get(i);
79-
if (ggml_backend_dev_type(device) == GGML_BACKEND_DEVICE_TYPE_GPU) {
80-
80+
auto deviceType = ggml_backend_dev_type(device);
81+
if (deviceType == GGML_BACKEND_DEVICE_TYPE_GPU || deviceType == GGML_BACKEND_DEVICE_TYPE_IGPU) {
8182
deviceNames.push_back(std::string(ggml_backend_dev_description(device)));
8283
}
8384
}

llama/gpuInfo/vulkan-gpu-info.cpp

Lines changed: 95 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,109 @@
11
#include <stddef.h>
2+
#include <map>
23
#include <vector>
34

45
#include <vulkan/vulkan.hpp>
56

7+
constexpr std::uint32_t VK_VENDOR_ID_AMD = 0x1002;
8+
constexpr std::uint32_t VK_VENDOR_ID_APPLE = 0x106b;
9+
constexpr std::uint32_t VK_VENDOR_ID_INTEL = 0x8086;
10+
constexpr std::uint32_t VK_VENDOR_ID_NVIDIA = 0x10de;
11+
612
typedef void (*gpuInfoVulkanWarningLogCallback_t)(const char* message);
713

8-
static bool enumerateVulkanDevices(size_t* total, size_t* used, size_t* unifiedMemorySize, bool addDeviceNames, std::vector<std::string> * deviceNames, gpuInfoVulkanWarningLogCallback_t warningLogCallback, bool * checkSupported) {
14+
static vk::Instance vulkanInstance() {
915
vk::ApplicationInfo appInfo("node-llama-cpp GPU info", 1, "llama.cpp", 1, VK_API_VERSION_1_2);
1016
vk::InstanceCreateInfo createInfo(vk::InstanceCreateFlags(), &appInfo, {}, {});
11-
vk::Instance instance = vk::createInstance(createInfo);
17+
return vk::createInstance(createInfo);
18+
}
1219

20+
static std::vector<vk::PhysicalDevice> dedupedDevices() {
21+
vk::Instance instance = vulkanInstance();
1322
auto physicalDevices = instance.enumeratePhysicalDevices();
23+
std::vector<vk::PhysicalDevice> dedupedDevices;
24+
dedupedDevices.reserve(physicalDevices.size());
25+
26+
// adapted from `ggml_vk_instance_init` in `ggml-vulkan.cpp`
27+
for (const auto& device : physicalDevices) {
28+
vk::PhysicalDeviceProperties2 newProps;
29+
vk::PhysicalDeviceDriverProperties newDriver;
30+
vk::PhysicalDeviceIDProperties newId;
31+
newProps.pNext = &newDriver;
32+
newDriver.pNext = &newId;
33+
device.getProperties2(&newProps);
34+
35+
auto oldDevice = std::find_if(
36+
dedupedDevices.begin(),
37+
dedupedDevices.end(),
38+
[&newId](const vk::PhysicalDevice& oldDevice) {
39+
vk::PhysicalDeviceProperties2 oldProps;
40+
vk::PhysicalDeviceDriverProperties oldDriver;
41+
vk::PhysicalDeviceIDProperties oldId;
42+
oldProps.pNext = &oldDriver;
43+
oldDriver.pNext = &oldId;
44+
oldDevice.getProperties2(&oldProps);
45+
46+
bool equals = std::equal(std::begin(oldId.deviceUUID), std::end(oldId.deviceUUID), std::begin(newId.deviceUUID));
47+
equals = equals || (
48+
oldId.deviceLUIDValid && newId.deviceLUIDValid &&
49+
std::equal(std::begin(oldId.deviceLUID), std::end(oldId.deviceLUID), std::begin(newId.deviceLUID))
50+
);
51+
52+
return equals;
53+
}
54+
);
55+
56+
if (oldDevice == dedupedDevices.end()) {
57+
dedupedDevices.push_back(device);
58+
continue;
59+
}
60+
61+
vk::PhysicalDeviceProperties2 oldProps;
62+
vk::PhysicalDeviceDriverProperties oldDriver;
63+
oldProps.pNext = &oldDriver;
64+
oldDevice->getProperties2(&oldProps);
65+
66+
std::map<vk::DriverId, int> driverPriorities {};
67+
int oldPriority = 1000;
68+
int newPriority = 1000;
69+
70+
switch (oldProps.properties.vendorID) {
71+
case VK_VENDOR_ID_AMD:
72+
driverPriorities[vk::DriverId::eMesaRadv] = 1;
73+
driverPriorities[vk::DriverId::eAmdOpenSource] = 2;
74+
driverPriorities[vk::DriverId::eAmdProprietary] = 3;
75+
break;
76+
case VK_VENDOR_ID_INTEL:
77+
driverPriorities[vk::DriverId::eIntelOpenSourceMESA] = 1;
78+
driverPriorities[vk::DriverId::eIntelProprietaryWindows] = 2;
79+
break;
80+
case VK_VENDOR_ID_NVIDIA:
81+
driverPriorities[vk::DriverId::eNvidiaProprietary] = 1;
82+
#if defined(VK_API_VERSION_1_3) && VK_HEADER_VERSION >= 235
83+
driverPriorities[vk::DriverId::eMesaNvk] = 2;
84+
#endif
85+
break;
86+
}
87+
driverPriorities[vk::DriverId::eMesaDozen] = 4;
88+
89+
if (driverPriorities.count(oldDriver.driverID)) {
90+
oldPriority = driverPriorities[oldDriver.driverID];
91+
}
92+
if (driverPriorities.count(newDriver.driverID)) {
93+
newPriority = driverPriorities[newDriver.driverID];
94+
}
95+
96+
if (newPriority < oldPriority) {
97+
dedupedDevices.erase(std::remove(dedupedDevices.begin(), dedupedDevices.end(), *oldDevice), dedupedDevices.end());
98+
dedupedDevices.push_back(device);
99+
}
100+
}
101+
102+
return dedupedDevices;
103+
}
104+
105+
static bool enumerateVulkanDevices(size_t* total, size_t* used, size_t* unifiedMemorySize, bool addDeviceNames, std::vector<std::string> * deviceNames, gpuInfoVulkanWarningLogCallback_t warningLogCallback, bool * checkSupported) {
106+
auto physicalDevices = dedupedDevices();
14107

15108
size_t usedMem = 0;
16109
size_t totalMem = 0;

src/bindings/Llama.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -684,6 +684,8 @@ function getTransformedLogLevel(level: LlamaLogLevel, message: string, gpu: Buil
684684
return LlamaLogLevel.info;
685685
else if (level === LlamaLogLevel.warn && message.startsWith("load: special_eog_ids contains both '<|return|>' and '<|call|>' tokens, removing '<|end|>' token from EOG list"))
686686
return LlamaLogLevel.info;
687+
else if (level === LlamaLogLevel.warn && message.startsWith("llama_init_from_model: model default pooling_type is [0], but [-1] was specified"))
688+
return LlamaLogLevel.info;
687689
else if (gpu === false && level === LlamaLogLevel.warn && message.startsWith("llama_adapter_lora_init_impl: lora for '") && message.endsWith("' cannot use buft 'CPU_REPACK', fallback to CPU"))
688690
return LlamaLogLevel.info;
689691

src/chatWrappers/FunctionaryChatWrapper.ts

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -39,13 +39,13 @@ export class FunctionaryChatWrapper extends ChatWrapper {
3939
prefix: LlamaText([
4040
new SpecialTokensText("<|start_header_id|>tool<|end_header_id|>\n\n")
4141
]),
42-
suffix: LlamaText(new SpecialToken("EOT"))
42+
suffix: LlamaText(new SpecialTokensText("<|eot_id|>"))
4343
},
4444
parallelism: {
4545
call: {
4646
sectionPrefix: "",
4747
betweenCalls: "",
48-
sectionSuffix: LlamaText(new SpecialToken("EOT"))
48+
sectionSuffix: LlamaText(new SpecialTokensText("<|eot_id|>"))
4949
},
5050
result: {
5151
sectionPrefix: "",
@@ -72,13 +72,13 @@ export class FunctionaryChatWrapper extends ChatWrapper {
7272
"{{functionName}}",
7373
new SpecialTokensText("\n")
7474
]),
75-
suffix: LlamaText(new SpecialToken("EOT"))
75+
suffix: LlamaText(new SpecialTokensText("<|eot_id|>"))
7676
},
7777
parallelism: {
7878
call: {
7979
sectionPrefix: "",
8080
betweenCalls: "",
81-
sectionSuffix: LlamaText(new SpecialToken("EOT"))
81+
sectionSuffix: LlamaText(new SpecialTokensText("<|eot_id|>"))
8282
},
8383
result: {
8484
sectionPrefix: "",
@@ -155,13 +155,13 @@ export class FunctionaryChatWrapper extends ChatWrapper {
155155
return LlamaText([
156156
new SpecialTokensText("<|start_header_id|>system<|end_header_id|>\n\n"),
157157
LlamaText.fromJSON(item.text),
158-
new SpecialToken("EOT")
158+
new SpecialTokensText("<|eot_id|>")
159159
]);
160160
} else if (item.type === "user") {
161161
return LlamaText([
162162
new SpecialTokensText("<|start_header_id|>user<|end_header_id|>\n\n"),
163163
item.text,
164-
new SpecialToken("EOT")
164+
new SpecialTokensText("<|eot_id|>")
165165
]);
166166
} else if (item.type === "model") {
167167
if (isLastItem && item.response.length === 0)
@@ -178,7 +178,7 @@ export class FunctionaryChatWrapper extends ChatWrapper {
178178
return;
179179

180180
res.push(LlamaText(pendingFunctionCalls));
181-
res.push(LlamaText(new SpecialToken("EOT")));
181+
res.push(LlamaText(new SpecialTokensText("<|eot_id|>")));
182182
res.push(LlamaText(pendingFunctionResults));
183183

184184
pendingFunctionResults.length = 0;
@@ -206,7 +206,7 @@ export class FunctionaryChatWrapper extends ChatWrapper {
206206
response,
207207
(!isLastResponse || isLastItem)
208208
? LlamaText([])
209-
: new SpecialToken("EOT")
209+
: new SpecialTokensText("<|eot_id|>")
210210
])
211211
])
212212
);
@@ -232,7 +232,7 @@ export class FunctionaryChatWrapper extends ChatWrapper {
232232
response.result === undefined
233233
? "" // "void"
234234
: jsonDumps(response.result),
235-
new SpecialToken("EOT")
235+
new SpecialTokensText("<|eot_id|>")
236236
])
237237
);
238238
} else
@@ -320,13 +320,13 @@ export class FunctionaryChatWrapper extends ChatWrapper {
320320
return LlamaText([
321321
new SpecialTokensText("<|start_header_id|>system<|end_header_id|>\n\n"),
322322
LlamaText.fromJSON(item.text),
323-
new SpecialToken("EOT")
323+
new SpecialTokensText("<|eot_id|>")
324324
]);
325325
} else if (item.type === "user") {
326326
return LlamaText([
327327
new SpecialTokensText("<|start_header_id|>user<|end_header_id|>\n\n"),
328328
item.text,
329-
new SpecialToken("EOT")
329+
new SpecialTokensText("<|eot_id|>")
330330
]);
331331
} else if (item.type === "model") {
332332
if (isLastItem && item.response.length === 0)
@@ -343,7 +343,7 @@ export class FunctionaryChatWrapper extends ChatWrapper {
343343
return;
344344

345345
res.push(LlamaText(pendingFunctionCalls));
346-
res.push(LlamaText(new SpecialToken("EOT")));
346+
res.push(LlamaText(new SpecialTokensText("<|eot_id|>")));
347347
res.push(LlamaText(pendingFunctionResults));
348348

349349
pendingFunctionResults.length = 0;
@@ -365,7 +365,7 @@ export class FunctionaryChatWrapper extends ChatWrapper {
365365
response,
366366
(isLastItem && isLastResponse)
367367
? LlamaText([])
368-
: new SpecialToken("EOT")
368+
: new SpecialTokensText("<|eot_id|>")
369369
])
370370
);
371371
} else if (isChatModelResponseFunctionCall(response)) {
@@ -392,7 +392,7 @@ export class FunctionaryChatWrapper extends ChatWrapper {
392392
response.result === undefined
393393
? "" // "void"
394394
: jsonDumps(response.result),
395-
new SpecialToken("EOT")
395+
new SpecialTokensText("<|eot_id|>")
396396
])
397397
);
398398
} else

src/chatWrappers/Llama3ChatWrapper.ts

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -34,13 +34,13 @@ export class Llama3ChatWrapper extends ChatWrapper {
3434
},
3535
result: {
3636
prefix: LlamaText(new SpecialTokensText("<|start_header_id|>function_call_result<|end_header_id|>\n\n")),
37-
suffix: LlamaText(new SpecialToken("EOT"))
37+
suffix: LlamaText(new SpecialTokensText("<|eot_id|>"))
3838
},
3939
parallelism: {
4040
call: {
4141
sectionPrefix: "",
4242
betweenCalls: "\n",
43-
sectionSuffix: LlamaText(new SpecialToken("EOT"))
43+
sectionSuffix: LlamaText(new SpecialTokensText("<|eot_id|>"))
4444
},
4545
result: {
4646
sectionPrefix: "",
@@ -62,11 +62,11 @@ export class Llama3ChatWrapper extends ChatWrapper {
6262
},
6363
result: {
6464
prefix: LlamaText([
65-
LlamaText(new SpecialToken("EOT")),
65+
LlamaText(new SpecialTokensText("<|eot_id|>")),
6666
new SpecialTokensText("<|start_header_id|>function_call_result<|end_header_id|>\n\n")
6767
]),
6868
suffix: LlamaText([
69-
new SpecialToken("EOT"),
69+
new SpecialTokensText("<|eot_id|>"),
7070
new SpecialTokensText("<|start_header_id|>assistant<|end_header_id|>\n\n")
7171
])
7272
}
@@ -147,7 +147,7 @@ export class Llama3ChatWrapper extends ChatWrapper {
147147
LlamaText([
148148
new SpecialTokensText("<|start_header_id|>system<|end_header_id|>\n\n"),
149149
item.system,
150-
new SpecialToken("EOT")
150+
new SpecialTokensText("<|eot_id|>")
151151
])
152152
);
153153
}
@@ -157,7 +157,7 @@ export class Llama3ChatWrapper extends ChatWrapper {
157157
LlamaText([
158158
new SpecialTokensText("<|start_header_id|>user<|end_header_id|>\n\n"),
159159
item.user,
160-
new SpecialToken("EOT")
160+
new SpecialTokensText("<|eot_id|>")
161161
])
162162
);
163163
}
@@ -169,7 +169,7 @@ export class Llama3ChatWrapper extends ChatWrapper {
169169
item.model,
170170
isLastItem
171171
? LlamaText([])
172-
: new SpecialToken("EOT")
172+
: new SpecialTokensText("<|eot_id|>")
173173
])
174174
);
175175
}

src/chatWrappers/Llama3_1ChatWrapper.ts

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ export class Llama3_1ChatWrapper extends ChatWrapper {
2929
},
3030
result: {
3131
prefix: LlamaText(new SpecialTokensText("\n<|start_header_id|>ipython<|end_header_id|>\n\n")),
32-
suffix: LlamaText(new SpecialToken("EOT"), new SpecialTokensText("<|start_header_id|>assistant<|end_header_id|>\n\n"))
32+
suffix: LlamaText(new SpecialTokensText("<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"))
3333
}
3434
}
3535
};
@@ -189,7 +189,7 @@ export class Llama3_1ChatWrapper extends ChatWrapper {
189189
LlamaText([
190190
new SpecialTokensText("<|start_header_id|>system<|end_header_id|>\n\n"),
191191
item.system,
192-
new SpecialToken("EOT")
192+
new SpecialTokensText("<|eot_id|>")
193193
])
194194
);
195195
}
@@ -199,7 +199,7 @@ export class Llama3_1ChatWrapper extends ChatWrapper {
199199
LlamaText([
200200
new SpecialTokensText("<|start_header_id|>user<|end_header_id|>\n\n"),
201201
item.user,
202-
new SpecialToken("EOT")
202+
new SpecialTokensText("<|eot_id|>")
203203
])
204204
);
205205
}
@@ -211,7 +211,7 @@ export class Llama3_1ChatWrapper extends ChatWrapper {
211211
item.model,
212212
isLastItem
213213
? LlamaText([])
214-
: new SpecialToken("EOT")
214+
: new SpecialTokensText("<|eot_id|>")
215215
])
216216
);
217217
}

0 commit comments

Comments
 (0)