Skip to content

Commit 93346f9

Browse files
committed
feat(minor): improve memory usage estimation
1 parent 729de00 commit 93346f9

File tree

1 file changed

+24
-2
lines changed

1 file changed

+24
-2
lines changed

src/gguf/insights/GgufInsights.ts

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -355,6 +355,7 @@ export class GgufInsights {
355355
gpu: GgufTensorInfo[]
356356
} {
357357
const tensorInfo = this._ggufFileInfo.fullTensorInfo ?? [];
358+
const architecture = this._ggufFileInfo.metadata?.general?.architecture;
358359

359360
if (gpuLayers === 0) {
360361
return {
@@ -369,7 +370,15 @@ export class GgufInsights {
369370
const gpuTensors: GgufTensorInfo[] = [];
370371
const cpuTensors: GgufTensorInfo[] = [];
371372

373+
let tokenEmbedLayer: GgufTensorInfo | undefined;
374+
let mainOutputLayer: GgufTensorInfo | undefined;
375+
372376
for (const singleTensorInfo of tensorInfo) {
377+
if (isMainOutputLayer(singleTensorInfo.name))
378+
mainOutputLayer = singleTensorInfo;
379+
else if (isTokenEmbedLayer(singleTensorInfo.name))
380+
tokenEmbedLayer = singleTensorInfo;
381+
373382
// in the implementation of `llm_load_tensors`, layers with `LLM_TENSOR_LAYER_INPUT` are always
374383
// loaded with `model.dev_input`, which is always set to the CPU
375384
if (isInputLayer(singleTensorInfo.name)) {
@@ -391,8 +400,6 @@ export class GgufInsights {
391400
const {layerNumber} = parseTensorName(singleTensorInfo.name);
392401

393402
if (gpuLayers !== this.totalLayers) {
394-
const architecture = this._ggufFileInfo.metadata?.general?.architecture;
395-
396403
if (architecture === GgufArchitectureType.qwen2 || architecture === GgufArchitectureType.gemma) {
397404
if (layerNumber != null && layerNumber >= startGpuLayer)
398405
gpuTensors.push(singleTensorInfo);
@@ -409,6 +416,9 @@ export class GgufInsights {
409416
cpuTensors.push(singleTensorInfo);
410417
}
411418

419+
if (mainOutputLayer == null && tokenEmbedLayer != null && gpuLayers === this.totalLayers && !gpuTensors.includes(tokenEmbedLayer))
420+
gpuTensors.push(tokenEmbedLayer);
421+
412422
return {
413423
cpu: cpuTensors,
414424
gpu: gpuTensors
@@ -626,3 +636,15 @@ function isOutputLayer(layerName: string) {
626636

627637
return false;
628638
}
639+
640+
function isMainOutputLayer(layerName: string) {
641+
const [firstPart] = layerName.split(".");
642+
643+
return firstPart === "output";
644+
}
645+
646+
function isTokenEmbedLayer(layerName: string) {
647+
const [firstPart] = layerName.split(".");
648+
649+
return firstPart === "token_embd";
650+
}

0 commit comments

Comments
 (0)