PowerInfer/powerinfer-py/powerinfer/export_split.py at 41bcf67b0ed14e9f67111d59a34d67afc43e313f · Tiiny-AI/PowerInfer · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import argparse
import pickle
import sys
from gguf.constants import GGMLQuantizationType
from gguf.gguf_writer import GGUFWriter
import torch
from pathlib import Path
import os
if 'NO_LOCAL_GGUF' not in os.environ:
    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
import gguf
import struct
import numpy as np
import re

def load_activation_weights(models_base: Path):
    # TODO: might need a specification file to indicate which models to load.
    # But for now, let's assume it is a plain directory of activation_{0, ... , n_layers - 1}.pt
    *_, files = next(os.walk(models_base))
    activation_files = [f for f in files if re.match(r"activation_\d+.pt", f)]

    layer_num = np.array([int(re.sub(f'[^0-9]', '', f)) for f in activation_files])
    idx = np.argsort(layer_num)
    activation_files = [activation_files[i] for i in idx]
    return [torch.load(models_base / f) for f in activation_files]

def append_gpu_idx(gguf: GGUFWriter, i_layer: int, activation, select_count) -> None:
    _, indices = torch.topk(activation, k=int(select_count))
    gpu_idx = torch.zeros_like(activation)
    gpu_idx[indices] = 1
    gpu_idx = gpu_idx.numpy().astype(np.int32)
    key = f"blk.{i_layer}.gpu_idx"
    print(
        f"{key} => {key} {gpu_idx.shape} {gpu_idx.dtype} {gpu_idx.nbytes/1024/1024} MiB"
    )
    gguf.add_tensor(
        name=key,
        tensor=gpu_idx,
        raw_shape=gpu_idx.shape[::-1],
        raw_dtype=GGMLQuantizationType.I32,
    )

    indices = indices.numpy().astype(np.int32)
    gpu_bucket = np.sort(indices)
    key = f"blk.{i_layer}.gpu_bucket"
    print(
        f"{key} => {key} {gpu_bucket.shape} {gpu_bucket.dtype} {gpu_bucket.nbytes/1024/1024} MiB"
    )
    gguf.add_tensor(
        name=key,
        tensor=gpu_bucket,
        raw_shape=gpu_bucket.shape[::-1],
        raw_dtype=GGMLQuantizationType.I32,
    )

def export_split(activations_path: str, output_path: str, solved_list: list[int], vram_capacity: int):
    predictors = load_activation_weights(Path(activations_path)) # predictor => activation acount
    gguf_out = GGUFWriter(output_path, "generic.gpu_index")
    for i, (activation, selected_count) in enumerate(zip(predictors, solved_list)):
        append_gpu_idx(gguf_out, i, activation, selected_count)

    # set kvs
    gguf_out.add_block_count(len(predictors))
    # TODO: better to save the actual capacity that split neurons require
    gguf_out.add_uint64(gguf.Keys.Split.VRAM_CAPACITY, vram_capacity)

    gguf_out.write_header_to_file()
    gguf_out.write_kv_data_to_file()
    gguf_out.write_tensors_to_file()
    gguf_out.close()

    # post-process: write another unique file header to distinguish from the origianl GGUF file
    with open(output_path, "r+b") as fout:
        POWERINFER_MAGIC = int.from_bytes(b"PWRI", "little")
        fout.write(struct.pack("<I", POWERINFER_MAGIC))
        fout.write(struct.pack("<I", 3))

    print(f"exported GPU index to {output_path}")