-
Notifications
You must be signed in to change notification settings - Fork 501
Expand file tree
/
Copy pathexport_split.py
More file actions
79 lines (69 loc) · 2.9 KB
/
export_split.py
File metadata and controls
79 lines (69 loc) · 2.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import argparse
import pickle
import sys
from gguf.constants import GGMLQuantizationType
from gguf.gguf_writer import GGUFWriter
import torch
from pathlib import Path
import os
if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
import gguf
import struct
import numpy as np
import re
def load_activation_weights(models_base: Path):
# TODO: might need a specification file to indicate which models to load.
# But for now, let's assume it is a plain directory of activation_{0, ... , n_layers - 1}.pt
*_, files = next(os.walk(models_base))
activation_files = [f for f in files if re.match(r"activation_\d+.pt", f)]
layer_num = np.array([int(re.sub(f'[^0-9]', '', f)) for f in activation_files])
idx = np.argsort(layer_num)
activation_files = [activation_files[i] for i in idx]
return [torch.load(models_base / f) for f in activation_files]
def append_gpu_idx(gguf: GGUFWriter, i_layer: int, activation, select_count) -> None:
_, indices = torch.topk(activation, k=int(select_count))
gpu_idx = torch.zeros_like(activation)
gpu_idx[indices] = 1
gpu_idx = gpu_idx.numpy().astype(np.int32)
key = f"blk.{i_layer}.gpu_idx"
print(
f"{key} => {key} {gpu_idx.shape} {gpu_idx.dtype} {gpu_idx.nbytes/1024/1024} MiB"
)
gguf.add_tensor(
name=key,
tensor=gpu_idx,
raw_shape=gpu_idx.shape[::-1],
raw_dtype=GGMLQuantizationType.I32,
)
indices = indices.numpy().astype(np.int32)
gpu_bucket = np.sort(indices)
key = f"blk.{i_layer}.gpu_bucket"
print(
f"{key} => {key} {gpu_bucket.shape} {gpu_bucket.dtype} {gpu_bucket.nbytes/1024/1024} MiB"
)
gguf.add_tensor(
name=key,
tensor=gpu_bucket,
raw_shape=gpu_bucket.shape[::-1],
raw_dtype=GGMLQuantizationType.I32,
)
def export_split(activations_path: str, output_path: str, solved_list: list[int], vram_capacity: int):
predictors = load_activation_weights(Path(activations_path)) # predictor => activation acount
gguf_out = GGUFWriter(output_path, "generic.gpu_index")
for i, (activation, selected_count) in enumerate(zip(predictors, solved_list)):
append_gpu_idx(gguf_out, i, activation, selected_count)
# set kvs
gguf_out.add_block_count(len(predictors))
# TODO: better to save the actual capacity that split neurons require
gguf_out.add_uint64(gguf.Keys.Split.VRAM_CAPACITY, vram_capacity)
gguf_out.write_header_to_file()
gguf_out.write_kv_data_to_file()
gguf_out.write_tensors_to_file()
gguf_out.close()
# post-process: write another unique file header to distinguish from the origianl GGUF file
with open(output_path, "r+b") as fout:
POWERINFER_MAGIC = int.from_bytes(b"PWRI", "little")
fout.write(struct.pack("<I", POWERINFER_MAGIC))
fout.write(struct.pack("<I", 3))
print(f"exported GPU index to {output_path}")