Skip to content

Commit b97fd3e

Browse files
committed
Merge branch 'xsn/mimi_dec' into xsn/csm_tts
2 parents 4012054 + 61d8ad6 commit b97fd3e

File tree

10 files changed

+1182
-40
lines changed

10 files changed

+1182
-40
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@ examples/server/*.gz.hpp
107107
!examples/*/*/*.kts
108108
!examples/sycl/*.bat
109109
!examples/sycl/*.sh
110+
/*.wav
110111

111112
# Server Web UI temporary files
112113
node_modules

common/common.cpp

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2055,3 +2055,31 @@ common_grammar_trigger common_grammar_trigger::from_json(const json & in) {
20552055
}
20562056
return out;
20572057
}
2058+
2059+
//
2060+
// Audio utils
2061+
//
2062+
2063+
bool save_wav16(const std::string & fname, const std::vector<float> & data, int sample_rate) {
2064+
std::ofstream file(fname, std::ios::binary);
2065+
if (!file) {
2066+
LOG_ERR("%s: Failed to open file '%s' for writing.\n", __func__, fname.c_str());
2067+
return false;
2068+
}
2069+
2070+
wav_header header;
2071+
header.sample_rate = sample_rate;
2072+
header.byte_rate = header.sample_rate * header.num_channels * (header.bits_per_sample / 8);
2073+
header.block_align = header.num_channels * (header.bits_per_sample / 8);
2074+
header.data_size = data.size() * (header.bits_per_sample / 8);
2075+
header.chunk_size = 36 + header.data_size;
2076+
2077+
file.write(reinterpret_cast<const char*>(&header), sizeof(header));
2078+
2079+
for (const auto & sample : data) {
2080+
int16_t pcm_sample = static_cast<int16_t>(std::clamp(sample * 32767.0, -32768.0, 32767.0));
2081+
file.write(reinterpret_cast<const char*>(&pcm_sample), sizeof(pcm_sample));
2082+
}
2083+
2084+
return file.good();
2085+
}

common/common.h

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -683,3 +683,25 @@ const char * const LLM_KV_SPLIT_COUNT = "split.count";
683683
const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
684684

685685
}
686+
687+
//
688+
// Audio utils
689+
//
690+
691+
struct wav_header {
692+
char riff[4] = {'R', 'I', 'F', 'F'};
693+
uint32_t chunk_size;
694+
char wave[4] = {'W', 'A', 'V', 'E'};
695+
char fmt[4] = {'f', 'm', 't', ' '};
696+
uint32_t fmt_chunk_size = 16;
697+
uint16_t audio_format = 1; // PCM
698+
uint16_t num_channels = 1; // Mono
699+
uint32_t sample_rate;
700+
uint32_t byte_rate;
701+
uint16_t block_align;
702+
uint16_t bits_per_sample = 16;
703+
char data[4] = {'d', 'a', 't', 'a'};
704+
uint32_t data_size;
705+
};
706+
707+
bool save_wav16(const std::string & fname, const std::vector<float> & data, int sample_rate);

examples/tts/CMakeLists.txt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,3 +9,10 @@ add_executable(${TARGET} tts-csm.cpp)
99
install(TARGETS ${TARGET} RUNTIME)
1010
target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
1111
target_compile_features(${TARGET} PRIVATE cxx_std_17)
12+
13+
set(TARGET llama-mimi)
14+
add_executable(${TARGET} mimi.cpp mimi-model.cpp)
15+
install(TARGETS ${TARGET} RUNTIME)
16+
target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
17+
# for using C++ designated initializers, TODO: can be changed back to C++17 in the future
18+
target_compile_features(${TARGET} PRIVATE cxx_std_20)

examples/tts/README-mimi.md

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
# llama.cpp/example/mimi
2+
3+
This demonstrates running [Kyutai's Mimi](https://huggingface.co/kyutai/mimi) model via GGML.
4+
5+
## Quickstart
6+
7+
Convert model to GGUF (no need to download, the script will automatically download the `safetensors` file)
8+
9+
```sh
10+
python examples/tts/convert_mimi_to_gguf.py
11+
12+
# output file: kyutai-mimi.gguf
13+
14+
# optionally, use q8_0 quantization for faster speed
15+
python examples/tts/convert_mimi_to_gguf.py --outtype q8_0
16+
```
17+
18+
Then compile, run it:
19+
20+
```sh
21+
cmake --build build -j --target llama-mimi
22+
23+
./build/bin/llama-mimi kyutai-mimi.gguf codes.txt
24+
25+
# output: output.wav
26+
27+
# alternatively, use "dummy1" to get a "wah hello there" sample output file
28+
./build/bin/llama-mimi kyutai-mimi.gguf dummy1
29+
```
30+
31+
Example of code file (one code per line):
32+
33+
```
34+
1263
35+
1597
36+
1596
37+
1477
38+
1540
39+
1720
40+
1433
41+
118
42+
1066
43+
1968
44+
1096
45+
232
46+
418
47+
566
48+
1653
49+
2010
50+
```
Lines changed: 191 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,191 @@
1+
import gguf
2+
import argparse
3+
import logging
4+
import torch
5+
from typing import Union
6+
from pathlib import Path
7+
from torch import Tensor
8+
from transformers import MimiModel, PreTrainedModel
9+
10+
logger = logging.getLogger("mimi")
11+
12+
13+
class MimiModelConverter:
14+
mimi_model: PreTrainedModel
15+
gguf_writer: gguf.GGUFWriter
16+
fname_out: Path
17+
ftype: gguf.LlamaFileType
18+
19+
def __init__(self,
20+
pretrained_model_name_or_path: Union[Path, str],
21+
fname_out: Path,
22+
ftype: gguf.LlamaFileType,
23+
is_big_endian: bool,):
24+
self.mimi_model = MimiModel.from_pretrained(pretrained_model_name_or_path)
25+
self.fname_out = fname_out
26+
self.ftype = ftype
27+
endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
28+
self.gguf_writer = gguf.GGUFWriter(
29+
path=None,
30+
arch="if you see this, you are using the wrong file",
31+
endianess=endianess)
32+
33+
assert self.mimi_model.config.architectures[0] == "MimiModel"
34+
35+
# load tensors
36+
for name, data_torch in self.mimi_model.state_dict().items():
37+
# convert any unsupported data types to float32
38+
old_dtype = data_torch.dtype
39+
if data_torch.dtype not in (torch.float16, torch.float32):
40+
data_torch = data_torch.to(torch.float32)
41+
self.add_tensor(name, data_torch, old_dtype)
42+
43+
def add_tensor(self, name: str, data_torch: Tensor, old_dtype: torch.dtype):
44+
is_1d = len(data_torch.shape) == 1
45+
is_bias = ".bias" in name
46+
can_quantize = not is_1d and not is_bias
47+
data_qtype = gguf.GGMLQuantizationType.F32
48+
49+
n_head = self.mimi_model.config.num_attention_heads
50+
n_kv_head = self.mimi_model.config.num_key_value_heads
51+
if name.endswith(("q_proj.weight", "q_proj.bias")):
52+
data_torch = self.undo_permute(data_torch, n_head, n_head)
53+
if name.endswith(("k_proj.weight", "k_proj.bias")):
54+
data_torch = self.undo_permute(data_torch, n_head, n_kv_head)
55+
56+
# process codebook
57+
if ".codebook.initialized" in name:
58+
# "initialized" tensor
59+
state_dict = self.mimi_model.state_dict()
60+
embed_sum = state_dict[name.replace(".initialized", ".embed_sum")]
61+
cluster_usage = state_dict[name.replace(".initialized", ".cluster_usage")]
62+
# see modeling_mimi.py --> MimiEuclideanCodebook
63+
data_torch = embed_sum / cluster_usage.clamp(min=self.mimi_model.config.norm_eps)[:, None]
64+
name = name.replace(".initialized", "")
65+
66+
# ignore processed tensors
67+
if ".cluster_usage" in name or ".embed_sum" in name:
68+
return
69+
70+
# transpose some tensors
71+
if ".conv.bias" in name:
72+
data_torch = data_torch.view((1, data_torch.shape[0]))
73+
data_torch = data_torch.transpose(0, 1)
74+
75+
# change view 3d to 2d
76+
if "quantizer" in name and "_proj." in name:
77+
assert data_torch.shape[2] == 1
78+
data_torch = data_torch.view((data_torch.shape[0], data_torch.shape[1]))
79+
80+
# shorten name, otherwise it will be too long for ggml to read
81+
name = name.replace("_residual_vector_quantizer", "_rvq")
82+
83+
if can_quantize:
84+
if self.ftype == gguf.LlamaFileType.ALL_F32:
85+
data_qtype = gguf.GGMLQuantizationType.F32
86+
elif self.ftype == gguf.LlamaFileType.MOSTLY_F16:
87+
data_qtype = gguf.GGMLQuantizationType.F16
88+
elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
89+
data_qtype = gguf.GGMLQuantizationType.BF16
90+
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0:
91+
data_qtype = gguf.GGMLQuantizationType.Q8_0
92+
else:
93+
raise ValueError(f"Unsupported file type: {self.ftype}")
94+
95+
# Conv kernels are always F16
96+
if ".conv.weight" in name:
97+
data_qtype = gguf.GGMLQuantizationType.F16
98+
99+
data = data_torch.numpy()
100+
101+
try:
102+
data = gguf.quants.quantize(data, data_qtype)
103+
except Exception as e:
104+
logger.error(f"Error quantizing tensor '{name}': {e}, fallback to F16")
105+
data_qtype = gguf.GGMLQuantizationType.F16
106+
data = gguf.quants.quantize(data, data_qtype)
107+
108+
# reverse shape to make it similar to the internal ggml dimension order
109+
shape_str = f"{{{', '.join(str(n) for n in reversed(data_torch.shape))}}}"
110+
logger.info(f"{f'%-32s' % f'{name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}")
111+
112+
self.gguf_writer.add_tensor(name, data, raw_dtype=data_qtype)
113+
114+
def write(self):
115+
self.gguf_writer.write_header_to_file(path=self.fname_out)
116+
self.gguf_writer.write_kv_data_to_file()
117+
self.gguf_writer.write_tensors_to_file(progress=True)
118+
self.gguf_writer.close()
119+
120+
@staticmethod
121+
def undo_permute(weights: Tensor, n_head: int, n_head_kv: int):
122+
if n_head_kv is not None and n_head != n_head_kv:
123+
n_head = n_head_kv
124+
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
125+
.swapaxes(1, 2)
126+
.reshape(weights.shape))
127+
128+
def parse_args() -> argparse.Namespace:
129+
parser = argparse.ArgumentParser(
130+
description="Convert Mimi safetensors model to GGUF",)
131+
parser.add_argument(
132+
"--outfile", type=Path, default="kyutai-mimi.gguf",
133+
help="path to write to",
134+
)
135+
parser.add_argument(
136+
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0"], default="f16",
137+
help="output format",
138+
)
139+
parser.add_argument(
140+
"--bigendian", action="store_true",
141+
help="model is executed on big endian machine",
142+
)
143+
parser.add_argument(
144+
"model", type=Path,
145+
help="directory or model ID containing model file (if model ID is specified, download from Hugging Face hub)",
146+
nargs="?",
147+
default="kyutai/mimi",
148+
)
149+
parser.add_argument(
150+
"--verbose", action="store_true",
151+
help="increase output verbosity",
152+
)
153+
154+
args = parser.parse_args()
155+
if args.model is None:
156+
parser.error("the following arguments are required: model")
157+
return args
158+
159+
160+
def main() -> None:
161+
args = parse_args()
162+
163+
if args.verbose:
164+
logging.basicConfig(level=logging.DEBUG)
165+
else:
166+
logging.basicConfig(level=logging.INFO)
167+
168+
dir_model = args.model
169+
170+
ftype_map: dict[str, gguf.LlamaFileType] = {
171+
"f32": gguf.LlamaFileType.ALL_F32,
172+
"f16": gguf.LlamaFileType.MOSTLY_F16,
173+
"bf16": gguf.LlamaFileType.MOSTLY_BF16,
174+
"q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
175+
}
176+
177+
logger.info(f"Loading model: {dir_model}")
178+
179+
with torch.inference_mode():
180+
converter = MimiModelConverter(
181+
pretrained_model_name_or_path=dir_model,
182+
fname_out=args.outfile,
183+
ftype=ftype_map[args.outtype],
184+
is_big_endian=args.bigendian,
185+
)
186+
converter.write()
187+
188+
189+
if __name__ == '__main__':
190+
main()
191+

0 commit comments

Comments
 (0)