Skip to content

Commit 891273c

Browse files
committed
mimi : non-transposed input codes
1 parent a98f199 commit 891273c

File tree

3 files changed

+48
-45
lines changed

3 files changed

+48
-45
lines changed

examples/tts/mimi-model.cpp

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,8 @@
2424
*
2525
* Background:
2626
* - The audio codes can be generated using any Mimi-based model, for example: Moshi, Hibiki, Sesame, etc
27-
* - Audio codes must be in the order: (1 semantic component, 31 acoustic components) repeated N times
27+
* - Audio codes must be in the order: N semantic codes followed by (N*31) acoustic codes
28+
* (In other words, input matrix has shape 32 cols x N rows)
2829
*
2930
* How it works?
3031
* 1. Audio code passed to RVQ (mimi_residual_vector_quantizer) to get the latent code
@@ -653,23 +654,22 @@ std::vector<float> mimi_model::decode_frame(const std::vector<int> & codes, int
653654
for (int i = 0; i < (int)pos_data.size(); i++) {
654655
pos_data[i] = i + n_past;
655656
}
656-
n_past += n_pos;
657657
if (verbose) {
658658
printf("%s: n_pos: %d, n_past: %d\n", __func__, n_pos, n_past);
659659
}
660+
n_past += n_pos;
660661
ctx->set_tensor_data("pos_dec", pos_data.data());
661662

662-
// code data (need to transpose it)
663-
// code [n_codes, n_codes_per_embd] -> [n_codes_per_embd, n_codes]
664-
std::vector<int> codes_t(n_codes_per_embd * n_codes);
663+
// code data
664+
/*std::vector<int> codes_t(n_codes_per_embd * n_codes);
665665
for (int i = 0; i < n_codes / n_codes_per_embd; i++) {
666666
for (int j = 0; j < n_codes_per_embd; j++) {
667667
int src_idx = i * n_codes_per_embd + j;
668668
int dst_idx = j * (n_codes / n_codes_per_embd) + i;
669669
codes_t[dst_idx] = codes[src_idx];
670670
}
671-
}
672-
ctx->set_tensor_data("inp_dec", codes_t.data());
671+
}*/
672+
ctx->set_tensor_data("inp_dec", codes.data());
673673

674674
ctx->compute();
675675

examples/tts/mimi-model.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ struct mimi_model {
2222

2323
int get_sample_rate() const;
2424

25+
// layout of codes: N semantic codes followed by (N*31) acoustic codes
2526
std::vector<float> decode(const std::vector<int> & codes);
2627

2728
// TODO: implement encoding pass

examples/tts/mimi.cpp

Lines changed: 40 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,11 @@
55
#include <fstream>
66

77

8+
/**
9+
* This file is used for testing and showcase how to use "mimi_model" class.
10+
* Please keep it simple and easy to understand.
11+
*/
12+
813
int main(int argc, const char ** argv) {
914
if (argc < 3) {
1015
fprintf(stderr, "Usage: %s model.gguf codes.txt [output.wav]\n", argv[0]);
@@ -23,48 +28,45 @@ int main(int argc, const char ** argv) {
2328
std::vector<int> codes;
2429
if (strcmp(codes_path, "dummy0") == 0) {
2530
printf("Using dummy0 codes\n");
26-
codes.resize(32 * 3); // [n_codes = 3, n_codes_per_embd = 32]
27-
int n = 0;
28-
for (int c = 0; c < 32; c++) {
29-
for (int r = 0; r < 3; r++) {
30-
codes[r*32 + c] = n++;
31-
}
31+
codes.resize(32 * 3); // [n_codes_per_embd = 32, n_codes = 3]
32+
for (int i = 0; i < (int)codes.size(); i++) {
33+
codes[i] = i;
3234
}
3335
} else if (strcmp(codes_path, "dummy1") == 0) {
3436
printf("Using dummy1 codes\n");
3537
codes = {
36-
1263 ,1597 ,1596 ,1477 ,1540 ,1720 ,1433 ,118 ,1066 ,1968 ,1096 ,232 ,418 ,566 ,1653 ,2010 ,
37-
1029 ,1874 ,77 ,1803 ,123 ,908 ,97 ,1616 ,595 ,1170 ,1654 ,1211 ,1967 ,1579 ,1846 ,1462 ,
38-
1962 ,175 ,1539 ,742 ,1065 ,1226 ,19 ,955 ,528 ,1031 ,659 ,1687 ,1173 ,1802 ,1031 ,1714 ,
39-
1986 ,582 ,367 ,112 ,1245 ,1386 ,759 ,532 ,1472 ,1790 ,802 ,1213 ,1543 ,1916 ,1251 ,309 ,
40-
1962 ,1280 ,1943 ,878 ,1588 ,1989 ,568 ,1463 ,1814 ,1095 ,103 ,583 ,976 ,998 ,871 ,587 ,
41-
247 ,1698 ,1817 ,1024 ,268 ,597 ,45 ,1608 ,1880 ,2047 ,759 ,1578 ,1612 ,49 ,1031 ,1076 ,
42-
927 ,1202 ,1601 ,1719 ,1670 ,412 ,568 ,1838 ,341 ,1265 ,1279 ,830 ,1997 ,32 ,1369 ,1686 ,
43-
1307 ,419 ,1143 ,324 ,325 ,572 ,1597 ,1920 ,795 ,915 ,610 ,2000 ,819 ,718 ,1235 ,282 ,
44-
1912 ,1911 ,141 ,1069 ,1485 ,642 ,1370 ,732 ,284 ,1407 ,1591 ,1002 ,939 ,671 ,951 ,1411 ,
45-
1887 ,460 ,1588 ,1636 ,1312 ,232 ,969 ,1513 ,1336 ,1185 ,1660 ,4 ,926 ,1243 ,1077 ,1379 ,
46-
704 ,85 ,257 ,1302 ,1029 ,1717 ,899 ,1345 ,355 ,1915 ,1007 ,315 ,1283 ,779 ,415 ,335 ,
47-
1848 ,1786 ,469 ,295 ,380 ,1736 ,393 ,765 ,1921 ,836 ,374 ,1649 ,52 ,1633 ,759 ,548 ,
48-
1922 ,47 ,564 ,893 ,34 ,131 ,1063 ,1657 ,474 ,1960 ,1255 ,1275 ,92 ,976 ,1217 ,483 ,
49-
105 ,1746 ,1158 ,1557 ,1001 ,512 ,1668 ,1255 ,1045 ,1596 ,613 ,1272 ,1366 ,1147 ,411 ,831 ,
50-
349 ,692 ,1435 ,2005 ,1465 ,37 ,892 ,95 ,460 ,557 ,1315 ,259 ,1978 ,1838 ,1232 ,2003 ,
51-
1197 ,111 ,1953 ,1297 ,1843 ,671 ,1687 ,91 ,1788 ,1138 ,1896 ,399 ,615 ,758 ,1423 ,365 ,
52-
288 ,632 ,876 ,875 ,1156 ,345 ,1189 ,638 ,1527 ,1981 ,1925 ,333 ,1353 ,473 ,1913 ,1443 ,
53-
1634 ,1373 ,803 ,420 ,192 ,1440 ,1593 ,1925 ,784 ,831 ,552 ,807 ,1942 ,1289 ,612 ,511 ,
54-
968 ,1091 ,30 ,828 ,1611 ,1241 ,1985 ,596 ,273 ,529 ,1182 ,302 ,726 ,1942 ,733 ,1590 ,
55-
1564 ,214 ,1156 ,1722 ,1215 ,1837 ,1729 ,1823 ,672 ,116 ,340 ,396 ,721 ,462 ,1615 ,1380 ,
56-
1459 ,1553 ,636 ,586 ,1148 ,1147 ,1941 ,471 ,876 ,127 ,1938 ,2002 ,1563 ,1121 ,857 ,1179 ,
57-
1983 ,1324 ,1726 ,1445 ,295 ,270 ,896 ,1947 ,1740 ,1211 ,128 ,1266 ,734 ,715 ,1562 ,285 ,
58-
1139 ,304 ,526 ,653 ,1270 ,320 ,484 ,22 ,687 ,1065 ,489 ,827 ,993 ,1654 ,431 ,1552 ,
59-
1418 ,1604 ,455 ,841 ,412 ,848 ,475 ,540 ,1903 ,575 ,584 ,300 ,1079 ,189 ,1481 ,893 ,
60-
228 ,1577 ,429 ,635 ,106 ,1536 ,176 ,348 ,1733 ,1570 ,537 ,1840 ,798 ,410 ,1714 ,1318 ,
61-
487 ,332 ,1109 ,1744 ,283 ,692 ,681 ,1744 ,1008 ,1715 ,1956 ,1066 ,1768 ,1645 ,139 ,1967 ,
62-
897 ,132 ,1010 ,1932 ,277 ,1536 ,1541 ,952 ,19 ,88 ,1663 ,1232 ,1681 ,1878 ,1241 ,1805 ,
63-
89 ,1401 ,544 ,1061 ,1166 ,267 ,1351 ,1998 ,1623 ,1898 ,425 ,1320 ,2006 ,865 ,1981 ,823 ,
64-
1243 ,471 ,485 ,1765 ,391 ,1281 ,1607 ,1418 ,116 ,1702 ,1725 ,512 ,1088 ,1375 ,1994 ,1738 ,
65-
725 ,1471 ,811 ,1251 ,1156 ,1664 ,898 ,1511 ,1872 ,1717 ,444 ,1005 ,254 ,103 ,202 ,1769 ,
66-
1511 ,433 ,284 ,721 ,1741 ,56 ,615 ,916 ,887 ,1253 ,916 ,535 ,1666 ,1713 ,741 ,873 ,
67-
447 ,492 ,388 ,321 ,1860 ,1456 ,1658 ,1682 ,848 ,462 ,2034 ,1368 ,1609 ,1887 ,510 ,1516 ,
38+
1049 ,1415 ,1962 ,914 ,1372 ,704 ,1922 ,2036 ,288 ,968 ,193 ,1139 ,897 ,897 ,1243 ,1511 ,
39+
1597 ,175 ,1280 ,1202 ,1911 ,85 ,47 ,692 ,632 ,251 ,1553 ,1735 ,1577 ,132 ,471 ,433 ,
40+
1325 ,1539 ,1943 ,1601 ,141 ,257 ,564 ,1435 ,876 ,1096 ,636 ,61 ,1497 ,1010 ,485 ,284 ,
41+
839 ,776 ,878 ,1719 ,1069 ,1302 ,893 ,2005 ,875 ,908 ,586 ,2001 ,186 ,1932 ,1765 ,721 ,
42+
592 ,1046 ,1588 ,1670 ,1485 ,1141 ,34 ,1465 ,1156 ,1938 ,435 ,753 ,1418 ,277 ,391 ,1741 ,
43+
1440 ,117 ,723 ,412 ,642 ,1717 ,131 ,37 ,345 ,112 ,1979 ,2034 ,1822 ,1536 ,1281 ,56 ,
44+
1341 ,803 ,568 ,568 ,1370 ,1995 ,1063 ,892 ,273 ,895 ,1226 ,354 ,1726 ,1541 ,1607 ,615 ,
45+
985 ,1499 ,1736 ,1838 ,702 ,1345 ,1657 ,511 ,1774 ,1787 ,945 ,1927 ,947 ,952 ,1418 ,916 ,
46+
1239 ,1457 ,1021 ,341 ,284 ,882 ,474 ,1559 ,1923 ,273 ,1330 ,1406 ,1782 ,19 ,116 ,887 ,
47+
1146 ,1307 ,983 ,1237 ,1407 ,1350 ,1960 ,1255 ,878 ,1979 ,1500 ,1939 ,1415 ,88 ,1702 ,1253 ,
48+
1778 ,2 ,10 ,1279 ,999 ,1549 ,1049 ,373 ,1355 ,1200 ,1466 ,1009 ,75 ,2042 ,1725 ,916 ,
49+
1636 ,1135 ,833 ,830 ,1758 ,2015 ,1275 ,1675 ,287 ,744 ,89 ,430 ,1724 ,1232 ,1692 ,535 ,
50+
1485 ,1287 ,973 ,1815 ,314 ,2020 ,424 ,1085 ,982 ,1994 ,1563 ,1269 ,1769 ,1681 ,1082 ,1666 ,
51+
1622 ,1039 ,1209 ,32 ,679 ,732 ,976 ,1462 ,805 ,402 ,1150 ,170 ,1529 ,2013 ,350 ,1175 ,
52+
757 ,1124 ,1091 ,1369 ,1061 ,415 ,1217 ,1135 ,1360 ,1578 ,1205 ,1785 ,1835 ,1241 ,14 ,716 ,
53+
480 ,716 ,681 ,1686 ,1624 ,335 ,865 ,1356 ,1688 ,307 ,366 ,541 ,1262 ,1167 ,59 ,269 ,
54+
1899 ,1798 ,1606 ,1307 ,1549 ,1814 ,114 ,483 ,958 ,1919 ,1179 ,898 ,834 ,1526 ,386 ,447 ,
55+
1481 ,201 ,779 ,419 ,430 ,1451 ,1000 ,156 ,1062 ,615 ,1353 ,414 ,1214 ,1487 ,882 ,32 ,
56+
840 ,1517 ,334 ,1143 ,823 ,454 ,725 ,1298 ,1325 ,649 ,1737 ,913 ,685 ,761 ,2010 ,63 ,
57+
1397 ,1299 ,765 ,1158 ,1809 ,1299 ,1585 ,1776 ,625 ,1539 ,830 ,1563 ,461 ,308 ,1438 ,321 ,
58+
82 ,886 ,1836 ,325 ,1976 ,761 ,359 ,1136 ,1720 ,2036 ,904 ,719 ,526 ,1567 ,145 ,1860 ,
59+
1565 ,1786 ,1400 ,1696 ,232 ,1736 ,512 ,518 ,1895 ,1854 ,1584 ,1393 ,1869 ,1702 ,789 ,1986 ,
60+
116 ,521 ,150 ,1597 ,727 ,1916 ,815 ,1826 ,1382 ,653 ,1596 ,286 ,1373 ,177 ,1397 ,1009 ,
61+
1449 ,353 ,877 ,93 ,266 ,1853 ,1255 ,872 ,1974 ,556 ,1885 ,857 ,992 ,5 ,1921 ,1849 ,
62+
1038 ,1912 ,464 ,795 ,747 ,56 ,124 ,431 ,1868 ,609 ,855 ,1522 ,912 ,1709 ,1507 ,1062 ,
63+
1015 ,1357 ,1487 ,4 ,253 ,1871 ,933 ,215 ,1228 ,633 ,1306 ,2024 ,1453 ,900 ,457 ,471 ,
64+
436 ,1311 ,870 ,1032 ,134 ,984 ,1983 ,1103 ,1627 ,1627 ,414 ,1845 ,583 ,1699 ,1458 ,2018 ,
65+
150 ,450 ,1114 ,369 ,267 ,1273 ,1136 ,1578 ,1063 ,1820 ,120 ,779 ,652 ,1266 ,1929 ,1213 ,
66+
159 ,297 ,1703 ,819 ,93 ,247 ,1366 ,144 ,1617 ,1428 ,812 ,121 ,1637 ,1620 ,289 ,1557 ,
67+
1414 ,971 ,476 ,1685 ,428 ,1802 ,653 ,1290 ,614 ,1663 ,1528 ,1344 ,798 ,1027 ,1305 ,990 ,
68+
1740 ,1154 ,1839 ,912 ,731 ,602 ,1064 ,1508 ,834 ,1387 ,252 ,745 ,1034 ,1102 ,965 ,696 ,
69+
1971 ,1729 ,666 ,282 ,1993 ,1551 ,1703 ,1124 ,1628 ,1725 ,107 ,808 ,1096 ,1753 ,500 ,677 ,
6870
};
6971
} else {
7072
std::ifstream fin(codes_path);

0 commit comments

Comments
 (0)