Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,11 @@ endif ()

if (ARM)
set(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/../bin-arm)
add_compile_definitions(__ARM_FEATURE_DOTPROD)
# 检查是否使用的是 GCC 或 Clang 编译器
if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=armv8.2-a+dotprod")
endif()
else ()
set(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/../bin)
endif ()
Expand Down Expand Up @@ -96,7 +101,8 @@ endif ()
if (QUANT)
include_directories(${PROJECT_SOURCE_DIR}/src/quantizer)
file(GLOB_RECURSE MLLM_QUANT

${PROJECT_SOURCE_DIR}/src/backends/cpu/compute/GEMM_AArch64.hpp
${PROJECT_SOURCE_DIR}/src/backends/cpu/compute/GEMM_AArch64.cpp
${PROJECT_SOURCE_DIR}/src/backends/cpu/quantize/*.hpp
${PROJECT_SOURCE_DIR}/src/backends/cpu/quantize/*.cpp
)
Expand Down
66 changes: 35 additions & 31 deletions examples/demo_imagebind_1mod.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,53 +13,57 @@ int main(int argc, char **argv) {
cmdParser.add<string>("model", 'm', "specify mllm model path", false, "../models/imagebind_huge-q4_k.mllm");
cmdParser.add<string>("merges", 'f', "specify mllm tokenizer merges.txt path", false, "../vocab/clip_merges.txt");
cmdParser.add<int>("thread", 't', "num of threads", false, 4);
cmdParser.add<int>("loop_times", 'l', "number of inference loops", false, 10);
cmdParser.add<string>("modality", 'o', "inference modality (text/vision/audio/all)", false, "all");
cmdParser.parse_check(argc, argv);

string vocab_path = cmdParser.get<string>("vocab");
string model_path = cmdParser.get<string>("model");
string merges_path = cmdParser.get<string>("merges");
int loop_times = cmdParser.get<int>("loop_times");
string modality = cmdParser.get<string>("modality");
CPUBackend::cpu_threads = cmdParser.get<int>("thread");

auto processor = ImagebindProcessor(vocab_path, merges_path);

ImagebindConfig config("huge");

int loop_times = 10;

// auto input_tensors = processor.process(
// {"a dog.", "A car", "A bird"},config.max_position_embeddings,
// {"../assets/dog_image.jpg", "../assets/car_image.jpg", "../assets/bird_image.jpg"}, config.img_hw,
// {"../assets/dog_audio.wav", "../assets/car_audio.wav", "../assets/bird_audio.wav"});

auto input_tensors = processor.process(
{"a dog."},config.max_position_embeddings,
{"a dog."}, config.max_position_embeddings,
{"../assets/dog_image.jpg"}, config.img_hw,
{"../assets/dog_audio.wav"});

std::cout<<"Text| input_shape:["<<input_tensors.text_tensors.batch()<<", "<<input_tensors.text_tensors.sequence()<<", "<<input_tensors.text_tensors.head()<<", "<<input_tensors.text_tensors.dimension()<<"]"<<std::endl;
auto text_model = ImagebindTextModel(config);
text_model.load(model_path);
for (int step = 0; step < loop_times; step++) {
auto result = text_model({input_tensors.text_tensors}, input_tensors.in_len);

if (modality == "text" || modality == "all") {
std::cout << "Text| input_shape:[" << input_tensors.text_tensors.batch() << ", " << input_tensors.text_tensors.sequence() << ", " << input_tensors.text_tensors.head() << ", " << input_tensors.text_tensors.dimension() << "]" << std::endl;
auto text_model = ImagebindTextModel(config);
text_model.load(model_path);
for (int step = 0; step < loop_times; step++) {
auto result = text_model({input_tensors.text_tensors}, input_tensors.in_len);
}
text_model.profiling();
text_model.free();
}
text_model.profiling();
text_model.free();

std::cout<<"Vision| input_shape:["<<input_tensors.img_tensors.batch()<<", "<<input_tensors.img_tensors.channel()<<", "<<input_tensors.img_tensors.time()<<", "<<input_tensors.img_tensors.height()<<", "<<input_tensors.img_tensors.width()<<"]"<<std::endl;
auto vision_model = ImagebindVisionModel(config);
vision_model.load(model_path);
for (int step = 0; step < loop_times; step++) {
auto result = vision_model({input_tensors.img_tensors});
if (modality == "vision" || modality == "all") {
std::cout << "Vision| input_shape:[" << input_tensors.img_tensors.batch() << ", " << input_tensors.img_tensors.channel() << ", " << input_tensors.img_tensors.time() << ", " << input_tensors.img_tensors.height() << ", " << input_tensors.img_tensors.width() << "]" << std::endl;
auto vision_model = ImagebindVisionModel(config);
vision_model.load(model_path);
for (int step = 0; step < loop_times; step++) {
auto result = vision_model({input_tensors.img_tensors});
}
vision_model.profiling();
vision_model.free();
}
vision_model.profiling();
vision_model.free();

std::cout<<"Audio| input_shape:["<<input_tensors.audio_tensors.batch()<<", "<<input_tensors.audio_tensors.sequence()<<", "<<input_tensors.audio_tensors.head()<<", "<<input_tensors.audio_tensors.dimension()<<"]"<<std::endl;
auto audio_model = ImagebindAudioModel(config);
audio_model.load(model_path);
for (int step = 0; step < loop_times; step++) {
auto result = audio_model({input_tensors.audio_tensors});
if (modality == "audio" || modality == "all") {
std::cout << "Audio| input_shape:[" << input_tensors.audio_tensors.batch() << ", " << input_tensors.audio_tensors.sequence() << ", " << input_tensors.audio_tensors.head() << ", " << input_tensors.audio_tensors.dimension() << "]" << std::endl;
auto audio_model = ImagebindAudioModel(config);
audio_model.load(model_path);
for (int step = 0; step < loop_times; step++) {
auto result = audio_model({input_tensors.audio_tensors});
}
audio_model.profiling();
audio_model.free();
}
audio_model.profiling();
audio_model.free();

return 0;
}
1 change: 1 addition & 0 deletions examples/demo_llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ int main(int argc, char **argv) {
chatPostProcessing(out_token, input_tensor, {});
}
printf("\n");
model.profiling();
}

return 0;
Expand Down
4 changes: 2 additions & 2 deletions examples/demo_qwen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ int main(int argc, char **argv) {
cmdline::parser cmdParser;
cmdParser.add<string>("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/qwen_vocab.mllm");
cmdParser.add<string>("merge", 'e', "specify mllm merge file path", false, "../vocab/qwen_merges.txt");
cmdParser.add<string>("model", 'm', "specify mllm model path", false, "../models/qwen-1.5-0.5b-q4_k.mllm");
cmdParser.add<string>("model", 'm', "specify mllm model path", false, "../models/qwen-1.5-1.8b-q8_0.mllm");
cmdParser.add<int>("limits", 'l', "max KV cache size", false, 400);
cmdParser.add<int>("thread", 't', "num of threads", false, 4);
cmdParser.parse_check(argc, argv);
Expand All @@ -31,7 +31,7 @@ int main(int argc, char **argv) {
CPUBackend::cpu_threads = cmdParser.get<int>("thread");

auto tokenizer = QWenTokenizer(vocab_path, merge_path);
QWenConfig config(tokens_limit, "0.5B", RoPEType::HFHUBROPE);
QWenConfig config(tokens_limit, "1.8B", RoPEType::HFHUBROPE);
auto model = QWenForCausalLM(config);
model.load(model_path);

Expand Down
12 changes: 6 additions & 6 deletions examples/demo_yi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@
*
*/
#include "cmdline.h"
#include "models/yi/configuration_yi.hpp"
#include "models/yi/modeling_yi.hpp"
#include "models/yi/tokenization_yi.hpp"
#include "models/llama/configuration_llama.hpp"
#include "models/llama/modeling_llama.hpp"
#include "models/llama/tokenization_llama.hpp"
#include "processor/PostProcess.hpp"

using namespace mllm;
Expand All @@ -29,9 +29,9 @@ int main(int argc, char **argv) {
int tokens_limit = cmdParser.get<int>("limits");
CPUBackend::cpu_threads = cmdParser.get<int>("thread");

auto tokenizer = YiTokenizer(vocab_path);
YiConfig config(tokens_limit, "6B", RoPEType::HFHUBROPE);
auto model = YiForCausalLM(config);
auto tokenizer = LLaMATokenizer(vocab_path, false);
LLaMAConfig config(tokens_limit, "6B", RoPEType::HFHUBROPE, 64000);
auto model = LLaMAModel(config);
model.load(model_path);

vector<string> in_strs = {
Expand Down
4 changes: 2 additions & 2 deletions examples/main_alpaca.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,8 @@ NetTensor *Attention( NetTensor * x, int embedding_size, int hidden_size, int he
v = _KVCache( {v}, cache_max, name + ".v_cache");
auto *qk = _Matmul( {q, k}, false, true, name + ".qk");
qk = *qk/std::sqrt(hidden_size);
qk = _Causalmask( {qk}, name + ".mask");
qk = _Softmax( {qk}, DIMENSION, name + ".softmax");
// qk = _Causalmask( {qk}, name + ".mask");
qk = _Softmax( {qk}, DIMENSION, true, name + ".softmax");
auto *o = _Matmul( {qk, v}, false, false, name + ".qkv");
o = o->view(-1, 1, -1, hidden_size * head_size);
o = _Linear( {o}, hidden_size * head_size, embedding_size, false, name + ".o_proj");
Expand Down
6 changes: 4 additions & 2 deletions examples/main_clip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,11 @@ NetTensor *Attention(NetTensor *x, int embedding_size, int hidden_size, int head
auto *qk = _Matmul( {q, k}, false, true, name + ".qk");
qk = _Scale( {qk}, 1.0F / std::sqrt(hidden_size), 0.0F, false, name + ".scale");
if(name.find("text_model") != std::string::npos){
qk = _Causalmask( {qk}, name + ".mask");
// qk = _Causalmask( {qk}, name + ".mask");
qk = _Softmax( {qk}, DIMENSION, true, name + ".softmax");
} else{
qk = _Softmax( {qk}, DIMENSION, false, name + ".softmax");
}
qk = _Softmax( {qk}, DIMENSION, name + ".softmax");
auto *o = _Matmul( {qk, v}, false, false, name + ".qkv");
o = o->view(-1, 1, -1, hidden_size * head_size);
o = _Linear( {o}, hidden_size * head_size, embedding_size, true, name + ".out_proj");
Expand Down
4 changes: 2 additions & 2 deletions examples/main_fuyu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -102,8 +102,8 @@ NetTensor *Attention(NetTensor *x, int embedding_size, int hidden_size, int head
v = _KVCache({v}, cache_max, name + ".v_cache");
auto *qk = _Matmul({q, k}, false, true, name + ".qk");
qk = _Scale({qk}, 1.0F / std::sqrt(head_size), 0.0F, false, name + ".scale");
qk = _Causalmask({qk}, name + ".mask");
qk = _Softmax({qk}, DIMENSION, name + ".softmax");
// qk = _Causalmask({qk}, name + ".mask");
qk = _Softmax({qk}, DIMENSION, true, name + ".softmax");
auto *o = _Matmul({qk, v}, false, false, name + ".qkv");
o = o->view(-1, 1, -1, hidden_size * head_size);
o = _Linear({o}, hidden_size * head_size, embedding_size, true, name + ".dense");
Expand Down
9 changes: 5 additions & 4 deletions examples/main_imagebind.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,9 +118,10 @@ NetTensor *Attention(Context *c,NetTensor *x, int embedding_size, int hidden_siz
auto *qk = _Matmul( {q, k}, false, true, name + ".qk");
qk = *qk/std::sqrt(hidden_size);
if(name.find("text") != std::string::npos){
qk = _Causalmask( {qk}, name + ".mask");
qk = _Softmax( {qk}, DIMENSION, true, name + ".softmax");
} else{
qk = _Softmax( {qk}, DIMENSION, false, name + ".softmax");
}
qk = _Softmax( {qk}, DIMENSION, name + ".softmax");
auto *o = _Matmul( {qk, v}, false, false, name + ".qkv");
o = o->view(-1, 1, -1, hidden_size * head_size);
o = _Linear( {o}, hidden_size * head_size, embedding_size, true, name + ".out_proj");
Expand Down Expand Up @@ -227,10 +228,10 @@ void ImageBind(Context* c) {
a = a->transpose(BATCH, SEQUENCE);

auto *j1 = _Matmul( {p, i}, false, true, "final.vision@text");
j1 = _Softmax( {j1}, DIMENSION, "[email protected]");
j1 = _Softmax( {j1}, DIMENSION, false, "[email protected]");

auto *j2 = _Matmul( {p, a}, false, true, "final.vision@audio");
j2 = _Softmax( {j2}, DIMENSION, "[email protected]");
j2 = _Softmax( {j2}, DIMENSION, false, "[email protected]");

i = _Cat( {j1, j2}, BATCH, "final.cat");
}
Expand Down
4 changes: 2 additions & 2 deletions examples/main_llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,8 @@ NetTensor *Attention(NetTensor *x, int embedding_size, int hidden_size, int head
v = _KVCache({v}, cache_max, name + ".v_cache");
auto *qk = _Matmul({q, k}, false, true, name + ".qk");
qk = *qk / std::sqrt(hidden_size);
qk = _Causalmask({qk}, name + ".mask");
qk = _Softmax({qk}, DIMENSION, name + ".softmax");
// qk = _Causalmask({qk}, name + ".mask");
qk = _Softmax({qk}, DIMENSION, true, name + ".softmax");
auto *o = _Matmul({qk, v}, false, false, name + ".qkv");
o = o->view(-1, 1, -1, hidden_size * head_size);
o = _Linear({o}, hidden_size * head_size, embedding_size, false, name + ".wo");
Expand Down
9 changes: 5 additions & 4 deletions examples/main_llava.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,8 @@ NetTensor *Attention(NetTensor *x, int embedding_size, int hidden_size, int head
v = _KVCache({v}, cache_max, name + ".v_cache");
auto *qk = _Matmul({q, k}, false, true, name + ".qk");
qk = *qk / std::sqrt(hidden_size);
qk = _Causalmask({qk}, name + ".mask");
qk = _Softmax({qk}, DIMENSION, name + ".softmax");
// qk = _Causalmask({qk}, name + ".mask");
qk = _Softmax({qk}, DIMENSION, true, name + ".softmax");
auto *o = _Matmul({qk, v}, false, false, name + ".qkv");
o = o->view(-1, 1, -1, hidden_size * head_size);
o = _Linear({o}, hidden_size * head_size, embedding_size, false, name + ".o_proj");
Expand Down Expand Up @@ -117,9 +117,10 @@ NetTensor *VisionAttention(NetTensor *x, int embedding_size, int hidden_size, in
auto *qk = _Matmul({q, k}, false, true, name + ".qk");
qk = _Scale({qk}, 1.0F / std::sqrt(hidden_size), 0.0F, false, name + ".scale");
if (name.find("text_model") != std::string::npos) {
qk = _Causalmask({qk}, name + ".mask");
qk = _Softmax( {qk}, DIMENSION, true, name + ".softmax");
} else{
qk = _Softmax( {qk}, DIMENSION, false, name + ".softmax");
}
qk = _Softmax({qk}, DIMENSION, name + ".softmax");
auto *o = _Matmul({qk, v}, false, false, name + ".qkv");
o = o->view(-1, 1, -1, hidden_size * head_size);
o = _Linear({o}, hidden_size * head_size, embedding_size, true, name + ".out_proj");
Expand Down
4 changes: 2 additions & 2 deletions examples/main_tinyllama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,8 @@ NetTensor *Attention( NetTensor * x, int embedding_size, int hidden_size, int he
v = _KVCache( {v},head_size/mutil_key_value_head, cache_max, name + ".v_cache");
auto *qk = _Matmul( {q, k}, false, true, name + ".qk");
qk = *qk/std::sqrt(hidden_size);
qk = _Causalmask( {qk}, name + ".mask");
qk = _Softmax( {qk}, DIMENSION, name + ".softmax");
// qk = _Causalmask( {qk}, name + ".mask");
qk = _Softmax( {qk}, DIMENSION, true, name + ".softmax");
auto *o = _Matmul( {qk, v}, false, false, name + ".qkv");
o = o->view(-1, 1, -1, hidden_size * head_size);
o = _Linear( {o}, hidden_size * head_size, embedding_size, false, name + ".o_proj");
Expand Down
2 changes: 1 addition & 1 deletion examples/main_vit.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1089,7 +1089,7 @@ NetTensor *Attention(NetTensor * x, int embedded_size, int hidden_size, int head
qk = *qk/std::sqrt(hidden_size);
// qk = _Scale( {qk}, 1.0F / std::sqrt(hidden_size), 0.0F, false, name + ".scale");
// qk = _Causalmask( {qk}, name + ".mask");
qk = _Softmax( {qk}, DIMENSION, name + ".softmax");
qk = _Softmax( {qk}, DIMENSION, false, name + ".softmax");
auto *o = _Matmul( {qk, v}, false, false, name + ".qkv");
o = o->view(-1, 1, -1, hidden_size * head_size);
o = _Linear( {o}, hidden_size * head_size, embedded_size, true, name + ".output.dense");
Expand Down
56 changes: 56 additions & 0 deletions include/Types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,10 @@ enum DataType {
MLLM_TYPE_I8,
MLLM_TYPE_I16,
MLLM_TYPE_I32,
MLLM_TYPE_Q4_0_4_4=19,
MLLM_TYPE_Q4_0_4_8=20,
MLLM_TYPE_Q4_0_8_8=21,
MLLM_TYPE_Q8_0_4_4,
MLLM_TYPE_COUNT,
};
enum ChlType {
Expand Down Expand Up @@ -147,6 +151,8 @@ enum RoPEType {
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
// #define LLAMAFILE_SGEMM

#if defined(__ARM_NEON) && !defined(_MSC_VER)
typedef __fp16 mllm_fp16_t;
#else
Expand Down Expand Up @@ -223,6 +229,39 @@ typedef struct {
#pragma pack()
static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K / 16 * sizeof(int16_t), "wrong q8_K block size/padding");


#pragma pack(1)
typedef struct {
mllm_fp16_t d[4]; // deltas for 4 q4_0 blocks
uint8_t qs[QK4_0 * 2]; // nibbles / quants for 4 q4_0 blocks
} block_q4_0x4;
#pragma pack()
static_assert(sizeof(block_q4_0x4) == 4 * sizeof(mllm_fp16_t) + QK4_0 * 2, "wrong q4_0x4 block size/padding");

#pragma pack(1)
typedef struct {
mllm_fp16_t d[8]; // deltas for 8 q4_0 blocks
uint8_t qs[QK4_0 * 4]; // nibbles / quants for 8 q4_0 blocks
} block_q4_0x8;
#pragma pack()
static_assert(sizeof(block_q4_0x8) == 8 * sizeof(mllm_fp16_t) + QK4_0 * 4, "wrong q4_0x8 block size/padding");

#pragma pack(1)
typedef struct {
mllm_fp16_t d[4]; // deltas for 4 q8_0 blocks
int8_t qs[QK8_0 * 4]; // quants for 4 q8_0 blocks
} block_q8_0x4;
#pragma pack()
static_assert(sizeof(block_q8_0x4) == 4 * sizeof(mllm_fp16_t) + QK8_0 * 4, "wrong q8_0x4 block size/padding");

#pragma pack(1)
typedef struct {
mllm_fp16_t d[8]; // deltas for 8 q8_0 blocks
int8_t qs[QK8_0 * 8]; // quants for 8 q8_0 blocks
} block_q8_0x8;
#pragma pack()
static_assert(sizeof(block_q8_0x8) == 8 * sizeof(mllm_fp16_t) + QK8_0 * 8, "wrong q8_0x8 block size/padding");

//

static string DataTypeName(DataType dataType) {
Expand Down Expand Up @@ -251,6 +290,14 @@ static string DataTypeName(DataType dataType) {
return "Q4_1";
case MLLM_TYPE_Q8_1:
return "Q8_1";
case MLLM_TYPE_Q4_0_4_4:
return "Q4_0_4_4";
case MLLM_TYPE_Q4_0_4_8:
return "Q4_0_4_8";
case MLLM_TYPE_Q4_0_8_8:
return "Q4_0_8_8";
case MLLM_TYPE_Q8_0_4_4:
return "Q8_0_4_4";
case MLLM_TYPE_COUNT:
return "COUNT";
default:
Expand Down Expand Up @@ -281,6 +328,15 @@ static size_t DataTypeSize(DataType dtype, int count = 1) {
return (sizeof(block_q8_K)) * count / (QK_K);
case MLLM_TYPE_Q4_1:
case MLLM_TYPE_Q8_1:
return -1;
case MLLM_TYPE_Q4_0_4_4:
return (sizeof(block_q4_0x4)) * count / (QK4_0 * 4);
case MLLM_TYPE_Q4_0_4_8:
return (sizeof(block_q4_0x8)) * count / (QK4_0 * 8);
case MLLM_TYPE_Q4_0_8_8:
return (sizeof(block_q4_0x8)) * count / (QK4_0 * 8);
case MLLM_TYPE_Q8_0_4_4:
return (sizeof(block_q8_0x4)) * count / (QK8_0 * 4);
case MLLM_TYPE_COUNT:
return 0;
default:
Expand Down
Loading