Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .editorconfig
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,6 @@ charset = unset
trim_trailing_whitespace = unset
insert_final_newline = unset

[tools/mtmd/miniaudio.h]
[tools/mtmd/vendor/miniaudio.h]
trim_trailing_whitespace = unset
insert_final_newline = unset
46 changes: 26 additions & 20 deletions tools/mtmd/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,48 +1,54 @@
# mtmd

# compile mtmd-audio separately to avoid long compile times with miniaudio.h
# TODO @ngxson : move miniaudio.h and stb_image.h to mtmd-helper.cpp, then compile the helper as a separate library
add_library(mtmd_audio STATIC mtmd-audio.cpp mtmd-audio.h)
if (BUILD_SHARED_LIBS)
set_target_properties(mtmd_audio PROPERTIES POSITION_INDEPENDENT_CODE ON)
endif()
target_link_libraries(mtmd_audio PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(mtmd_audio PRIVATE cxx_std_17)
target_include_directories(mtmd_audio PRIVATE .)

add_library(mtmd OBJECT
mtmd.cpp
mtmd-helper.cpp
mtmd-audio.cpp
mtmd.h
clip.cpp
clip.h
clip-impl.h
)

target_link_libraries(mtmd PRIVATE ggml llama mtmd_audio ${CMAKE_THREAD_LIBS_INIT})

target_link_libraries(mtmd PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
target_include_directories(mtmd PUBLIC .)
target_include_directories(mtmd PRIVATE ../..)
target_include_directories(mtmd PRIVATE ../../common) # for stb_image.h

target_compile_features(mtmd PRIVATE cxx_std_17)

add_library(mtmd_static STATIC $<TARGET_OBJECTS:mtmd>)
# compile the helper separately, to avoid long compile times with miniaudio.h and stb_image.h

add_library(mtmd_helper OBJECT
mtmd-helper.cpp
mtmd-helper.h
)

target_link_libraries(mtmd_helper PRIVATE ggml llama mtmd ${CMAKE_THREAD_LIBS_INIT})
target_include_directories(mtmd_helper PUBLIC .)
target_include_directories(mtmd_helper PRIVATE ./vendor)
target_include_directories(mtmd_helper PRIVATE ../..)
target_compile_features(mtmd_helper PRIVATE cxx_std_17)

if (BUILD_SHARED_LIBS)
set_target_properties(mtmd PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_compile_definitions(mtmd PRIVATE LLAMA_SHARED LLAMA_BUILD)
add_library(mtmd_shared SHARED $<TARGET_OBJECTS:mtmd>)
target_link_libraries(mtmd_shared PRIVATE ggml llama mtmd_audio ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(mtmd_shared PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
install(TARGETS mtmd_shared LIBRARY)

set_target_properties(mtmd_helper PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_compile_definitions(mtmd_helper PRIVATE LLAMA_SHARED LLAMA_BUILD)
add_library(mtmd_helper_shared SHARED $<TARGET_OBJECTS:mtmd>)
target_link_libraries(mtmd_helper_shared PRIVATE ggml llama mtmd ${CMAKE_THREAD_LIBS_INIT})
install(TARGETS mtmd_helper_shared LIBRARY)
endif()

if (NOT MSVC)
target_compile_options(mtmd PRIVATE -Wno-cast-qual) # stb_image.h
target_compile_options(mtmd_audio PRIVATE -Wno-cast-qual) # miniaudio.h
# for stb_image.h and miniaudio.h
target_compile_options(mtmd_helper PRIVATE -Wno-cast-qual)
endif()

if(TARGET BUILD_INFO)
add_dependencies(mtmd BUILD_INFO)
add_dependencies(mtmd_helper BUILD_INFO)
endif()

add_executable(llama-llava-cli deprecation-warning.cpp)
Expand All @@ -54,5 +60,5 @@ set(TARGET llama-mtmd-cli)
add_executable(${TARGET} mtmd-cli.cpp)
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-mtmd-cli)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common mtmd ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE common mtmd mtmd_helper ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)
27 changes: 0 additions & 27 deletions tools/mtmd/clip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,6 @@
#include "ggml-backend.h"
#include "gguf.h"

#define STB_IMAGE_IMPLEMENTATION
#include "stb_image.h"

#include <cassert>
#include <cmath>
#include <cstdlib>
Expand Down Expand Up @@ -2786,30 +2783,6 @@ void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny
memcpy(img->buf.data(), rgb_pixels, img->buf.size());
}

bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
int nx, ny, nc;
auto * data = stbi_load(fname, &nx, &ny, &nc, 3);
if (!data) {
LOG_ERR("%s: failed to load image '%s'\n", __func__, fname);
return false;
}
clip_build_img_from_pixels(data, nx, ny, img);
stbi_image_free(data);
return true;
}

bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img) {
int nx, ny, nc;
auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3);
if (!data) {
LOG_ERR("%s: failed to decode image bytes\n", __func__);
return false;
}
clip_build_img_from_pixels(data, nx, ny, img);
stbi_image_free(data);
return true;
}

// Normalize image to float32 - careful with pytorch .to(model.device, dtype=torch.float16) - this sometimes reduces precision (32>16>32), sometimes not
static void normalize_image_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst, const float mean[3], const float std[3]) {
dst.nx = src.nx;
Expand Down
86 changes: 0 additions & 86 deletions tools/mtmd/mtmd-audio.cpp
Original file line number Diff line number Diff line change
@@ -1,28 +1,5 @@
// fix problem with std::min and std::max
#if defined(_WIN32)
#define WIN32_LEAN_AND_MEAN
#ifndef NOMINMAX
# define NOMINMAX
#endif
#include <windows.h>
#endif

#include "mtmd-audio.h"

//#define MTMD_AUDIO_DEBUG

#define MINIAUDIO_IMPLEMENTATION
#ifndef MTMD_AUDIO_DEBUG
# define MA_NO_ENCODING
#endif
#define MA_NO_DEVICE_IO
#define MA_NO_RESOURCE_MANAGER
#define MA_NO_NODE_GRAPH
#define MA_NO_ENGINE
#define MA_NO_GENERATION
#define MA_API static
#include "miniaudio.h"

#define _USE_MATH_DEFINES // for M_PI
#include <cmath>
#include <cstdint>
Expand Down Expand Up @@ -359,69 +336,6 @@ bool preprocess_audio(
} // namespace whisper_preprocessor


namespace audio_helpers {

bool is_audio_file(const char * buf, size_t len) {
if (len < 12) {
return false;
}

// RIFF ref: https://en.wikipedia.org/wiki/Resource_Interchange_File_Format
// WAV ref: https://www.mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/WAVE.html
bool is_wav = memcmp(buf, "RIFF", 4) == 0 && memcmp(buf + 8, "WAVE", 4) == 0;
bool is_mp3 = len >= 3 && (
memcmp(buf, "ID3", 3) == 0 ||
// Check for MPEG sync word (simplified check)
((unsigned char)buf[0] == 0xFF && ((unsigned char)buf[1] & 0xE0) == 0xE0)
);
bool is_flac = memcmp(buf, "fLaC", 4) == 0;

return is_wav || is_mp3 || is_flac;
}

// returns true if the buffer is a valid audio file
bool decode_audio_from_buf(const unsigned char * buf_in, size_t len, int target_sampler_rate, std::vector<float> & pcmf32_mono) {
ma_result result;
const int channels = 1;
ma_decoder_config decoder_config = ma_decoder_config_init(ma_format_f32, channels, target_sampler_rate);
ma_decoder decoder;

result = ma_decoder_init_memory(buf_in, len, &decoder_config, &decoder);
if (result != MA_SUCCESS) {
return false;
}

ma_uint64 frame_count;
ma_uint64 frames_read;
result = ma_decoder_get_length_in_pcm_frames(&decoder, &frame_count);
if (result != MA_SUCCESS) {
ma_decoder_uninit(&decoder);
return false;
}

pcmf32_mono.resize(frame_count);
result = ma_decoder_read_pcm_frames(&decoder, pcmf32_mono.data(), frame_count, &frames_read);
if (result != MA_SUCCESS) {
ma_decoder_uninit(&decoder);
return false;
}

#ifdef MTMD_AUDIO_DEBUG
// save audio to wav file
ma_encoder_config config = ma_encoder_config_init(ma_encoding_format_wav, ma_format_f32, 1, target_sampler_rate);
ma_encoder encoder;
ma_encoder_init_file("output.wav", &config, &encoder);
ma_encoder_write_pcm_frames(&encoder, pcmf32_mono.data(), pcmf32_mono.size(), &frames_read);
ma_encoder_uninit(&encoder);
#endif

ma_decoder_uninit(&decoder);
return true;
}

} // namespace wav_utils


// precalculated mel filter banks
// values are multiplied by 1000.0 to save space, and will be divided by 1000.0 in the end of the function
//
Expand Down
19 changes: 2 additions & 17 deletions tools/mtmd/mtmd-audio.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,31 +32,16 @@ struct whisper_filters {
std::vector<float> data;
};

extern bool preprocess_audio(
bool preprocess_audio(
const float * samples,
size_t n_samples,
const whisper_filters & filters,
std::vector<whisper_mel> & output);

} // namespace whisper_preprocessor


// TODO @ngxson : move this helper to mtmd-helpers.cpp
namespace audio_helpers {

extern bool is_audio_file(const char * buf, size_t len);

extern bool decode_audio_from_buf(
const unsigned char * buf_in,
size_t len,
int target_sampler_rate,
std::vector<float> & pcmf32_mono);

} // namespace audio_helpers


namespace whisper_precalc_filters {

extern whisper_preprocessor::whisper_filters get_128_bins();
whisper_preprocessor::whisper_filters get_128_bins();

} // namespace whisper_precalc_filters
3 changes: 2 additions & 1 deletion tools/mtmd/mtmd-cli.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include "console.h"
#include "chat.h"
#include "mtmd.h"
#include "mtmd-helper.h"

#include <vector>
#include <limits.h>
Expand Down Expand Up @@ -143,7 +144,7 @@ struct mtmd_cli_context {
}

bool load_media(const std::string & fname) {
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(fname.c_str()));
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(ctx_vision.get(), fname.c_str()));
if (!bmp.ptr) {
return false;
}
Expand Down
Loading
Loading