Skip to content

Commit 620568a

Browse files
committed
mtmd : move helpers to dedicated library
1 parent aa6dff0 commit 620568a

File tree

14 files changed

+276
-259
lines changed

14 files changed

+276
-259
lines changed

.editorconfig

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,6 @@ charset = unset
4949
trim_trailing_whitespace = unset
5050
insert_final_newline = unset
5151

52-
[tools/mtmd/miniaudio.h]
52+
[tools/mtmd/vendor/miniaudio.h]
5353
trim_trailing_whitespace = unset
5454
insert_final_newline = unset

tools/mtmd/CMakeLists.txt

Lines changed: 26 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -12,37 +12,53 @@ target_include_directories(mtmd_audio PRIVATE .)
1212

1313
add_library(mtmd OBJECT
1414
mtmd.cpp
15-
mtmd-helper.cpp
15+
mtmd-audio.cpp
1616
mtmd.h
1717
clip.cpp
1818
clip.h
1919
clip-impl.h
2020
)
2121

22-
target_link_libraries(mtmd PRIVATE ggml llama mtmd_audio ${CMAKE_THREAD_LIBS_INIT})
23-
22+
target_link_libraries(mtmd PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
2423
target_include_directories(mtmd PUBLIC .)
2524
target_include_directories(mtmd PRIVATE ../..)
26-
target_include_directories(mtmd PRIVATE ../../common) # for stb_image.h
27-
2825
target_compile_features(mtmd PRIVATE cxx_std_17)
2926

30-
add_library(mtmd_static STATIC $<TARGET_OBJECTS:mtmd>)
27+
# Helper library for mtmd, to avoid long compile times with miniaudio.h and stb_image.h
28+
29+
add_library(mtmd_helper OBJECT
30+
mtmd-helper.cpp
31+
mtmd-helper.h
32+
)
33+
34+
target_link_libraries(mtmd_helper PRIVATE ggml llama mtmd ${CMAKE_THREAD_LIBS_INIT})
35+
target_include_directories(mtmd_helper PUBLIC .)
36+
target_include_directories(mtmd_helper PRIVATE ./vendor)
37+
target_include_directories(mtmd_helper PRIVATE ../..)
38+
target_compile_features(mtmd_helper PRIVATE cxx_std_17)
39+
3140
if (BUILD_SHARED_LIBS)
3241
set_target_properties(mtmd PROPERTIES POSITION_INDEPENDENT_CODE ON)
3342
target_compile_definitions(mtmd PRIVATE LLAMA_SHARED LLAMA_BUILD)
3443
add_library(mtmd_shared SHARED $<TARGET_OBJECTS:mtmd>)
35-
target_link_libraries(mtmd_shared PRIVATE ggml llama mtmd_audio ${CMAKE_THREAD_LIBS_INIT})
44+
target_link_libraries(mtmd_shared PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
3645
install(TARGETS mtmd_shared LIBRARY)
46+
47+
set_target_properties(mtmd_helper PROPERTIES POSITION_INDEPENDENT_CODE ON)
48+
target_compile_definitions(mtmd_helper PRIVATE LLAMA_SHARED LLAMA_BUILD)
49+
add_library(mtmd_helper_shared SHARED $<TARGET_OBJECTS:mtmd>)
50+
target_link_libraries(mtmd_helper_shared PRIVATE ggml llama mtmd ${CMAKE_THREAD_LIBS_INIT})
51+
install(TARGETS mtmd_helper_shared LIBRARY)
3752
endif()
3853

3954
if (NOT MSVC)
40-
target_compile_options(mtmd PRIVATE -Wno-cast-qual) # stb_image.h
41-
target_compile_options(mtmd_audio PRIVATE -Wno-cast-qual) # miniaudio.h
55+
# for stb_image.h and miniaudio.h
56+
target_compile_options(mtmd_helper PRIVATE -Wno-cast-qual)
4257
endif()
4358

4459
if(TARGET BUILD_INFO)
4560
add_dependencies(mtmd BUILD_INFO)
61+
add_dependencies(mtmd_helper BUILD_INFO)
4662
endif()
4763

4864
add_executable(llama-llava-cli deprecation-warning.cpp)
@@ -54,5 +70,5 @@ set(TARGET llama-mtmd-cli)
5470
add_executable(${TARGET} mtmd-cli.cpp)
5571
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-mtmd-cli)
5672
install(TARGETS ${TARGET} RUNTIME)
57-
target_link_libraries(${TARGET} PRIVATE common mtmd ${CMAKE_THREAD_LIBS_INIT})
73+
target_link_libraries(${TARGET} PRIVATE common mtmd mtmd_helper ${CMAKE_THREAD_LIBS_INIT})
5874
target_compile_features(${TARGET} PRIVATE cxx_std_17)

tools/mtmd/clip.cpp

Lines changed: 0 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,6 @@
1111
#include "ggml-backend.h"
1212
#include "gguf.h"
1313

14-
#define STB_IMAGE_IMPLEMENTATION
15-
#include "stb_image.h"
16-
1714
#include <cassert>
1815
#include <cmath>
1916
#include <cstdlib>
@@ -2786,30 +2783,6 @@ void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny
27862783
memcpy(img->buf.data(), rgb_pixels, img->buf.size());
27872784
}
27882785

2789-
bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
2790-
int nx, ny, nc;
2791-
auto * data = stbi_load(fname, &nx, &ny, &nc, 3);
2792-
if (!data) {
2793-
LOG_ERR("%s: failed to load image '%s'\n", __func__, fname);
2794-
return false;
2795-
}
2796-
clip_build_img_from_pixels(data, nx, ny, img);
2797-
stbi_image_free(data);
2798-
return true;
2799-
}
2800-
2801-
bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img) {
2802-
int nx, ny, nc;
2803-
auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3);
2804-
if (!data) {
2805-
LOG_ERR("%s: failed to decode image bytes\n", __func__);
2806-
return false;
2807-
}
2808-
clip_build_img_from_pixels(data, nx, ny, img);
2809-
stbi_image_free(data);
2810-
return true;
2811-
}
2812-
28132786
// Normalize image to float32 - careful with pytorch .to(model.device, dtype=torch.float16) - this sometimes reduces precision (32>16>32), sometimes not
28142787
static void normalize_image_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst, const float mean[3], const float std[3]) {
28152788
dst.nx = src.nx;

tools/mtmd/mtmd-audio.cpp

Lines changed: 0 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -1,28 +1,5 @@
1-
// fix problem with std::min and std::max
2-
#if defined(_WIN32)
3-
#define WIN32_LEAN_AND_MEAN
4-
#ifndef NOMINMAX
5-
# define NOMINMAX
6-
#endif
7-
#include <windows.h>
8-
#endif
9-
101
#include "mtmd-audio.h"
112

12-
//#define MTMD_AUDIO_DEBUG
13-
14-
#define MINIAUDIO_IMPLEMENTATION
15-
#ifndef MTMD_AUDIO_DEBUG
16-
# define MA_NO_ENCODING
17-
#endif
18-
#define MA_NO_DEVICE_IO
19-
#define MA_NO_RESOURCE_MANAGER
20-
#define MA_NO_NODE_GRAPH
21-
#define MA_NO_ENGINE
22-
#define MA_NO_GENERATION
23-
#define MA_API static
24-
#include "miniaudio.h"
25-
263
#define _USE_MATH_DEFINES // for M_PI
274
#include <cmath>
285
#include <cstdint>
@@ -359,69 +336,6 @@ bool preprocess_audio(
359336
} // namespace whisper_preprocessor
360337

361338

362-
namespace audio_helpers {
363-
364-
bool is_audio_file(const char * buf, size_t len) {
365-
if (len < 12) {
366-
return false;
367-
}
368-
369-
// RIFF ref: https://en.wikipedia.org/wiki/Resource_Interchange_File_Format
370-
// WAV ref: https://www.mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/WAVE.html
371-
bool is_wav = memcmp(buf, "RIFF", 4) == 0 && memcmp(buf + 8, "WAVE", 4) == 0;
372-
bool is_mp3 = len >= 3 && (
373-
memcmp(buf, "ID3", 3) == 0 ||
374-
// Check for MPEG sync word (simplified check)
375-
((unsigned char)buf[0] == 0xFF && ((unsigned char)buf[1] & 0xE0) == 0xE0)
376-
);
377-
bool is_flac = memcmp(buf, "fLaC", 4) == 0;
378-
379-
return is_wav || is_mp3 || is_flac;
380-
}
381-
382-
// returns true if the buffer is a valid audio file
383-
bool decode_audio_from_buf(const unsigned char * buf_in, size_t len, int target_sampler_rate, std::vector<float> & pcmf32_mono) {
384-
ma_result result;
385-
const int channels = 1;
386-
ma_decoder_config decoder_config = ma_decoder_config_init(ma_format_f32, channels, target_sampler_rate);
387-
ma_decoder decoder;
388-
389-
result = ma_decoder_init_memory(buf_in, len, &decoder_config, &decoder);
390-
if (result != MA_SUCCESS) {
391-
return false;
392-
}
393-
394-
ma_uint64 frame_count;
395-
ma_uint64 frames_read;
396-
result = ma_decoder_get_length_in_pcm_frames(&decoder, &frame_count);
397-
if (result != MA_SUCCESS) {
398-
ma_decoder_uninit(&decoder);
399-
return false;
400-
}
401-
402-
pcmf32_mono.resize(frame_count);
403-
result = ma_decoder_read_pcm_frames(&decoder, pcmf32_mono.data(), frame_count, &frames_read);
404-
if (result != MA_SUCCESS) {
405-
ma_decoder_uninit(&decoder);
406-
return false;
407-
}
408-
409-
#ifdef MTMD_AUDIO_DEBUG
410-
// save audio to wav file
411-
ma_encoder_config config = ma_encoder_config_init(ma_encoding_format_wav, ma_format_f32, 1, target_sampler_rate);
412-
ma_encoder encoder;
413-
ma_encoder_init_file("output.wav", &config, &encoder);
414-
ma_encoder_write_pcm_frames(&encoder, pcmf32_mono.data(), pcmf32_mono.size(), &frames_read);
415-
ma_encoder_uninit(&encoder);
416-
#endif
417-
418-
ma_decoder_uninit(&decoder);
419-
return true;
420-
}
421-
422-
} // namespace wav_utils
423-
424-
425339
// precalculated mel filter banks
426340
// values are multiplied by 1000.0 to save space, and will be divided by 1000.0 in the end of the function
427341
//

tools/mtmd/mtmd-audio.h

Lines changed: 2 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -32,31 +32,16 @@ struct whisper_filters {
3232
std::vector<float> data;
3333
};
3434

35-
extern bool preprocess_audio(
35+
bool preprocess_audio(
3636
const float * samples,
3737
size_t n_samples,
3838
const whisper_filters & filters,
3939
std::vector<whisper_mel> & output);
4040

4141
} // namespace whisper_preprocessor
4242

43-
44-
// TODO @ngxson : move this helper to mtmd-helpers.cpp
45-
namespace audio_helpers {
46-
47-
extern bool is_audio_file(const char * buf, size_t len);
48-
49-
extern bool decode_audio_from_buf(
50-
const unsigned char * buf_in,
51-
size_t len,
52-
int target_sampler_rate,
53-
std::vector<float> & pcmf32_mono);
54-
55-
} // namespace audio_helpers
56-
57-
5843
namespace whisper_precalc_filters {
5944

60-
extern whisper_preprocessor::whisper_filters get_128_bins();
45+
whisper_preprocessor::whisper_filters get_128_bins();
6146

6247
} // namespace whisper_precalc_filters

tools/mtmd/mtmd-cli.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
#include "console.h"
88
#include "chat.h"
99
#include "mtmd.h"
10+
#include "mtmd-helper.h"
1011

1112
#include <vector>
1213
#include <limits.h>
@@ -143,7 +144,7 @@ struct mtmd_cli_context {
143144
}
144145

145146
bool load_media(const std::string & fname) {
146-
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(fname.c_str()));
147+
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(ctx_vision.get(), fname.c_str()));
147148
if (!bmp.ptr) {
148149
return false;
149150
}

0 commit comments

Comments
 (0)