Skip to content

Commit 021dfa8

Browse files
author
lexasub
committed
tool: add convertation of text/parquet to custom format
1 parent daf2dd7 commit 021dfa8

File tree

60 files changed

+12073
-9
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

60 files changed

+12073
-9
lines changed

CMakeLists.txt

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@ if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
1212
set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
1313
endif()
1414

15+
message("CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}")
16+
1517
# Add path to modules
1618
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
1719

@@ -84,6 +86,12 @@ option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE})
8486
# 3rd party libs
8587
option(LLAMA_CURL "llama: use libcurl to download model from an URL" ON)
8688
option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured output in common utils" OFF)
89+
option(LLAMA_PARQUET "Enable Parquet dataset support via Arrow/Parquet C++" OFF)
90+
91+
92+
if(LLAMA_PARQUET)
93+
add_definitions(-DLLAMA_PARQUET)
94+
endif()
8795

8896
# Required for relocatable CMake package
8997
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)

common/arg.cpp

Lines changed: 44 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1489,7 +1489,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
14891489
[](common_params & params, int value) {
14901490
params.n_chunks = value;
14911491
}
1492-
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_RETRIEVAL}));
1492+
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_RETRIEVAL}));
14931493
add_opt(common_arg(
14941494
{"-fa", "--flash-attn"},
14951495
string_format("enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled"),
@@ -1551,7 +1551,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
15511551
}
15521552
params.in_files.push_back(value);
15531553
}
1554-
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
1554+
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_FINETUNE}));
15551555
add_opt(common_arg(
15561556
{"-bf", "--binary-file"}, "FNAME",
15571557
"binary file containing the prompt (default: none)",
@@ -2637,9 +2637,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
26372637
{"-o", "--output", "--output-file"}, "FNAME",
26382638
string_format("output file (default: '%s')", params.out_file.c_str()),
26392639
[](common_params & params, const std::string & value) {
2640-
params.out_file = value;
2640+
params.out_file = value;
26412641
}
2642-
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS}));
2642+
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE}));
26432643
add_opt(common_arg(
26442644
{"-ofreq", "--output-frequency"}, "N",
26452645
string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq),
@@ -3511,5 +3511,45 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
35113511
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
35123512

35133513

3514+
add_opt(common_arg(
3515+
{"--dataset-format"}, " ",
3516+
string_format("type of input data (e.g., 'text', 'parquet') (default: %s)", params.dataset_format.c_str()),
3517+
[](common_params & params, const std::string & format) {
3518+
params.dataset_format = format; //TODO ENUM CLASS
3519+
}
3520+
).set_examples({LLAMA_EXAMPLE_FINETUNE}));
3521+
3522+
add_opt(common_arg(
3523+
{"--max-seq-len"}, " ",
3524+
string_format("max sequence length (default: %d)", params.max_seq_len),
3525+
[](common_params & params, int32_t max_seq_len) {
3526+
params.max_seq_len = max_seq_len;
3527+
}
3528+
).set_examples({LLAMA_EXAMPLE_FINETUNE}));
3529+
3530+
add_opt(common_arg(
3531+
{"--pre-tokenized"},
3532+
string_format("input file contains pre-tokenized data (space-separated token IDs)"),
3533+
[](common_params & params) {
3534+
params.pre_tokenized = true;
3535+
}
3536+
).set_examples({LLAMA_EXAMPLE_FINETUNE}));
3537+
3538+
add_opt(common_arg(
3539+
{"--preview"},
3540+
string_format("read and print metadata and first sequence from the output GGUF file (enables preview)"),
3541+
[](common_params & params) {
3542+
params.do_preview = true;
3543+
}
3544+
).set_examples({LLAMA_EXAMPLE_FINETUNE}));
3545+
3546+
add_opt(common_arg(
3547+
{"--dataset-column"}, "<name>",
3548+
string_format("column name for data in dataset files"),
3549+
[](common_params & params, const std::string &dataset_column) {
3550+
params.dataset_column = dataset_column;
3551+
}
3552+
).set_examples({LLAMA_EXAMPLE_FINETUNE}));
3553+
35143554
return ctx_arg;
35153555
}

common/common.h

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,13 @@
44

55
#include "llama-cpp.h"
66

7+
#include <cmath>
8+
#include <map>
79
#include <set>
10+
#include <sstream>
811
#include <string>
912
#include <string_view>
1013
#include <vector>
11-
#include <map>
12-
#include <sstream>
1314

1415
#ifdef _WIN32
1516
#define DIRECTORY_SEPARATOR '\\'
@@ -82,6 +83,7 @@ enum llama_example {
8283
LLAMA_EXAMPLE_PARALLEL,
8384
LLAMA_EXAMPLE_TTS,
8485
LLAMA_EXAMPLE_DIFFUSION,
86+
LLAMA_EXAMPLE_FINETUNE,
8587

8688
LLAMA_EXAMPLE_COUNT,
8789
};
@@ -300,6 +302,7 @@ struct common_params {
300302
std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT
301303
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT
302304
std::string logits_file = ""; // file for saving *all* logits // NOLINT
305+
std::string dataset_format = "text"; // "text" | "parquet"
303306

304307
std::vector<std::string> in_files; // all input files
305308
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
@@ -464,6 +467,10 @@ struct common_params {
464467
// return false from callback to abort model loading or true to continue
465468
llama_progress_callback load_progress_callback = NULL;
466469
void * load_progress_callback_user_data = NULL;
470+
int32_t max_seq_len = 2048;
471+
bool do_preview = false;
472+
bool pre_tokenized = false;
473+
std::string dataset_column = "data";
467474
};
468475

469476
// call once at the start of a program if it uses libcommon

ggml/include/gguf.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,14 @@ extern "C" {
197197
// writes the meta data to pointer "data"
198198
GGML_API void gguf_get_meta_data(const struct gguf_context * ctx, void * data);
199199

200+
GGML_API void gguf_context_set_data(struct gguf_context * ctx, void * data);
201+
GGML_API void * gguf_context_get_data(struct gguf_context * ctx);
202+
GGML_API struct gguf_context * gguf_init_from_in_memory_data(
203+
const void* metadata,
204+
uint32_t version,
205+
size_t alignment
206+
);
207+
200208
#ifdef __cplusplus
201209
}
202210
#endif

ggml/src/gguf.cpp

Lines changed: 43 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,3 @@
1-
#include "ggml.h"
2-
#include "ggml-backend.h"
3-
#include "ggml-impl.h"
41
#include "gguf.h"
52

63
#include <cinttypes>
@@ -13,8 +10,13 @@
1310
#include <new>
1411
#include <stdexcept>
1512
#include <string>
13+
#include <unordered_map>
1614
#include <vector>
1715

16+
#include "ggml-backend.h"
17+
#include "ggml-impl.h"
18+
#include "ggml.h"
19+
1820
template <typename T>
1921
struct type_to_gguf_type;
2022

@@ -316,6 +318,14 @@ bool gguf_read_emplace_helper(const struct gguf_reader & gr, std::vector<struct
316318
return true;
317319
}
318320

321+
void * gguf_context_get_data(gguf_context * ctx) {
322+
return ctx->data;
323+
}
324+
325+
void gguf_context_set_data(gguf_context * ctx, void * data) {
326+
ctx->data = data;
327+
}
328+
319329
struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params) {
320330
const struct gguf_reader gr(file);
321331
struct gguf_context * ctx = new gguf_context;
@@ -926,6 +936,36 @@ int64_t gguf_find_tensor(const struct gguf_context * ctx, const char * name) {
926936
return tensor_id;
927937
}
928938

939+
struct gguf_context * gguf_init_from_in_memory_data(const void * _metadata, uint32_t version, size_t alignment) {
940+
941+
const auto* metadata_map = reinterpret_cast<const std::unordered_map<std::string, std::string>*>(_metadata);
942+
943+
struct gguf_context * ctx = new gguf_context{};
944+
ctx->version = version;
945+
ctx->alignment = alignment;
946+
ctx->offset = 0;
947+
ctx->size = 0;
948+
ctx->data = nullptr;
949+
if (ctx == nullptr) {
950+
GGML_LOG_ERROR("%s: failed to allocate gguf_context\n", __func__);
951+
return nullptr;
952+
}
953+
954+
ctx->kv.reserve(metadata_map->size());
955+
for (const auto& pair : *metadata_map) {
956+
gguf_kv kv{pair.first, pair.second};
957+
ctx->kv.emplace_back(kv);
958+
}
959+
960+
if (ctx->alignment == 0 || (ctx->alignment & (ctx->alignment - 1)) != 0) {
961+
GGML_LOG_ERROR("%s: alignment %zu is not a power of 2\n", __func__, ctx->alignment);
962+
gguf_free(ctx);
963+
return nullptr;
964+
}
965+
966+
return ctx;
967+
}
968+
929969
size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int64_t tensor_id) {
930970
GGML_ASSERT(tensor_id >= 0 && tensor_id < gguf_get_n_tensors(ctx));
931971
return ctx->info[tensor_id].offset;

tools/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,4 +36,5 @@ else()
3636
add_subdirectory(cvector-generator)
3737
add_subdirectory(export-lora)
3838
endif()
39+
add_subdirectory(dataset-converter)
3940
endif()
Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
cmake_minimum_required(VERSION 3.15)
2+
include_directories(.
3+
../../common
4+
../../
5+
../../src
6+
${CMAKE_CURRENT_SOURCE_DIR}/formats/gguf
7+
${CMAKE_CURRENT_SOURCE_DIR}/formats/parquet
8+
${CMAKE_CURRENT_SOURCE_DIR}/formats/text
9+
${CMAKE_CURRENT_SOURCE_DIR}/core
10+
${CMAKE_CURRENT_SOURCE_DIR}/streaming
11+
${CMAKE_CURRENT_SOURCE_DIR}/safety
12+
${CMAKE_CURRENT_SOURCE_DIR}/validation)
13+
14+
# Check for Arrow/Parquet dependencies
15+
if(LLAMA_PARQUET)
16+
find_package(Arrow REQUIRED)
17+
find_package(Parquet REQUIRED)
18+
add_definitions(-DLLAMA_DATASET_PARQUET_SUPPORT)
19+
endif()
20+
21+
# Option to control the build of the dataset-converter tool
22+
option(PROJECT_BUILD_DATASET_CONVERTER "Build the dataset-converter tool" ON)
23+
24+
if(NOT PROJECT_BUILD_DATASET_CONVERTER)
25+
message(STATUS "Skipping build of dataset-converter tool.")
26+
return()
27+
endif()
28+
29+
# Set the C++ standard to 17
30+
set(CMAKE_CXX_STANDARD 17)
31+
set(CMAKE_CXX_STANDARD_REQUIRED ON)
32+
33+
# Gather all .cpp source files for the executable
34+
file(GLOB_RECURSE DATASET_CONVERTER_SOURCES
35+
"${CMAKE_CURRENT_SOURCE_DIR}/core/*.cpp"
36+
"${CMAKE_CURRENT_SOURCE_DIR}/formats/gguf/*.cpp"
37+
"${CMAKE_CURRENT_SOURCE_DIR}/formats/parquet/*.cpp"
38+
"${CMAKE_CURRENT_SOURCE_DIR}/formats/text/*.cpp"
39+
"${CMAKE_CURRENT_SOURCE_DIR}/streaming/*.cpp"
40+
"${CMAKE_CURRENT_SOURCE_DIR}/validation/*.cpp"
41+
)
42+
43+
add_library(dataset_convert_lib STATIC ${DATASET_CONVERTER_SOURCES})
44+
45+
# Link dependencies to the target
46+
target_link_libraries(dataset_convert_lib PRIVATE
47+
llama
48+
common
49+
Threads::Threads
50+
ggml
51+
)
52+
53+
# Link Arrow/Parquet if enabled
54+
if(LLAMA_PARQUET)
55+
target_link_libraries(dataset_convert_lib PRIVATE
56+
Arrow::arrow_shared
57+
Parquet::parquet_shared
58+
)
59+
endif()
60+
61+
# Define the executable target
62+
add_executable(dataset_converter "${CMAKE_CURRENT_SOURCE_DIR}/tools/convert-to-gguf.cpp")
63+
target_link_libraries(dataset_converter PRIVATE dataset_convert_lib)
64+
65+
# Installation rule for the executable
66+
install(TARGETS dataset_converter
67+
DESTINATION ${CMAKE_INSTALL_PREFIX}/bin
68+
)
69+
70+
enable_testing()
71+
72+
function(add_test_target TEST_SRC)
73+
get_filename_component(TEST_NAME ${TEST_SRC} NAME_WE)
74+
75+
file(RELATIVE_PATH REL_TEST_NAME ${CMAKE_CURRENT_SOURCE_DIR} ${TEST_SRC})
76+
string(REPLACE "/" "_" REL_TEST_NAME ${REL_TEST_NAME})
77+
string(REPLACE "\\" "_" REL_TEST_NAME ${REL_TEST_NAME})
78+
string(REPLACE "." "_" REL_TEST_NAME ${REL_TEST_NAME})
79+
string(REPLACE " " "_" REL_TEST_NAME ${REL_TEST_NAME})
80+
81+
add_executable(${REL_TEST_NAME} ${TEST_SRC})
82+
83+
target_link_libraries(${REL_TEST_NAME} PRIVATE
84+
dataset_convert_lib
85+
)
86+
87+
add_test(NAME ${REL_TEST_NAME} COMMAND ${REL_TEST_NAME})
88+
endfunction()
89+
90+
file(GLOB_RECURSE TEST_SOURCES
91+
"${CMAKE_CURRENT_SOURCE_DIR}/tests/data/*.cpp"
92+
"${CMAKE_CURRENT_SOURCE_DIR}/tests/integration/*.cpp"
93+
"${CMAKE_CURRENT_SOURCE_DIR}/tests/streaming/*.cpp"
94+
"${CMAKE_CURRENT_SOURCE_DIR}/tests/unit/*.cpp"
95+
)
96+
97+
foreach(TEST_SRC ${TEST_SOURCES})
98+
add_test_target(${TEST_SRC})
99+
endforeach()

0 commit comments

Comments
 (0)