lexasub
diff --git a/‎CMakeLists.txt‎
Lines changed: 8 additions & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎common/arg.cpp‎
Lines changed: 44 additions & 4 deletions b/‎common/arg.cpp‎
Lines changed: 44 additions & 4 deletions
diff --git a/‎common/common.h‎
Lines changed: 9 additions & 2 deletions b/‎common/common.h‎
Lines changed: 9 additions & 2 deletions
diff --git a/‎ggml/include/gguf.h‎
Lines changed: 8 additions & 0 deletions b/‎ggml/include/gguf.h‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎ggml/src/gguf.cpp‎
Lines changed: 43 additions & 3 deletions b/‎ggml/src/gguf.cpp‎
Lines changed: 43 additions & 3 deletions
diff --git a/‎tools/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎tools/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎tools/dataset-converter/CMakeLists.txt‎
Lines changed: 99 additions & 0 deletions b/‎tools/dataset-converter/CMakeLists.txt‎
Lines changed: 99 additions & 0 deletions
@@ -12,6 +12,8 @@ if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
     set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
 endif()
 
+message("CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}")
+
 # Add path to modules
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
 
@@ -84,6 +86,12 @@ option(LLAMA_BUILD_SERVER   "llama: build server example" ${LLAMA_STANDALONE})
 # 3rd party libs
 option(LLAMA_CURL       "llama: use libcurl to download model from an URL" ON)
 option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured output in common utils" OFF)
+option(LLAMA_PARQUET "Enable Parquet dataset support via Arrow/Parquet C++" OFF)
+
+
+if(LLAMA_PARQUET)
+    add_definitions(-DLLAMA_PARQUET)
+endif()
 
 # Required for relocatable CMake package
 include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
 
@@ -1489,7 +1489,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, int value) {
             params.n_chunks = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_RETRIEVAL}));
+    ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_RETRIEVAL}));
     add_opt(common_arg(
         {"-fa", "--flash-attn"},
         string_format("enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled"),
@@ -1551,7 +1551,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             }
             params.in_files.push_back(value);
         }
-    ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
+    ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_FINETUNE}));
     add_opt(common_arg(
         {"-bf", "--binary-file"}, "FNAME",
         "binary file containing the prompt (default: none)",
@@ -2637,9 +2637,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         {"-o", "--output", "--output-file"}, "FNAME",
         string_format("output file (default: '%s')", params.out_file.c_str()),
         [](common_params & params, const std::string & value) {
-            params.out_file = value;
+          params.out_file = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS}));
+    ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE}));
     add_opt(common_arg(
         {"-ofreq", "--output-frequency"}, "N",
         string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq),
@@ -3511,5 +3511,45 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
 
 
+    add_opt(common_arg(
+        {"--dataset-format"}, " ",
+        string_format("type of input data (e.g., 'text', 'parquet') (default: %s)", params.dataset_format.c_str()),
+        [](common_params & params, const std::string & format) {
+            params.dataset_format = format; //TODO ENUM CLASS
+        }
+    ).set_examples({LLAMA_EXAMPLE_FINETUNE}));
+
+    add_opt(common_arg(
+        {"--max-seq-len"}, " ",
+        string_format("max sequence length (default: %d)", params.max_seq_len),
+        [](common_params & params, int32_t max_seq_len) {
+            params.max_seq_len = max_seq_len;
+        }
+    ).set_examples({LLAMA_EXAMPLE_FINETUNE}));
+
+    add_opt(common_arg(
+        {"--pre-tokenized"},
+        string_format("input file contains pre-tokenized data (space-separated token IDs)"),
+        [](common_params & params) {
+            params.pre_tokenized = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_FINETUNE}));
+
+    add_opt(common_arg(
+        {"--preview"},
+        string_format("read and print metadata and first sequence from the output GGUF file (enables preview)"),
+        [](common_params & params) {
+            params.do_preview = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_FINETUNE}));
+
+    add_opt(common_arg(
+        {"--dataset-column"}, "<name>",
+        string_format("column name for data in dataset files"),
+        [](common_params & params, const std::string &dataset_column) {
+            params.dataset_column = dataset_column;
+        }
+    ).set_examples({LLAMA_EXAMPLE_FINETUNE}));
+
     return ctx_arg;
 }
@@ -4,12 +4,13 @@
 
 #include "llama-cpp.h"
 
+#include <cmath>
+#include <map>
 #include <set>
+#include <sstream>
 #include <string>
 #include <string_view>
 #include <vector>
-#include <map>
-#include <sstream>
 
 #ifdef _WIN32
 #define DIRECTORY_SEPARATOR '\\'
@@ -82,6 +83,7 @@ enum llama_example {
     LLAMA_EXAMPLE_PARALLEL,
     LLAMA_EXAMPLE_TTS,
     LLAMA_EXAMPLE_DIFFUSION,
+    LLAMA_EXAMPLE_FINETUNE,
 
     LLAMA_EXAMPLE_COUNT,
 };
@@ -300,6 +302,7 @@ struct common_params {
     std::string lookup_cache_static  = ""; // path of static ngram cache file for lookup decoding           // NOLINT
     std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding          // NOLINT
     std::string logits_file          = ""; // file for saving *all* logits                                  // NOLINT
+    std::string dataset_format = "text"; // "text" | "parquet"
 
     std::vector<std::string> in_files;   // all input files
     std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
@@ -464,6 +467,10 @@ struct common_params {
     // return false from callback to abort model loading or true to continue
     llama_progress_callback load_progress_callback = NULL;
     void *                  load_progress_callback_user_data = NULL;
+    int32_t max_seq_len = 2048;
+    bool do_preview = false;
+    bool pre_tokenized = false;
+    std::string dataset_column = "data";
 };
 
 // call once at the start of a program if it uses libcommon
 
@@ -197,6 +197,14 @@ extern "C" {
     // writes the meta data to pointer "data"
     GGML_API void   gguf_get_meta_data(const struct gguf_context * ctx, void * data);
 
+    GGML_API void   gguf_context_set_data(struct gguf_context * ctx, void * data);
+    GGML_API void *  gguf_context_get_data(struct gguf_context * ctx);
+    GGML_API struct gguf_context * gguf_init_from_in_memory_data(
+        const void* metadata,
+        uint32_t version,
+        size_t alignment
+    );
+
 #ifdef  __cplusplus
 }
 #endif
@@ -1,6 +1,3 @@
-#include "ggml.h"
-#include "ggml-backend.h"
-#include "ggml-impl.h"
 #include "gguf.h"
 
 #include <cinttypes>
@@ -13,8 +10,13 @@
 #include <new>
 #include <stdexcept>
 #include <string>
+#include <unordered_map>
 #include <vector>
 
+#include "ggml-backend.h"
+#include "ggml-impl.h"
+#include "ggml.h"
+
 template <typename T>
 struct type_to_gguf_type;
 
@@ -316,6 +318,14 @@ bool gguf_read_emplace_helper(const struct gguf_reader & gr, std::vector<struct
     return true;
 }
 
+void * gguf_context_get_data(gguf_context * ctx) {
+    return ctx->data;
+}
+
+void gguf_context_set_data(gguf_context * ctx, void * data) {
+    ctx->data = data;
+}
+
 struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params) {
     const struct gguf_reader gr(file);
     struct gguf_context * ctx = new gguf_context;
@@ -926,6 +936,36 @@ int64_t gguf_find_tensor(const struct gguf_context * ctx, const char * name) {
     return tensor_id;
 }
 
+struct gguf_context * gguf_init_from_in_memory_data(const void * _metadata, uint32_t version, size_t alignment) {
+
+    const auto* metadata_map = reinterpret_cast<const std::unordered_map<std::string, std::string>*>(_metadata);
+
+    struct gguf_context * ctx = new gguf_context{};
+    ctx->version = version;
+    ctx->alignment = alignment;
+    ctx->offset = 0;
+    ctx->size = 0;
+    ctx->data = nullptr;
+    if (ctx == nullptr) {
+        GGML_LOG_ERROR("%s: failed to allocate gguf_context\n", __func__);
+        return nullptr;
+    }
+
+    ctx->kv.reserve(metadata_map->size());
+    for (const auto& pair : *metadata_map) {
+        gguf_kv kv{pair.first, pair.second};
+        ctx->kv.emplace_back(kv);
+    }
+
+    if (ctx->alignment == 0 || (ctx->alignment & (ctx->alignment - 1)) != 0) {
+        GGML_LOG_ERROR("%s: alignment %zu is not a power of 2\n", __func__, ctx->alignment);
+        gguf_free(ctx);
+        return nullptr;
+    }
+
+    return ctx;
+}
+
 size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int64_t tensor_id) {
     GGML_ASSERT(tensor_id >= 0 && tensor_id < gguf_get_n_tensors(ctx));
     return ctx->info[tensor_id].offset;
 
@@ -36,4 +36,5 @@ else()
         add_subdirectory(cvector-generator)
         add_subdirectory(export-lora)
     endif()
+    add_subdirectory(dataset-converter)
 endif()
@@ -0,0 +1,99 @@
+cmake_minimum_required(VERSION 3.15)
+include_directories(.
+        ../../common
+        ../../
+        ../../src
+        ${CMAKE_CURRENT_SOURCE_DIR}/formats/gguf
+        ${CMAKE_CURRENT_SOURCE_DIR}/formats/parquet
+        ${CMAKE_CURRENT_SOURCE_DIR}/formats/text
+        ${CMAKE_CURRENT_SOURCE_DIR}/core
+        ${CMAKE_CURRENT_SOURCE_DIR}/streaming
+        ${CMAKE_CURRENT_SOURCE_DIR}/safety
+        ${CMAKE_CURRENT_SOURCE_DIR}/validation)
+
+# Check for Arrow/Parquet dependencies
+if(LLAMA_PARQUET)
+    find_package(Arrow REQUIRED)
+    find_package(Parquet REQUIRED)
+    add_definitions(-DLLAMA_DATASET_PARQUET_SUPPORT)
+endif()
+
+# Option to control the build of the dataset-converter tool
+option(PROJECT_BUILD_DATASET_CONVERTER "Build the dataset-converter tool" ON)
+
+if(NOT PROJECT_BUILD_DATASET_CONVERTER)
+    message(STATUS "Skipping build of dataset-converter tool.")
+    return()
+endif()
+
+# Set the C++ standard to 17
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+# Gather all .cpp source files for the executable
+file(GLOB_RECURSE DATASET_CONVERTER_SOURCES
+    "${CMAKE_CURRENT_SOURCE_DIR}/core/*.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/formats/gguf/*.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/formats/parquet/*.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/formats/text/*.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/streaming/*.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/validation/*.cpp"
+)
+
+add_library(dataset_convert_lib STATIC ${DATASET_CONVERTER_SOURCES})
+
+# Link dependencies to the target
+target_link_libraries(dataset_convert_lib PRIVATE
+        llama
+        common
+        Threads::Threads
+        ggml
+)
+
+# Link Arrow/Parquet if enabled
+if(LLAMA_PARQUET)
+    target_link_libraries(dataset_convert_lib PRIVATE
+            Arrow::arrow_shared
+            Parquet::parquet_shared
+    )
+endif()
+
+# Define the executable target
+add_executable(dataset_converter "${CMAKE_CURRENT_SOURCE_DIR}/tools/convert-to-gguf.cpp")
+target_link_libraries(dataset_converter PRIVATE dataset_convert_lib)
+
+# Installation rule for the executable
+install(TARGETS dataset_converter
+    DESTINATION ${CMAKE_INSTALL_PREFIX}/bin
+)
+
+enable_testing()
+
+function(add_test_target TEST_SRC)
+    get_filename_component(TEST_NAME ${TEST_SRC} NAME_WE)
+
+    file(RELATIVE_PATH REL_TEST_NAME ${CMAKE_CURRENT_SOURCE_DIR} ${TEST_SRC})
+    string(REPLACE "/" "_" REL_TEST_NAME ${REL_TEST_NAME})
+    string(REPLACE "\\" "_" REL_TEST_NAME ${REL_TEST_NAME})
+    string(REPLACE "." "_" REL_TEST_NAME ${REL_TEST_NAME})
+    string(REPLACE " " "_" REL_TEST_NAME ${REL_TEST_NAME})
+
+    add_executable(${REL_TEST_NAME} ${TEST_SRC})
+
+    target_link_libraries(${REL_TEST_NAME} PRIVATE
+            dataset_convert_lib
+    )
+
+    add_test(NAME ${REL_TEST_NAME} COMMAND ${REL_TEST_NAME})
+endfunction()
+
+file(GLOB_RECURSE TEST_SOURCES
+        "${CMAKE_CURRENT_SOURCE_DIR}/tests/data/*.cpp"
+        "${CMAKE_CURRENT_SOURCE_DIR}/tests/integration/*.cpp"
+        "${CMAKE_CURRENT_SOURCE_DIR}/tests/streaming/*.cpp"
+        "${CMAKE_CURRENT_SOURCE_DIR}/tests/unit/*.cpp"
+)
+
+foreach(TEST_SRC ${TEST_SOURCES})
+    add_test_target(${TEST_SRC})
+endforeach()