tool: add convertation of text/parquet to custom format

lexasub · lexasub · commit ed53dd56f93b · 2025-07-31T23:36:39.000+04:00
diff --git a/tools/dataset-converter/CMakeLists.txt b/tools/dataset-converter/CMakeLists.txt
@@ -3,6 +3,8 @@ include_directories(.
         ../../common
         ../../
         ../../src
+        ../../ggml/include
+        ../../include
         ${CMAKE_CURRENT_SOURCE_DIR}/formats/gguf
         ${CMAKE_CURRENT_SOURCE_DIR}/formats/parquet
         ${CMAKE_CURRENT_SOURCE_DIR}/formats/text
@@ -62,13 +64,25 @@ endif()
 add_executable(dataset_converter "${CMAKE_CURRENT_SOURCE_DIR}/tools/convert-to-gguf.cpp")
 target_link_libraries(dataset_converter PRIVATE dataset_convert_lib)
 
+add_executable(streaming_analyzer "${CMAKE_CURRENT_SOURCE_DIR}/tools/streaming-optimization-analysis.cpp")
+target_link_libraries(streaming_analyzer PRIVATE dataset_convert_lib)
+
+add_executable(test_data_validator "${CMAKE_CURRENT_SOURCE_DIR}/tools/test-data-validation-tool.cpp")
+target_link_libraries(test_data_validator PRIVATE dataset_convert_lib)
+
 # Installation rule for the executable
-install(TARGETS dataset_converter
+install(TARGETS dataset_converter streaming_analyzer test_data_validator
     DESTINATION ${CMAKE_INSTALL_PREFIX}/bin
 )
 
 enable_testing()
+add_executable(create-test-data ${CMAKE_CURRENT_SOURCE_DIR}/tests/data/create_test_data.cpp)
+target_link_libraries(create-test-data PRIVATE dataset_convert_lib)
+add_executable(create-test-data-gguf ${CMAKE_CURRENT_SOURCE_DIR}/tests/data/create_test_gguf.cpp)
+target_link_libraries(create-test-data-gguf PRIVATE dataset_convert_lib)
 
+set_target_properties(create-test-data PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+set_target_properties(create-test-data-gguf PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 function(add_test_target TEST_SRC)
     get_filename_component(TEST_NAME ${TEST_SRC} NAME_WE)
 
@@ -79,16 +93,18 @@ function(add_test_target TEST_SRC)
     string(REPLACE " " "_" REL_TEST_NAME ${REL_TEST_NAME})
 
     add_executable(${REL_TEST_NAME} ${TEST_SRC})
+    target_link_libraries(${REL_TEST_NAME} PRIVATE dataset_convert_lib)
 
-    target_link_libraries(${REL_TEST_NAME} PRIVATE
-            dataset_convert_lib
-    )
+    set_target_properties(${REL_TEST_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 
-    add_test(NAME ${REL_TEST_NAME} COMMAND ${REL_TEST_NAME})
+    add_test(
+            NAME ${REL_TEST_NAME}
+            COMMAND ${CMAKE_COMMAND} -E chdir ${CMAKE_CURRENT_BINARY_DIR}
+            sh -c "./create-test-data && ./create-test-data-gguf && ./${REL_TEST_NAME}"
+    )
 endfunction()
 
 file(GLOB_RECURSE TEST_SOURCES
-        "${CMAKE_CURRENT_SOURCE_DIR}/tests/data/*.cpp"
         "${CMAKE_CURRENT_SOURCE_DIR}/tests/integration/*.cpp"
         "${CMAKE_CURRENT_SOURCE_DIR}/tests/streaming/*.cpp"
         "${CMAKE_CURRENT_SOURCE_DIR}/tests/unit/*.cpp"
diff --git a/tools/dataset-converter/README.md b/tools/dataset-converter/README.md
@@ -144,7 +144,7 @@ cmake --build build
 Run the tests with:
 
 ```bash
-ctest -R "dataset|streaming|integration"
+ctest -R "_unit|_streaming|_integration"
 ```
 
 ## 10. Important Note on `safetensors`
diff --git a/tools/dataset-converter/core/llama-dataset-utils.cpp b/tools/dataset-converter/core/llama-dataset-utils.cpp
@@ -22,21 +22,21 @@ void set_error_with_code(enum dataset_error code, const char* msg) {
     if (!msg) {
         msg = "Unknown error (null message)";
     }
-    
+
     g_error_state.code = code;
-    
+
     // Use safer string copying with proper bounds checking
     size_t msg_len = strlen(msg);
     size_t max_len = sizeof(g_error_state.message) - 1;
-    
+
     if (msg_len > max_len) {
         // Truncate message if too long
         memcpy(g_error_state.message, msg, max_len);
         g_error_state.message[max_len] = '\0';
     } else {
         strcpy(g_error_state.message, msg);
     }
-    
+
     g_error_state.has_error = true;
 
     LLAMA_LOG_ERROR("%s", g_error_state.message);
@@ -116,7 +116,7 @@ struct llama_dataset* dataset_alloc(enum dataset_type type, bool streaming) {
             free(dataset);
             return nullptr;
         }
-        
+
         // Initialize optimization manager to nullptr (will be created on demand)
         dataset->optimization_manager = nullptr;
     }
@@ -299,7 +299,7 @@ struct ggml_tensor* create_sequence_tensor(struct ggml_context* ggml_ctx,
 // Check if streaming is supported for a dataset type and file
 bool llama_dataset_supports_streaming(enum dataset_type type, const char* path) {
     // Currently only GGUF supports streaming
-    if (type == DATASET_GGUF) {
+    if (type == DATASET_GGUF || type == DATASET_PARQUET) {
         return true;
     }
 
diff --git a/tools/dataset-converter/core/llama-dataset.cpp b/tools/dataset-converter/core/llama-dataset.cpp
@@ -287,7 +287,7 @@ void to_gguf(struct llama_dataset * dataset, const char * path) {
     }
 
     // For other formats (TEXT, PARQUET), we need to create a new GGUF file
-    LLAMA_LOG_INFO("Converting %s dataset to GGUF file: %s",
+    LLAMA_LOG_INFO("Converting %s dataset to GGUF file: %s\n",
                   dataset->type == DATASET_TEXT ? "TEXT" : "PARQUET", path);
 
     // Create a new GGUF context
@@ -312,15 +312,23 @@ void to_gguf(struct llama_dataset * dataset, const char * path) {
                 case GGUF_TYPE_INT32:
                     gguf_set_val_i32(new_ctx, key, gguf_get_val_i32(dataset->ctx, i));
                     break;
+                case GGUF_TYPE_UINT32:
+                    gguf_set_val_u32(new_ctx, key, gguf_get_val_u32(dataset->ctx, i));
+                    break;
                 case GGUF_TYPE_INT64:
                     gguf_set_val_i64(new_ctx, key, gguf_get_val_i64(dataset->ctx, i));
                     break;
+                case GGUF_TYPE_UINT64:
+                    gguf_set_val_u64(new_ctx, key, gguf_get_val_u64(dataset->ctx, i));
+                    break;
                 case GGUF_TYPE_FLOAT32:
                     gguf_set_val_f32(new_ctx, key, gguf_get_val_f32(dataset->ctx, i));
                     break;
+                case GGUF_TYPE_FLOAT64:
+                    gguf_set_val_f64(new_ctx, key, gguf_get_val_f64(dataset->ctx, i));
+                    break;
                 default:
-                    // Skip other types for now
-                    LLAMA_LOG_WARN("Skipping metadata key '%s' with unsupported type %d", key, type);
+                    LLAMA_LOG_WARN("Bad metadata key '%s' with type %d", key, type);
                     break;
             }
         }
diff --git a/tools/dataset-converter/formats/gguf/llama-dataset-gguf-utils.cpp b/tools/dataset-converter/formats/gguf/llama-dataset-gguf-utils.cpp
@@ -1,9 +1,8 @@
 #include "llama-dataset-gguf-utils.h"
-#include "../../common/log.h"
 
 #include <cstdio>
-#include <cstdlib>
-#include <cstring>
+
+#include "gguf.h"
 
 // Get the total size of the data section in a GGUF context
 size_t gguf_get_data_size(const struct gguf_context * ctx) {
@@ -37,16 +36,16 @@ bool gguf_load_tensors(const struct gguf_context * gguf_ctx, struct ggml_context
     // This function is a compatibility wrapper
     // In the actual GGUF API, tensor loading is handled by gguf_init_from_file
     // when called with appropriate parameters
-    
+
     // Since we're implementing this as a utility function, we'll validate
     // that the tensors can be accessed and return true if everything looks good
-    
+
     const int n_tensors = gguf_get_n_tensors(gguf_ctx);
-    
+
     if (n_tensors <= 0) {
         return true; // No tensors to load is considered success
     }
-    
+
     // Validate that we can access tensor metadata
     for (int i = 0; i < n_tensors; ++i) {
         const char * name = gguf_get_tensor_name(gguf_ctx, i);
@@ -61,13 +60,13 @@ bool gguf_load_tensors(const struct gguf_context * gguf_ctx, struct ggml_context
             fprintf(stderr, "Invalid tensor size for tensor %s\n", name);
             return false;
         }
-        
+
         enum ggml_type tensor_type = gguf_get_tensor_type(gguf_ctx, i);
         if (tensor_type >= GGML_TYPE_COUNT) {
             fprintf(stderr, "Invalid tensor type for tensor %s\n", name);
             return false;
         }
     }
-    
+
     return true;
-}
+}
diff --git a/tools/dataset-converter/formats/gguf/llama-dataset-gguf-utils.h b/tools/dataset-converter/formats/gguf/llama-dataset-gguf-utils.h
@@ -1,33 +1,30 @@
 #pragma once
-
-#include "gguf.h"
-#include "ggml.h"
-
+#include <cstddef>
 /**
  * @brief Utility functions for GGUF dataset handling.
- * 
+ *
  * This file provides utility functions that are missing from the GGUF API
  * but needed for the dataset converter.
  */
 
 /**
  * @brief Get the total size of the data section in a GGUF context.
- * 
+ *
  * This function calculates the total size of all tensor data in the GGUF context.
- * 
+ *
  * @param ctx The GGUF context
  * @return The total size of the data section in bytes
  */
 size_t gguf_get_data_size(const struct gguf_context * ctx);
 
 /**
  * @brief Load all tensors from a GGUF context into a GGML context.
- * 
+ *
  * This function loads all tensor data from a GGUF context into a GGML context.
  * The GGML context must have enough memory allocated to store all tensor data.
- * 
+ *
  * @param gguf_ctx The GGUF context containing tensor data
  * @param ggml_ctx The GGML context to load tensors into
  * @return true if successful, false otherwise
  */
-bool gguf_load_tensors(const struct gguf_context * gguf_ctx, struct ggml_context * ggml_ctx);
+bool gguf_load_tensors(const struct gguf_context * gguf_ctx, struct ggml_context * ggml_ctx);
diff --git a/tools/dataset-converter/formats/parquet/llama-dataset-parquet.cpp b/tools/dataset-converter/formats/parquet/llama-dataset-parquet.cpp
@@ -296,7 +296,7 @@ static bool create_gguf_from_parquet(const std::shared_ptr<arrow::Table>& table,
         }
     }
 
-    LLAMA_LOG_INFO("Successfully loaded %zu sequences from Parquet file (max_length=%d)",
+    LLAMA_LOG_INFO("Successfully loaded %zu sequences from Parquet file (max_length=%d)\n",
                    all_sequences.size(), max_length);
 
     return true;
@@ -519,7 +519,7 @@ struct llama_dataset * llama_dataset_load_parquet_internal(const char * path, bo
             return nullptr;
         }
 
-        LLAMA_LOG_INFO("Successfully loaded Parquet dataset from %s (%zu sequences, streaming=%s)",
+        LLAMA_LOG_INFO("Successfully loaded Parquet dataset from %s (%zu sequences, streaming=%s)\n",
                        path, dataset->n_seq, streaming ? "true" : "false");
 
         return dataset;
diff --git a/tools/dataset-converter/tools/streaming-optimization-analysis.cpp b/tools/dataset-converter/tools/streaming-optimization-analysis.cpp
@@ -119,7 +119,7 @@ class StreamingOptimizer {
             }
 
             // Issue 4: Test random access performance
-            std::cout << "Testing random access performance..." << std::endl;
+            /*std::cout << "Testing random access performance..." << std::endl;
 
             auto test_random_access = [](struct llama_dataset* dataset) {
                 auto start = std::chrono::high_resolution_clock::now();
@@ -179,7 +179,7 @@ class StreamingOptimizer {
                 result.issues_found.push_back("Sequential access in streaming mode is slow");
                 result.optimizations_needed.push_back("Implement read-ahead buffering for sequential access");
                 result.has_issues = true;
-            }
+            }*/
 
         } catch (const std::exception& e) {
             result.issues_found.push_back(std::string("Exception during analysis: ") + e.what());

Original file line number	Diff line number	Diff line change
`@@ -296,7 +296,7 @@ static bool create_gguf_from_parquet(const std::shared_ptr<arrow::Table>& table,`
`296`	`296`	`}`
`297`	`297`	`}`
`298`	`298`
`299`		`- LLAMA_LOG_INFO("Successfully loaded %zu sequences from Parquet file (max_length=%d)",`
	`299`	`+ LLAMA_LOG_INFO("Successfully loaded %zu sequences from Parquet file (max_length=%d)\n",`
`300`	`300`	`all_sequences.size(), max_length);`
`301`	`301`
`302`	`302`	`return true;`
`@@ -519,7 +519,7 @@ struct llama_dataset * llama_dataset_load_parquet_internal(const char * path, bo`
`519`	`519`	`return nullptr;`
`520`	`520`	`}`
`521`	`521`
`522`		`- LLAMA_LOG_INFO("Successfully loaded Parquet dataset from %s (%zu sequences, streaming=%s)",`
	`522`	`+ LLAMA_LOG_INFO("Successfully loaded Parquet dataset from %s (%zu sequences, streaming=%s)\n",`
`523`	`523`	`path, dataset->n_seq, streaming ? "true" : "false");`
`524`	`524`
`525`	`525`	`return dataset;`