Skip to content

Commit ed53dd5

Browse files
author
lexasub
committed
tool: add convertation of text/parquet to custom format
1 parent 021dfa8 commit ed53dd5

File tree

8 files changed

+60
-40
lines changed

8 files changed

+60
-40
lines changed

tools/dataset-converter/CMakeLists.txt

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@ include_directories(.
33
../../common
44
../../
55
../../src
6+
../../ggml/include
7+
../../include
68
${CMAKE_CURRENT_SOURCE_DIR}/formats/gguf
79
${CMAKE_CURRENT_SOURCE_DIR}/formats/parquet
810
${CMAKE_CURRENT_SOURCE_DIR}/formats/text
@@ -62,13 +64,25 @@ endif()
6264
add_executable(dataset_converter "${CMAKE_CURRENT_SOURCE_DIR}/tools/convert-to-gguf.cpp")
6365
target_link_libraries(dataset_converter PRIVATE dataset_convert_lib)
6466

67+
add_executable(streaming_analyzer "${CMAKE_CURRENT_SOURCE_DIR}/tools/streaming-optimization-analysis.cpp")
68+
target_link_libraries(streaming_analyzer PRIVATE dataset_convert_lib)
69+
70+
add_executable(test_data_validator "${CMAKE_CURRENT_SOURCE_DIR}/tools/test-data-validation-tool.cpp")
71+
target_link_libraries(test_data_validator PRIVATE dataset_convert_lib)
72+
6573
# Installation rule for the executable
66-
install(TARGETS dataset_converter
74+
install(TARGETS dataset_converter streaming_analyzer test_data_validator
6775
DESTINATION ${CMAKE_INSTALL_PREFIX}/bin
6876
)
6977

7078
enable_testing()
79+
add_executable(create-test-data ${CMAKE_CURRENT_SOURCE_DIR}/tests/data/create_test_data.cpp)
80+
target_link_libraries(create-test-data PRIVATE dataset_convert_lib)
81+
add_executable(create-test-data-gguf ${CMAKE_CURRENT_SOURCE_DIR}/tests/data/create_test_gguf.cpp)
82+
target_link_libraries(create-test-data-gguf PRIVATE dataset_convert_lib)
7183

84+
set_target_properties(create-test-data PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
85+
set_target_properties(create-test-data-gguf PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
7286
function(add_test_target TEST_SRC)
7387
get_filename_component(TEST_NAME ${TEST_SRC} NAME_WE)
7488

@@ -79,16 +93,18 @@ function(add_test_target TEST_SRC)
7993
string(REPLACE " " "_" REL_TEST_NAME ${REL_TEST_NAME})
8094

8195
add_executable(${REL_TEST_NAME} ${TEST_SRC})
96+
target_link_libraries(${REL_TEST_NAME} PRIVATE dataset_convert_lib)
8297

83-
target_link_libraries(${REL_TEST_NAME} PRIVATE
84-
dataset_convert_lib
85-
)
98+
set_target_properties(${REL_TEST_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
8699

87-
add_test(NAME ${REL_TEST_NAME} COMMAND ${REL_TEST_NAME})
100+
add_test(
101+
NAME ${REL_TEST_NAME}
102+
COMMAND ${CMAKE_COMMAND} -E chdir ${CMAKE_CURRENT_BINARY_DIR}
103+
sh -c "./create-test-data && ./create-test-data-gguf && ./${REL_TEST_NAME}"
104+
)
88105
endfunction()
89106

90107
file(GLOB_RECURSE TEST_SOURCES
91-
"${CMAKE_CURRENT_SOURCE_DIR}/tests/data/*.cpp"
92108
"${CMAKE_CURRENT_SOURCE_DIR}/tests/integration/*.cpp"
93109
"${CMAKE_CURRENT_SOURCE_DIR}/tests/streaming/*.cpp"
94110
"${CMAKE_CURRENT_SOURCE_DIR}/tests/unit/*.cpp"

tools/dataset-converter/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ cmake --build build
144144
Run the tests with:
145145

146146
```bash
147-
ctest -R "dataset|streaming|integration"
147+
ctest -R "_unit|_streaming|_integration"
148148
```
149149

150150
## 10. Important Note on `safetensors`

tools/dataset-converter/core/llama-dataset-utils.cpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -22,21 +22,21 @@ void set_error_with_code(enum dataset_error code, const char* msg) {
2222
if (!msg) {
2323
msg = "Unknown error (null message)";
2424
}
25-
25+
2626
g_error_state.code = code;
27-
27+
2828
// Use safer string copying with proper bounds checking
2929
size_t msg_len = strlen(msg);
3030
size_t max_len = sizeof(g_error_state.message) - 1;
31-
31+
3232
if (msg_len > max_len) {
3333
// Truncate message if too long
3434
memcpy(g_error_state.message, msg, max_len);
3535
g_error_state.message[max_len] = '\0';
3636
} else {
3737
strcpy(g_error_state.message, msg);
3838
}
39-
39+
4040
g_error_state.has_error = true;
4141

4242
LLAMA_LOG_ERROR("%s", g_error_state.message);
@@ -116,7 +116,7 @@ struct llama_dataset* dataset_alloc(enum dataset_type type, bool streaming) {
116116
free(dataset);
117117
return nullptr;
118118
}
119-
119+
120120
// Initialize optimization manager to nullptr (will be created on demand)
121121
dataset->optimization_manager = nullptr;
122122
}
@@ -299,7 +299,7 @@ struct ggml_tensor* create_sequence_tensor(struct ggml_context* ggml_ctx,
299299
// Check if streaming is supported for a dataset type and file
300300
bool llama_dataset_supports_streaming(enum dataset_type type, const char* path) {
301301
// Currently only GGUF supports streaming
302-
if (type == DATASET_GGUF) {
302+
if (type == DATASET_GGUF || type == DATASET_PARQUET) {
303303
return true;
304304
}
305305

tools/dataset-converter/core/llama-dataset.cpp

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -287,7 +287,7 @@ void to_gguf(struct llama_dataset * dataset, const char * path) {
287287
}
288288

289289
// For other formats (TEXT, PARQUET), we need to create a new GGUF file
290-
LLAMA_LOG_INFO("Converting %s dataset to GGUF file: %s",
290+
LLAMA_LOG_INFO("Converting %s dataset to GGUF file: %s\n",
291291
dataset->type == DATASET_TEXT ? "TEXT" : "PARQUET", path);
292292

293293
// Create a new GGUF context
@@ -312,15 +312,23 @@ void to_gguf(struct llama_dataset * dataset, const char * path) {
312312
case GGUF_TYPE_INT32:
313313
gguf_set_val_i32(new_ctx, key, gguf_get_val_i32(dataset->ctx, i));
314314
break;
315+
case GGUF_TYPE_UINT32:
316+
gguf_set_val_u32(new_ctx, key, gguf_get_val_u32(dataset->ctx, i));
317+
break;
315318
case GGUF_TYPE_INT64:
316319
gguf_set_val_i64(new_ctx, key, gguf_get_val_i64(dataset->ctx, i));
317320
break;
321+
case GGUF_TYPE_UINT64:
322+
gguf_set_val_u64(new_ctx, key, gguf_get_val_u64(dataset->ctx, i));
323+
break;
318324
case GGUF_TYPE_FLOAT32:
319325
gguf_set_val_f32(new_ctx, key, gguf_get_val_f32(dataset->ctx, i));
320326
break;
327+
case GGUF_TYPE_FLOAT64:
328+
gguf_set_val_f64(new_ctx, key, gguf_get_val_f64(dataset->ctx, i));
329+
break;
321330
default:
322-
// Skip other types for now
323-
LLAMA_LOG_WARN("Skipping metadata key '%s' with unsupported type %d", key, type);
331+
LLAMA_LOG_WARN("Bad metadata key '%s' with type %d", key, type);
324332
break;
325333
}
326334
}

tools/dataset-converter/formats/gguf/llama-dataset-gguf-utils.cpp

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,8 @@
11
#include "llama-dataset-gguf-utils.h"
2-
#include "../../common/log.h"
32

43
#include <cstdio>
5-
#include <cstdlib>
6-
#include <cstring>
4+
5+
#include "gguf.h"
76

87
// Get the total size of the data section in a GGUF context
98
size_t gguf_get_data_size(const struct gguf_context * ctx) {
@@ -37,16 +36,16 @@ bool gguf_load_tensors(const struct gguf_context * gguf_ctx, struct ggml_context
3736
// This function is a compatibility wrapper
3837
// In the actual GGUF API, tensor loading is handled by gguf_init_from_file
3938
// when called with appropriate parameters
40-
39+
4140
// Since we're implementing this as a utility function, we'll validate
4241
// that the tensors can be accessed and return true if everything looks good
43-
42+
4443
const int n_tensors = gguf_get_n_tensors(gguf_ctx);
45-
44+
4645
if (n_tensors <= 0) {
4746
return true; // No tensors to load is considered success
4847
}
49-
48+
5049
// Validate that we can access tensor metadata
5150
for (int i = 0; i < n_tensors; ++i) {
5251
const char * name = gguf_get_tensor_name(gguf_ctx, i);
@@ -61,13 +60,13 @@ bool gguf_load_tensors(const struct gguf_context * gguf_ctx, struct ggml_context
6160
fprintf(stderr, "Invalid tensor size for tensor %s\n", name);
6261
return false;
6362
}
64-
63+
6564
enum ggml_type tensor_type = gguf_get_tensor_type(gguf_ctx, i);
6665
if (tensor_type >= GGML_TYPE_COUNT) {
6766
fprintf(stderr, "Invalid tensor type for tensor %s\n", name);
6867
return false;
6968
}
7069
}
71-
70+
7271
return true;
73-
}
72+
}
Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,30 @@
11
#pragma once
2-
3-
#include "gguf.h"
4-
#include "ggml.h"
5-
2+
#include <cstddef>
63
/**
74
* @brief Utility functions for GGUF dataset handling.
8-
*
5+
*
96
* This file provides utility functions that are missing from the GGUF API
107
* but needed for the dataset converter.
118
*/
129

1310
/**
1411
* @brief Get the total size of the data section in a GGUF context.
15-
*
12+
*
1613
* This function calculates the total size of all tensor data in the GGUF context.
17-
*
14+
*
1815
* @param ctx The GGUF context
1916
* @return The total size of the data section in bytes
2017
*/
2118
size_t gguf_get_data_size(const struct gguf_context * ctx);
2219

2320
/**
2421
* @brief Load all tensors from a GGUF context into a GGML context.
25-
*
22+
*
2623
* This function loads all tensor data from a GGUF context into a GGML context.
2724
* The GGML context must have enough memory allocated to store all tensor data.
28-
*
25+
*
2926
* @param gguf_ctx The GGUF context containing tensor data
3027
* @param ggml_ctx The GGML context to load tensors into
3128
* @return true if successful, false otherwise
3229
*/
33-
bool gguf_load_tensors(const struct gguf_context * gguf_ctx, struct ggml_context * ggml_ctx);
30+
bool gguf_load_tensors(const struct gguf_context * gguf_ctx, struct ggml_context * ggml_ctx);

tools/dataset-converter/formats/parquet/llama-dataset-parquet.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -296,7 +296,7 @@ static bool create_gguf_from_parquet(const std::shared_ptr<arrow::Table>& table,
296296
}
297297
}
298298

299-
LLAMA_LOG_INFO("Successfully loaded %zu sequences from Parquet file (max_length=%d)",
299+
LLAMA_LOG_INFO("Successfully loaded %zu sequences from Parquet file (max_length=%d)\n",
300300
all_sequences.size(), max_length);
301301

302302
return true;
@@ -519,7 +519,7 @@ struct llama_dataset * llama_dataset_load_parquet_internal(const char * path, bo
519519
return nullptr;
520520
}
521521

522-
LLAMA_LOG_INFO("Successfully loaded Parquet dataset from %s (%zu sequences, streaming=%s)",
522+
LLAMA_LOG_INFO("Successfully loaded Parquet dataset from %s (%zu sequences, streaming=%s)\n",
523523
path, dataset->n_seq, streaming ? "true" : "false");
524524

525525
return dataset;

tools/dataset-converter/tools/streaming-optimization-analysis.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ class StreamingOptimizer {
119119
}
120120

121121
// Issue 4: Test random access performance
122-
std::cout << "Testing random access performance..." << std::endl;
122+
/*std::cout << "Testing random access performance..." << std::endl;
123123
124124
auto test_random_access = [](struct llama_dataset* dataset) {
125125
auto start = std::chrono::high_resolution_clock::now();
@@ -179,7 +179,7 @@ class StreamingOptimizer {
179179
result.issues_found.push_back("Sequential access in streaming mode is slow");
180180
result.optimizations_needed.push_back("Implement read-ahead buffering for sequential access");
181181
result.has_issues = true;
182-
}
182+
}*/
183183

184184
} catch (const std::exception& e) {
185185
result.issues_found.push_back(std::string("Exception during analysis: ") + e.what());

0 commit comments

Comments
 (0)