Skip to content

Commit b264c38

Browse files
author
lexasub
committed
tool: fix convertation of text/parquet to custom format
1 parent bff2db8 commit b264c38

File tree

3 files changed

+43
-13
lines changed

3 files changed

+43
-13
lines changed

tools/dataset-converter/convert-to-train-gguf.cpp

Lines changed: 21 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -100,12 +100,10 @@ int main(int argc, char ** argv) {
100100
llama_gguf_converter converter;
101101
bool success = converter.llama_gguf_converter_convert(params, model);
102102

103-
// Clean up llama model
104-
llama_model_free(model);
105-
llama_backend_free();
106-
107103
if (!success) {
108104
fprintf(stderr, "error: GGUF conversion failed.\n");
105+
llama_model_free(model); // Free model on conversion failure
106+
llama_backend_free();
109107
return 1;
110108
}
111109

@@ -120,6 +118,8 @@ int main(int argc, char ** argv) {
120118

121119
if (!reader.llama_gguf_reader_is_initialized()) {
122120
fprintf(stderr, "error: llama_gguf_reader failed to initialize for preview.\n");
121+
llama_model_free(model); // Free model before exiting
122+
llama_backend_free();
123123
return 1;
124124
}
125125

@@ -153,14 +153,19 @@ int main(int argc, char ** argv) {
153153
std::string detokenized_text = "";
154154
// Buffer for a single token
155155
std::array<char, 256> piece_buf; // Large enough buffer for a single token
156-
for (llama_token token : sequence_tokens) {
157-
int n_chars = llama_token_to_piece(llama_model_get_vocab(model), token,
158-
piece_buf.data(), piece_buf.size(), 1, false);
159-
if (n_chars > 0) {
160-
detokenized_text.append(piece_buf.data(), n_chars);
156+
// Ensure model is valid before calling llama_model_get_vocab
157+
if (model != nullptr) {
158+
for (llama_token token : sequence_tokens) {
159+
int n_chars = llama_token_to_piece(llama_model_get_vocab(model), token,
160+
piece_buf.data(), piece_buf.size(), 1, false);
161+
if (n_chars > 0) {
162+
detokenized_text.append(piece_buf.data(), n_chars);
163+
}
161164
}
165+
printf(" Detokenized: \"%s\"\n", detokenized_text.c_str());
166+
} else {
167+
fprintf(stderr, " Warning: Cannot detokenize preview, model is null.\n");
162168
}
163-
printf(" Detokenized: \"%s\"\n", detokenized_text.c_str());
164169
}
165170

166171
} else {
@@ -173,10 +178,16 @@ int main(int argc, char ** argv) {
173178

174179
} catch (const std::runtime_error & e) {
175180
fprintf(stderr, "error: GGUF preview failed: %s\n", e.what());
181+
llama_model_free(model); // Free model before exiting
182+
llama_backend_free();
176183
return 1;
177184
}
178185
printf("--- End of GGUF file preview ---\n");
179186
}
180187

188+
// Clean up llama model and backend after all usage
189+
llama_model_free(model);
190+
llama_backend_free();
191+
181192
return 0;
182193
}

tools/dataset-converter/dataset-to-gguf/llama-gguf-file.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,6 @@ struct llama_gguf_file {
108108

109109
private:
110110
struct gguf_context * m_ctx; // The underlying GGUF context
111-
struct ggml_context * m_ggml_ctx; // ggml_context for tensor data when reading
112111

113112
// Private helper function to find a key by name.
114113
// key: The key name to find.

tools/dataset-converter/dataset-to-gguf/tests/dataset-to-gguf-tests.cpp

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,25 @@ static std::string g_test_model_path = "../../gte-small.Q2_K.gguf"; // Specif
2828
return false; \
2929
} \
3030
} while (0)
31+
bool SetUpLlamaBackend;
32+
bool Testllama_gguf_file_DefaultConstructorInitializesContext;
33+
bool Testllama_gguf_file_ConstructorFromFileThrowsOnError;
34+
bool Testllama_gguf_file_SetAndGetMetadataString;
35+
bool Testllama_gguf_file_SetAndGetMetadataU64;
36+
bool Testllama_gguf_file_SetAndGetMetadataStringArray;
37+
bool CreateTestllama_gguf_file;const std::string & path, llama_model * model_ptr
38+
bool Testllama_gguf_reader_ConstructorInitializesFromFile;
39+
bool Testllama_gguf_reader_GetMetadata;
40+
bool Testllama_gguf_reader_GetTensorCount;
41+
bool Testllama_gguf_reader_GetTensorNameAndTypeAndSize;
42+
bool Testllama_gguf_reader_ReadTensorData;
43+
bool Testllama_gguf_reader_ReadTensorDataInvalidIndex;
44+
bool TestTextDataReader_OpenFile;
45+
bool TestTextDataReader_ReadNextSequenceTextMode;
46+
bool TestTextDataReader_ReadNextSequencePreTokenizedMode;
47+
bool TestTextDataReader_ResetFunctionality;
48+
bool TestTextDataReader_GetTotalSequences;
49+
bool Testllama_gguf_converter_ConvertTextFileSuccess;
3150

3251
// Global setup for llama.cpp backend
3352
bool SetUpLlamaBackend() {
@@ -419,9 +438,10 @@ bool Testllama_gguf_converter_ConvertTextFileSuccess() {
419438
params.max_seq_len = 128;
420439
params.pre_tokenized = false;
421440
params.dataset_format = "text";
441+
#ifdef LLAMA_PARQUET
422442
params.parquet_text_column = "text"; // Not used for text, but for completeness
423443
params.parquet_tokens_column = "tokens"; // Not used for text, but for completeness
424-
444+
#endif
425445
llama_gguf_converter converter;
426446
TEST_ASSERT(converter.llama_gguf_converter_convert(params, g_llama_model), "GGUF conversion failed");
427447

@@ -448,7 +468,7 @@ bool Testllama_gguf_converter_ConvertTextFileSuccess() {
448468
// Main function to run all tests
449469
// =============================================================================
450470

451-
int main(int argc, char ** argv) {
471+
int main() {
452472
printf("Running dataset-to-gguf tests...\n\n");
453473

454474
// Global setup for llama.cpp backend

0 commit comments

Comments
 (0)