Skip to content

Commit daf0d16

Browse files
author
lexasub
committed
tool: fix convertation of text/parquet to custom format
1 parent bff2db8 commit daf0d16

File tree

3 files changed

+23
-12
lines changed

3 files changed

+23
-12
lines changed

tools/dataset-converter/convert-to-train-gguf.cpp

Lines changed: 21 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -100,12 +100,10 @@ int main(int argc, char ** argv) {
100100
llama_gguf_converter converter;
101101
bool success = converter.llama_gguf_converter_convert(params, model);
102102

103-
// Clean up llama model
104-
llama_model_free(model);
105-
llama_backend_free();
106-
107103
if (!success) {
108104
fprintf(stderr, "error: GGUF conversion failed.\n");
105+
llama_model_free(model); // Free model on conversion failure
106+
llama_backend_free();
109107
return 1;
110108
}
111109

@@ -120,6 +118,8 @@ int main(int argc, char ** argv) {
120118

121119
if (!reader.llama_gguf_reader_is_initialized()) {
122120
fprintf(stderr, "error: llama_gguf_reader failed to initialize for preview.\n");
121+
llama_model_free(model); // Free model before exiting
122+
llama_backend_free();
123123
return 1;
124124
}
125125

@@ -153,14 +153,19 @@ int main(int argc, char ** argv) {
153153
std::string detokenized_text = "";
154154
// Buffer for a single token
155155
std::array<char, 256> piece_buf; // Large enough buffer for a single token
156-
for (llama_token token : sequence_tokens) {
157-
int n_chars = llama_token_to_piece(llama_model_get_vocab(model), token,
158-
piece_buf.data(), piece_buf.size(), 1, false);
159-
if (n_chars > 0) {
160-
detokenized_text.append(piece_buf.data(), n_chars);
156+
// Ensure model is valid before calling llama_model_get_vocab
157+
if (model != nullptr) {
158+
for (llama_token token : sequence_tokens) {
159+
int n_chars = llama_token_to_piece(llama_model_get_vocab(model), token,
160+
piece_buf.data(), piece_buf.size(), 1, false);
161+
if (n_chars > 0) {
162+
detokenized_text.append(piece_buf.data(), n_chars);
163+
}
161164
}
165+
printf(" Detokenized: \"%s\"\n", detokenized_text.c_str());
166+
} else {
167+
fprintf(stderr, " Warning: Cannot detokenize preview, model is null.\n");
162168
}
163-
printf(" Detokenized: \"%s\"\n", detokenized_text.c_str());
164169
}
165170

166171
} else {
@@ -173,10 +178,16 @@ int main(int argc, char ** argv) {
173178

174179
} catch (const std::runtime_error & e) {
175180
fprintf(stderr, "error: GGUF preview failed: %s\n", e.what());
181+
llama_model_free(model); // Free model before exiting
182+
llama_backend_free();
176183
return 1;
177184
}
178185
printf("--- End of GGUF file preview ---\n");
179186
}
180187

188+
// Clean up llama model and backend after all usage
189+
llama_model_free(model);
190+
llama_backend_free();
191+
181192
return 0;
182193
}

tools/dataset-converter/dataset-to-gguf/llama-gguf-file.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,6 @@ struct llama_gguf_file {
108108

109109
private:
110110
struct gguf_context * m_ctx; // The underlying GGUF context
111-
struct ggml_context * m_ggml_ctx; // ggml_context for tensor data when reading
112111

113112
// Private helper function to find a key by name.
114113
// key: The key name to find.

tools/dataset-converter/dataset-to-gguf/tests/dataset-to-gguf-tests.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -419,9 +419,10 @@ bool Testllama_gguf_converter_ConvertTextFileSuccess() {
419419
params.max_seq_len = 128;
420420
params.pre_tokenized = false;
421421
params.dataset_format = "text";
422+
#ifdef LLAMA_PARQUET
422423
params.parquet_text_column = "text"; // Not used for text, but for completeness
423424
params.parquet_tokens_column = "tokens"; // Not used for text, but for completeness
424-
425+
#endif
425426
llama_gguf_converter converter;
426427
TEST_ASSERT(converter.llama_gguf_converter_convert(params, g_llama_model), "GGUF conversion failed");
427428

0 commit comments

Comments
 (0)