1313// This two-pass approach allows processing datasets significantly larger than
1414// available RAM.
1515
16+ #include " log.h"
1617#include < algorithm> // For std::min
1718#include < array> // For std::array
1819#include < cinttypes> // For PRIu64
2526#include " dataset-to-gguf/llama-gguf-converter.h"
2627#include " dataset-to-gguf/llama-gguf-reader.h"
2728#include " llama.h" // For llama_backend_init, llama_backend_free, llama_model_load_from_file, llama_model_free
28-
29+ # define PREVIEW_COUNT 1
2930int main (int argc, char ** argv) {
3031 common_params params;
3132 if (!common_params_parse (argc, argv, params, LLAMA_EXAMPLE_FINETUNE)) {
3233 return 1 ;
3334 }
3435
3536 // Print parameters for verification
36- printf (" Parameters:\n " );
37- printf (" Model for tokenizer: %s\n " , params.model .path .c_str ());
38- printf (" Input files: " );
37+ LOG_INF (" Parameters:\n " );
38+ LOG_INF (" Model for tokenizer: %s\n " , params.model .path .c_str ());
39+ LOG_INF (" Input files: " );
3940 for (auto & i : params.in_files ) {
40- printf (" %s " , i.c_str ());
41- }
42- printf (" \n Output file: %s\n " , params.out_file .c_str ());
43- printf (" Max sequence length: %d\n " , params.max_seq_len );
44- printf (" Pre-tokenized input: %s\n " , params.pre_tokenized ? " Yes" : " No" );
45- printf (" Input type: %s\n " , params.dataset_format .c_str ());
46- printf (" Do preview: %s\n " , params.do_preview ? " Yes" : " No" );
47- if (params.do_preview ) {
48- printf (" Preview count: %d\n " , params.preview_count );
49- printf (" Detokenize preview: %s\n " , params.detokenize_preview ? " Yes" : " No" );
41+ LOG_INF (" %s " , i.c_str ());
5042 }
51- #ifdef LLAMA_PARQUET
52- if (params.dataset_format == " parquet" ) {
53- printf (" Parquet text column: %s\n " , params.parquet_text_column .c_str ());
54- printf (" Parquet tokens column: %s\n " , params.parquet_tokens_column .c_str ());
43+ LOG_INF (" \n Output file: %s\n " , params.out_file .c_str ());
44+ LOG_INF (" Max sequence length: %d\n " , params.max_seq_len );
45+ LOG_INF (" Input type: %s\n " , params.dataset_format .c_str ());
46+ LOG_INF (" Do preview: %s\n " , params.do_preview ? " Yes" : " No" );
47+ if (params.dataset_format != " text" ) {
48+ LOG_INF (" Dataset column: %s\n " , params.dataset_column .c_str ());
5549 }
56- #endif
57- printf (" \n " );
50+ LOG_INF (" \n " );
5851
5952 // Initialize llama.cpp
6053 llama_backend_init ();
@@ -64,125 +57,122 @@ int main(int argc, char ** argv) {
6457 llama_model *model = llama_model_load_from_file (params.model .path .c_str (), model_params);
6558
6659 if (model == nullptr ) {
67- fprintf (stderr, " error: failed to load model from %s\n " , params.model .path .c_str ());
60+ LOG_ERR ( " error: failed to load model from %s\n " , params.model .path .c_str ());
6861 llama_backend_free ();
6962 return 1 ;
7063 }
7164
7265 // --- Diagnostic Test: Reading tokenizer model GGUF file ---
73- printf (" --- Diagnostic Test: Reading tokenizer model GGUF file ---\n " );
66+ LOG_INF (" --- Diagnostic Test: Reading tokenizer model GGUF file ---\n " );
7467 try {
7568 llama_gguf_reader tokenizer_model_reader (params.model .path );
7669 if (tokenizer_model_reader.llama_gguf_reader_is_initialized ()) {
77- printf (" Tokenizer Model GGUF file opened successfully.\n " );
78- printf (" Tokenizer Model Name: %s\n " ,
70+ LOG_INF (" Tokenizer Model GGUF file opened successfully.\n " );
71+ LOG_INF (" Tokenizer Model Name: %s\n " ,
7972 tokenizer_model_reader.llama_gguf_reader_get_metadata_str (" general.name" , " N/A" ).c_str ());
80- printf (" Tokenizer Model Architecture: %s\n " ,
73+ LOG_INF (" Tokenizer Model Architecture: %s\n " ,
8174 tokenizer_model_reader.llama_gguf_reader_get_metadata_str (" general.architecture" , " N/A" ).c_str ());
82- printf (" Tokenizer Model Tensor Count: %llu\n " ,
75+ LOG_INF (" Tokenizer Model Tensor Count: %llu\n " ,
8376 static_cast <long long >(tokenizer_model_reader.llama_gguf_reader_get_tensor_count ()));
84- printf (" Diagnostic Test: Tokenizer Model GGUF read successful.\n " );
77+ LOG_INF (" Diagnostic Test: Tokenizer Model GGUF read successful.\n " );
8578 } else {
86- fprintf (stderr, " error: Diagnostic Test: Tokenizer Model GGUF read failed to initialize.\n " );
79+ LOG_ERR ( " error: Diagnostic Test: Tokenizer Model GGUF read failed to initialize.\n " );
8780 llama_model_free (model); // Free model before exiting
8881 llama_backend_free ();
8982 return 1 ;
9083 }
9184 } catch (const std::runtime_error & e) {
92- fprintf (stderr, " error: Diagnostic Test: Tokenizer Model GGUF read failed: %s\n " , e.what ());
85+ LOG_ERR ( " error: Diagnostic Test: Tokenizer Model GGUF read failed: %s\n " , e.what ());
9386 llama_model_free (model); // Free model before exiting
9487 llama_backend_free ();
9588 return 1 ;
9689 }
97- printf (" --- End of Diagnostic Test ---\n\n " );
90+ LOG_INF (" --- End of Diagnostic Test ---\n\n " );
9891
9992 // Create and run the converter
10093 llama_gguf_converter converter;
10194 bool success = converter.llama_gguf_converter_convert (params, model);
10295
10396 if (!success) {
104- fprintf (stderr, " error: GGUF conversion failed.\n " );
97+ LOG_ERR ( " error: GGUF conversion failed.\n " );
10598 llama_model_free (model); // Free model on conversion failure
10699 llama_backend_free ();
107100 return 1 ;
108101 }
109102
110- printf (" Conversion successful!\n " );
111- printf (" Output file: %s\n " , params.out_file .c_str ());
103+ LOG_INF (" Conversion successful!\n " );
104+ LOG_INF (" Output file: %s\n " , params.out_file .c_str ());
112105
113106 // --- Preview generated GGUF file (if requested) ---
114107 if (params.do_preview ) {
115- printf (" \n --- Previewing generated GGUF file ---\n " );
108+ LOG_INF (" \n --- Previewing generated GGUF file ---\n " );
116109 try {
117110 llama_gguf_reader reader (params.out_file );
118111
119112 if (!reader.llama_gguf_reader_is_initialized ()) {
120- fprintf (stderr, " error: llama_gguf_reader failed to initialize for preview.\n " );
113+ LOG_ERR ( " error: llama_gguf_reader failed to initialize for preview.\n " );
121114 llama_model_free (model); // Free model before exiting
122115 llama_backend_free ();
123116 return 1 ;
124117 }
125118
126- printf (" Dataset Name: %s\n " ,
119+ LOG_INF (" Dataset Name: %s\n " ,
127120 reader.llama_gguf_reader_get_metadata_str (" training.dataset.name" , " N/A" ).c_str ());
128- printf (" Sequence Count: %llu\n " , static_cast <long long >(reader.llama_gguf_reader_get_metadata_u64 (" training.sequence.count" , 0 )));
129- printf (" Tokenizer Model: %s\n " ,
121+ LOG_INF (" Sequence Count: %llu\n " , static_cast <long long >(reader.llama_gguf_reader_get_metadata_u64 (" training.sequence.count" , 0 )));
122+ LOG_INF (" Tokenizer Model: %s\n " ,
130123 reader.llama_gguf_reader_get_metadata_str (" training.tokenizer.gguf.model" , " N/A" ).c_str ());
131124
132125 int64_t tensor_count = reader.llama_gguf_reader_get_tensor_count ();
133126 if (tensor_count > 0 ) {
134127 // Print N first sequences
135- for (int64_t i = 0 ; i < std::min (( int64_t ) params. preview_count , tensor_count); ++i) {
136- printf (" Sequence (training.tensor.%" PRId64 " ):\n " , i);
128+ for (int64_t i = 0 ; i < std::min (static_cast < int64_t >(PREVIEW_COUNT) , tensor_count); ++i) {
129+ LOG_INF (" Sequence (training.tensor.%" PRId64 " ):\n " , i);
137130 std::vector<llama_token> sequence_tokens;
138131 if (reader.llama_gguf_reader_read_tensor_data (i, sequence_tokens)) {
139- printf (" Length: %zu tokens\n " , sequence_tokens.size ());
140- printf (" Tokens: [" );
132+ LOG_INF (" Length: %zu tokens\n " , sequence_tokens.size ());
133+ LOG_INF (" Tokens: [" );
141134 for (size_t j = 0 ; j < std::min ((size_t ) 10 , sequence_tokens.size ());
142135 ++j) { // Print up to 10 tokens
143- printf (" %d%s" , sequence_tokens[j],
136+ LOG_INF (" %d%s" , sequence_tokens[j],
144137 (j == std::min ((size_t ) 10 , sequence_tokens.size ()) - 1 ) ? " " : " , " );
145138 }
146139 if (sequence_tokens.size () > 10 ) {
147- printf (" ..." );
140+ LOG_INF (" ..." );
148141 }
149- printf (" ]\n " );
150-
151- if (params.detokenize_preview ) {
152- // Detokenization
153- std::string detokenized_text = " " ;
154- // Buffer for a single token
155- std::array<char , 256 > piece_buf; // Large enough buffer for a single token
156- // Ensure model is valid before calling llama_model_get_vocab
157- if (model != nullptr ) {
158- for (llama_token token : sequence_tokens) {
159- int n_chars = llama_token_to_piece (llama_model_get_vocab (model), token,
160- piece_buf.data (), piece_buf.size (), 1 , false );
161- if (n_chars > 0 ) {
162- detokenized_text.append (piece_buf.data (), n_chars);
163- }
142+ LOG_INF (" ]\n " );
143+ // Detokenization
144+ std::string detokenized_text = " " ;
145+ // Buffer for a single token
146+ std::array<char , 256 > piece_buf; // Large enough buffer for a single token
147+ // Ensure model is valid before calling llama_model_get_vocab
148+ if (model != nullptr ) {
149+ for (llama_token token : sequence_tokens) {
150+ int n_chars = llama_token_to_piece (llama_model_get_vocab (model), token,
151+ piece_buf.data (), piece_buf.size (), 1 , false );
152+ if (n_chars > 0 ) {
153+ detokenized_text.append (piece_buf.data (), n_chars);
164154 }
165- printf (" Detokenized: \" %s\"\n " , detokenized_text.c_str ());
166- } else {
167- fprintf (stderr, " Warning: Cannot detokenize preview, model is null.\n " );
168155 }
156+ LOG_INF (" Detokenized: \" %s\"\n " , detokenized_text.c_str ());
157+ } else {
158+ LOG_ERR (" Warning: Cannot detokenize preview, model is null.\n " );
169159 }
170160
171161 } else {
172- fprintf (stderr, " Error: Could not read data for sequence %" PRId64 " .\n " , i);
162+ LOG_ERR ( " Error: Could not read data for sequence %" PRId64 " .\n " , i);
173163 }
174164 }
175165 } else {
176- printf (" No sequences found in the GGUF file.\n " );
166+ LOG_INF (" No sequences found in the GGUF file.\n " );
177167 }
178168
179169 } catch (const std::runtime_error & e) {
180- fprintf (stderr, " error: GGUF preview failed: %s\n " , e.what ());
170+ LOG_ERR ( " error: GGUF preview failed: %s\n " , e.what ());
181171 llama_model_free (model); // Free model before exiting
182172 llama_backend_free ();
183173 return 1 ;
184174 }
185- printf (" --- End of GGUF file preview ---\n " );
175+ LOG_INF (" --- End of GGUF file preview ---\n " );
186176 }
187177
188178 // Clean up llama model and backend after all usage
0 commit comments