tool: fix convertation of text/parquet to custom format

lexasub · lexasub · commit b264c38bcd9c · 2025-07-10T22:28:04.000+04:00
diff --git a/tools/dataset-converter/convert-to-train-gguf.cpp b/tools/dataset-converter/convert-to-train-gguf.cpp
@@ -100,12 +100,10 @@ int main(int argc, char ** argv) {
     llama_gguf_converter converter;
     bool success = converter.llama_gguf_converter_convert(params, model);
 
-    // Clean up llama model
-    llama_model_free(model);
-    llama_backend_free();
-
     if (!success) {
         fprintf(stderr, "error: GGUF conversion failed.\n");
+        llama_model_free(model); // Free model on conversion failure
+        llama_backend_free();
         return 1;
     }
 
@@ -120,6 +118,8 @@ int main(int argc, char ** argv) {
 
             if (!reader.llama_gguf_reader_is_initialized()) {
                 fprintf(stderr, "error: llama_gguf_reader failed to initialize for preview.\n");
+                llama_model_free(model); // Free model before exiting
+                llama_backend_free();
                 return 1;
             }
 
@@ -153,14 +153,19 @@ int main(int argc, char ** argv) {
                             std::string detokenized_text = "";
                             // Buffer for a single token
                             std::array<char, 256> piece_buf;  // Large enough buffer for a single token
-                            for (llama_token token : sequence_tokens) {
-                                int n_chars = llama_token_to_piece(llama_model_get_vocab(model), token,
-                                                                   piece_buf.data(), piece_buf.size(), 1, false);
-                                if (n_chars > 0) {
-                                    detokenized_text.append(piece_buf.data(), n_chars);
+                            // Ensure model is valid before calling llama_model_get_vocab
+                            if (model != nullptr) {
+                                for (llama_token token : sequence_tokens) {
+                                    int n_chars = llama_token_to_piece(llama_model_get_vocab(model), token,
+                                                                       piece_buf.data(), piece_buf.size(), 1, false);
+                                    if (n_chars > 0) {
+                                        detokenized_text.append(piece_buf.data(), n_chars);
+                                    }
                                 }
+                                printf("    Detokenized: \"%s\"\n", detokenized_text.c_str());
+                            } else {
+                                fprintf(stderr, "    Warning: Cannot detokenize preview, model is null.\n");
                             }
-                            printf("    Detokenized: \"%s\"\n", detokenized_text.c_str());
                         }
 
                     } else {
@@ -173,10 +178,16 @@ int main(int argc, char ** argv) {
 
         } catch (const std::runtime_error & e) {
             fprintf(stderr, "error: GGUF preview failed: %s\n", e.what());
+            llama_model_free(model); // Free model before exiting
+            llama_backend_free();
             return 1;
         }
         printf("--- End of GGUF file preview ---\n");
     }
 
+    // Clean up llama model and backend after all usage
+    llama_model_free(model);
+    llama_backend_free();
+
     return 0;
 }
diff --git a/tools/dataset-converter/dataset-to-gguf/llama-gguf-file.h b/tools/dataset-converter/dataset-to-gguf/llama-gguf-file.h
@@ -108,7 +108,6 @@ struct llama_gguf_file {
 
   private:
     struct gguf_context * m_ctx;       // The underlying GGUF context
-    struct ggml_context * m_ggml_ctx;  // ggml_context for tensor data when reading
 
     // Private helper function to find a key by name.
     // key: The key name to find.
diff --git a/tools/dataset-converter/dataset-to-gguf/tests/dataset-to-gguf-tests.cpp b/tools/dataset-converter/dataset-to-gguf/tests/dataset-to-gguf-tests.cpp
@@ -28,6 +28,25 @@ static std::string   g_test_model_path = "../../gte-small.Q2_K.gguf";  // Specif
             return false;                                                                                            \
         }                                                                                                            \
     } while (0)
+bool SetUpLlamaBackend;
+bool Testllama_gguf_file_DefaultConstructorInitializesContext;
+bool Testllama_gguf_file_ConstructorFromFileThrowsOnError;
+bool Testllama_gguf_file_SetAndGetMetadataString;
+bool Testllama_gguf_file_SetAndGetMetadataU64;
+bool Testllama_gguf_file_SetAndGetMetadataStringArray;
+bool CreateTestllama_gguf_file;const std::string & path, llama_model * model_ptr
+bool Testllama_gguf_reader_ConstructorInitializesFromFile;
+bool Testllama_gguf_reader_GetMetadata;
+bool Testllama_gguf_reader_GetTensorCount;
+bool Testllama_gguf_reader_GetTensorNameAndTypeAndSize;
+bool Testllama_gguf_reader_ReadTensorData;
+bool Testllama_gguf_reader_ReadTensorDataInvalidIndex;
+bool TestTextDataReader_OpenFile;
+bool TestTextDataReader_ReadNextSequenceTextMode;
+bool TestTextDataReader_ReadNextSequencePreTokenizedMode;
+bool TestTextDataReader_ResetFunctionality;
+bool TestTextDataReader_GetTotalSequences;
+bool Testllama_gguf_converter_ConvertTextFileSuccess;
 
 // Global setup for llama.cpp backend
 bool SetUpLlamaBackend() {
@@ -419,9 +438,10 @@ bool Testllama_gguf_converter_ConvertTextFileSuccess() {
     params.max_seq_len           = 128;
     params.pre_tokenized         = false;
     params.dataset_format        = "text";
+#ifdef LLAMA_PARQUET
     params.parquet_text_column   = "text";    // Not used for text, but for completeness
     params.parquet_tokens_column = "tokens";  // Not used for text, but for completeness
-
+#endif
     llama_gguf_converter converter;
     TEST_ASSERT(converter.llama_gguf_converter_convert(params, g_llama_model), "GGUF conversion failed");
 
@@ -448,7 +468,7 @@ bool Testllama_gguf_converter_ConvertTextFileSuccess() {
 // Main function to run all tests
 // =============================================================================
 
-int main(int argc, char ** argv) {
+int main() {
     printf("Running dataset-to-gguf tests...\n\n");
 
     // Global setup for llama.cpp backend