lexasub
diff --git a/‎common/arg.cpp‎
Lines changed: 17 additions & 9 deletions b/‎common/arg.cpp‎
Lines changed: 17 additions & 9 deletions
diff --git a/‎common/common.h‎
Lines changed: 2 additions & 0 deletions b/‎common/common.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎tools/dataset-converter/CMakeLists.txt‎
Lines changed: 0 additions & 1 deletion b/‎tools/dataset-converter/CMakeLists.txt‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎tools/dataset-converter/README.md‎
Lines changed: 38 additions & 19 deletions b/‎tools/dataset-converter/README.md‎
Lines changed: 38 additions & 19 deletions
diff --git a/‎tools/dataset-converter/core/llama-dataset-internal.h‎
Lines changed: 6 additions & 5 deletions b/‎tools/dataset-converter/core/llama-dataset-internal.h‎
Lines changed: 6 additions & 5 deletions
@@ -1482,7 +1482,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params) {
             params.ctx_shift = false;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_DATASET}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
     add_opt(common_arg(
         {"--chunks"}, "N",
         string_format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
@@ -1551,7 +1551,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             }
             params.in_files.push_back(value);
         }
-    ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_FINETUNE}));
+    ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_DATASET}));
     add_opt(common_arg(
         {"-bf", "--binary-file"}, "FNAME",
         "binary file containing the prompt (default: none)",
@@ -2541,7 +2541,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, const std::string & value) {
             params.model.path = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL"));
+    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_DATASET}).set_env("LLAMA_ARG_MODEL"));
     add_opt(common_arg(
         {"-mu", "--model-url"}, "MODEL_URL",
         "model download url (default: unused)",
@@ -2639,7 +2639,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, const std::string & value) {
           params.out_file = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE}));
+    ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_DATASET}));
     add_opt(common_arg(
         {"-ofreq", "--output-frequency"}, "N",
         string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq),
@@ -3517,39 +3517,47 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, const std::string & format) {
             params.dataset_format = format; //TODO ENUM CLASS
         }
-    ).set_examples({LLAMA_EXAMPLE_FINETUNE}));
+    ).set_examples({LLAMA_EXAMPLE_DATASET}));
 
     add_opt(common_arg(
         {"--max-seq-len"}, " ",
         string_format("max sequence length (default: %d)", params.max_seq_len),
         [](common_params & params, int32_t max_seq_len) {
             params.max_seq_len = max_seq_len;
         }
-    ).set_examples({LLAMA_EXAMPLE_FINETUNE}));
+    ).set_examples({LLAMA_EXAMPLE_DATASET}));
 
     add_opt(common_arg(
         {"--pre-tokenized"},
         string_format("input file contains pre-tokenized data (space-separated token IDs)"),
         [](common_params & params) {
             params.pre_tokenized = true;
         }
-    ).set_examples({LLAMA_EXAMPLE_FINETUNE}));
+    ).set_examples({LLAMA_EXAMPLE_DATASET}));
 
     add_opt(common_arg(
         {"--preview"},
         string_format("read and print metadata and first sequence from the output GGUF file (enables preview)"),
         [](common_params & params) {
             params.do_preview = true;
         }
-    ).set_examples({LLAMA_EXAMPLE_FINETUNE}));
+    ).set_examples({LLAMA_EXAMPLE_DATASET}));
 
     add_opt(common_arg(
         {"--dataset-column"}, "<name>",
         string_format("column name for data in dataset files"),
         [](common_params & params, const std::string &dataset_column) {
             params.dataset_column = dataset_column;
         }
-    ).set_examples({LLAMA_EXAMPLE_FINETUNE}));
+    ).set_examples({LLAMA_EXAMPLE_DATASET}));
+
+    add_opt(common_arg(
+        {"--streaming"},
+        string_format("enable streaming dataset"),
+        [](common_params & params) {
+            params.dataset_streaming = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_DATASET}));
 
     return ctx_arg;
 }
@@ -84,6 +84,7 @@ enum llama_example {
     LLAMA_EXAMPLE_TTS,
     LLAMA_EXAMPLE_DIFFUSION,
     LLAMA_EXAMPLE_FINETUNE,
+    LLAMA_EXAMPLE_DATASET,
 
     LLAMA_EXAMPLE_COUNT,
 };
@@ -471,6 +472,7 @@ struct common_params {
     bool do_preview = false;
     bool pre_tokenized = false;
     std::string dataset_column = "data";
+    bool dataset_streaming = false;
 };
 
 // call once at the start of a program if it uses libcommon
 
@@ -70,7 +70,6 @@ target_link_libraries(streaming_analyzer PRIVATE dataset_convert_lib)
 add_executable(test_data_validator "${CMAKE_CURRENT_SOURCE_DIR}/tools/test-data-validation-tool.cpp")
 target_link_libraries(test_data_validator PRIVATE dataset_convert_lib)
 
-# Installation rule for the executable
 install(TARGETS dataset_converter streaming_analyzer test_data_validator
     DESTINATION ${CMAKE_INSTALL_PREFIX}/bin
 )
 
@@ -28,29 +28,20 @@ The Dataset Converter addresses this challenge by:
 The tool is run from the command line. The basic syntax is as follows:
 
 ```bash
-./dataset_converter [options] <input_file> <output_file>
+./dataset_converter [options] --in-file <input_file> -o <output_file> --streaming
 ```
-
-**Options:**
-
-| Flag                | Description                                                  |
-| ------------------- | ------------------------------------------------------------ |
-| `-h`, `--help`      | Show the help message and exit.                              |
-| `--model MODEL`     | Path to the `llama.cpp` model for tokenization (required for text input). |
-| `--streaming`       | Use streaming mode for large datasets.                       |
-
 ### Examples
 
 **Convert a text file to GGUF:**
 
 ```bash
-./dataset_converter --model ./models/7B/ggml-model-f16.gguf --streaming input.txt output.gguf
+./dataset_converter --model ./models/7B/ggml-model-f16.gguf --streaming --in-file input.txt -o output.gguf
 ```
 
 **Convert a Parquet file to GGUF:**
 
 ```bash
-./dataset_converter --streaming input.parquet output.gguf
+./dataset_converter --streaming --in-file input.parquet -o output.gguf --dataset-column tokens
 ```
 
 ## 5. Directory Structure
@@ -115,13 +106,6 @@ struct llama_dataset * llama_dataset_load_gguf(const char * path, bool streaming
 struct llama_dataset * llama_dataset_load_text(const char * path, struct llama_model * model, bool streaming);
 struct llama_dataset * llama_dataset_load_parquet(const char * path, bool streaming);
 
-// Legacy access functions
-uint64_t llama_dataset_get_sequence_count(const struct llama_dataset * dataset);
-int32_t llama_dataset_get_sequence_length(const struct llama_dataset * dataset, uint64_t index);
-const llama_token * llama_dataset_get_sequence(const struct llama_dataset * dataset, uint64_t index);
-
-// Legacy conversion function
-bool llama_dataset_save_gguf(struct llama_dataset * dataset, const char * path);
 ```
 
 ## 7. Dependencies
@@ -157,3 +141,38 @@ For developers working on this codebase:
 - See `docs/REMOVED_FILES.md` for information about the previous implementation.
 - The new interface is designed to be simpler and more consistent while maintaining backward compatibility.
 - Streaming optimization features significantly improve performance for large datasets.
+
+Metadata:
+```
+training.format.version: int16 (e.g. 1000) - Specification version, in case of future changes.
+
+training.format.source: Source format (gguf, text, parquet)
+
+training.dataset.name: string (optional) - Dataset name (e.g. "OpenWebText-ru").
+
+training.dataset.description: string (optional) - Dataset description (e.g. "OpenWebText-ru").
+
+training.dataset.source: string (optional) - URL or description of the data source.
+
+training.file.creation_date: string (ISO 8601) - File creation date.
+
+training.tokenizer.gguf.model: string - Tokenizer model name (llama, gpt2, etc.).
+
+training.tokenizer.gguf.vocab: array[string] - Tokenizer dictionary.
+
+training.tokenizer.gguf.merges: array[string] - Tokenizer merges (for BPE).
+
+training.tokenizer.gguf.pre: string (optional) - Pre-tokenization architecture.
+
+Note: Instead of storing the entire tokenizer, you could reference the model file, but embedding ensures that the data file is completely self-contained.
+
+training.sequence.count: uint64 - Total number of sequences in the file.
+```
+Tensors:
+```
+Naming: training.tensor.{index} (e.g. training.tensor.0, training.tensor.1, ...).
+
+Data type: GGML_TYPE_I32 (standard for tokens in llama.cpp).
+
+Shape: [sequence_length] - One-dimensional array. sequence_length will be different for each tensor.
+```
@@ -10,8 +10,8 @@
  */
 
 #include "llama-dataset.h"
-#include "../../ggml/include/gguf.h"
-#include "../../ggml/include/ggml.h"
+#include "ggml/include/gguf.h"
+#include "ggml/include/ggml.h"
 
 #ifdef __cplusplus
 #include "streaming-cache.h"
@@ -23,7 +23,7 @@ extern "C" {
  *
  * This structure contains all the necessary data for representing a dataset,
  * including the GGUF context, GGML context, and cached tensor pointers.
- * 
+ *
  * This structure is internal and should not be exposed to the public API.
  */
 struct llama_dataset {
@@ -35,8 +35,9 @@ struct llama_dataset {
     bool streaming;                         // Streaming mode flag
     void * format_data;                     // Format-specific state
 #ifdef __cplusplus
-    StreamingCache * streaming_cache;       // LRU cache for streaming data
+    llama_dataset_streaming_cache * streaming_cache;       // LRU cache for streaming data
     void * optimization_manager;            // Streaming optimization manager
+    std::string                             column;
 #else
     void * streaming_cache;                 // Opaque pointer for C compatibility
     void * optimization_manager;            // Opaque pointer for optimization manager
@@ -45,4 +46,4 @@ struct llama_dataset {
 
 #ifdef __cplusplus
 }
-#endif
+#endif
Original file line number	Diff line number	Diff line change
`@@ -70,7 +70,6 @@ target_link_libraries(streaming_analyzer PRIVATE dataset_convert_lib)`
`70`	`70`	`add_executable(test_data_validator "${CMAKE_CURRENT_SOURCE_DIR}/tools/test-data-validation-tool.cpp")`
`71`	`71`	`target_link_libraries(test_data_validator PRIVATE dataset_convert_lib)`
`72`	`72`
`73`		`-# Installation rule for the executable`
`74`	`73`	`install(TARGETS dataset_converter streaming_analyzer test_data_validator`
`75`	`74`	`DESTINATION ${CMAKE_INSTALL_PREFIX}/bin`
`76`	`75`	`)`