Skip to content

Commit 872584b

Browse files
author
lexasub
committed
tool: refactor dataset converter
1 parent a26e37b commit 872584b

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

51 files changed

+2085
-2408
lines changed

common/arg.cpp

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1482,7 +1482,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
14821482
[](common_params & params) {
14831483
params.ctx_shift = false;
14841484
}
1485-
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
1485+
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_DATASET}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
14861486
add_opt(common_arg(
14871487
{"--chunks"}, "N",
14881488
string_format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
@@ -1551,7 +1551,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
15511551
}
15521552
params.in_files.push_back(value);
15531553
}
1554-
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_FINETUNE}));
1554+
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_DATASET}));
15551555
add_opt(common_arg(
15561556
{"-bf", "--binary-file"}, "FNAME",
15571557
"binary file containing the prompt (default: none)",
@@ -2541,7 +2541,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
25412541
[](common_params & params, const std::string & value) {
25422542
params.model.path = value;
25432543
}
2544-
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL"));
2544+
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_DATASET}).set_env("LLAMA_ARG_MODEL"));
25452545
add_opt(common_arg(
25462546
{"-mu", "--model-url"}, "MODEL_URL",
25472547
"model download url (default: unused)",
@@ -2639,7 +2639,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
26392639
[](common_params & params, const std::string & value) {
26402640
params.out_file = value;
26412641
}
2642-
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE}));
2642+
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_DATASET}));
26432643
add_opt(common_arg(
26442644
{"-ofreq", "--output-frequency"}, "N",
26452645
string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq),
@@ -3517,39 +3517,47 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
35173517
[](common_params & params, const std::string & format) {
35183518
params.dataset_format = format; //TODO ENUM CLASS
35193519
}
3520-
).set_examples({LLAMA_EXAMPLE_FINETUNE}));
3520+
).set_examples({LLAMA_EXAMPLE_DATASET}));
35213521

35223522
add_opt(common_arg(
35233523
{"--max-seq-len"}, " ",
35243524
string_format("max sequence length (default: %d)", params.max_seq_len),
35253525
[](common_params & params, int32_t max_seq_len) {
35263526
params.max_seq_len = max_seq_len;
35273527
}
3528-
).set_examples({LLAMA_EXAMPLE_FINETUNE}));
3528+
).set_examples({LLAMA_EXAMPLE_DATASET}));
35293529

35303530
add_opt(common_arg(
35313531
{"--pre-tokenized"},
35323532
string_format("input file contains pre-tokenized data (space-separated token IDs)"),
35333533
[](common_params & params) {
35343534
params.pre_tokenized = true;
35353535
}
3536-
).set_examples({LLAMA_EXAMPLE_FINETUNE}));
3536+
).set_examples({LLAMA_EXAMPLE_DATASET}));
35373537

35383538
add_opt(common_arg(
35393539
{"--preview"},
35403540
string_format("read and print metadata and first sequence from the output GGUF file (enables preview)"),
35413541
[](common_params & params) {
35423542
params.do_preview = true;
35433543
}
3544-
).set_examples({LLAMA_EXAMPLE_FINETUNE}));
3544+
).set_examples({LLAMA_EXAMPLE_DATASET}));
35453545

35463546
add_opt(common_arg(
35473547
{"--dataset-column"}, "<name>",
35483548
string_format("column name for data in dataset files"),
35493549
[](common_params & params, const std::string &dataset_column) {
35503550
params.dataset_column = dataset_column;
35513551
}
3552-
).set_examples({LLAMA_EXAMPLE_FINETUNE}));
3552+
).set_examples({LLAMA_EXAMPLE_DATASET}));
3553+
3554+
add_opt(common_arg(
3555+
{"--streaming"},
3556+
string_format("enable streaming dataset"),
3557+
[](common_params & params) {
3558+
params.dataset_streaming = true;
3559+
}
3560+
).set_examples({LLAMA_EXAMPLE_DATASET}));
35533561

35543562
return ctx_arg;
35553563
}

common/common.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@ enum llama_example {
8484
LLAMA_EXAMPLE_TTS,
8585
LLAMA_EXAMPLE_DIFFUSION,
8686
LLAMA_EXAMPLE_FINETUNE,
87+
LLAMA_EXAMPLE_DATASET,
8788

8889
LLAMA_EXAMPLE_COUNT,
8990
};
@@ -471,6 +472,7 @@ struct common_params {
471472
bool do_preview = false;
472473
bool pre_tokenized = false;
473474
std::string dataset_column = "data";
475+
bool dataset_streaming = false;
474476
};
475477

476478
// call once at the start of a program if it uses libcommon

tools/dataset-converter/CMakeLists.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,6 @@ target_link_libraries(streaming_analyzer PRIVATE dataset_convert_lib)
7070
add_executable(test_data_validator "${CMAKE_CURRENT_SOURCE_DIR}/tools/test-data-validation-tool.cpp")
7171
target_link_libraries(test_data_validator PRIVATE dataset_convert_lib)
7272

73-
# Installation rule for the executable
7473
install(TARGETS dataset_converter streaming_analyzer test_data_validator
7574
DESTINATION ${CMAKE_INSTALL_PREFIX}/bin
7675
)

tools/dataset-converter/README.md

Lines changed: 38 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -28,29 +28,20 @@ The Dataset Converter addresses this challenge by:
2828
The tool is run from the command line. The basic syntax is as follows:
2929

3030
```bash
31-
./dataset_converter [options] <input_file> <output_file>
31+
./dataset_converter [options] --in-file <input_file> -o <output_file> --streaming
3232
```
33-
34-
**Options:**
35-
36-
| Flag | Description |
37-
| ------------------- | ------------------------------------------------------------ |
38-
| `-h`, `--help` | Show the help message and exit. |
39-
| `--model MODEL` | Path to the `llama.cpp` model for tokenization (required for text input). |
40-
| `--streaming` | Use streaming mode for large datasets. |
41-
4233
### Examples
4334

4435
**Convert a text file to GGUF:**
4536

4637
```bash
47-
./dataset_converter --model ./models/7B/ggml-model-f16.gguf --streaming input.txt output.gguf
38+
./dataset_converter --model ./models/7B/ggml-model-f16.gguf --streaming --in-file input.txt -o output.gguf
4839
```
4940

5041
**Convert a Parquet file to GGUF:**
5142

5243
```bash
53-
./dataset_converter --streaming input.parquet output.gguf
44+
./dataset_converter --streaming --in-file input.parquet -o output.gguf --dataset-column tokens
5445
```
5546

5647
## 5. Directory Structure
@@ -115,13 +106,6 @@ struct llama_dataset * llama_dataset_load_gguf(const char * path, bool streaming
115106
struct llama_dataset * llama_dataset_load_text(const char * path, struct llama_model * model, bool streaming);
116107
struct llama_dataset * llama_dataset_load_parquet(const char * path, bool streaming);
117108

118-
// Legacy access functions
119-
uint64_t llama_dataset_get_sequence_count(const struct llama_dataset * dataset);
120-
int32_t llama_dataset_get_sequence_length(const struct llama_dataset * dataset, uint64_t index);
121-
const llama_token * llama_dataset_get_sequence(const struct llama_dataset * dataset, uint64_t index);
122-
123-
// Legacy conversion function
124-
bool llama_dataset_save_gguf(struct llama_dataset * dataset, const char * path);
125109
```
126110
127111
## 7. Dependencies
@@ -157,3 +141,38 @@ For developers working on this codebase:
157141
- See `docs/REMOVED_FILES.md` for information about the previous implementation.
158142
- The new interface is designed to be simpler and more consistent while maintaining backward compatibility.
159143
- Streaming optimization features significantly improve performance for large datasets.
144+
145+
Metadata:
146+
```
147+
training.format.version: int16 (e.g. 1000) - Specification version, in case of future changes.
148+
149+
training.format.source: Source format (gguf, text, parquet)
150+
151+
training.dataset.name: string (optional) - Dataset name (e.g. "OpenWebText-ru").
152+
153+
training.dataset.description: string (optional) - Dataset description (e.g. "OpenWebText-ru").
154+
155+
training.dataset.source: string (optional) - URL or description of the data source.
156+
157+
training.file.creation_date: string (ISO 8601) - File creation date.
158+
159+
training.tokenizer.gguf.model: string - Tokenizer model name (llama, gpt2, etc.).
160+
161+
training.tokenizer.gguf.vocab: array[string] - Tokenizer dictionary.
162+
163+
training.tokenizer.gguf.merges: array[string] - Tokenizer merges (for BPE).
164+
165+
training.tokenizer.gguf.pre: string (optional) - Pre-tokenization architecture.
166+
167+
Note: Instead of storing the entire tokenizer, you could reference the model file, but embedding ensures that the data file is completely self-contained.
168+
169+
training.sequence.count: uint64 - Total number of sequences in the file.
170+
```
171+
Tensors:
172+
```
173+
Naming: training.tensor.{index} (e.g. training.tensor.0, training.tensor.1, ...).
174+
175+
Data type: GGML_TYPE_I32 (standard for tokens in llama.cpp).
176+
177+
Shape: [sequence_length] - One-dimensional array. sequence_length will be different for each tensor.
178+
```

tools/dataset-converter/core/llama-dataset-internal.h

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,8 @@
1010
*/
1111

1212
#include "llama-dataset.h"
13-
#include "../../ggml/include/gguf.h"
14-
#include "../../ggml/include/ggml.h"
13+
#include "ggml/include/gguf.h"
14+
#include "ggml/include/ggml.h"
1515

1616
#ifdef __cplusplus
1717
#include "streaming-cache.h"
@@ -23,7 +23,7 @@ extern "C" {
2323
*
2424
* This structure contains all the necessary data for representing a dataset,
2525
* including the GGUF context, GGML context, and cached tensor pointers.
26-
*
26+
*
2727
* This structure is internal and should not be exposed to the public API.
2828
*/
2929
struct llama_dataset {
@@ -35,8 +35,9 @@ struct llama_dataset {
3535
bool streaming; // Streaming mode flag
3636
void * format_data; // Format-specific state
3737
#ifdef __cplusplus
38-
StreamingCache * streaming_cache; // LRU cache for streaming data
38+
llama_dataset_streaming_cache * streaming_cache; // LRU cache for streaming data
3939
void * optimization_manager; // Streaming optimization manager
40+
std::string column;
4041
#else
4142
void * streaming_cache; // Opaque pointer for C compatibility
4243
void * optimization_manager; // Opaque pointer for optimization manager
@@ -45,4 +46,4 @@ struct llama_dataset {
4546

4647
#ifdef __cplusplus
4748
}
48-
#endif
49+
#endif

0 commit comments

Comments
 (0)