Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 0 additions & 6 deletions examples/training/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,3 @@ add_executable(${TARGET} finetune.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

set(TARGET llama-finetune-lora)
add_executable(${TARGET} finetune-lora.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
65 changes: 0 additions & 65 deletions examples/training/README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
# llama.cpp/examples/training

## finetune
This directory contains examples related to language model training using llama.cpp/GGML.
So far finetuning is technically functional (for FP32 models and limited hardware setups) but the code is very much WIP.
Finetuning of Stories 260K and LLaMA 3.2 1b seems to work with 24 GB of memory.
Expand All @@ -16,67 +15,3 @@ export model_name=llama_3.2-1b && export quantization=f32
```

The perplexity value of the finetuned model should be lower after training on the test set for 2 epochs.


## finetune-lora

LoRA (Low-Rank Adaptation) fine-tuning for efficient model training. This approach trains only a small set of additional parameters while keeping
the base model frozen, making it memory-efficient.

### Basic Usage

```sh
# Create new LoRA adapter with default settings (rank=8, alpha=16, attention modules)
./build/bin/llama-finetune-lora -m model.gguf -f dataset.txt -ngl 999 -c 512 -b 512 -ub 512

# Custom LoRA parameters(creates new lora adapter and trains it from scratch)
./build/bin/llama-finetune-lora -m model.gguf -f dataset.txt -ngl 999 -c 512 -b 512 -ub 512 \
--lora-rank 16 --lora-alpha 32 --lora-modules "attn_q,attn_k,attn_v,attn_o"

# Fine-tune existing LoRA adapter
./build/bin/llama-finetune-lora -m base_model.gguf -f dataset.txt --lora existing_adapter.gguf \
--output-adapter improved_adapter.gguf -ngl 999 -c 512 -b 512 -ub 512
```


### Parameters

#### LoRA Configuration
- `--lora-rank N` - LoRA rank (default: 8)
- Lower rank = smaller adapter, less capacity
- Higher rank = larger adapter, more capacity
- `--lora-alpha N` - LoRA alpha scaling factor (default: 16.0)
- Controls adaptation strength
- Common rule: alpha = 2 × rank
- `--lora-modules MODULES` - Target modules as comma-separated list
- Available: `attn_q`, `attn_k`, `attn_v`, `attn_o`, `ffn_gate`, `ffn_up`, `ffn_down`, `embed`, `output`, `all`
- Default: `attn_q,attn_k,attn_v,attn_o` (attention modules)
- `--output-adapter PATH` - Output adapter filename (default: auto-generated)

#### Standard Parameters
- `-m MODEL` - Base model file (.gguf)
- `-f FILE` - Training dataset
- `-ngl N` - GPU layers (use 999 for full GPU training)
- `-c N` - Context length (512 recommended for mobile)


### Using Trained Adapters

After training, you'll get a small adapter file. Use it with the original base model:

```sh
./build/bin/llama-cli -m base_model.gguf --lora trained_adapter.gguf -ngl 999
```

### Troubleshooting

- **Out of memory**: Reduce context length (`-c 256`), lower rank, or use fewer target modules
- **Poor quality**: Increase rank, add more target modules, or train longer
- **Large adapter**: Reduce rank or limit target modules

### Help

Run with `--help` or `-h` to see all available parameters:
```sh
./build/bin/llama-finetune-lora --help
```
263 changes: 0 additions & 263 deletions examples/training/finetune-lora.cpp

This file was deleted.

7 changes: 3 additions & 4 deletions examples/training/finetune.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,8 @@ int main(int argc, char ** argv) {
LOG_INF("%s\n", common_params_get_system_info(params).c_str());
}

std::vector<llama_token> tokens = common_tokenize(ctx.get(), params.prompt, true);
ggml_opt_dataset_t dataset = common_opt_dataset_init(ctx.get(), tokens, llama_n_ctx(ctx.get())/2);
std::vector<llama_token> tokens = common_tokenize(ctx.get(), params.prompt, true);
ggml_opt_dataset_t dataset = common_opt_dataset_init(ctx.get(), tokens, llama_n_ctx(ctx.get()) / 2);

struct lr_opt & lr = params.lr;
LOG_INF("-optimizer %s -lr0 %.2g -wd %.2g -lr-min %.2g -min-epochs %.2g -epochs %d -period %.2g -val %.2g\n",
Expand All @@ -64,8 +64,7 @@ int main(int argc, char ** argv) {

struct llama_opt_params lopt_params{
/*n_ctx_train =*/0,
// /*param_filter =*/llama_opt_param_filter_all,
llama_opt_param_filter_lora,
/*param_filter =*/llama_opt_param_filter_all,
/*param_filter_ud =*/nullptr,
/*get_opt_pars =*/common_opt_lr_pars,
/*get_opt_pars_ud =*/&params.lr,
Expand Down
7 changes: 0 additions & 7 deletions ggml/include/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -479,7 +479,6 @@ extern "C" {
GGML_OP_REPEAT_BACK,
GGML_OP_CONCAT,
GGML_OP_SILU_BACK,
GGML_OP_GEGLU_BACK,
GGML_OP_NORM, // normalize
GGML_OP_RMS_NORM,
GGML_OP_RMS_NORM_BACK,
Expand Down Expand Up @@ -1131,12 +1130,6 @@ extern "C" {
struct ggml_tensor * a,
struct ggml_tensor * b);

GGML_API struct ggml_tensor * ggml_geglu_back(
struct ggml_context * ctx,
struct ggml_tensor * grad,
struct ggml_tensor * x,
struct ggml_tensor * g);

// hardswish(x) = x * relu6(x + 3) / 6
GGML_API struct ggml_tensor * ggml_hardswish(
struct ggml_context * ctx,
Expand Down
Loading
Loading