diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index abc4fa1c8931f..e05481019603e 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -17,7 +17,6 @@ else() add_subdirectory(batched) add_subdirectory(embedding) add_subdirectory(eval-callback) - add_subdirectory(gguf-hash) add_subdirectory(gguf) add_subdirectory(gritlm) @@ -35,6 +34,7 @@ else() add_subdirectory(training) add_subdirectory(diffusion) add_subdirectory(model-conversion) + add_subdirectory(prediction-next-token) if (NOT GGML_BACKEND_DL) add_subdirectory(convert-llama2c-to-ggml) # these examples use the backends directly and cannot be built with dynamic loading diff --git a/examples/prediction-next-token/CMakeLists.txt b/examples/prediction-next-token/CMakeLists.txt new file mode 100644 index 0000000000000..c8c828de205de --- /dev/null +++ b/examples/prediction-next-token/CMakeLists.txt @@ -0,0 +1,5 @@ +set(TARGET prediction-next-token) +add_executable(${TARGET} main.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/prediction-next-token/README.md b/examples/prediction-next-token/README.md new file mode 100644 index 0000000000000..c52c2cabdfecc --- /dev/null +++ b/examples/prediction-next-token/README.md @@ -0,0 +1,48 @@ +# llama.cpp/examples/prediction-next-token + +This directory contains examples demonstrating **next-token prediction** using LLaMA models through [llama.cpp/GGML](https://github.com/ggml-org/llama.cpp). + +The tool can be useful for checking and measuring fine tuning results on examples +(Now only on CPU) +--- + +## Usage + +``` +prediction-next-token --model --prompt [--hypothesis ] +``` + +or short form: + +``` +prediction-next-token -m -p [-h ] +``` + +**Example:** + +```bash +prediction-next-token -m "models\llama-3.2-1B-q4_k_m-128k.gguf" -p "Who invented E=mc^2?" -h "Einstein" +``` + +--- + +### Notes for non-English UTF-8 text (e.g., Russian) + +On **Windows**, it is recommended to use **Windows Terminal**: + +``` +.\prediction-next-token.exe -m "models\llama-3.2-1B-q4_k_m-128k-ru.gguf" -p "Здравствуйте!" -h "Привет" +chcp 65001 +``` + +* This ensures correct handling of UTF-8 characters both for input arguments and output in the console. + + +--- + +## Notes on Model Behavior + +* The `--hypothesis` argument is optional and specifies expected/necessary the first word to evaluate. +* After fine-tuning on a dataset, the **perplexity** of the model on a test set should decrease over training epochs. + + diff --git a/examples/prediction-next-token/main.cpp b/examples/prediction-next-token/main.cpp new file mode 100644 index 0000000000000..524a3f6912957 --- /dev/null +++ b/examples/prediction-next-token/main.cpp @@ -0,0 +1,236 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct TokenInfo { + int id; + float p; + std::string piece; +}; + +#include + +#include // для malloc/free +#include // для strlen + +const char * Utf8FromUtf16(const wchar_t * wstr) { + if (!wstr) { + return nullptr; + } + + int size_needed = WideCharToMultiByte(CP_UTF8, 0, wstr, -1, nullptr, 0, nullptr, nullptr); + + char * buffer = (char *) malloc(size_needed); + if (!buffer) { + return nullptr; + } + + WideCharToMultiByte(CP_UTF8, 0, wstr, -1, buffer, size_needed, nullptr, nullptr); + + return buffer; // caller должен вызвать free() +} + +static int wmain(int argc, wchar_t * argv[]) { + SetConsoleOutputCP(CP_UTF8); + SetConsoleCP(CP_UTF8); + // Установка значений по умолчанию + const char * model_path = nullptr; + const char * prompt = nullptr; + const char * word = nullptr; + + // Разбор аргументов + for (int i = 1; i < argc; i++) { + if ((wcscmp(argv[i], L"-m") == 0 || wcscmp(argv[i], L"--model") == 0) && i + 1 < argc) { + model_path = Utf8FromUtf16(argv[++i]); + } else if ((wcscmp(argv[i], L"-p") == 0 || wcscmp(argv[i], L"--prompt") == 0) && i + 1 < argc) { + prompt = Utf8FromUtf16(argv[++i]); + } else if ((wcscmp(argv[i], L"-h") == 0 || wcscmp(argv[i], L"--hypothesis") == 0) && i + 1 < argc) { + word = Utf8FromUtf16(argv[++i]); + } else if (i == 1 && argv[i][0] != L'-') { + model_path = Utf8FromUtf16(argv[i]); + if (i + 1 < argc) { + prompt = Utf8FromUtf16(argv[++i]); + } + } + } + + // Проверка обязательных аргументов + if (model_path == nullptr || prompt == nullptr) { + fprintf(stderr, + "Usage: %s -m or --model -p|--prompt [-h|--hypothesis ]\n", + Utf8FromUtf16(argv[0])); + return 1; + } + + // 0) backend + llama_backend_init(); + + // 1) load model + llama_model_params model_params = llama_model_default_params(); + llama_model * model = llama_model_load_from_file(model_path, model_params); + if (!model) { + fprintf(stderr, "failed to load model: %s\n", model_path); + llama_backend_free(); + return 1; + } + + // 2) context + llama_context_params ctx_params = llama_context_default_params(); + ctx_params.n_ctx = 512; + llama_context * ctx = llama_new_context_with_model(model, ctx_params); + if (!ctx) { + fprintf(stderr, "failed to create context\n"); + llama_model_free(model); + llama_backend_free(); + return 1; + } + + // 3) vocab + const llama_vocab * vocab = llama_model_get_vocab(model); + + // 4) tokenize full prompt + int max_tokens = 256; + std::vector tok(max_tokens); + + int n_tok = llama_tokenize(vocab, + prompt, + (int) strlen(prompt), + tok.data(), + (int) tok.size(), + /*add_bos=*/true, + /*special=*/true); + if (n_tok < 0) { + max_tokens = -n_tok; + tok.resize(max_tokens); + n_tok = llama_tokenize(vocab, prompt, (int) strlen(prompt), tok.data(), (int) tok.size(), true, true); + } + if (n_tok <= 0) { + fprintf(stderr, "tokenization failed\n"); + llama_free(ctx); + llama_model_free(model); + llama_backend_free(); + return 1; + } + tok.resize(n_tok); + + // 5) build batch correctly (НЕ аллоцируем seq_id вручную!) + llama_batch batch = llama_batch_get_one(tok.data(), (int) tok.size()); + // batch.pos / batch.seq_id / batch.n_seq_id / batch.logits = nullptr + // рантайм сам подставит корректные значения и вернёт логиты для последнего токена + + // 6) decode + int ret = llama_decode(ctx, batch); + if (ret != 0) { + fprintf(stderr, "llama_decode failed, ret = %d\n", ret); + llama_free(ctx); + llama_model_free(model); + llama_backend_free(); + return 1; + } + + // 7) logits of the last token in the batch + // (так безопаснее: это "последние" логиты, соответствующие отмеченному последнему токену) + const float * logits = llama_get_logits(ctx); + const int n_vocab = llama_vocab_n_tokens(vocab); + + // 8) softmax + top-10 + // найдём максимум для стабильного softmax + float max_logit = logits[0]; + for (int i = 1; i < n_vocab; ++i) { + if (logits[i] > max_logit) { + max_logit = logits[i]; + } + } + // вычислим exp и сумму + std::vector probs(n_vocab); + double sum = 0.0; + for (int i = 0; i < n_vocab; ++i) { + float e = std::exp(logits[i] - max_logit); + probs[i] = e; + sum += e; + } + for (int i = 0; i < n_vocab; ++i) { + probs[i] = (float) (probs[i] / sum); + } + + // соберём индексы и отсортируем по вероятности + std::vector ids(n_vocab); + for (int i = 0; i < n_vocab; ++i) { + ids[i] = i; + } + std::partial_sort(ids.begin(), ids.begin() + 10, ids.end(), [&](int a, int b) { return probs[a] > probs[b]; }); + + // 9) распечатаем top-10 + char piece[256]; + for (int r = 0; r < 10; ++r) { + int id = ids[r]; + int n = llama_token_to_piece(vocab, + id, + piece, + sizeof(piece), + /*special=*/true, + /*clean=*/true); + if (n < 0) { + snprintf(piece, sizeof(piece), "", id); + } else { + piece[n] = '\0'; + } + printf("%2d) id=%6d p=%.6f \"%s\"\n", r + 1, id, probs[id], piece); + } + + if (word != nullptr) { + // 10) распечатаем ещё интересующие токены + std::vector tokens_info; + + // Получаем все префиксы строки + std::vector prefixes; + size_t text_len = strlen(word); + for (size_t len = 1; len <= text_len; len++) { + char buf[256]; + memcpy(buf, word, len); + buf[len] = '\0'; + prefixes.push_back(buf); + } + + // Проходим по словарю и ищем все токены, которые совпадают с префиксами + for (int id = 0; id < llama_vocab_n_tokens(vocab); ++id) { + char piece[256]; + int n = llama_token_to_piece(vocab, id, piece, sizeof(piece), true, true); + if (n <= 0) { + continue; + } + piece[n] = '\0'; + + // проверка на совпадение с префиксом + for (const auto & pref : prefixes) { + if (strcmp(piece, pref.c_str()) == 0) { + tokens_info.push_back({ id, probs[id], piece }); + } + } + } + + // Сортируем по убыванию вероятности + std::sort( + tokens_info.begin(), tokens_info.end(), [](const TokenInfo & a, const TokenInfo & b) { return a.p > b.p; }); + + // Вывод + for (const auto & t : tokens_info) { + if (t.p > 0.00000049f) { + printf("id=%6d p=%.6f \"%s\"\n", t.id, t.p, t.piece.c_str()); + } + } + } + + // 11) cleanup + llama_free(ctx); + llama_model_free(model); + llama_backend_free(); + return 0; +}