Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
165 changes: 96 additions & 69 deletions sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include <chrono>
#include <cstring>
#include <string>
#include <vector>

#include "rac/core/rac_logger.h"

Expand All @@ -15,39 +16,38 @@

namespace runanywhere {

// =============================================================================
// UTF-8 VALIDATION HELPER
// =============================================================================

static bool is_valid_utf8(const char* string) {
if (!string)
return true;

const unsigned char* bytes = (const unsigned char*)string;
int num;

while (*bytes != 0x00) {
if ((*bytes & 0x80) == 0x00) {
num = 1;
} else if ((*bytes & 0xE0) == 0xC0) {
num = 2;
} else if ((*bytes & 0xF0) == 0xE0) {
num = 3;
} else if ((*bytes & 0xF8) == 0xF0) {
num = 4;
} else {
return false;
}
// UTF-8 STATE MACHINE (DFA)

struct Utf8State {

uint32_t state = 0;

// Bjoern Hoehrmann LUT
bool process(uint8_t byte) {
static const uint8_t utf8d[] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
};

bytes += 1;
for (int i = 1; i < num; ++i) {
if ((*bytes & 0xC0) != 0x80)
return false;
bytes += 1;
}
uint32_t type = utf8d[byte];
state = utf8d[256 + state * 16 + type];
return (state == 0);
Comment on lines +26 to +46
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the process() method doesn't update state correctly - the return value indicates if a character completed a valid sequence, but the state itself is internal to the DFA

Suggested change
bool process(uint8_t byte) {
static const uint8_t utf8d[] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
};
bytes += 1;
for (int i = 1; i < num; ++i) {
if ((*bytes & 0xC0) != 0x80)
return false;
bytes += 1;
}
uint32_t type = utf8d[byte];
state = utf8d[256 + state * 16 + type];
return (state == 0);
bool process(uint8_t byte) {
static const uint8_t utf8d[] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
};
uint32_t type = utf8d[byte];
uint32_t new_state = utf8d[256 + state * 16 + type];
state = new_state;
return (state == 0);
}
Prompt To Fix With AI
This is a comment left during a code review.
Path: sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.cpp
Line: 26:46

Comment:
the `process()` method doesn't update `state` correctly - the return value indicates if a character completed a valid sequence, but the state itself is internal to the DFA

```suggestion
    bool process(uint8_t byte) {
        static const uint8_t utf8d[] = {
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
            1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
            7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
            8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
            0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
            0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
            0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
            1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
            1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
            1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
            1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
        };

        uint32_t type = utf8d[byte];
        uint32_t new_state = utf8d[256 + state * 16 + type];
        state = new_state;
        return (state == 0);
    }
```

How can I resolve this? If you propose a fix, please make it concise.

}
return true;
}

void reset() { state = 0; }
};

// =============================================================================
// LOG CALLBACK
Expand Down Expand Up @@ -457,12 +457,8 @@ bool LlamaCppTextGeneration::generate_stream(const TextGenerationRequest& reques
}

int effective_max_tokens = std::min(request.max_tokens, available_tokens);
if (effective_max_tokens < request.max_tokens) {
LOGI("Capping max_tokens: %d → %d (context=%d, prompt=%d tokens)", request.max_tokens,
effective_max_tokens, n_ctx, prompt_tokens);
}
LOGI("Generation: prompt_tokens=%d, max_tokens=%d, context=%d", prompt_tokens,
effective_max_tokens, n_ctx);
LOGI("Generation: prompt_tokens=%d, max_tokens=%d, context=%d",
prompt_tokens, effective_max_tokens, n_ctx);

llama_batch batch = llama_batch_init(n_ctx, 0, 1);

Expand All @@ -481,10 +477,27 @@ bool LlamaCppTextGeneration::generate_stream(const TextGenerationRequest& reques
llama_sampler_reset(sampler_);

const auto vocab = llama_model_get_vocab(model_);
std::string cached_token_chars;
std::string accumulated_text;

static const std::vector<std::string> STOP_SEQUENCES = {
"<|im_end|>", "<|eot_id|>", "</s>", "<|end|>", "<|endoftext|>",
"\n\nUser:", "\n\nHuman:",
};

static const size_t MAX_STOP_LEN = []{
size_t m = 0;
for (const auto& s : STOP_SEQUENCES) m = std::max(m, s.size());
return m;
}();

std::string stop_window;
stop_window.reserve(MAX_STOP_LEN * 2);

std::string partial_utf8_buffer;
partial_utf8_buffer.reserve(8);

int n_cur = batch.n_tokens;
int tokens_generated = 0;
bool stop_sequence_hit = false;

while (tokens_generated < effective_max_tokens && !cancel_requested_.load()) {
const llama_token new_token_id = llama_sampler_sample(sampler_, context_, -1);
Expand All @@ -496,41 +509,55 @@ bool LlamaCppTextGeneration::generate_stream(const TextGenerationRequest& reques
break;
}

auto new_token_chars = common_token_to_piece(context_, new_token_id);
cached_token_chars += new_token_chars;
accumulated_text += new_token_chars;

static const std::vector<std::string> stop_sequences = {
"<|im_end|>",
"<|eot_id|>",
"</s>",
"<|end|>",
"<|endoftext|>",
"\n\nUser:",
"\n\nHuman:",
};
const std::string new_token_chars =
common_token_to_piece(context_, new_token_id);

bool hit_stop_sequence = false;
for (const auto& stop_seq : stop_sequences) {
size_t pos = accumulated_text.find(stop_seq);
if (pos != std::string::npos) {
LOGI("Stop sequence detected: %s", stop_seq.c_str());
hit_stop_sequence = true;
break;
partial_utf8_buffer.append(new_token_chars);

Utf8State scanner_state;
size_t valid_upto = 0;
for (size_t i = 0; i < partial_utf8_buffer.size(); ++i) {
scanner_state.process(static_cast<uint8_t>(partial_utf8_buffer[i]));
if (scanner_state.state == 0) {
valid_upto = i + 1;
}
}
Comment on lines +517 to 524
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

scanner_state should be reused or reset across iterations - creating a new Utf8State inside the loop means each token's UTF-8 validation starts fresh, but partial_utf8_buffer can contain incomplete sequences from previous iterations

Suggested change
Utf8State scanner_state;
size_t valid_upto = 0;
for (size_t i = 0; i < partial_utf8_buffer.size(); ++i) {
scanner_state.process(static_cast<uint8_t>(partial_utf8_buffer[i]));
if (scanner_state.state == 0) {
valid_upto = i + 1;
}
}
Utf8State scanner_state;
size_t valid_upto = 0;
for (size_t i = 0; i < partial_utf8_buffer.size(); ++i) {
bool complete = scanner_state.process(static_cast<uint8_t>(partial_utf8_buffer[i]));
if (complete) {
valid_upto = i + 1;
}
}
Prompt To Fix With AI
This is a comment left during a code review.
Path: sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.cpp
Line: 517:524

Comment:
scanner_state should be reused or reset across iterations - creating a new `Utf8State` inside the loop means each token's UTF-8 validation starts fresh, but `partial_utf8_buffer` can contain incomplete sequences from previous iterations

```suggestion
        Utf8State scanner_state;
        size_t valid_upto = 0;
        for (size_t i = 0; i < partial_utf8_buffer.size(); ++i) {
            bool complete = scanner_state.process(static_cast<uint8_t>(partial_utf8_buffer[i]));
            if (complete) {
                valid_upto = i + 1;
            }
        }
```

How can I resolve this? If you propose a fix, please make it concise.


if (hit_stop_sequence) {
break;
}
if (valid_upto > 0) {
std::string valid_chunk = partial_utf8_buffer.substr(0, valid_upto);
stop_window.append(valid_chunk);
partial_utf8_buffer.erase(0, valid_upto);

size_t found_stop_pos = std::string::npos;
for (const auto& stop_seq : STOP_SEQUENCES) {
size_t pos = stop_window.find(stop_seq);
if (pos != std::string::npos) {
if (found_stop_pos == std::string::npos || pos < found_stop_pos) {
found_stop_pos = pos;
}
}
}

if (is_valid_utf8(cached_token_chars.c_str())) {
if (!callback(cached_token_chars)) {
LOGI("Generation cancelled by callback");
cancel_requested_.store(true);
if (found_stop_pos != std::string::npos) {
LOGI("Stop sequence detected");
stop_sequence_hit = true;
if (found_stop_pos > 0) {
if (!callback(stop_window.substr(0, found_stop_pos))) {
cancel_requested_.store(true);
}
}
break;
}
cached_token_chars.clear();

if (stop_window.size() > MAX_STOP_LEN) {
size_t safe_len = stop_window.size() - MAX_STOP_LEN;
if (!callback(stop_window.substr(0, safe_len))) {
LOGI("Generation cancelled by callback");
cancel_requested_.store(true);
break;
}
stop_window.erase(0, safe_len);
}
}

batch.n_tokens = 0;
Expand All @@ -545,8 +572,8 @@ bool LlamaCppTextGeneration::generate_stream(const TextGenerationRequest& reques
}
}

if (!cached_token_chars.empty() && is_valid_utf8(cached_token_chars.c_str())) {
callback(cached_token_chars);
if (!cancel_requested_.load() && !stop_sequence_hit && !stop_window.empty()) {
callback(stop_window);
}

llama_memory_clear(llama_get_memory(context_), true);
Expand Down
Loading