Skip to content
Closed

Dev #440

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
405 changes: 405 additions & 0 deletions sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.cpp

Large diffs are not rendered by default.

41 changes: 41 additions & 0 deletions sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,47 @@ class LlamaCppTextGeneration {
bool generate_stream(const TextGenerationRequest& request, TextStreamCallback callback,
int* out_prompt_tokens);
void cancel();

/**
* @brief Check whether the given context answers the query, using logit probing.
*
* Formats a Yes/No question, runs llama_decode for prefill only (no generation),
* extracts logits for the "Yes" and "No" tokens at the last position, and computes
* confidence via softmax. Probe tokens are removed from the KV cache before returning.
*
* @param context The context passage (retrieved sentence or accumulated sentences)
* @param query The user query to check against the context
* @return Confidence score in [0.0, 1.0] — higher means context likely answers query.
* Returns 0.5 on error (neutral / unknown).
*/
float probe_confidence(const std::string& context, const std::string& query);

/**
* @brief Inject a system prompt into the KV cache at position 0.
* Clears existing KV cache first, then decodes the prompt tokens.
* @return true on success, false on error.
*/
bool inject_system_prompt(const std::string& prompt);

/**
* @brief Append text to the KV cache after current content.
* Does not clear existing KV cache — adds at current position.
* @return true on success, false on error.
*/
bool append_context(const std::string& text);

/**
* @brief Generate a response from accumulated KV cache state.
* Unlike generate(), does NOT clear the KV cache first.
* @return TextGenerationResult with generated text.
*/
TextGenerationResult generate_from_context(const TextGenerationRequest& request);

/**
* @brief Clear all KV cache state.
*/
void clear_context();

nlohmann::json get_model_info() const;

// LoRA adapter management
Expand Down
39 changes: 38 additions & 1 deletion sdk/runanywhere-commons/src/backends/rag/inference_provider.h
Original file line number Diff line number Diff line change
Expand Up @@ -134,10 +134,47 @@ class ITextGenerator {

/**
* @brief Get maximum context size in tokens
*
*
* @return Context window size
*/
virtual int context_size() const noexcept = 0;

/**
* @brief Inject a system prompt into the KV cache at position 0.
* Called once at the start of an adaptive query loop.
* Default: no-op (returns false).
*/
virtual bool inject_system_prompt(const std::string& prompt) { (void)prompt; return false; }

/**
* @brief Append text to the KV cache after current content.
* Used to incrementally add sentences during the adaptive loop.
* Default: no-op (returns false).
*/
virtual bool append_context(const std::string& text) { (void)text; return false; }

/**
* @brief Check confidence that accumulated context answers the query.
* Default: returns 0.5 (neutral — loop continues).
*/
virtual float probe_confidence(const std::string& context, const std::string& query) {
(void)context; (void)query; return 0.5f;
}

/**
* @brief Generate response using accumulated KV cache state.
* Unlike generate(), does NOT clear the KV cache first.
* Default: falls back to generate(prompt, options).
*/
virtual GenerationResult generate_from_context(const std::string& query, const GenerationOptions& options = GenerationOptions{}) {
return generate(query, options);
}

/**
* @brief Clear all KV cache state.
* Default: no-op.
*/
virtual void clear_context() {}
};

// =============================================================================
Expand Down
Loading
Loading