@@ -3502,13 +3502,10 @@ static bool llama_kv_cache_init(
35023502    return true;
35033503}
35043504
3505- // find an empty slot of size "n_tokens" in the cache
3506- // updates the cache head
3507- // Note: On success, it's important that cache.head points
3508- // to the first cell of the slot.
3505+ // a structure holds information about the slot found in llama_kv_cache_find_slot
35093506struct llama_kv_cache_slot_info {
3510-     std::pair<uint32_t, uint32_t> boundaries;
3511-     bool found = false;
3507+     std::pair<uint32_t, uint32_t> boundaries; // slot boundaries [begin, end) 
3508+     bool found = false;                       // the slot was found 
35123509
35133510    explicit llama_kv_cache_slot_info(bool found_) : found{found_} {}
35143511    llama_kv_cache_slot_info(uint32_t begin, uint32_t end) : boundaries{begin, end}, found{true} {}
@@ -3517,6 +3514,11 @@ struct llama_kv_cache_slot_info {
35173514};
35183515static const llama_kv_cache_slot_info llama_kv_cache_slot_info_failed{false};
35193516
3517+ // find an empty slot of size "n_tokens" in the cache
3518+ // updates the cache head
3519+ // returns a structure holding information about the slot found
3520+ // Note: On success, it's important that cache.head points
3521+ // to the first cell of the slot.
35203522static struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
35213523           struct llama_kv_cache & cache,
35223524       const struct llama_ubatch & batch) {
@@ -4019,7 +4021,9 @@ struct llama_kv_slot_restorer {
40194021        uint32_t n    = 0;
40204022    } old_state;
40214023
4022-     std::vector<std::pair<uint32_t, uint32_t>> slot_boundaries; // for non-recurrent models only
4024+     // for non-recurrent models only
4025+     // list of slots to restore
4026+     std::vector<std::pair<uint32_t, uint32_t>> slot_boundaries;
40234027
40244028    bool do_restore = false;
40254029
@@ -4028,7 +4032,8 @@ struct llama_kv_slot_restorer {
40284032        old_state.n     = cache.n;
40294033    }
40304034
4031-     void save(const struct llama_kv_cache_slot_info& slot) {
4035+     // saves a slot information for future restoration
4036+     void save(const struct llama_kv_cache_slot_info & slot) {
40324037        if (slot) {
40334038            do_restore = true;
40344039            if (slot.boundaries.first != slot.boundaries.second) {
@@ -4037,6 +4042,8 @@ struct llama_kv_slot_restorer {
40374042        }
40384043    }
40394044
4045+     // must be explicitly called to restore the kv_cache state
4046+     // and rollback changes from all llama_kv_cache_find_slot calls
40404047    void restore(struct llama_kv_cache & cache) {
40414048        if (do_restore) {
40424049            cache.head  = old_state.head;
@@ -17236,6 +17243,7 @@ static void llama_output_reorder(struct llama_context * ctx) {
1723617243    }
1723717244}
1723817245
17246+ // returns the result of ggml_backend_sched_graph_compute_async execution
1723917247static enum ggml_status llama_graph_compute(
1724017248          llama_context & lctx,
1724117249            ggml_cgraph * gf,
@@ -17262,6 +17270,9 @@ static enum ggml_status llama_graph_compute(
1726217270}
1726317271
1726417272// decode a batch of tokens by evaluating the transformer
17273+ // in case of unsuccessful decoding (error or warning),
17274+ // the kv_cache state will be returned to its original state
17275+ // (for non-recurrent models) or cleaned (for recurrent models)
1726517276//
1726617277//   - lctx:      llama context
1726717278//   - batch:     batch to evaluate
0 commit comments