Experimental: Add support for prefix-chunking

anujj · web-flow · commit 9673ff4e7c02 · 2025-06-11T20:07:41.000+05:30
diff --git a/src/models/decoder_only.cpp b/src/models/decoder_only.cpp
@@ -25,13 +25,42 @@ DecoderOnly_State::DecoderOnly_State(const DecoderOnly_Model& model, DeviceSpan<
 }
 
 DeviceSpan<float> DecoderOnly_State::Run(int total_length, DeviceSpan<int32_t>& next_tokens, DeviceSpan<int32_t> next_indices) {
-  UpdateInputsOutputs(next_tokens, next_indices, total_length);
+  size_t num_tokens = next_tokens.size();
+  const size_t chunk_size = 15;
+  
+  if (num_tokens > chunk_size) {
+    // Chunking logic for context phase - process in chunks of 512 tokens
+    size_t processed_tokens = 0;
+    int length = total_length - static_cast<int>(num_tokens);
+    while (processed_tokens < num_tokens) {
+      size_t current_chunk_size = std::min(chunk_size, num_tokens - processed_tokens);
+      
+      // Create subspans for current chunk
+      auto chunk_tokens = next_tokens.subspan(processed_tokens, current_chunk_size);
+      //auto chunk_indices = next_indices.subspan(processed_tokens, current_chunk_size);
+      length = length + static_cast<int>(current_chunk_size);
+      // Process this chunk - fills KV cache progressively
+      UpdateInputsOutputs(chunk_tokens, next_indices, length);
+      
+      // Graph capture is typically disabled during context phase chunking
+      bool graph_capture_this_run = false; // Disable graph capture during chunking
+      State::Run(*model_.session_decoder_, graph_capture_this_run);
+      
+      processed_tokens += current_chunk_size;
+    }
+    
+    // Return logits from the last chunk for potential sampling
+    return logits_.Get();
+  } else {
+    // Original logic for tokens <= 512 (generation phase or small context)
+    UpdateInputsOutputs(next_tokens, next_indices, total_length);
 
-  // Graph capture enabled for token generation case, allowing it to repeat the same graph for each token.
-  bool graph_capture_this_run = params_->use_graph_capture && input_ids_.GetShape()[1] == 1;
-  State::Run(*model_.session_decoder_, graph_capture_this_run);
+    // Graph capture enabled for token generation case, allowing it to repeat the same graph for each token.
+    bool graph_capture_this_run = params_->use_graph_capture && input_ids_.GetShape()[1] == 1;
+    State::Run(*model_.session_decoder_, graph_capture_this_run);
 
-  return logits_.Get();
+    return logits_.Get();
+  }
 }
 
 void DecoderOnly_State::RewindTo(size_t index) {