nod-ai · zeeshanhaque21 · Oct 15, 2025 · Oct 15, 2025 · Oct 15, 2025 · Oct 15, 2025
diff --git a/sharktank/sharktank/examples/export_paged_llm_v1.py b/sharktank/sharktank/examples/export_paged_llm_v1.py
@@ -64,7 +64,6 @@ def generate_batch_prefill(bs: int):
 
         seq_len_dim = seq_len_blocks_dim * llama_config.block_seq_stride
 
-        start_pos = torch.empty(bs, dtype=torch.int64)
         cache, cache_dynamic_shapes, cache_affinities = model.setup_cache()
 
         dynamic_shapes = {
@@ -95,6 +94,7 @@ def generate_batch_prefill(bs: int):
             if "start_pos" in dynamic_shapes:
                 dynamic_shapes["start_pos"][0] = extend_bs
 
+        start_pos = torch.empty(bs_min, dtype=torch.int64)
         seq_block_ids = torch.empty(bs_min, block_dim_min, dtype=torch.int64)
         tokens = torch.empty(
             bs_min,

diff --git a/shortfin/python/shortfin_apps/llm/components/batching/config.py b/shortfin/python/shortfin_apps/llm/components/batching/config.py
@@ -25,6 +25,7 @@
 
 class BatchMode(Enum):
     DEFAULT = "Default"
+    EXTEND_ATTENTION = "ExtendAttention"
 
 
 @dataclass(slots=True)
@@ -35,3 +36,4 @@ class BatchConfig:
     decode_functions: dict[int, sf.ProgramFunction]  # type: ignore
     prog_isolation: sf.ProgramIsolation  # type: ignore
     chunk_block_size: Optional[int] = None
+    token_budget: Optional[int] = None
diff --git a/shortfin/python/shortfin_apps/llm/components/batching/factory.py b/shortfin/python/shortfin_apps/llm/components/batching/factory.py
@@ -16,6 +16,7 @@
 from ..kvcache.base_attention_cache import BasePagedAttentionCache
 from .batching_trait import BatchingTrait
 from .modes.default import DefaultBatchingEngine
+from .modes.extend_attention import ExtendAttentionBatchingEngine
 from ..messages import LlmInferenceExecRequest
 
 
@@ -61,5 +62,15 @@ def _create_impl(batch_cfg: BatchConfig, page_cache: BasePagedAttentionCache, pr
             ),
             page_cache=page_cache,
         )
+    elif batch_cfg.mode == BatchMode.EXTEND_ATTENTION:
+        return _BatchingEngineImpl(
+            ExtendAttentionBatchingEngine.create(
+                batch_cfg=batch_cfg,
+                page_cache=page_cache,
+                prefill_fiber=prefill_fiber,
+                decode_fiber=decode_fiber,
+            ),
+            page_cache=page_cache,
+        )
 
     raise ValueError(f"Unsupported Batching Mode: {batch_cfg.mode}")