vllm-project
diff --git a/‎.github/workflows/mypy.yaml‎
Lines changed: 15 additions & 14 deletions b/‎.github/workflows/mypy.yaml‎
Lines changed: 15 additions & 14 deletions
diff --git a/‎format.sh‎
Lines changed: 12 additions & 14 deletions b/‎format.sh‎
Lines changed: 12 additions & 14 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 4 additions & 2 deletions b/‎pyproject.toml‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎vllm/attention/backends/abstract.py‎
Lines changed: 1 addition & 1 deletion b/‎vllm/attention/backends/abstract.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎vllm/attention/backends/rocm_flash_attn.py‎
Lines changed: 1 addition & 0 deletions b/‎vllm/attention/backends/rocm_flash_attn.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎vllm/attention/backends/torch_sdpa.py‎
Lines changed: 2 additions & 1 deletion b/‎vllm/attention/backends/torch_sdpa.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎vllm/attention/backends/xformers.py‎
Lines changed: 1 addition & 0 deletions b/‎vllm/attention/backends/xformers.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎vllm/core/block/block_table.py‎
Lines changed: 1 addition & 0 deletions b/‎vllm/core/block/block_table.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎vllm/core/block/common.py‎
Lines changed: 4 additions & 2 deletions b/‎vllm/core/block/common.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎vllm/core/block/interfaces.py‎
Lines changed: 2 additions & 4 deletions b/‎vllm/core/block/interfaces.py‎
Lines changed: 2 additions & 4 deletions
@@ -32,19 +32,20 @@ jobs:
         pip install types-setuptools
     - name: Mypy
       run: |
-        mypy vllm/attention/*.py --follow-imports=skip --config-file pyproject.toml
+        mypy vllm/attention --config-file pyproject.toml
+        # TODO(sang): Fix nested dir
         mypy vllm/core/*.py --follow-imports=skip --config-file pyproject.toml
-        mypy vllm/distributed/*.py --follow-imports=skip --config-file pyproject.toml
-        mypy vllm/entrypoints/*.py --follow-imports=skip --config-file pyproject.toml
-        mypy vllm/executor/*.py --follow-imports=skip --config-file pyproject.toml
-        mypy vllm/usage/*.py --follow-imports=skip --config-file pyproject.toml
-        mypy vllm/*.py --follow-imports=skip --config-file pyproject.toml
-        mypy vllm/transformers_utils/*.py --follow-imports=skip --config-file pyproject.toml
-
-        mypy vllm/engine/*.py --follow-imports=skip --config-file pyproject.toml
-        mypy vllm/worker/*.py --follow-imports=skip --config-file pyproject.toml
-        mypy vllm/spec_decode/*.py --follow-imports=skip --config-file pyproject.toml
-        mypy vllm/model_executor/*.py --follow-imports=skip --config-file pyproject.toml
-        # TODO(sang): Follow up
-        # mypy vllm/lora/*.py --follow-imports=skip --config-file pyproject.toml
+        mypy vllm/distributed --config-file pyproject.toml
+        mypy vllm/entrypoints --config-file pyproject.toml
+        mypy vllm/executor --config-file pyproject.toml
+        mypy vllm/usage --config-file pyproject.toml
+        mypy vllm/*.py --config-file pyproject.toml
+        mypy vllm/transformers_utils --config-file pyproject.toml
+        mypy vllm/engine  --config-file pyproject.toml
+        mypy vllm/worker --config-file pyproject.toml
+        mypy vllm/spec_decode --config-file pyproject.toml
+        # TODO(sang): Fix nested dir
+        mypy vllm/model_executor/*.py  --config-file pyproject.toml
+        # TODO(sang): Fix nested dir
+        # mypy vllm/lora/*.py --config-file pyproject.toml
 
@@ -94,21 +94,19 @@ echo 'vLLM yapf: Done'
 
 # Run mypy
 echo 'vLLM mypy:'
-mypy vllm/attention/*.py --follow-imports=skip --config-file pyproject.toml
+mypy vllm/attention --config-file pyproject.toml
 mypy vllm/core/*.py --follow-imports=skip --config-file pyproject.toml
-mypy vllm/distributed/*.py --follow-imports=skip --config-file pyproject.toml
-mypy vllm/entrypoints/*.py --follow-imports=skip --config-file pyproject.toml
-mypy vllm/executor/*.py --follow-imports=skip --config-file pyproject.toml
-mypy vllm/usage/*.py --follow-imports=skip --config-file pyproject.toml
-mypy vllm/*.py --follow-imports=skip --config-file pyproject.toml
-mypy vllm/transformers_utils/*.py --follow-imports=skip --config-file pyproject.toml
-
-# TODO(sang): Follow up
-mypy vllm/engine/*.py --follow-imports=skip --config-file pyproject.toml
-mypy vllm/worker/*.py --follow-imports=skip --config-file pyproject.toml
-mypy vllm/spec_decode/*.py --follow-imports=skip --config-file pyproject.toml
-mypy vllm/model_executor/*.py --follow-imports=skip --config-file pyproject.toml
-# mypy vllm/lora/*.py --follow-imports=skip --config-file pyproject.toml
+mypy vllm/distributed --config-file pyproject.toml
+mypy vllm/entrypoints --config-file pyproject.toml
+mypy vllm/executor --config-file pyproject.toml
+mypy vllm/usage --config-file pyproject.toml
+mypy vllm/*.py --config-file pyproject.toml
+mypy vllm/transformers_utils --config-file pyproject.toml
+mypy vllm/engine  --config-file pyproject.toml
+mypy vllm/worker --config-file pyproject.toml
+mypy vllm/spec_decode --config-file pyproject.toml
+mypy vllm/model_executor/*.py  --config-file pyproject.toml
+# mypy vllm/lora/*.py --config-file pyproject.toml
 
 
 CODESPELL_EXCLUDES=(
 
@@ -46,15 +46,17 @@ ignore = [
 python_version = "3.8"
 
 ignore_missing_imports = true
- check_untyped_defs = true
+check_untyped_defs = true
+follow_imports = "skip"
 
 files = "vllm"
 # TODO(woosuk): Include the code from Megatron and HuggingFace.
 exclude = [
     "vllm/model_executor/parallel_utils/|vllm/model_executor/models/",
+    # Ignore triton kernels in ops.
+    'vllm/attention/ops/.*\.py$'
 ]
 
-
 [tool.codespell]
 ignore-words-list = "dout, te, indicies"
 skip = "./tests/prompts,./benchmarks/sonnet.txt"
 
@@ -116,7 +116,7 @@ def forward(
         key: torch.Tensor,
         value: torch.Tensor,
         kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata[AttentionMetadataPerStage],
+        attn_metadata: AttentionMetadata,
         kv_scale: float,
     ) -> torch.Tensor:
         raise NotImplementedError
@@ -248,6 +248,7 @@ def forward(
 
         if prefill_meta := attn_metadata.prefill_metadata:
             # Prompt run.
+            assert prefill_meta.prompt_lens is not None
             if kv_cache is None or prefill_meta.block_tables.numel() == 0:
                 # triton attention
                 # When block_tables are not filled, it means q and k are the
 
@@ -106,7 +106,7 @@ def forward(
         key: torch.Tensor,
         value: torch.Tensor,
         kv_cache: Optional[torch.Tensor],
-        attn_metadata: TorchSDPAMetadata,
+        attn_metadata: TorchSDPAMetadata,  # type: ignore
         kv_scale: float,
     ) -> torch.Tensor:
         """Forward pass with torch SDPA and PagedAttention.
@@ -136,6 +136,7 @@ def forward(
                                                 kv_scale)
 
         if attn_metadata.is_prompt:
+            assert attn_metadata.prompt_lens is not None
             if (kv_cache is None or attn_metadata.block_tables.numel() == 0):
                 if self.num_kv_heads != self.num_heads:
                     key = key.repeat_interleave(self.num_queries_per_kv, dim=1)
 
@@ -288,6 +288,7 @@ def _run_memory_efficient_xformers_forward(
             value: shape = [num_prefill_tokens, num_kv_heads, head_size]
             attn_metadata: Metadata for attention.
         """
+        assert attn_metadata.prompt_lens is not None
         original_query = query
         if self.num_kv_heads != self.num_heads:
             # GQA/MQA requires the shape [B, M, G, H, K].
 
@@ -104,6 +104,7 @@ def append_token_ids(self,
             token_ids (List[int]): The sequence of token IDs to be appended.
         """
         assert self._is_allocated
+        assert self._blocks is not None
 
         self.ensure_num_empty_slots(num_empty_slots=len(token_ids) +
                                     num_lookahead_slots)
 
@@ -99,7 +99,7 @@ def __init__(
         refcounter: RefCounter,
         allocator: BlockAllocator,
     ):
-        self._copy_on_writes = defaultdict(list)
+        self._copy_on_writes: Dict[BlockId, List[BlockId]] = defaultdict(list)
         self._refcounter = refcounter
         self._allocator = allocator
 
@@ -138,6 +138,8 @@ def cow_block_if_not_appendable(self, block: Block) -> Optional[BlockId]:
                 prev_block=block.prev_block).block_id
 
             # Track src/dst copy.
+            assert src_block_id is not None
+            assert block_id is not None
             self._copy_on_writes[src_block_id].append(block_id)
 
         return block_id
@@ -180,6 +182,6 @@ def recurse(block: Block, lst: List[Block]) -> None:
             recurse(block.prev_block, lst)
         lst.append(block)
 
-    all_blocks = []
+    all_blocks: List[Block] = []
     recurse(last_block, all_blocks)
     return all_blocks
@@ -52,8 +52,7 @@ def __call__(
 class BlockAllocator(ABC):
 
     @abstractmethod
-    def allocate_mutable(self, prev_block: Optional[Block],
-                         device: Device) -> Block:
+    def allocate_mutable(self, prev_block: Optional[Block]) -> Block:
         pass
 
     @abstractmethod
@@ -98,8 +97,7 @@ class NoFreeBlocksError(ValueError):
 class DeviceAwareBlockAllocator(BlockAllocator):
 
     @abstractmethod
-    def allocate_mutable(self, prev_block: Optional[Block],
-                         device: Device) -> Block:
+    def allocate_mutable(self, prev_block: Optional[Block]) -> Block:
         pass
 
     @abstractmethod