diff --git a/nanovllm/engine/block_manager.py b/nanovllm/engine/block_manager.py index 65d725e4..b0597c9f 100644 --- a/nanovllm/engine/block_manager.py +++ b/nanovllm/engine/block_manager.py @@ -102,11 +102,11 @@ def may_append(self, seq: Sequence): self._allocate_block(block_id) block_table.append(block_id) elif len(seq) % self.block_size == 0: - assert last_block.hash == -1 - token_ids = seq.block(seq.num_blocks-1) - prefix = self.blocks[block_table[-2]].hash if len(block_table) > 1 else -1 - h = self.compute_hash(token_ids, prefix) - last_block.update(h, token_ids) - self.hash_to_block_id[h] = last_block.block_id + if last_block.hash == -1: + token_ids = seq.block(seq.num_blocks-1) + prefix = self.blocks[block_table[-2]].hash if len(block_table) > 1 else -1 + h = self.compute_hash(token_ids, prefix) + last_block.update(h, token_ids) + self.hash_to_block_id[h] = last_block.block_id else: assert last_block.hash == -1 diff --git a/nanovllm/engine/scheduler.py b/nanovllm/engine/scheduler.py index 5bc19fe0..e4a09a6c 100644 --- a/nanovllm/engine/scheduler.py +++ b/nanovllm/engine/scheduler.py @@ -30,13 +30,15 @@ def schedule(self) -> tuple[list[Sequence], bool]: seq = self.waiting[0] if num_batched_tokens + len(seq) > self.max_num_batched_tokens or not self.block_manager.can_allocate(seq): break - num_seqs += 1 self.block_manager.allocate(seq) - num_batched_tokens += len(seq) - seq.num_cached_tokens seq.status = SequenceStatus.RUNNING self.waiting.popleft() self.running.append(seq) - scheduled_seqs.append(seq) + tokens_to_compute=len(seq) - seq.num_cached_tokens + if tokens_to_compute > 0: + num_seqs += 1 + scheduled_seqs.append(seq) + num_batched_tokens += tokens_to_compute if scheduled_seqs: return scheduled_seqs, True