vllm-project
diff --git a/‎docs/source/models/adding_model.rst‎
Lines changed: 1 addition & 1 deletion b/‎docs/source/models/adding_model.rst‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/conftest.py‎
Lines changed: 56 additions & 10 deletions b/‎tests/conftest.py‎
Lines changed: 56 additions & 10 deletions
diff --git a/‎tests/samplers/test_beam_search.py‎
Lines changed: 46 additions & 0 deletions b/‎tests/samplers/test_beam_search.py‎
Lines changed: 46 additions & 0 deletions
diff --git a/‎vllm/core/block_manager.py‎
Lines changed: 2 additions & 6 deletions b/‎vllm/core/block_manager.py‎
Lines changed: 2 additions & 6 deletions
diff --git a/‎vllm/core/scheduler.py‎
Lines changed: 45 additions & 69 deletions b/‎vllm/core/scheduler.py‎
Lines changed: 45 additions & 69 deletions
@@ -59,7 +59,7 @@ Next, you need to rewrite the :code:`forward` methods of your model by following
     +    kv_caches: List[KVCache],
     +    input_metadata: InputMetadata,
     +    cache_events: Optional[List[torch.cuda.Event]],
-    +) -> Dict[int, SequenceOutputs]:
+    +) -> SamplerOutput:
 
 3. Update the code by considering that :code:`input_ids` and :code:`positions` are now flattened tensors.
 4. Replace the attention operation with either :code:`GPTPagedAttention` or :code:`GPTNeoXPagedAttention`, depending on the model's architecture.
 
@@ -67,8 +67,8 @@ def generate(
                 output_ids,
                 skip_special_tokens=True,
                 clean_up_tokenization_spaces=False,
-            )[0]
-            output_ids = output_ids[0].cpu().tolist()
+            )
+            output_ids = output_ids.cpu().tolist()
             outputs.append((output_ids, output_str))
         return outputs
 
@@ -77,8 +77,34 @@ def generate_greedy(
         prompts: List[str],
         max_tokens: int,
     ) -> List[Tuple[List[int], str]]:
-        return self.generate(prompts, do_sample=False,
-                             max_new_tokens=max_tokens)
+        outputs = self.generate(prompts,
+                                do_sample=False,
+                                max_new_tokens=max_tokens)
+        for i in range(len(outputs)):
+            output_ids, output_str = outputs[i]
+            outputs[i] = (output_ids[0], output_str[0])
+        return outputs
+
+    def generate_beam_search(
+        self,
+        prompts: List[str],
+        beam_width: int,
+        max_tokens: int,
+    ) -> List[Tuple[List[int], str]]:
+        outputs = self.generate(prompts,
+                                do_sample=False,
+                                max_new_tokens=max_tokens,
+                                num_beams=beam_width,
+                                num_return_sequences=beam_width)
+        for i in range(len(outputs)):
+            output_ids, output_str = outputs[i]
+            for j in range(len(output_ids)):
+                output_ids[j] = [
+                    x for x in output_ids[j]
+                    if x != self.tokenizer.pad_token_id
+                ]
+            outputs[i] = (output_ids, output_str)
+        return outputs
 
 
 @pytest.fixture
@@ -107,15 +133,20 @@ def generate(
         prompts: List[str],
         sampling_params: SamplingParams,
     ) -> List[Tuple[List[int], str]]:
-        req_outputs = self.model.generate(
-            prompts, sampling_params=sampling_params)
+        req_outputs = self.model.generate(prompts,
+                                          sampling_params=sampling_params)
         outputs = []
         for req_output in req_outputs:
             prompt_str = req_output.prompt
             prompt_ids = req_output.prompt_token_ids
-            output_str = req_output.outputs[0].text
-            output_ids = req_output.outputs[0].token_ids
-            outputs.append((prompt_ids + output_ids, prompt_str + output_str))
+            req_sample_output_ids = []
+            req_sample_output_strs = []
+            for sample in req_output.outputs:
+                output_str = sample.text
+                output_ids = sample.token_ids
+                req_sample_output_ids.append(prompt_ids + output_ids)
+                req_sample_output_strs.append(prompt_str + output_str)
+            outputs.append((req_sample_output_ids, req_sample_output_strs))
         return outputs
 
     def generate_greedy(
@@ -124,7 +155,22 @@ def generate_greedy(
         max_tokens: int,
     ) -> List[Tuple[List[int], str]]:
         greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
-        return self.generate(prompts, greedy_params)
+        outputs = self.generate(prompts, greedy_params)
+        return [(output_ids[0], output_str[0]) for output_ids, output_str in
+                outputs]
+
+    def generate_beam_search(
+        self,
+        prompts: List[str],
+        beam_width: int,
+        max_tokens: int,
+    ) -> List[Tuple[List[int], str]]:
+        beam_search_params = SamplingParams(n=beam_width,
+                                            use_beam_search=True,
+                                            temperature=0.0,
+                                            max_tokens=max_tokens)
+        outputs = self.generate(prompts, beam_search_params)
+        return outputs
 
 
 @pytest.fixture
 
@@ -0,0 +1,46 @@
+"""Compare the outputs of HF and vLLM when using beam search.
+
+Run `pytest tests/samplers/test_beam_search.py --forked`.
+"""
+import pytest
+
+# FIXME(zhuohan): The test can not pass if we:
+#   1. Increase max_tokens to 256.
+#   2. Increase beam_width to 8.
+#   3. Use the model "huggyllama/llama-7b".
+MAX_TOKENS = [128]
+BEAM_WIDTHS = [4]
+MODELS = ["facebook/opt-125m"]
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", MAX_TOKENS)
+@pytest.mark.parametrize("beam_width", BEAM_WIDTHS)
+def test_beam_search_single_input(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    beam_width: int,
+) -> None:
+    hf_model = hf_runner(model, dtype=dtype)
+    hf_outputs = hf_model.generate_beam_search(example_prompts, beam_width,
+                                               max_tokens)
+    del hf_model
+
+    vllm_model = vllm_runner(model, dtype=dtype)
+    vllm_outputs = vllm_model.generate_beam_search(example_prompts, beam_width,
+                                                   max_tokens)
+    del vllm_model
+
+    for i in range(len(example_prompts)):
+        hf_output_ids, _ = hf_outputs[i]
+        vllm_output_ids, _ = vllm_outputs[i]
+        assert len(hf_output_ids) == len(vllm_output_ids)
+        for j in range(len(hf_output_ids)):
+            assert hf_output_ids[j] == vllm_output_ids[j], (
+                f"Test{i} output{j}:\nHF: {hf_output_ids}\n"
+                f"vLLM: {vllm_output_ids}")
@@ -172,9 +172,7 @@ def can_swap_in(self, seq_group: SequenceGroup) -> bool:
     def swap_in(self, seq_group: SequenceGroup) -> Dict[int, int]:
         # CPU block -> GPU block.
         mapping: Dict[PhysicalTokenBlock, PhysicalTokenBlock] = {}
-        for seq in seq_group.get_seqs():
-            if seq.is_finished():
-                continue
+        for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED):
             new_block_table: BlockTable = []
             block_table = self.block_tables[seq.seq_id]
 
@@ -203,9 +201,7 @@ def can_swap_out(self, seq_group: SequenceGroup) -> bool:
     def swap_out(self, seq_group: SequenceGroup) -> Dict[int, int]:
         # GPU block -> CPU block.
         mapping: Dict[PhysicalTokenBlock, PhysicalTokenBlock] = {}
-        for seq in seq_group.get_seqs():
-            if seq.is_finished():
-                continue
+        for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
             new_block_table: BlockTable = []
             block_table = self.block_tables[seq.seq_id]
 
 
@@ -7,8 +7,7 @@
 from vllm.core.policy import PolicyFactory
 from vllm.logger import init_logger
 from vllm.sequence import (Sequence, SequenceData, SequenceGroup,
-                           SequenceGroupMetadata, SequenceOutputs,
-                           SequenceStatus)
+                           SequenceGroupMetadata, SequenceStatus)
 
 logger = init_logger(__name__)
 
@@ -76,6 +75,7 @@ def __init__(
             num_cpu_blocks=self.cache_config.num_cpu_blocks,
         )
 
+        # TODO(zhuohan): Use deque instead of list for better performance.
         # Sequence groups in the WAITING state.
         self.waiting: List[SequenceGroup] = []
         # Sequence groups in the RUNNING state.
@@ -96,10 +96,11 @@ def abort_seq_group(self, request_id: Union[str, Iterable[str]]) -> None:
                 if seq_group.request_id in request_ids:
                     # Remove the sequence group from the state queue.
                     state_queue.remove(seq_group)
-                    for seq in seq_group.seqs:
+                    for seq in seq_group.get_seqs():
                         if seq.is_finished():
                             continue
-                        self.free_seq(seq, SequenceStatus.FINISHED_ABORTED)
+                        seq.status = SequenceStatus.FINISHED_ABORTED
+                        self.free_seq(seq)
                     request_ids.remove(seq_group.request_id)
                     if not request_ids:
                         return
@@ -123,13 +124,20 @@ def _schedule(self) -> SchedulerOutputs:
         if not self.swapped:
             ignored_seq_groups: List[SequenceGroup] = []
             scheduled: List[SequenceGroup] = []
+            # The total number of sequences on the fly, including the
+            # requests in the generation phase.
+            num_curr_seqs = sum(seq_group.get_max_num_running_seqs()
+                                for seq_group in self.running)
             num_batched_tokens = 0
             # Optimization: We do not sort the waiting queue since the preempted
             # sequence groups are added to the front and the new sequence groups
             # are added to the back.
             while self.waiting:
                 seq_group = self.waiting[0]
 
+                assert seq_group.num_seqs() == 1, (
+                    "Waiting sequence group should have only one prompt "
+                    "sequence.")
                 num_prompt_tokens = seq_group.get_seqs()[0].get_len()
                 if num_prompt_tokens > self.prompt_limit:
                     logger.warning(
@@ -152,11 +160,7 @@ def _schedule(self) -> SchedulerOutputs:
 
                 # The total number of sequences in the RUNNING state should not
                 # exceed the maximum number of sequences.
-                num_new_seqs = seq_group.num_seqs(
-                    status=SequenceStatus.WAITING)
-                num_curr_seqs = sum(
-                    seq_group.num_seqs(status=SequenceStatus.RUNNING)
-                    for seq_group in self.running)
+                num_new_seqs = seq_group.get_max_num_running_seqs()
                 if (num_curr_seqs + num_new_seqs >
                         self.scheduler_config.max_num_seqs):
                     break
@@ -165,6 +169,7 @@ def _schedule(self) -> SchedulerOutputs:
                 self._allocate(seq_group)
                 self.running.append(seq_group)
                 num_batched_tokens += num_prompt_tokens
+                num_curr_seqs += num_new_seqs
                 scheduled.append(seq_group)
 
             if scheduled:
@@ -210,30 +215,32 @@ def _schedule(self) -> SchedulerOutputs:
 
         # Swap in the sequence groups in the SWAPPED state if possible.
         self.swapped = self.policy.sort_by_priority(now, self.swapped)
-        while self.swapped and not blocks_to_swap_out:
-            seq_group = self.swapped[0]
-            # If the sequence group has been preempted in this step, stop.
-            if seq_group in preempted:
-                break
-            # If the sequence group cannot be swapped in, stop.
-            if not self.block_manager.can_swap_in(seq_group):
-                break
-
-            # The total number of sequences in the RUNNING state should not
-            # exceed the maximum number of sequences.
-            num_new_seqs = seq_group.num_seqs(status=SequenceStatus.SWAPPED)
-            num_curr_seqs = sum(
-                seq_group.num_seqs(status=SequenceStatus.RUNNING)
-                for seq_group in self.running)
-            if (num_curr_seqs + num_new_seqs >
-                    self.scheduler_config.max_num_seqs):
-                break
-
-            seq_group = self.swapped.pop(0)
-            self._swap_in(seq_group, blocks_to_swap_in)
-            self._append_slot(seq_group, blocks_to_copy)
-            self.running.append(seq_group)
+        if not preempted:
+            num_curr_seqs = sum(seq_group.get_max_num_running_seqs()
+                                for seq_group in self.running)
+
+            while self.swapped:
+                seq_group = self.swapped[0]
+                # If the sequence group cannot be swapped in, stop.
+                if not self.block_manager.can_swap_in(seq_group):
+                    break
 
+                # The total number of sequences in the RUNNING state should not
+                # exceed the maximum number of sequences.
+                num_new_seqs = seq_group.get_max_num_running_seqs()
+                if (num_curr_seqs + num_new_seqs >
+                        self.scheduler_config.max_num_seqs):
+                    break
+
+                seq_group = self.swapped.pop(0)
+                self._swap_in(seq_group, blocks_to_swap_in)
+                self._append_slot(seq_group, blocks_to_copy)
+                num_curr_seqs += num_new_seqs
+                self.running.append(seq_group)
+
+        # Each sequence in the generation phase only takes one token slot.
+        # Therefore, the number of batched tokens is equal to the number of
+        # sequences in the RUNNING state.
         num_batched_tokens = sum(
             seq_group.num_seqs(status=SequenceStatus.RUNNING)
             for seq_group in self.running)
@@ -275,40 +282,10 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]:
             seq_group_metadata_list.append(seq_group_metadata)
         return seq_group_metadata_list, scheduler_outputs
 
-    def update(
-        self,
-        seq_outputs: Dict[int, SequenceOutputs],
-    ) -> List[SequenceGroup]:
-        scheduled: List[SequenceGroup] = []
-        for seq_group in self.running:
-            for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
-                if seq.seq_id in seq_outputs:
-                    scheduled.append(seq_group)
-                    break
-
-        # Update the scheduled sequences and free blocks.
-        for seq_group in scheduled:
-            # Process beam search results before processing the new tokens.
-            for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
-                output = seq_outputs[seq.seq_id]
-                if seq.seq_id != output.parent_seq_id:
-                    # The sequence is a fork of the parent sequence (beam
-                    # search). Free the current sequence.
-                    self.block_manager.free(seq)
-                    # Fork the parent sequence.
-                    parent_seq = seq_group.find(output.parent_seq_id)
-                    parent_seq.fork(seq)
-                    self.block_manager.fork(parent_seq, seq)
-
-            # Process the new tokens.
-            for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
-                # Append a new token to the sequence.
-                output = seq_outputs[seq.seq_id]
-                seq.append_token_id(output.output_token, output.logprobs)
-        return scheduled
+    def fork_seq(self, parent_seq: Sequence, child_seq: Sequence) -> None:
+        self.block_manager.fork(parent_seq, child_seq)
 
-    def free_seq(self, seq: Sequence, finish_status: SequenceStatus) -> None:
-        seq.status = finish_status
+    def free_seq(self, seq: Sequence) -> None:
         self.block_manager.free(seq)
 
     def free_finished_seq_groups(self) -> None:
@@ -345,17 +322,16 @@ def _preempt(
         # If preemption mode is not specified, we determine the mode as follows:
         # We use recomputation by default since it incurs lower overhead than
         # swapping. However, when the sequence group has multiple sequences
-        # (e.g., beam search), recomputation is not supported. In such a case,
-        # we use swapping instead.
+        # (e.g., beam search), recomputation is not currently supported. In
+        # such a case, we use swapping instead.
         # FIXME(woosuk): This makes our scheduling policy a bit bizarre.
         # As swapped sequences are prioritized over waiting sequences,
         # sequence groups with multiple sequences are implicitly prioritized
         # over sequence groups with a single sequence.
         # TODO(woosuk): Support recomputation for sequence groups with multiple
         # sequences. This may require a more sophisticated CUDA kernel.
         if preemption_mode is None:
-            seqs = seq_group.get_seqs(status=SequenceStatus.RUNNING)
-            if len(seqs) == 1:
+            if seq_group.get_max_num_running_seqs() == 1:
                 preemption_mode = PreemptionMode.RECOMPUTE
             else:
                 preemption_mode = PreemptionMode.SWAP