@@ -30,12 +30,6 @@ class LLM:
30
30
this class generates texts from the model, using an intelligent batching
31
31
mechanism and efficient memory management.
32
32
33
- NOTE: This class is intended to be used for offline inference. For online
34
- serving, use the :class:`~vllm.AsyncLLMEngine` class instead.
35
-
36
- NOTE: For the comprehensive list of arguments, see
37
- :class:`~vllm.EngineArgs`.
38
-
39
33
Args:
40
34
model: The name or path of a HuggingFace Transformers model.
41
35
tokenizer: The name or path of a HuggingFace Transformers tokenizer.
@@ -84,6 +78,12 @@ class LLM:
84
78
When a sequence has context length larger than this, we fall back
85
79
to eager mode.
86
80
disable_custom_all_reduce: See ParallelConfig
81
+ **kwargs: Arguments for :class:`~vllm.EngineArgs`. (See
82
+ :ref:`engine_args`)
83
+
84
+ Note:
85
+ This class is intended to be used for offline inference. For online
86
+ serving, use the :class:`~vllm.AsyncLLMEngine` class instead.
87
87
"""
88
88
89
89
DEPRECATE_LEGACY : ClassVar [bool ] = False
@@ -253,7 +253,7 @@ def generate(
253
253
) -> List [RequestOutput ]:
254
254
"""Generates the completions for the input prompts.
255
255
256
- NOTE: This class automatically batches the given prompts, considering
256
+ This class automatically batches the given prompts, considering
257
257
the memory constraint. For the best performance, put all of your prompts
258
258
into a single list and pass it to this method.
259
259
@@ -270,6 +270,11 @@ def generate(
270
270
Returns:
271
271
A list of `RequestOutput` objects containing the
272
272
generated completions in the same order as the input prompts.
273
+
274
+ Note:
275
+ Using ``prompts`` and ``prompt_token_ids`` as keyword parameters is
276
+ considered legacy and may be deprecated in the future. You should
277
+ instead pass them via the ``inputs`` parameter.
273
278
"""
274
279
if prompt_token_ids is not None or multi_modal_data is not None :
275
280
inputs = self ._convert_v1_inputs (
@@ -393,7 +398,7 @@ def encode(
393
398
) -> List [EmbeddingRequestOutput ]:
394
399
"""Generates the completions for the input prompts.
395
400
396
- NOTE: This class automatically batches the given prompts, considering
401
+ This class automatically batches the given prompts, considering
397
402
the memory constraint. For the best performance, put all of your prompts
398
403
into a single list and pass it to this method.
399
404
@@ -409,6 +414,11 @@ def encode(
409
414
Returns:
410
415
A list of `EmbeddingRequestOutput` objects containing the
411
416
generated embeddings in the same order as the input prompts.
417
+
418
+ Note:
419
+ Using ``prompts`` and ``prompt_token_ids`` as keyword parameters is
420
+ considered legacy and may be deprecated in the future. You should
421
+ instead pass them via the ``inputs`` parameter.
412
422
"""
413
423
if prompt_token_ids is not None or multi_modal_data is not None :
414
424
inputs = self ._convert_v1_inputs (
0 commit comments