|
9 | 9 | "containerDiskInGb": 150, |
10 | 10 | "gpuIds": "ADA_80_PRO,AMPERE_80", |
11 | 11 | "gpuCount": 1, |
12 | | - "allowedCudaVersions": ["12.9", "12.8", "12.7", "12.6", "12.5", "12.4"], |
| 12 | + "allowedCudaVersions": ["12.9", "12.8"], |
13 | 13 | "presets": [ |
14 | 14 | { |
15 | 15 | "name": "deepseek-ai/deepseek-r1-distill-llama-8b", |
|
181 | 181 | "advanced": true |
182 | 182 | } |
183 | 183 | }, |
184 | | - { |
185 | | - "key": "QUANTIZATION_PARAM_PATH", |
186 | | - "input": { |
187 | | - "name": "Quantization Param Path", |
188 | | - "type": "string", |
189 | | - "description": "Path to the JSON file containing the KV cache scaling factors.", |
190 | | - "advanced": true |
191 | | - } |
192 | | - }, |
193 | 184 | { |
194 | 185 | "key": "MAX_MODEL_LEN", |
195 | 186 | "input": { |
|
199 | 190 | "advanced": true |
200 | 191 | } |
201 | 192 | }, |
202 | | - { |
203 | | - "key": "GUIDED_DECODING_BACKEND", |
204 | | - "input": { |
205 | | - "name": "Guided Decoding Backend", |
206 | | - "type": "string", |
207 | | - "description": "Which engine will be used for guided decoding by default.", |
208 | | - "options": [ |
209 | | - { |
210 | | - "label": "outlines", |
211 | | - "value": "outlines" |
212 | | - }, |
213 | | - { |
214 | | - "label": "lm-format-enforcer", |
215 | | - "value": "lm-format-enforcer" |
216 | | - } |
217 | | - ], |
218 | | - "default": "outlines", |
219 | | - "advanced": true |
220 | | - } |
221 | | - }, |
222 | 193 | { |
223 | 194 | "key": "DISTRIBUTED_EXECUTOR_BACKEND", |
224 | 195 | "input": { |
|
238 | 209 | "advanced": true |
239 | 210 | } |
240 | 211 | }, |
241 | | - { |
242 | | - "key": "WORKER_USE_RAY", |
243 | | - "input": { |
244 | | - "name": "Worker Use Ray", |
245 | | - "type": "boolean", |
246 | | - "description": "Deprecated, use --distributed-executor-backend=ray.", |
247 | | - "default": false, |
248 | | - "advanced": true |
249 | | - } |
250 | | - }, |
251 | 212 | { |
252 | 213 | "key": "RAY_WORKERS_USE_NSIGHT", |
253 | 214 | "input": { |
|
307 | 268 | "advanced": true |
308 | 269 | } |
309 | 270 | }, |
310 | | - { |
311 | | - "key": "USE_V2_BLOCK_MANAGER", |
312 | | - "input": { |
313 | | - "name": "Use V2 Block Manager", |
314 | | - "type": "boolean", |
315 | | - "description": "Use BlockSpaceMangerV2.", |
316 | | - "default": false, |
317 | | - "advanced": true |
318 | | - } |
319 | | - }, |
320 | | - { |
321 | | - "key": "NUM_LOOKAHEAD_SLOTS", |
322 | | - "input": { |
323 | | - "name": "Num Lookahead Slots", |
324 | | - "type": "number", |
325 | | - "description": "Experimental scheduling config necessary for speculative decoding.", |
326 | | - "default": 0, |
327 | | - "advanced": true |
328 | | - } |
329 | | - }, |
330 | 271 | { |
331 | 272 | "key": "SEED", |
332 | 273 | "input": { |
|
412 | 353 | "advanced": true |
413 | 354 | } |
414 | 355 | }, |
415 | | - { |
416 | | - "key": "ROPE_SCALING", |
417 | | - "input": { |
418 | | - "name": "RoPE Scaling", |
419 | | - "type": "string", |
420 | | - "description": "RoPE scaling configuration in JSON format.", |
421 | | - "advanced": true |
422 | | - } |
423 | | - }, |
424 | | - { |
425 | | - "key": "ROPE_THETA", |
426 | | - "input": { |
427 | | - "name": "RoPE Theta", |
428 | | - "type": "number", |
429 | | - "description": "RoPE theta. Use with rope_scaling.", |
430 | | - "advanced": true |
431 | | - } |
432 | | - }, |
433 | | - { |
434 | | - "key": "TOKENIZER_POOL_SIZE", |
435 | | - "input": { |
436 | | - "name": "Tokenizer Pool Size", |
437 | | - "type": "number", |
438 | | - "description": "Size of tokenizer pool to use for asynchronous tokenization.", |
439 | | - "default": 0, |
440 | | - "advanced": true |
441 | | - } |
442 | | - }, |
443 | | - { |
444 | | - "key": "TOKENIZER_POOL_TYPE", |
445 | | - "input": { |
446 | | - "name": "Tokenizer Pool Type", |
447 | | - "type": "string", |
448 | | - "description": "Type of tokenizer pool to use for asynchronous tokenization.", |
449 | | - "default": "ray", |
450 | | - "advanced": true |
451 | | - } |
452 | | - }, |
453 | | - { |
454 | | - "key": "TOKENIZER_POOL_EXTRA_CONFIG", |
455 | | - "input": { |
456 | | - "name": "Tokenizer Pool Extra Config", |
457 | | - "type": "string", |
458 | | - "description": "Extra config for tokenizer pool.", |
459 | | - "advanced": true |
460 | | - } |
461 | | - }, |
462 | 356 | { |
463 | 357 | "key": "ENABLE_LORA", |
464 | 358 | "input": { |
|
489 | 383 | "advanced": true |
490 | 384 | } |
491 | 385 | }, |
492 | | - { |
493 | | - "key": "LORA_EXTRA_VOCAB_SIZE", |
494 | | - "input": { |
495 | | - "name": "LoRA Extra Vocab Size", |
496 | | - "type": "number", |
497 | | - "description": "Maximum size of extra vocabulary for LoRA adapters.", |
498 | | - "default": 256, |
499 | | - "advanced": true |
500 | | - } |
501 | | - }, |
502 | 386 | { |
503 | 387 | "key": "LORA_DTYPE", |
504 | 388 | "input": { |
|
527 | 411 | "advanced": true |
528 | 412 | } |
529 | 413 | }, |
530 | | - { |
531 | | - "key": "LONG_LORA_SCALING_FACTORS", |
532 | | - "input": { |
533 | | - "name": "Long LoRA Scaling Factors", |
534 | | - "type": "string", |
535 | | - "description": "Specify multiple scaling factors for LoRA adapters.", |
536 | | - "advanced": true |
537 | | - } |
538 | | - }, |
539 | 414 | { |
540 | 415 | "key": "MAX_CPU_LORAS", |
541 | 416 | "input": { |
|
615 | 490 | "advanced": true |
616 | 491 | } |
617 | 492 | }, |
618 | | - { |
619 | | - "key": "SPECULATIVE_MODEL", |
620 | | - "input": { |
621 | | - "name": "Speculative Model", |
622 | | - "type": "string", |
623 | | - "description": "The name of the draft model to be used in speculative decoding.", |
624 | | - "advanced": true |
625 | | - } |
626 | | - }, |
627 | | - { |
628 | | - "key": "NUM_SPECULATIVE_TOKENS", |
629 | | - "input": { |
630 | | - "name": "Num Speculative Tokens", |
631 | | - "type": "number", |
632 | | - "description": "The number of speculative tokens to sample from the draft model.", |
633 | | - "advanced": true |
634 | | - } |
635 | | - }, |
636 | | - { |
637 | | - "key": "SPECULATIVE_DRAFT_TENSOR_PARALLEL_SIZE", |
638 | | - "input": { |
639 | | - "name": "Speculative Draft Tensor Parallel Size", |
640 | | - "type": "number", |
641 | | - "description": "Number of tensor parallel replicas for the draft model.", |
642 | | - "advanced": true |
643 | | - } |
644 | | - }, |
645 | | - { |
646 | | - "key": "SPECULATIVE_MAX_MODEL_LEN", |
647 | | - "input": { |
648 | | - "name": "Speculative Max Model Length", |
649 | | - "type": "number", |
650 | | - "description": "The maximum sequence length supported by the draft model.", |
651 | | - "advanced": true |
652 | | - } |
653 | | - }, |
654 | | - { |
655 | | - "key": "SPECULATIVE_DISABLE_BY_BATCH_SIZE", |
656 | | - "input": { |
657 | | - "name": "Speculative Disable by Batch Size", |
658 | | - "type": "number", |
659 | | - "description": "Disable speculative decoding if the number of enqueue requests is larger than this value.", |
660 | | - "advanced": true |
661 | | - } |
662 | | - }, |
663 | | - { |
664 | | - "key": "NGRAM_PROMPT_LOOKUP_MAX", |
665 | | - "input": { |
666 | | - "name": "Ngram Prompt Lookup Max", |
667 | | - "type": "number", |
668 | | - "description": "Max size of window for ngram prompt lookup in speculative decoding.", |
669 | | - "advanced": true |
670 | | - } |
671 | | - }, |
672 | | - { |
673 | | - "key": "NGRAM_PROMPT_LOOKUP_MIN", |
674 | | - "input": { |
675 | | - "name": "Ngram Prompt Lookup Min", |
676 | | - "type": "number", |
677 | | - "description": "Min size of window for ngram prompt lookup in speculative decoding.", |
678 | | - "advanced": true |
679 | | - } |
680 | | - }, |
681 | | - { |
682 | | - "key": "SPEC_DECODING_ACCEPTANCE_METHOD", |
683 | | - "input": { |
684 | | - "name": "Speculative Decoding Acceptance Method", |
685 | | - "type": "string", |
686 | | - "description": "Specify the acceptance method for draft token verification in speculative decoding.", |
687 | | - "options": [ |
688 | | - { |
689 | | - "label": "rejection_sampler", |
690 | | - "value": "rejection_sampler" |
691 | | - }, |
692 | | - { |
693 | | - "label": "typical_acceptance_sampler", |
694 | | - "value": "typical_acceptance_sampler" |
695 | | - } |
696 | | - ], |
697 | | - "default": "rejection_sampler", |
698 | | - "advanced": true |
699 | | - } |
700 | | - }, |
701 | | - { |
702 | | - "key": "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_THRESHOLD", |
703 | | - "input": { |
704 | | - "name": "Typical Acceptance Sampler Posterior Threshold", |
705 | | - "type": "number", |
706 | | - "description": "Set the lower bound threshold for the posterior probability of a token to be accepted.", |
707 | | - "advanced": true |
708 | | - } |
709 | | - }, |
710 | | - { |
711 | | - "key": "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_ALPHA", |
712 | | - "input": { |
713 | | - "name": "Typical Acceptance Sampler Posterior Alpha", |
714 | | - "type": "number", |
715 | | - "description": "A scaling factor for the entropy-based threshold for token acceptance.", |
716 | | - "advanced": true |
717 | | - } |
718 | | - }, |
719 | 493 | { |
720 | 494 | "key": "MODEL_LOADER_EXTRA_CONFIG", |
721 | 495 | "input": { |
|
726 | 500 | } |
727 | 501 | }, |
728 | 502 | { |
729 | | - "key": "PREEMPTION_MODE", |
730 | | - "input": { |
731 | | - "name": "Preemption Mode", |
732 | | - "type": "string", |
733 | | - "description": "If 'recompute', the engine performs preemption-aware recomputation. If 'save', the engine saves activations into the CPU memory as preemption happens.", |
734 | | - "advanced": true |
735 | | - } |
736 | | - }, |
737 | | - { |
738 | | - "key": "PREEMPTION_CHECK_PERIOD", |
739 | | - "input": { |
740 | | - "name": "Preemption Check Period", |
741 | | - "type": "number", |
742 | | - "description": "How frequently the engine checks if a preemption happens.", |
743 | | - "default": 1, |
744 | | - "advanced": true |
745 | | - } |
746 | | - }, |
747 | | - { |
748 | | - "key": "PREEMPTION_CPU_CAPACITY", |
749 | | - "input": { |
750 | | - "name": "Preemption CPU Capacity", |
751 | | - "type": "number", |
752 | | - "description": "The percentage of CPU memory used for the saved activations.", |
753 | | - "default": 2, |
754 | | - "advanced": true |
755 | | - } |
756 | | - }, |
757 | | - { |
758 | | - "key": "MAX_LOG_LEN", |
759 | | - "input": { |
760 | | - "name": "Max Log Length", |
761 | | - "type": "number", |
762 | | - "description": "Max number of characters or ID numbers being printed in log.", |
763 | | - "advanced": true |
764 | | - } |
765 | | - }, |
766 | | - { |
767 | | - "key": "DISABLE_LOGGING_REQUEST", |
| 503 | + "key": "ENABLE_LOG_REQUESTS", |
768 | 504 | "input": { |
769 | | - "name": "Disable Logging Request", |
| 505 | + "name": "Enable Log Requests", |
770 | 506 | "type": "boolean", |
771 | | - "description": "Disable logging requests.", |
| 507 | + "description": "Enable vLLM request logging.", |
772 | 508 | "default": false, |
773 | 509 | "advanced": true |
774 | 510 | } |
|
840 | 576 | "advanced": true |
841 | 577 | } |
842 | 578 | }, |
843 | | - { |
844 | | - "key": "MAX_SEQ_LEN_TO_CAPTURE", |
845 | | - "input": { |
846 | | - "name": "CUDA Graph Max Content Length", |
847 | | - "type": "number", |
848 | | - "description": "Maximum context length covered by CUDA graphs. If a sequence has context length larger than this, we fall back to eager mode", |
849 | | - "default": 8192, |
850 | | - "advanced": true |
851 | | - } |
852 | | - }, |
853 | 579 | { |
854 | 580 | "key": "DISABLE_CUSTOM_ALL_REDUCE", |
855 | 581 | "input": { |
|
958 | 684 | "advanced": true |
959 | 685 | } |
960 | 686 | }, |
961 | | - { |
962 | | - "key": "DISABLE_LOG_REQUESTS", |
963 | | - "input": { |
964 | | - "name": "Disable Log Requests", |
965 | | - "type": "boolean", |
966 | | - "description": "Enables or disables vLLM request logging", |
967 | | - "default": true, |
968 | | - "advanced": true |
969 | | - } |
970 | | - }, |
971 | 687 | { |
972 | 688 | "key": "ENABLE_AUTO_TOOL_CHOICE", |
973 | 689 | "input": { |
|
0 commit comments