Skip to content

Commit c45ac42

Browse files
MadiatorLabsDeJayDevvelaraptor-runpod
authored
vLLM Worker v0.15.0 — Upgrade from v0.11.x to v0.15.0 (#259)
* VLLM upgrade to 0.12.0 and compatibility fixes * MAX_NUM_BATCHED_TOKENS fix and CUDA tester * Sys kill worker instead of marking as failed * upgrade to vllm 0.12.0 * Update to vllm 0.15.0 and lora fix * Update for HUB and removal of deprected env variables * reverted docker-bake changes * removed leftovers * Update src/handler.py Co-authored-by: Dj Isaac <contact@dejaydev.com> * Update src/utils.py Co-authored-by: Dj Isaac <contact@dejaydev.com> * Update src/handler.py Co-authored-by: Dj Isaac <contact@dejaydev.com> * Clean up of docs and comments in code * nit: lowercase p * nit: lowercase p --------- Co-authored-by: Dj Isaac <contact@dejaydev.com> Co-authored-by: chrisvela <chris.vela@runpod.io>
1 parent 6d6cbe7 commit c45ac42

File tree

8 files changed

+197
-355
lines changed

8 files changed

+197
-355
lines changed

.runpod/hub.json

Lines changed: 4 additions & 288 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
"containerDiskInGb": 150,
1010
"gpuIds": "ADA_80_PRO,AMPERE_80",
1111
"gpuCount": 1,
12-
"allowedCudaVersions": ["12.9", "12.8", "12.7", "12.6", "12.5", "12.4"],
12+
"allowedCudaVersions": ["12.9", "12.8"],
1313
"presets": [
1414
{
1515
"name": "deepseek-ai/deepseek-r1-distill-llama-8b",
@@ -181,15 +181,6 @@
181181
"advanced": true
182182
}
183183
},
184-
{
185-
"key": "QUANTIZATION_PARAM_PATH",
186-
"input": {
187-
"name": "Quantization Param Path",
188-
"type": "string",
189-
"description": "Path to the JSON file containing the KV cache scaling factors.",
190-
"advanced": true
191-
}
192-
},
193184
{
194185
"key": "MAX_MODEL_LEN",
195186
"input": {
@@ -199,26 +190,6 @@
199190
"advanced": true
200191
}
201192
},
202-
{
203-
"key": "GUIDED_DECODING_BACKEND",
204-
"input": {
205-
"name": "Guided Decoding Backend",
206-
"type": "string",
207-
"description": "Which engine will be used for guided decoding by default.",
208-
"options": [
209-
{
210-
"label": "outlines",
211-
"value": "outlines"
212-
},
213-
{
214-
"label": "lm-format-enforcer",
215-
"value": "lm-format-enforcer"
216-
}
217-
],
218-
"default": "outlines",
219-
"advanced": true
220-
}
221-
},
222193
{
223194
"key": "DISTRIBUTED_EXECUTOR_BACKEND",
224195
"input": {
@@ -238,16 +209,6 @@
238209
"advanced": true
239210
}
240211
},
241-
{
242-
"key": "WORKER_USE_RAY",
243-
"input": {
244-
"name": "Worker Use Ray",
245-
"type": "boolean",
246-
"description": "Deprecated, use --distributed-executor-backend=ray.",
247-
"default": false,
248-
"advanced": true
249-
}
250-
},
251212
{
252213
"key": "RAY_WORKERS_USE_NSIGHT",
253214
"input": {
@@ -307,26 +268,6 @@
307268
"advanced": true
308269
}
309270
},
310-
{
311-
"key": "USE_V2_BLOCK_MANAGER",
312-
"input": {
313-
"name": "Use V2 Block Manager",
314-
"type": "boolean",
315-
"description": "Use BlockSpaceMangerV2.",
316-
"default": false,
317-
"advanced": true
318-
}
319-
},
320-
{
321-
"key": "NUM_LOOKAHEAD_SLOTS",
322-
"input": {
323-
"name": "Num Lookahead Slots",
324-
"type": "number",
325-
"description": "Experimental scheduling config necessary for speculative decoding.",
326-
"default": 0,
327-
"advanced": true
328-
}
329-
},
330271
{
331272
"key": "SEED",
332273
"input": {
@@ -412,53 +353,6 @@
412353
"advanced": true
413354
}
414355
},
415-
{
416-
"key": "ROPE_SCALING",
417-
"input": {
418-
"name": "RoPE Scaling",
419-
"type": "string",
420-
"description": "RoPE scaling configuration in JSON format.",
421-
"advanced": true
422-
}
423-
},
424-
{
425-
"key": "ROPE_THETA",
426-
"input": {
427-
"name": "RoPE Theta",
428-
"type": "number",
429-
"description": "RoPE theta. Use with rope_scaling.",
430-
"advanced": true
431-
}
432-
},
433-
{
434-
"key": "TOKENIZER_POOL_SIZE",
435-
"input": {
436-
"name": "Tokenizer Pool Size",
437-
"type": "number",
438-
"description": "Size of tokenizer pool to use for asynchronous tokenization.",
439-
"default": 0,
440-
"advanced": true
441-
}
442-
},
443-
{
444-
"key": "TOKENIZER_POOL_TYPE",
445-
"input": {
446-
"name": "Tokenizer Pool Type",
447-
"type": "string",
448-
"description": "Type of tokenizer pool to use for asynchronous tokenization.",
449-
"default": "ray",
450-
"advanced": true
451-
}
452-
},
453-
{
454-
"key": "TOKENIZER_POOL_EXTRA_CONFIG",
455-
"input": {
456-
"name": "Tokenizer Pool Extra Config",
457-
"type": "string",
458-
"description": "Extra config for tokenizer pool.",
459-
"advanced": true
460-
}
461-
},
462356
{
463357
"key": "ENABLE_LORA",
464358
"input": {
@@ -489,16 +383,6 @@
489383
"advanced": true
490384
}
491385
},
492-
{
493-
"key": "LORA_EXTRA_VOCAB_SIZE",
494-
"input": {
495-
"name": "LoRA Extra Vocab Size",
496-
"type": "number",
497-
"description": "Maximum size of extra vocabulary for LoRA adapters.",
498-
"default": 256,
499-
"advanced": true
500-
}
501-
},
502386
{
503387
"key": "LORA_DTYPE",
504388
"input": {
@@ -527,15 +411,6 @@
527411
"advanced": true
528412
}
529413
},
530-
{
531-
"key": "LONG_LORA_SCALING_FACTORS",
532-
"input": {
533-
"name": "Long LoRA Scaling Factors",
534-
"type": "string",
535-
"description": "Specify multiple scaling factors for LoRA adapters.",
536-
"advanced": true
537-
}
538-
},
539414
{
540415
"key": "MAX_CPU_LORAS",
541416
"input": {
@@ -615,107 +490,6 @@
615490
"advanced": true
616491
}
617492
},
618-
{
619-
"key": "SPECULATIVE_MODEL",
620-
"input": {
621-
"name": "Speculative Model",
622-
"type": "string",
623-
"description": "The name of the draft model to be used in speculative decoding.",
624-
"advanced": true
625-
}
626-
},
627-
{
628-
"key": "NUM_SPECULATIVE_TOKENS",
629-
"input": {
630-
"name": "Num Speculative Tokens",
631-
"type": "number",
632-
"description": "The number of speculative tokens to sample from the draft model.",
633-
"advanced": true
634-
}
635-
},
636-
{
637-
"key": "SPECULATIVE_DRAFT_TENSOR_PARALLEL_SIZE",
638-
"input": {
639-
"name": "Speculative Draft Tensor Parallel Size",
640-
"type": "number",
641-
"description": "Number of tensor parallel replicas for the draft model.",
642-
"advanced": true
643-
}
644-
},
645-
{
646-
"key": "SPECULATIVE_MAX_MODEL_LEN",
647-
"input": {
648-
"name": "Speculative Max Model Length",
649-
"type": "number",
650-
"description": "The maximum sequence length supported by the draft model.",
651-
"advanced": true
652-
}
653-
},
654-
{
655-
"key": "SPECULATIVE_DISABLE_BY_BATCH_SIZE",
656-
"input": {
657-
"name": "Speculative Disable by Batch Size",
658-
"type": "number",
659-
"description": "Disable speculative decoding if the number of enqueue requests is larger than this value.",
660-
"advanced": true
661-
}
662-
},
663-
{
664-
"key": "NGRAM_PROMPT_LOOKUP_MAX",
665-
"input": {
666-
"name": "Ngram Prompt Lookup Max",
667-
"type": "number",
668-
"description": "Max size of window for ngram prompt lookup in speculative decoding.",
669-
"advanced": true
670-
}
671-
},
672-
{
673-
"key": "NGRAM_PROMPT_LOOKUP_MIN",
674-
"input": {
675-
"name": "Ngram Prompt Lookup Min",
676-
"type": "number",
677-
"description": "Min size of window for ngram prompt lookup in speculative decoding.",
678-
"advanced": true
679-
}
680-
},
681-
{
682-
"key": "SPEC_DECODING_ACCEPTANCE_METHOD",
683-
"input": {
684-
"name": "Speculative Decoding Acceptance Method",
685-
"type": "string",
686-
"description": "Specify the acceptance method for draft token verification in speculative decoding.",
687-
"options": [
688-
{
689-
"label": "rejection_sampler",
690-
"value": "rejection_sampler"
691-
},
692-
{
693-
"label": "typical_acceptance_sampler",
694-
"value": "typical_acceptance_sampler"
695-
}
696-
],
697-
"default": "rejection_sampler",
698-
"advanced": true
699-
}
700-
},
701-
{
702-
"key": "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_THRESHOLD",
703-
"input": {
704-
"name": "Typical Acceptance Sampler Posterior Threshold",
705-
"type": "number",
706-
"description": "Set the lower bound threshold for the posterior probability of a token to be accepted.",
707-
"advanced": true
708-
}
709-
},
710-
{
711-
"key": "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_ALPHA",
712-
"input": {
713-
"name": "Typical Acceptance Sampler Posterior Alpha",
714-
"type": "number",
715-
"description": "A scaling factor for the entropy-based threshold for token acceptance.",
716-
"advanced": true
717-
}
718-
},
719493
{
720494
"key": "MODEL_LOADER_EXTRA_CONFIG",
721495
"input": {
@@ -726,49 +500,11 @@
726500
}
727501
},
728502
{
729-
"key": "PREEMPTION_MODE",
730-
"input": {
731-
"name": "Preemption Mode",
732-
"type": "string",
733-
"description": "If 'recompute', the engine performs preemption-aware recomputation. If 'save', the engine saves activations into the CPU memory as preemption happens.",
734-
"advanced": true
735-
}
736-
},
737-
{
738-
"key": "PREEMPTION_CHECK_PERIOD",
739-
"input": {
740-
"name": "Preemption Check Period",
741-
"type": "number",
742-
"description": "How frequently the engine checks if a preemption happens.",
743-
"default": 1,
744-
"advanced": true
745-
}
746-
},
747-
{
748-
"key": "PREEMPTION_CPU_CAPACITY",
749-
"input": {
750-
"name": "Preemption CPU Capacity",
751-
"type": "number",
752-
"description": "The percentage of CPU memory used for the saved activations.",
753-
"default": 2,
754-
"advanced": true
755-
}
756-
},
757-
{
758-
"key": "MAX_LOG_LEN",
759-
"input": {
760-
"name": "Max Log Length",
761-
"type": "number",
762-
"description": "Max number of characters or ID numbers being printed in log.",
763-
"advanced": true
764-
}
765-
},
766-
{
767-
"key": "DISABLE_LOGGING_REQUEST",
503+
"key": "ENABLE_LOG_REQUESTS",
768504
"input": {
769-
"name": "Disable Logging Request",
505+
"name": "Enable Log Requests",
770506
"type": "boolean",
771-
"description": "Disable logging requests.",
507+
"description": "Enable vLLM request logging.",
772508
"default": false,
773509
"advanced": true
774510
}
@@ -840,16 +576,6 @@
840576
"advanced": true
841577
}
842578
},
843-
{
844-
"key": "MAX_SEQ_LEN_TO_CAPTURE",
845-
"input": {
846-
"name": "CUDA Graph Max Content Length",
847-
"type": "number",
848-
"description": "Maximum context length covered by CUDA graphs. If a sequence has context length larger than this, we fall back to eager mode",
849-
"default": 8192,
850-
"advanced": true
851-
}
852-
},
853579
{
854580
"key": "DISABLE_CUSTOM_ALL_REDUCE",
855581
"input": {
@@ -958,16 +684,6 @@
958684
"advanced": true
959685
}
960686
},
961-
{
962-
"key": "DISABLE_LOG_REQUESTS",
963-
"input": {
964-
"name": "Disable Log Requests",
965-
"type": "boolean",
966-
"description": "Enables or disables vLLM request logging",
967-
"default": true,
968-
"advanced": true
969-
}
970-
},
971687
{
972688
"key": "ENABLE_AUTO_TOOL_CHOICE",
973689
"input": {

0 commit comments

Comments
 (0)