Skip to content

Commit 94efac2

Browse files
russellbdbyoung18
authored andcommitted
[Core] Upgrade to xgrammar 0.1.18, add cache size limit (vllm-project#16283)
Signed-off-by: Russell Bryant <[email protected]>
1 parent b01a242 commit 94efac2

File tree

4 files changed

+22
-3
lines changed

4 files changed

+22
-3
lines changed

requirements/common.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ lm-format-enforcer >= 0.10.11, < 0.11
2222
llguidance >= 0.7.9, < 0.8.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64"
2323
outlines == 0.1.11
2424
lark == 1.2.2
25-
xgrammar == 0.1.17; platform_machine == "x86_64" or platform_machine == "aarch64"
25+
xgrammar == 0.1.18; platform_machine == "x86_64" or platform_machine == "aarch64"
2626
typing_extensions >= 4.10
2727
filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
2828
partial-json-parser # used for parsing partial JSON outputs

vllm/envs.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,7 @@
106106
VLLM_TPU_DISABLE_TOPK_TOPP_OPTIMIZATION: bool = False
107107
VLLM_TPU_BUCKET_PADDING_GAP: int = 0
108108
VLLM_USE_DEEP_GEMM: bool = False
109+
VLLM_XGRAMMAR_CACHE_MB: int = 0
109110

110111

111112
def get_default_cache_root():
@@ -697,6 +698,12 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
697698
# Allow use of DeepGemm kernels for fused moe ops.
698699
"VLLM_USE_DEEP_GEMM":
699700
lambda: bool(int(os.getenv("VLLM_USE_DEEP_GEMM", "0"))),
701+
702+
# Control the cache sized used by the xgrammar compiler. The default
703+
# of 512 MB should be enough for roughly 1000 JSON schemas.
704+
# It can be changed with this variable if needed for some reason.
705+
"VLLM_XGRAMMAR_CACHE_MB":
706+
lambda: int(os.getenv("VLLM_XGRAMMAR_CACHE_MB", "512")),
700707
}
701708

702709
# end-env-vars-definition

vllm/model_executor/guided_decoding/xgrammar_decoding.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
import torch
1212

13+
import vllm.envs
1314
from vllm.logger import init_logger
1415

1516
try:
@@ -131,8 +132,13 @@ def get_compiler(cls, config: GrammarConfig) -> xgr.GrammarCompiler:
131132
encoded_vocab=config_data.encoded_vocab,
132133
metadata=config_data.metadata,
133134
)
135+
cache_size = vllm.envs.VLLM_XGRAMMAR_CACHE_MB * 1024 * 1024
134136
cls._cache[cache_key] = xgr.GrammarCompiler(
135-
tokenizer_info, max_threads=config.max_threads)
137+
tokenizer_info,
138+
max_threads=config.max_threads,
139+
cache_enabled=True,
140+
cache_limit_bytes=cache_size,
141+
)
136142

137143
return cls._cache[cache_key]
138144

vllm/v1/structured_output/backend_xgrammar.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
import torch
77

8+
import vllm.envs
89
from vllm.config import VllmConfig
910
from vllm.logger import init_logger
1011
from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
@@ -76,7 +77,12 @@ def __init__(self, vllm_config: VllmConfig):
7677
tokenizer,
7778
vocab_size=self.vocab_size,
7879
)
79-
self.compiler = xgr.GrammarCompiler(tokenizer_info, max_threads=8)
80+
self.compiler = xgr.GrammarCompiler(
81+
tokenizer_info,
82+
max_threads=8,
83+
cache_enabled=True,
84+
cache_limit_bytes=vllm.envs.VLLM_XGRAMMAR_CACHE_MB * 1024 * 1024,
85+
)
8086

8187
def compile_grammar(self, request_type: StructuredOutputOptions,
8288
grammar_spec: str) -> StructuredOutputGrammar:

0 commit comments

Comments
 (0)