[Bugfix] Change max_size for LRU (#1349)

elfiegg · web-flow · commit 0ef96593406a · 2025-07-29T14:47:18.000-07:00
## 📌 Description LRU cache eviction blocks frameworks for cuda graph replay. Setting to a higher size. ```Fatal Python error: Segmentation fault Thread 0x00007f62e7ffe6c0 (most recent call first): File "/opt/mycode/python/sglang/srt/managers/scheduler.py", line 2215 in watchdog_thread File "/usr/lib/python3.12/threading.py", line 1010 in run File "/usr/lib/python3.12/threading.py", line 1073 in _bootstrap_inner File "/usr/lib/python3.12/threading.py", line 1030 in _bootstrap Current thread 0x00007f62ebfff6c0 (most recent call first): File "/usr/local/lib/python3.12/dist-packages/torch/cuda/graphs.py", line 88 in replay File "/opt/mycode/python/sglang/srt/model_executor/cuda_graph_runner.py", line 752 in replay File "/opt/mycode/python/sglang/srt/model_executor/model_runner.py", line 1633 in _forward_raw File "/opt/mycode/python/sglang/srt/model_executor/model_runner.py", line 1606 in forward File "/opt/mycode/python/sglang/srt/managers/tp_worker.py", line 234 in forward_batch_generation File "/opt/mycode/python/sglang/srt/managers/tp_worker_overlap_thread.py", line 175[2025-07-29 07:12:06] INFO: 127.0.0.1:49294 - "POST /generate HTTP/1.1" 200 OK in forward_thread_func_ File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 116 in decorate_context File "/opt/mycode/python/sglang/srt/managers/tp_worker_overlap_thread.py", line 140 in forward_thread_func File "/usr/lib/python3.12/threading.py", line 1010 in run File "/usr/lib/python3.12/threading.py", line 1073 in _bootstrap_inner File "/usr/lib/python3.12/threading.py", line 1030 in _bootstrap Thread 0x00007f72fffe46c0 (most recent call first): File "/usr/lib/python3.12/threading.py", line 359 in wait File Fatal Python error: "Segmentation fault/usr/lib/python3.12/threading.py "Fatal Python error: Thread 0x, line Segmentation fault00007f8375ffd6c0655 ``` ## 🔍 Related Issues ## 🚀 Pull Request Checklist Thank you for contributing to FlashInfer! Before we review your pull request, please make sure the following items are complete. ### ✅ Pre-commit Checks - [ ] I have installed `pre-commit` by running `pip install pre-commit` (or used your preferred method). - [ ] I have installed the hooks with `pre-commit install`. - [ ] I have run the hooks manually with `pre-commit run --all-files` and fixed any reported issues. > If you are unsure about how to set up `pre-commit`, see [the pre-commit documentation](https://pre-commit.com/). ## 🧪 Tests - [ ] Tests have been added or updated as needed. - [ ] All tests are passing (`unittest`, etc.). ## Reviewer Notes
diff --git a/flashinfer/gemm.py b/flashinfer/gemm.py
@@ -834,7 +834,7 @@ def _validate_fp8_output_dtype(dtype: torch.dtype):
         )
 
 
-@functools.lru_cache(maxsize=128)
+@functools.cache
 def build_cudnn_gemm_block_scale_dequantize_graph(
     a_shape,
     a_stride,
@@ -946,7 +946,7 @@ def execute_cudnn_gemm_fp4_graph(graph, a, b, a_descale, b_descale, alpha, c_fin
     graph.execute(variant_pack, workspace, handle=_get_cudnn_handle(stream))
 
 
-@functools.lru_cache(maxsize=128)
+@functools.cache
 def build_cudnn_gemm_with_per_tensor_q_graph(
     a_shape, a_stride, b_shape, b_stride, a_type, b_type, o_type, device
 ):