|
82 | 82 | from vllm_ascend.multistream.ms_split import compute_split_seq_index
|
83 | 83 | from vllm_ascend.platform import NPUPlatform
|
84 | 84 | from vllm_ascend.sample.rejection_sampler import AscendRejectionSampler
|
85 |
| -from vllm_ascend.torchair.utils import (check_torchair_cache_exist, |
86 |
| - write_kv_cache_bytes_to_file) |
87 | 85 | from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ,
|
88 | 86 | ProfileExecuteDuration, is_310p,
|
89 | 87 | maybe_converting_weight_acl_format,
|
@@ -2323,67 +2321,27 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
|
2323 | 2321 |
|
2324 | 2322 | return kv_cache_spec
|
2325 | 2323 |
|
2326 |
| - def _compile_torchair_graph(self, torchair_graph_batch_sizes) -> None: |
2327 |
| - # Trigger torchair graph capture for specific shapes. |
| 2324 | + def _capture_model(self): |
| 2325 | + if not self.use_aclgraph: |
| 2326 | + logger.info("Skipping NPU graph capture for eager mode.") |
| 2327 | + return |
| 2328 | + # Trigger ACL graph capture for specific shapes. |
2328 | 2329 | # Capture the large shapes first so that the smaller shapes
|
2329 | 2330 | # can reuse the memory pool allocated for the large shapes.
|
2330 |
| - for idx, num_tokens in enumerate(reversed(torchair_graph_batch_sizes)): |
2331 |
| - for _ in range(self.vllm_config.compilation_config. |
2332 |
| - cudagraph_num_of_warmups): |
2333 |
| - self._dummy_run(num_tokens, is_torchair_compile=True) |
2334 |
| - self._dummy_run(num_tokens, is_torchair_compile=True) |
2335 |
| - logger.info("Batchsize %d is compiled successfully: %d/%d.", |
2336 |
| - num_tokens, idx + 1, len(torchair_graph_batch_sizes)) |
| 2331 | + # TODO(zzzzwwjj): Check dummy_run with ACL Graph and full graph mode |
| 2332 | + with graph_capture(device=self.device): |
| 2333 | + for num_tokens in reversed(self.aclgraph_batch_sizes): |
| 2334 | + for _ in range(self.vllm_config.compilation_config. |
| 2335 | + cudagraph_num_of_warmups): |
| 2336 | + self._dummy_run(num_tokens) |
| 2337 | + self._dummy_run(num_tokens) |
2337 | 2338 |
|
2338 | 2339 | def capture_model(self) -> None:
|
2339 | 2340 | start_time = time.perf_counter()
|
2340 | 2341 | start_free_npu_memory = torch.npu.mem_get_info()[0]
|
2341 |
| - # TODO(NeverRaR): Calling graph_capture(device=self.device) in |
2342 |
| - # torchair graph capture can cause some issues, so now we just |
2343 |
| - # temporarily split the codepath for the two different graph patterns. |
2344 |
| - if self.torchair_graph_enabled: |
2345 |
| - torchair_graph_batch_sizes = self.torchair_graph_batch_sizes |
2346 |
| - graph_num = len(torchair_graph_batch_sizes) |
2347 |
| - |
2348 |
| - if self.use_cached_npu_graph and not check_torchair_cache_exist(): |
2349 |
| - # If caching is enabled but does not exist, we will compile the model twice. The first |
2350 |
| - # time is used to generate the cache, and the second time is used to load the cache to |
2351 |
| - # skip the overhead caused by Dynamo guard mechanism. |
2352 |
| - logger.info( |
2353 |
| - "Use cached npu graph but cache doesn't exist! Now we compile graph to genetate torchair cache, this usually takes %.1f~%.1f mins.", |
2354 |
| - 0.5 * graph_num, 1.5 * graph_num) |
2355 |
| - self._compile_torchair_graph(torchair_graph_batch_sizes) |
2356 |
| - NPUPlatform.synchronize() |
2357 |
| - torch._dynamo.reset() |
2358 |
| - self.torchair_compiled_models.clear() |
2359 |
| - if self.use_cached_npu_graph: |
2360 |
| - logger.info( |
2361 |
| - "Loading torchair graph cache, this usually takes %.1f~%.1f mins.", |
2362 |
| - 0.3 * graph_num, 0.5 * graph_num) |
2363 |
| - self._compile_torchair_graph(torchair_graph_batch_sizes) |
2364 |
| - else: |
2365 |
| - logger.info( |
2366 |
| - "Capturing torchair graph, this usually takes %.1f~%.1f mins.", |
2367 |
| - 0.5 * graph_num, 1.5 * graph_num) |
2368 |
| - self._compile_torchair_graph(torchair_graph_batch_sizes) |
2369 |
| - |
2370 |
| - if self.new_kv_cache_bytes > 0: |
2371 |
| - write_kv_cache_bytes_to_file(torch.distributed.get_rank(), |
2372 |
| - self.new_kv_cache_bytes) |
2373 |
| - elif self.use_aclgraph: |
2374 |
| - # Trigger ACL graph capture for specific shapes. |
2375 |
| - # Capture the large shapes first so that the smaller shapes |
2376 |
| - # can reuse the memory pool allocated for the large shapes. |
2377 |
| - # TODO(zzzzwwjj): Check dummy_run with ACL Graph and full graph mode |
2378 |
| - with graph_capture(device=self.device): |
2379 |
| - for num_tokens in reversed(self.aclgraph_batch_sizes): |
2380 |
| - for _ in range(self.vllm_config.compilation_config. |
2381 |
| - cudagraph_num_of_warmups): |
2382 |
| - self._dummy_run(num_tokens) |
2383 |
| - self._dummy_run(num_tokens) |
2384 |
| - else: |
2385 |
| - logger.info("Skipping NPU graph capture for eager mode.") |
2386 |
| - return |
| 2342 | + |
| 2343 | + self._capture_model() |
| 2344 | + |
2387 | 2345 | end_time = time.perf_counter()
|
2388 | 2346 | end_free_npu_memory = torch.npu.mem_get_info()[0]
|
2389 | 2347 | elapsed_time = end_time - start_time
|
|
0 commit comments