File tree Expand file tree Collapse file tree 5 files changed +14
-7
lines changed
model_executor/layers/quantization/utils Expand file tree Collapse file tree 5 files changed +14
-7
lines changed Original file line number Diff line number Diff line change 1616 - run : echo "::add-matcher::.github/workflows/matchers/actionlint.json"
1717 - uses : pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
1818 with :
19- extra_args : --hook-stage manual
19+ extra_args : --all-files -- hook-stage manual
Original file line number Diff line number Diff line change 1717 rev : v2.3.0
1818 hooks :
1919 - id : codespell
20- exclude : ' benchmarks/sonnet.txt|(build|tests/(lora/data|models/fixtures|prompts))/.*'
20+ exclude : ' benchmarks/sonnet.txt|(build|tests/(lora/data|models/fixtures|prompts))/.*|csrc/rocm/.*|csrc/gradlib/.* '
2121- repo : https://github.com/PyCQA/isort
2222 rev : 5.13.2
2323 hooks :
Original file line number Diff line number Diff line change @@ -326,6 +326,11 @@ def measure_current_non_torch():
326326 # Add some extra non-torch memory 256 MiB (simulate NCCL)
327327 handle2 = lib .cudaMalloc (256 * 1024 * 1024 )
328328
329+ # this is an analytic value, it is exact,
330+ # we only have 256 MiB non-torch memory increase
331+ measured_diff = monitored_values .values [- 1 ] - monitored_values .values [0 ]
332+ assert measured_diff == 256 * 1024 * 1024
333+
329334 # Check that the memory usage is within 5% of the expected values
330335 # 5% tolerance is caused by cuda runtime.
331336 # we cannot control cuda runtime in the granularity of bytes,
Original file line number Diff line number Diff line change @@ -681,10 +681,12 @@ def forward(
681681 seq_lens ,
682682 make_attn_mask = False ) # type: ignore
683683 full_scales = (
684- 1.0 / layer ._q_scale .item (), 1.0 / layer ._k_scale .item (),
685- 1.0 / layer ._v_scale .item (), 1.0 / layer ._prob_scale .item (),
684+ 1.0 / layer ._q_scale .item (),
685+ 1.0 / layer ._k_scale .item (), 1.0 /
686+ layer ._v_scale .item (), 1.0 / layer ._prob_scale .item (),
686687 fp8_out_scale .item ()) if (
687- fp8_out_scale and layer ._q_scale and layer ._prob_scale
688+ fp8_out_scale and layer ._q_scale
689+ and layer ._prob_scale
688690 and envs .VLLM_USE_ROCM_FP8_FLASH_ATTN ) else None
689691 out , _ = self .attn_func (
690692 query ,
Original file line number Diff line number Diff line change @@ -36,8 +36,8 @@ def apply_w8a8_block_fp8_linear(
3636
3737
3838def input_to_float8 (
39- x : torch .Tensor ,
40- dtype : Optional [torch .dtype ] = None
39+ x : torch .Tensor ,
40+ dtype : Optional [torch .dtype ] = None
4141) -> Tuple [torch .Tensor , torch .Tensor ]:
4242 """This function quantizes input values to float8 values "
4343 "with tensor-wise quantization."""
You can’t perform that action at this time.
0 commit comments