diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh
index df0bae0c9cbf..7fdb4ec69d31 100755
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -10,7 +10,7 @@ export PYTHONPATH=".."
 echo "--- Confirming Clean Initial State"
 while true; do
         sleep 3
-        if grep -q clean /opt/amdgpu/etc/gpu_state; then
+        if grep -q clean ${BUILDKITE_AGENT_META_DATA_RESET_TARGET}; then
                 echo "GPUs state is \"clean\""
                 break
         fi
@@ -49,18 +49,18 @@ cleanup_docker
 
 echo "--- Resetting GPUs"
 
-echo "reset" > /opt/amdgpu/etc/gpu_state
+echo "reset" > ${BUILDKITE_AGENT_META_DATA_RESET_TARGET}
 
 while true; do
         sleep 3
-        if grep -q clean /opt/amdgpu/etc/gpu_state; then
+	if grep -q clean ${BUILDKITE_AGENT_META_DATA_RESET_TARGET}; then
                 echo "GPUs state is \"clean\""
                 break
         fi
 done
 
 echo "--- Pulling container" 
-image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
+image_name="rocm/vllm-ci-private:${BUILDKITE_COMMIT}"
 container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
 docker pull "${image_name}"
 
diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2
new file mode 100644
index 000000000000..0d8b6d0a4f93
--- /dev/null
+++ b/.buildkite/test-template.j2
@@ -0,0 +1,47 @@
+{% set docker_image = "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT" %}
+{% set docker_image_amd = "rocm/vllm-ci-private:$BUILDKITE_COMMIT" %}
+{% set default_working_dir = "vllm/tests" %}
+{% set hf_home = "/root/.cache/huggingface" %}
+
+steps:
+  - label: ":docker: build image"
+    depends_on: ~
+    commands:
+      - "docker build --build-arg max_jobs=16 --tag {{ docker_image_amd }} -f docker/Dockerfile.rocm  --build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942' --target test --progress plain ."
+      - "docker push {{ docker_image_amd }}"
+    key: "amd-build"
+    env:
+      DOCKER_BUILDKIT: "1"
+    retry:
+      automatic:
+        - exit_status: -1  # Agent was lost
+          limit: 5
+        - exit_status: -10  # Agent was lost
+          limit: 5
+    agents:
+      queue: amd-cpu
+    soft_fail: false
+
+{% for step in steps %}
+{% if step.mirror_hardwares and "amd" in step.mirror_hardwares %}
+  - label: "AMD: {{ step.label }}"
+    depends_on: 
+      - "amd-build"
+    agents:
+{% if step.amd_gpus and step.amd_gpus==8%}
+      queue: amd_gpu
+{% elif step.amd_gpus and step.amd_gpus==4%}
+      queue: amd_gpu
+{% elif step.amd_gpus and step.amd_gpus==2%}
+      queue: amd_gpu
+{% else%}
+      queue: amd_gpu
+{% endif%}
+    commands: 
+      - bash .buildkite/scripts/hardware_ci/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe  }} ; {{ step.command  or (step.commands | join(" && ")) | safe }}"
+    env:
+      DOCKER_BUILDKIT: "1"
+    priority: 100
+    soft_fail: false
+{% endif %}
+{% endfor %}
diff --git a/.github/workflows/reminder_comment.yml b/.github/workflows/reminder_comment.yml
deleted file mode 100644
index 1ee605dc7bb0..000000000000
--- a/.github/workflows/reminder_comment.yml
+++ /dev/null
@@ -1,54 +0,0 @@
-name: PR Reminder Comment Bot
-permissions:
-  pull-requests: write
-on:
-  pull_request_target:
-    types: [opened]
-jobs:
-  pr_reminder:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Remind to run full CI on PR
-        uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
-        with:
-          script: |
-            try {
-              // Get the PR author
-              const prAuthor = context.payload.pull_request.user.login;
-              
-              // Check if this is the author's first PR in this repository
-              // Use GitHub's search API to find all PRs by this author
-              const { data: searchResults } = await github.rest.search.issuesAndPullRequests({
-                q: `repo:${context.repo.owner}/${context.repo.repo} type:pr author:${prAuthor}`,
-                per_page: 100  
-              });
-              
-              const authorPRCount = searchResults.total_count;
-              
-              console.log(`Found ${authorPRCount} PRs by ${prAuthor}`);
-              
-              // Only post comment if this is the first PR (only one PR by this author)
-              if (authorPRCount === 1) {
-                console.log(`Posting welcome comment for first-time contributor: ${prAuthor}`);
-                await github.rest.issues.createComment({
-                owner: context.repo.owner,
-                repo: context.repo.repo,
-                issue_number: context.issue.number,
-                body: '👋 Hi! Thank you for contributing to the vLLM project.\n\n' +
-                  '💬 Join our developer Slack at https://slack.vllm.ai to discuss your PR in #pr-reviews, coordinate on features in #feat- channels, or join special interest groups in #sig- channels.\n\n' +
-                  'Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. \n\n' +
-                  'You ask your reviewers to trigger select CI tests on top of `fastcheck` CI. \n\n' +
-                  'Once the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n' +
-                  'To run CI, PR reviewers can either: Add `ready` label to the PR or enable auto-merge.\n\n' +
-                  'If you have any questions, please reach out to us on Slack at https://slack.vllm.ai.\n\n' +
-                  '🚀'
-                });
-              } else {
-                console.log(`Skipping comment for ${prAuthor} - not their first PR (${authorPRCount} PRs found)`);
-              }
-            } catch (error) {
-              console.error('Error checking PR history or posting comment:', error);
-              // Don't fail the workflow, just log the error
-            }
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/scripts/build.sh b/.github/workflows/scripts/build.sh
index c69ebbb42da5..2bb7b726194d 100644
--- a/.github/workflows/scripts/build.sh
+++ b/.github/workflows/scripts/build.sh
@@ -1,22 +1,20 @@
 #!/bin/bash
 set -eux
 
-python_executable=python$1
-cuda_home=/usr/local/cuda-$2
+python_executable=python3
 
 # Update paths
-PATH=${cuda_home}/bin:$PATH
-LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH
-
 # Install requirements
-$python_executable -m pip install -r requirements/build.txt -r requirements/cuda.txt
+$python_executable -m pip install -r requirements/rocm.txt
 
 # Limit the number of parallel jobs to avoid OOM
 export MAX_JOBS=1
 # Make sure release wheels are built for the following architectures
-export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
+export PYTORCH_ROCM_ARCH="gfx90a;gfx942"
+
+rm -f "$(which sccache)"
 
-bash tools/check_repo.sh
+export MAX_JOBS=32
 
 # Build
 $python_executable setup.py bdist_wheel --dist-dir=dist
diff --git a/ROCm_performance.md b/ROCm_performance.md
new file mode 100644
index 000000000000..f6c67637a968
--- /dev/null
+++ b/ROCm_performance.md
@@ -0,0 +1,19 @@
+# Overview of the optional performance features uinque to https://github.com/ROCm/vllm
+
+## Triton attention
+The default attention function on ROCm is using triton attention kernel. To fallback to the https://github.com/ROCm/flash-attention implementation set up the following environment symbol:  
+`VLLM_USE_TRITON_FLASH_ATTN=0`
+
+## Tunable ops
+Pytorch tunable ops are supported.  
+Define the following environment symbol: `PYTORCH_TUNABLEOP_ENABLED=1` in order to enable both the runtime tuning and the subsequent use of tuned results. To only use the tuned results without tuning any newly encountered shapes, set `PYTORCH_TUNABLEOP_TUNING=0`
+
+## Custom PagedAttention
+
+On ROCm, to have better performance, a custom paged attention is available by switching on the env variable: `VLLM_USE_ROCM_CUSTOM_PAGED_ATTN=1`.
+Currently, this env variable is enabled by default. To fallback to PagedAttention v2 kernel assign the env variable to 0.
+The custom PagedAttention kernel is enabled for dtype: bf16, fp16, block-size=16, head-size=128, and max context length <= 16k, with GQA ratio (num_heads//num_kv_heads) between 1 to 16. On all the other cases, we fallback to PagedAttention v2 kernel.
+
+## NCCL Performance environment variable
+
+For MI300x, setting environment variable NCCL_MIN_NCHANNELS=112 is expected to improve performance.
diff --git a/benchmarks/P3L.py b/benchmarks/P3L.py
new file mode 100755
index 000000000000..793b88a4a61e
--- /dev/null
+++ b/benchmarks/P3L.py
@@ -0,0 +1,264 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Patch-Perplexity (P3L)
+
+This is a script that produces a realistic PPL measurement
+for the quantized KV cache system by processing a sequence of
+non-overlapping patches of the reference text. Generation of the
+consecutive symbols in each patch is governed (forced)
+by the reference text.
+
+The initial context size for the system is set by the parameter
+"--context-size".
+
+The number of output symbols to generate starting from a given
+context is set by the parameter "--sample-size". This variable also
+defines the size of the individual patch.
+
+For the N-token reference text that is split into M patches with the
+system's context size C it takes M*preload + (N-C)*generation time.
+
+Quick correctness validation tips:
+
+Running llama-2-7b model
+(
+    ./vllm/examples/P3L.py
+    --model=meta-llama/Llama-2-7b-chat-hf
+    --context-size=1024
+    --sample-size=512
+)
+should result in PPL ~ 6.524227946419175
+
+Running llama-2-7b model
+(
+    ./vllm/examples/P3L.py
+    --model=meta-llama/Llama-2-7b-chat-hf
+    --context-size=1024
+    --sample-size=512
+    --patch-size=1
+)
+should result in PPL ~ PPL=3.8968611189957523
+
+Running the script with multiple batches is possible
+by specifying the --batch-size parameter.
+
+"""
+
+import argparse
+import dataclasses
+import datetime
+import json
+import math
+import os
+import tempfile
+
+from huggingface_hub import hf_hub_download
+
+from vllm import LLM, SamplingParams
+from vllm.engine.arg_utils import EngineArgs
+from vllm.logger import init_logger
+from vllm.utils import FlexibleArgumentParser
+
+logger = init_logger(__name__)
+
+
+def get_wikitext2_text(tokenizer):
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        hf_hub_download(
+            repo_id="alexei-v-ivanov-amd/wiki",
+            repo_type="dataset",
+            filename="wiki.test.raw",
+            local_dir=tmpdirname,
+        )
+        with open(os.path.join(tmpdirname, "wiki.test.raw")) as f:
+            test_text = "\n".join(line.strip() for line in f)
+            test_enc = tokenizer(test_text)
+
+    return test_enc, test_text
+
+
+def vllm_init(args):
+    engine_args = EngineArgs.from_cli_args(args)
+    llm = LLM(**dataclasses.asdict(engine_args))
+
+    sampling_params = SamplingParams(
+        n=1,
+        temperature=0.0,
+        top_p=1,
+        ignore_eos=True,
+        ppl_measurement=True,
+        future_context=[],
+        prompt_logprobs=1,
+        logprobs=1,
+        presence_penalty=0.0,
+    )
+
+    return llm, sampling_params
+
+
+def vllm_predict(CONT, llm, sampl_par):
+    result = llm.generate(prompt_token_ids=CONT, sampling_params=sampl_par)
+    return result
+
+
+def main(args: argparse.Namespace):
+    MESSAGE = f"Initialising @ {datetime.datetime.now()}"
+    logger.info(MESSAGE)
+    print(MESSAGE)
+    my_ppl = 0.0
+
+    logger.info("Initializing the engine.")
+    my_llm, my_sampl_par = vllm_init(args)
+    my_tokenizer = my_llm.llm_engine.tokenizer.tokenizer
+    logger.info(my_sampl_par)
+    logger.info("Initialized the engine.")
+
+    my_n_samples = args.sample_size
+
+    if (
+        args.context_size + my_n_samples
+    ) > my_llm.llm_engine.model_config.max_model_len:
+        MESSAGE = (
+            ""
+            "Error! The total number of tokens:\n"
+            f" prefix ({args.context_size}) + "
+            f"to be generated ({my_n_samples})"
+            f" can't be bigger than the model limit "
+            f"({my_llm.llm_engine.model_config.max_model_len})."
+        )
+        logger.info(MESSAGE)
+        print(MESSAGE)
+        return
+
+    my_test_enc, my_test_text = get_wikitext2_text(my_tokenizer)
+    logger.info("Loaded the test data.")
+
+    my_n_patches = math.ceil(
+        (len(my_test_enc["input_ids"]) - args.context_size - 1) / my_n_samples
+    )
+    if args.patch_size is not None:
+        my_n_patches = args.patch_size
+
+    num_tokens_generated = 0
+    starting_time = datetime.datetime.now()
+    MESSAGE = (
+        f"Starting generation @ {starting_time}\n"
+        " Have the test sample of "
+        f"{len(my_test_enc['input_ids'])} tokens"
+        f" will try to process {my_n_patches} patche(s),"
+        f" generating {my_n_samples} tokens in each patch"
+        f" from the initial context of {args.context_size} tokens."
+    )
+
+    logger.info(MESSAGE)
+    print(MESSAGE)
+
+    my_batchsize = args.batch_size
+
+    for c in range(0, my_n_patches, my_batchsize):
+        CONTEXT = []
+        my_sampl_par.future_context = []
+        my_sampl_par.cntr = []
+
+        for b in range(my_batchsize):
+            if (c + b) < my_n_patches:
+                upper_boundary = min(
+                    (c + b + 1) * my_n_samples + args.context_size,
+                    len(my_test_enc["input_ids"]),
+                )
+                CONTEXT.append(
+                    my_test_enc["input_ids"][
+                        (c + b) * my_n_samples : (c + b) * my_n_samples
+                        + args.context_size
+                    ]
+                )
+
+                my_sampl_par.future_context.append(
+                    my_test_enc["input_ids"][
+                        (c + b) * my_n_samples + args.context_size : upper_boundary
+                    ]
+                )
+
+                my_sampl_par.cntr.append(c + b)
+
+        my_sampl_par.max_tokens = max(
+            len(my_sampl_par.future_context[b]) for b in range(len(CONTEXT))
+        )
+
+        LOGPROBS = vllm_predict(CONTEXT, my_llm, my_sampl_par)
+        for b in range(len(CONTEXT)):
+            num_tokens_generated += len(LOGPROBS[b].outputs[0].token_ids)
+            my_ppl -= LOGPROBS[b].outputs[0].cumulative_logprob
+
+        if num_tokens_generated < my_n_samples * len(CONTEXT):
+            MESSAGE = (
+                f"Warning: The number of generated tokens is"
+                f"less than requested ({num_tokens_generated}"
+                f" < {my_n_samples * len(CONTEXT)})."
+            )
+            logger.info(MESSAGE)
+            print(MESSAGE)
+
+        MESSAGE = (
+            f"Iterations {c + 1} through {c + len(CONTEXT)}"
+            f" of {my_n_patches} Intermediate "
+            "Estimates:\n"
+            f"\tCross-entropy_intermediate={my_ppl / num_tokens_generated}\n"
+            f"\tPerplexity_intermediate="
+            f"{math.exp(my_ppl / num_tokens_generated)}"
+        )
+
+        logger.info(MESSAGE)
+        print(MESSAGE)
+
+    ending_time = datetime.datetime.now()
+    MESSAGE = (
+        f"Done @ {ending_time} after processing for"
+        f" {ending_time - starting_time}"
+        f" generated {num_tokens_generated} tokens."
+    )
+
+    logger.info(MESSAGE)
+    print(MESSAGE)
+
+    MESSAGE = (
+        f"\tIntegral Cross-Entropy={my_ppl}\n\tAverage Cross-Entropy="
+        f"{my_ppl / num_tokens_generated}"
+        f"\n\tPPL={math.exp(my_ppl / num_tokens_generated)}"
+    )
+
+    if args.output_json:
+        results = {
+            "integral_cross_entropy": my_ppl,
+            "average_cross_entropy": my_ppl / num_tokens_generated,
+            "ppl": math.exp(my_ppl / num_tokens_generated),
+        }
+        with open(args.output_json, "w") as f:
+            json.dump(results, f, indent=4)
+
+    logger.info(MESSAGE)
+    print(MESSAGE)
+    return
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(
+        description="Measure the PPPL (P3L) score of a given model."
+    )
+    parser.add_argument("--context-size", type=int, default=4096)
+    parser.add_argument("--sample-size", type=int, default=512)
+    parser.add_argument("--batch-size", type=int, default=1)
+    parser.add_argument("--patch-size", type=int, default=None)
+    parser.add_argument(
+        "--output-json",
+        type=str,
+        default=None,
+        help="Path to save the latency results in JSON format.",
+    )
+
+    parser = EngineArgs.add_cli_args(parser)
+    args = parser.parse_args()
+
+    main(args)
diff --git a/benchmarks/P3L_mling.py b/benchmarks/P3L_mling.py
new file mode 100755
index 000000000000..7055745e601e
--- /dev/null
+++ b/benchmarks/P3L_mling.py
@@ -0,0 +1,302 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+*MULTILINGUAL*  Patch-Perplexity (P3L)
+
+This is a script that produces a realistic PPL measurement
+for the quantized KV cache system by processing a sequence of
+non-overlapping patches of the reference text. Generation of the
+consecutive symbols in each patch is governed (forced)
+by the reference text.
+
+The initial context size for the system is set by the parameter
+"--context-size".
+
+The number of output symbols to generate starting from a given
+context is set by the parameter "--sample-size". This variable also
+defines the size of the individual patch.
+
+For the N-token reference text that is split into M patches with the
+system's context size C it takes M*preload + (N-C)*generation time.
+
+Quick correctness validation tips:
+
+Running DeepSeek-V2 model
+(
+    ./vllm/examples/P3L_mling.py
+    --model=meta-llama/Llama-2-7b-chat-hf
+    --context-size=1024
+    --sample-size=512
+)
+
+should result in PPL ~ 8.42927
+
+Running DeepSeek-V2 model
+(
+    ./vllm/examples/P3L_mling.py
+    --model=meta-llama/Llama-2-7b-chat-hf
+    --context-size=1024
+    --sample-size=512
+    --patch-size=1
+    --lang-script="cmn_Hant"
+)
+should result in PPL ~ 2.67962
+
+The multi-linguality is implemented through the additional
+key "--lang-script", which defaults to English in Latin
+scripture ("eng_Latn").
+
+Please refer to
+
+https://confluence.amd.com/display/MLSE/Multi-Lingual+P3L+Test
+
+for the complete set of possible language-scripture choices.
+
+Running the script with multiple batches is possible
+by specifying the --batch-size parameter.
+
+"""
+
+import argparse
+import dataclasses
+import datetime
+import json
+import math
+import os
+import tempfile
+
+import pandas
+from huggingface_hub import hf_hub_download
+
+from vllm import LLM, SamplingParams
+from vllm.engine.arg_utils import EngineArgs
+from vllm.logger import init_logger
+from vllm.utils import FlexibleArgumentParser
+
+logger = init_logger(__name__)
+
+
+def get_wikitext2_text(tokenizer):
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        hf_hub_download(
+            repo_id="alexei-v-ivanov-amd/wiki",
+            repo_type="dataset",
+            filename="wiki.test.raw",
+            local_dir=tmpdirname,
+        )
+        with open(os.path.join(tmpdirname, "wiki.test.raw")) as f:
+            test_text = "\n".join(line.strip() for line in f)
+            test_enc = tokenizer(test_text)
+
+    return test_enc, test_text
+
+
+def get_flores_plus_text(tokenizer, lng_script):
+    hf_hub_download(
+        repo_id="alexei-v-ivanov-amd/flores_plus",
+        repo_type="dataset",
+        filename=lng_script + ".parquet",
+        local_dir="./",
+    )
+
+    df = pandas.read_parquet("./" + lng_script + ".parquet")
+    test_text = "\n\n".join(line.strip() for line in df["text"])
+    test_enc = tokenizer(test_text)
+
+    os.remove("./" + lng_script + ".parquet")
+
+    return test_enc, test_text
+
+
+def vllm_init(args):
+    engine_args = EngineArgs.from_cli_args(args)
+    llm = LLM(**dataclasses.asdict(engine_args))
+
+    sampling_params = SamplingParams(
+        n=1,
+        temperature=0.0,
+        top_p=1,
+        ignore_eos=True,
+        ppl_measurement=True,
+        future_context=[],
+        prompt_logprobs=1,
+        logprobs=1,
+        presence_penalty=0.0,
+    )
+
+    return llm, sampling_params
+
+
+def vllm_predict(CONT, llm, sampl_par):
+    result = llm.generate(prompt_token_ids=CONT, sampling_params=sampl_par)
+    return result
+
+
+def main(args: argparse.Namespace):
+    MESSAGE = f"Initialising @ {datetime.datetime.now()}"
+    logger.info(MESSAGE)
+    print(MESSAGE)
+    my_ppl = 0.0
+
+    logger.info("Initializing the engine.")
+    my_llm, my_sampl_par = vllm_init(args)
+    my_tokenizer = my_llm.llm_engine.tokenizer.tokenizer
+    logger.info(my_sampl_par)
+    logger.info("Initialized the engine.")
+
+    my_n_samples = args.sample_size
+    my_lang_script = args.lang_script
+
+    if (
+        args.context_size + my_n_samples
+    ) > my_llm.llm_engine.model_config.max_model_len:
+        MESSAGE = (
+            ""
+            "Error! The total number of tokens:\n"
+            f" prefix ({args.context_size}) + "
+            f"to be generated ({my_n_samples})"
+            f" can't be bigger than the model limit "
+            f"({my_llm.llm_engine.model_config.max_model_len})."
+        )
+        logger.info(MESSAGE)
+        print(MESSAGE)
+        return
+
+    my_test_enc, my_test_text = get_flores_plus_text(my_tokenizer, my_lang_script)
+
+    logger.info("Loaded the test data.")
+
+    my_n_patches = math.ceil(
+        (len(my_test_enc["input_ids"]) - args.context_size - 1) / my_n_samples
+    )
+    if args.patch_size is not None:
+        my_n_patches = args.patch_size
+
+    num_tokens_generated = 0
+    starting_time = datetime.datetime.now()
+    MESSAGE = (
+        f"Starting generation @ {starting_time}\n"
+        " Have the test sample of "
+        f"{len(my_test_enc['input_ids'])} tokens"
+        f" will try to process {my_n_patches} patche(s),"
+        f" generating {my_n_samples} tokens in each patch"
+        f" from the initial context of {args.context_size} tokens."
+    )
+
+    logger.info(MESSAGE)
+    print(MESSAGE)
+
+    my_batchsize = args.batch_size
+
+    for c in range(0, my_n_patches, my_batchsize):
+        CONTEXT = []
+        my_sampl_par.future_context = []
+        my_sampl_par.cntr = []
+
+        for b in range(my_batchsize):
+            if (c + b) < my_n_patches:
+                upper_boundary = min(
+                    (c + b + 1) * my_n_samples + args.context_size,
+                    len(my_test_enc["input_ids"]),
+                )
+                CONTEXT.append(
+                    my_test_enc["input_ids"][
+                        (c + b) * my_n_samples : (c + b) * my_n_samples
+                        + args.context_size
+                    ]
+                )
+
+                my_sampl_par.future_context.append(
+                    my_test_enc["input_ids"][
+                        (c + b) * my_n_samples + args.context_size : upper_boundary
+                    ]
+                )
+
+                my_sampl_par.cntr.append(c + b)
+
+        my_sampl_par.max_tokens = max(
+            len(my_sampl_par.future_context[b]) for b in range(len(CONTEXT))
+        )
+
+        LOGPROBS = vllm_predict(CONTEXT, my_llm, my_sampl_par)
+        for b in range(len(CONTEXT)):
+            num_tokens_generated += len(LOGPROBS[b].outputs[0].token_ids)
+            my_ppl -= LOGPROBS[b].outputs[0].cumulative_logprob
+
+        if num_tokens_generated < my_n_samples * len(CONTEXT):
+            MESSAGE = (
+                f"Warning: The number of generated tokens is"
+                f"less than requested ({num_tokens_generated}"
+                f" < {my_n_samples * len(CONTEXT)})."
+            )
+            logger.info(MESSAGE)
+            print(MESSAGE)
+
+        MESSAGE = (
+            f"Iterations {c + 1} through {c + len(CONTEXT)}"
+            f" of {my_n_patches} Intermediate "
+            "Estimates:\n"
+            f"\tCross-entropy_intermediate={my_ppl / num_tokens_generated}\n"
+            f"\tPerplexity_intermediate="
+            f"{math.exp(my_ppl / num_tokens_generated)}"
+        )
+
+        logger.info(MESSAGE)
+        print(MESSAGE)
+
+    ending_time = datetime.datetime.now()
+    MESSAGE = (
+        f"Done @ {ending_time} after processing for"
+        f" {ending_time - starting_time}"
+        f" generated {num_tokens_generated} tokens."
+    )
+
+    logger.info(MESSAGE)
+    print(MESSAGE)
+
+    MESSAGE = (
+        f"\tIntegral Cross-Entropy={my_ppl}\n\tAverage Cross-Entropy="
+        f"{my_ppl / num_tokens_generated}"
+        f"\n\tPPL={math.exp(my_ppl / num_tokens_generated)}"
+    )
+
+    if args.output_json:
+        results = {
+            "integral_cross_entropy": my_ppl,
+            "average_cross_entropy": my_ppl / num_tokens_generated,
+            "ppl": math.exp(my_ppl / num_tokens_generated),
+        }
+        with open(args.output_json, "w") as f:
+            json.dump(results, f, indent=4)
+
+    logger.info(MESSAGE)
+    print(MESSAGE)
+    return
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(
+        description="Measure the PPPL (P3L) score of a given model."
+    )
+    parser.add_argument(
+        "--data",
+        type=str,
+        default="./wikitext/wikitext-2-v1/test-00000-of-00001.parquet",
+    )
+    parser.add_argument("--context-size", type=int, default=4096)
+    parser.add_argument("--sample-size", type=int, default=512)
+    parser.add_argument("--batch-size", type=int, default=1)
+    parser.add_argument("--patch-size", type=int, default=None)
+    parser.add_argument("--lang-script", type=str, default="eng_Latn")
+    parser.add_argument(
+        "--output-json",
+        type=str,
+        default=None,
+        help="Path to save the latency results in JSON format.",
+    )
+
+    parser = EngineArgs.add_cli_args(parser)
+    args = parser.parse_args()
+
+    main(args)
diff --git a/benchmarks/kernels/moe_tune_script.sh b/benchmarks/kernels/moe_tune_script.sh
new file mode 100755
index 000000000000..acd2502e0587
--- /dev/null
+++ b/benchmarks/kernels/moe_tune_script.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+
+export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
+
+## ---- Mixtral fp8 tuning example ---- ##
+python benchmark_moe.py --model /data/models/mistral-ai-models/Mixtral-8x22B-Instruct-v0.1-FP8/ --tp-size 1 --tune --dtype fp8_w8a8 
+python benchmark_moe.py --model /data/models/mistral-ai-models/Mixtral-8x22B-Instruct-v0.1-FP8/ --tp-size 2 --tune --dtype fp8_w8a8
+python benchmark_moe.py --model /data/models/mistral-ai-models/Mixtral-8x22B-Instruct-v0.1-FP8/ --tp-size 4 --tune --dtype fp8_w8a8
+python benchmark_moe.py --model /data/models/mistral-ai-models/Mixtral-8x22B-Instruct-v0.1-FP8/ --tp-size 8 --tune --dtype fp8_w8a8
+
+
+## ---- Mixtral fp16 tuning example ---- ##
+# we don't need --dtype fp16; it has been set as default for rocm in the script.
+
+python benchmark_moe.py --model /data/models/mistral-ai-models/Mixtral-8x22B-v0.1/ --tp-size 1 --tune
+python benchmark_moe.py --model /data/models/mistral-ai-models/Mixtral-8x22B-v0.1/ --tp-size 2 --tune
+python benchmark_moe.py --model /data/models/mistral-ai-models/Mixtral-8x22B-v0.1/ --tp-size 4 --tune
+python benchmark_moe.py --model /data/models/mistral-ai-models/Mixtral-8x22B-v0.1/ --tp-size 8 --tune
+
+
+
+## ---- After the tuning is finished ---- ##
+# The tuning script saves the configurations in a json file at the same directory from where you launch the script.
+# The name of the json file will look something like this: E=8,N=14336,device_name=AMD_Instinct_MI300X.json
+# 
+# [IMPORTANT] -> Once the tuning is complete, move the tuned config file(s) to the following path:
+#                      vllm/vllm/model_executor/layers/fused_moe/configs/
+
+
+## ---- Notes ---- ##
+# 1. The tuned file is specific for a TP size. This means a tuned file obtained for --tp-size 8 can only be used when running the model under TP=8 setting.
+# 2. The script uses Ray for multi-gpu tuning. Export HIP_VISIBLE_DEVICES accordingly to expose the required no. of GPUs and use multiple gpus for tuning.
+# 3. RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1 resolves the following errors (depending on if HIP_VISIBLE_DEVICES is set or not):
+#    - Error-1: RuntimeError: HIP error: invalid device ordinal
+#               HIP kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+#               For debugging consider passing AMD_SERIALIZE_KERNEL=3
+#    - Error-2: RuntimeError: HIP_VISIBLE_DEVICES contains more devices than ROCR_VISIBLE_DEVICES
+
diff --git a/benchmarks/profiling/README.md b/benchmarks/profiling/README.md
new file mode 100644
index 000000000000..ee65e8025cc5
--- /dev/null
+++ b/benchmarks/profiling/README.md
@@ -0,0 +1,57 @@
+# VLLM Benchmark Profiling
+
+This profiling directory provides a method to profile VLLM throughput and latency benchmarks using ROCm profiling utilities.
+
+## 1. Dependencies
+
+Before using the profiling feature, you need to install the required dependencies:
+
+### Install ROCm Profile Data
+
+```bash
+git clone -b nvtx_enabled https://github.com/ROCm/rocmProfileData.git
+cd rocmProfileData && make && sudo make install
+```
+
+### Install hipMarker
+
+```bash
+cd rocmProfileData/hipMarker && python3 setup.py install
+```
+
+## 2. Profiling Benchmarks
+
+Profiling can be used to monitor the performance of the VLLM benchmarks with ROCm. The key flags used for profiling are:
+
+- `--profile-rpd`: Profiles the generation process of a single batch.
+- `--profile-dir PROFILE_DIR`: Specifies the path to save the profiler output, which can later be visualized using tools like [ui.perfetto.dev](https://ui.perfetto.dev/) or [chrome.tracing](chrome://tracing/).
+
+### Profiling Using Default Directory
+
+By default, profiling results are saved in either `vllm_benchmark_latency_result` or `vllm_benchmark_throughput_result`. To run a benchmark and profile it using the default directory, execute:
+
+```bash
+python3 benchmark_throughput.py --input-len {len} --output-len {len} --model {model} --profile-rpd
+```
+
+### Profiling With a Custom Directory
+
+You can specify a custom directory for saving profiler outputs by using the `--profile-dir` flag:
+
+```bash
+python3 benchmark_throughput.py --input-len {len} --output-len {len} --model {model} --profile-rpd --profile-dir {/path/to/custom/dir}
+```
+
+After profiling is complete, an `.rpd` file containing the trace data will be saved to the specified directory.
+
+## 3. Convert Trace Data to JSON Format
+
+To view the trace data, it needs to be converted into a format that is compatible with tools like Chrome tracing or Perfetto.
+
+You can use the `rpd2tracing.py` script in rocmProfileData to convert the `.rpd` file into a JSON file:
+
+```bash
+python3 rocmProfileData/tools/rpd2tracing.py trace.rpd trace.json
+```
+
+Once the trace is converted, open the `.json` file in [Chrome](chrome://tracing/) or [Perfetto](https://ui.perfetto.dev/) for visualization.
diff --git a/benchmarks/profiling/benchmark_latency.py b/benchmarks/profiling/benchmark_latency.py
new file mode 100644
index 000000000000..6ebfa911884c
--- /dev/null
+++ b/benchmarks/profiling/benchmark_latency.py
@@ -0,0 +1,203 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Benchmark the latency of processing a single batch of requests."""
+
+import argparse
+import dataclasses
+import json
+import os
+import time
+from contextlib import contextmanager, nullcontext
+from pathlib import Path
+from typing import Optional
+
+import numpy as np
+import torch
+from tqdm import tqdm
+
+from vllm import LLM, SamplingParams
+from vllm.engine.arg_utils import EngineArgs
+from vllm.inputs import PromptType
+from vllm.sampling_params import BeamSearchParams
+from vllm.utils import FlexibleArgumentParser
+
+
+def main(args: argparse.Namespace):
+    print(args)
+
+    @contextmanager
+    def rpd_profiler_context():
+        from rpdTracerControl import rpdTracerControl as rpd
+
+        llm.start_profile()
+        yield
+        llm.stop_profile()
+        rpd.top_totals()
+
+    @contextmanager
+    def torch_profiler_context(profile_result_dir: Optional[str] = None):
+        p = torch.profiler.profile(
+            activities=[
+                torch.profiler.ProfilerActivity.CPU,
+                torch.profiler.ProfilerActivity.CUDA,
+            ],
+            on_trace_ready=torch.profiler.tensorboard_trace_handler(
+                str(profile_result_dir)
+            ),
+        )
+        p.start()
+        try:
+            with torch.no_grad():
+                yield p
+        finally:
+            p.stop()
+            print(p.key_averages().table(sort_by="self_cuda_time_total", row_limit=-1))
+
+    def get_profiling_context(profile_result_dir: Optional[str] = None):
+        if args.profile_torch:
+            return torch_profiler_context(profile_result_dir)
+        elif args.profile_rpd:
+            return rpd_profiler_context()
+        else:
+            return nullcontext()
+
+    if args.profile_torch or args.profile_rpd:
+        profile_result_dir = Path(
+            args.profile_result_dir or "./vllm_benchmark_latency_result"
+        )
+        profile_result_dir.mkdir(parents=True, exist_ok=True)
+        name = os.path.basename(os.path.normpath(args.model))
+        model_trace_name = (
+            f"{name}_in_{args.input_len}_out_{args.output_len}_"
+            f"batch_{args.batch_size}_tp_{args.tensor_parallel_size}"
+        )
+        print(f"Profiling (results will be saved to '{profile_result_dir}')...")
+        if args.profile_rpd:
+            profile_result_dir /= f"{model_trace_name}.rpd"
+            os.environ["VLLM_RPD_PROFILER_DIR"] = str(profile_result_dir)
+
+    engine_args = EngineArgs.from_cli_args(args)
+
+    # NOTE(woosuk): If the request cannot be processed in a single batch,
+    # the engine will automatically process the request in multiple batches.
+    llm = LLM(**dataclasses.asdict(engine_args))
+
+    sampling_params = SamplingParams(
+        n=args.n,
+        temperature=1.0,
+        top_p=1.0,
+        ignore_eos=True,
+        max_tokens=args.output_len,
+    )
+    print(sampling_params)
+    dummy_prompt_token_ids = np.random.randint(
+        10000, size=(args.batch_size, args.input_len)
+    )
+    dummy_prompts: list[PromptType] = [
+        {"prompt_token_ids": batch} for batch in dummy_prompt_token_ids.tolist()
+    ]
+
+    def llm_generate():
+        if not args.use_beam_search:
+            llm.generate(dummy_prompts, sampling_params=sampling_params, use_tqdm=False)
+        else:
+            llm.beam_search(
+                dummy_prompts,
+                BeamSearchParams(
+                    beam_width=args.n,
+                    max_tokens=args.output_len,
+                    ignore_eos=True,
+                ),
+            )
+
+    def run_to_completion(profile_dir: Optional[str] = None):
+        if profile_dir:
+            with get_profiling_context(profile_dir):
+                llm_generate()
+        else:
+            start_time = time.perf_counter()
+            llm_generate()
+            end_time = time.perf_counter()
+            latency = end_time - start_time
+            return latency
+
+    print("Warming up...")
+    for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
+        run_to_completion(profile_dir=None)
+
+    if args.profile_torch or args.profile_rpd:
+        run_to_completion(profile_dir=profile_result_dir)
+        return
+
+    # Benchmark.
+    latencies = []
+    for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
+        latencies.append(run_to_completion(profile_dir=None))
+    latencies = np.array(latencies)
+    percentages = [10, 25, 50, 75, 90, 99]
+    percentiles = np.percentile(latencies, percentages)
+    print(f"Avg latency: {np.mean(latencies)} seconds")
+    for percentage, percentile in zip(percentages, percentiles):
+        print(f"{percentage}% percentile latency: {percentile} seconds")
+
+    # Output JSON results if specified
+    if args.output_json:
+        results = {
+            "avg_latency": np.mean(latencies),
+            "latencies": latencies.tolist(),
+            "percentiles": dict(zip(percentages, percentiles.tolist())),
+        }
+        with open(args.output_json, "w") as f:
+            json.dump(results, f, indent=4)
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(
+        description="Benchmark the latency of processing a single batch of "
+        "requests till completion."
+    )
+    parser.add_argument("--input-len", type=int, default=32)
+    parser.add_argument("--output-len", type=int, default=128)
+    parser.add_argument("--batch-size", type=int, default=8)
+    parser.add_argument(
+        "--n", type=int, default=1, help="Number of generated sequences per prompt."
+    )
+    parser.add_argument("--use-beam-search", action="store_true")
+    parser.add_argument(
+        "--num-iters-warmup",
+        type=int,
+        default=10,
+        help="Number of iterations to run for warmup.",
+    )
+    parser.add_argument(
+        "--num-iters", type=int, default=30, help="Number of iterations to run."
+    )
+    parser.add_argument(
+        "--profile-torch",
+        action="store_true",
+        help="profile the generation process of a single batch",
+    )
+    parser.add_argument(
+        "--profile-rpd",
+        action="store_true",
+        help="profile the generation process of a single batch",
+    )
+    parser.add_argument(
+        "--profile-result-dir",
+        type=str,
+        default=os.getenv("VLLM_RPD_PROFILER_DIR", default=None),
+        help=(
+            "path to save the profiler output. Can be visualized "
+            "with ui.perfetto.dev or Tensorboard."
+        ),
+    )
+    parser.add_argument(
+        "--output-json",
+        type=str,
+        default=None,
+        help="Path to save the latency results in JSON format.",
+    )
+
+    parser = EngineArgs.add_cli_args(parser)
+    args = parser.parse_args()
+    main(args)
diff --git a/benchmarks/profiling/benchmark_throughput.py b/benchmarks/profiling/benchmark_throughput.py
new file mode 100644
index 000000000000..0f8100e95f2b
--- /dev/null
+++ b/benchmarks/profiling/benchmark_throughput.py
@@ -0,0 +1,637 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Benchmark offline inference throughput."""
+
+import argparse
+import dataclasses
+import json
+import os
+import random
+import time
+from contextlib import contextmanager, nullcontext
+from functools import cache
+from pathlib import Path
+from typing import Optional
+
+import torch
+import uvloop
+from PIL import Image
+from tqdm import tqdm
+from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase
+
+from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
+from vllm.entrypoints.openai.api_server import (
+    build_async_engine_client_from_engine_args,
+)
+from vllm.inputs import TextPrompt
+from vllm.lora.request import LoRARequest
+from vllm.lora.utils import get_adapter_absolute_path
+from vllm.multimodal import MultiModalDataDict
+from vllm.sampling_params import BeamSearchParams
+from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer
+from vllm.utils import FlexibleArgumentParser, merge_async_iterators
+
+
+@dataclasses.dataclass
+class SampleRequest:
+    """A class representing a single inference request for benchmarking.
+
+    Attributes:
+        prompt: The input text prompt for the model.
+        prompt_len: The length of the prompt in tokens.
+        expected_output_len: The expected length of the output in tokens.
+        multi_modal_data: Optional dictionary containing multi-modal data (e.g.
+            images).
+        lora_request: Optional LoRARequest specifying the LoRA to use.
+    """
+
+    prompt: str
+    prompt_len: int
+    expected_output_len: int
+    multi_modal_data: Optional[MultiModalDataDict] = None
+    lora_request: Optional[LoRARequest] = None
+
+
+def _get_prompt_for_image_model(question: str, *, model: str) -> str:
+    """Prepend and append special tokens around the question to form a prompt.
+
+    Args:
+        question: The input question text to wrap with special tokens
+        model: The name of the model being used, to determine which special
+            tokens to add
+
+    Returns:
+        The formatted prompt string with appropriate special tokens for the
+            model
+
+    Raises:
+        ValueError: If an unsupported model name is provided
+    """
+    model = model.lower()
+    if "pixtral" in model:
+        return f"<s>[INST]{question}\n[IMG][/INST]"
+    raise ValueError(f"Unsupported model {model}")
+
+
+@cache
+def lora_path_on_disk(lora_path: str) -> str:
+    return get_adapter_absolute_path(lora_path)
+
+
+lora_tokenizer_cache: dict[int, AnyTokenizer] = {}
+
+
+def get_random_lora_request(
+    args: argparse.Namespace,
+) -> tuple[LoRARequest, Optional[AnyTokenizer]]:
+    global lora_tokenizer_cache
+    lora_id = random.randint(1, args.max_loras)
+    lora_request = LoRARequest(
+        lora_name=str(lora_id),
+        lora_int_id=lora_id,
+        lora_path=lora_path_on_disk(args.lora_path),
+    )
+    if lora_id not in lora_tokenizer_cache:
+        lora_tokenizer_cache[lora_id] = get_lora_tokenizer(lora_request)
+    return lora_request, lora_tokenizer_cache[lora_id]
+
+
+def sample_requests(
+    tokenizer: PreTrainedTokenizerBase, args: argparse.Namespace
+) -> list[SampleRequest]:
+    dataset_path: str = args.dataset
+    num_requests: int = args.num_prompts
+    fixed_output_len: Optional[int] = args.output_len
+    model: str = args.model
+    if fixed_output_len is not None and fixed_output_len < 4:
+        raise ValueError("output_len too small")
+
+    # Load the dataset.
+    with open(dataset_path) as f:
+        dataset = json.load(f)
+    # Filter out the conversations with less than 2 turns.
+    dataset = [data for data in dataset if len(data["conversations"]) >= 2]
+    # Shuffle the dataset.
+    random.shuffle(dataset)
+
+    # Filter out sequences that are too long or too short
+    filtered_dataset: list[SampleRequest] = []
+    for data in tqdm(dataset, total=len(filtered_dataset), desc="sampling requests"):
+        if len(filtered_dataset) == num_requests:
+            break
+
+        # Only keep the first two turns of each conversation.
+        prompt = data["conversations"][0]["value"]
+        completion = data["conversations"][1]["value"]
+
+        multi_modal_data: Optional[MultiModalDataDict] = None
+        if "image" in data:
+            multi_modal_data = multi_modal_data or {}
+            image_path = data["image"]
+            # TODO(vllm-project/vllm/issues/9778): Support multiple images.
+            assert isinstance(image_path, str), "Only support single image input"
+            try:
+                multi_modal_data["image"] = Image.open(image_path).convert("RGB")
+            except FileNotFoundError:
+                # Ignore datapoint where asset is missing
+                continue
+            prompt = _get_prompt_for_image_model(question=prompt, model=model)
+
+        request_tokenizer = tokenizer
+        lora_request: Optional[LoRARequest] = None
+        if args.enable_lora:
+            lora_request, lora_tokenizer = get_random_lora_request(args)
+            if lora_tokenizer:
+                request_tokenizer = lora_tokenizer
+
+        # Tokenize the prompts and completions.
+        prompt_token_ids = request_tokenizer(prompt).input_ids
+        completion_token_ids = request_tokenizer(completion).input_ids
+        prompt_len = len(prompt_token_ids)
+        output_len = (
+            len(completion_token_ids) if fixed_output_len is None else fixed_output_len
+        )
+        if prompt_len < 4 or output_len < 4:
+            # Prune too short sequences.
+            continue
+        if prompt_len > 1024 or prompt_len + output_len > 2048:
+            # Prune too long sequences.
+            continue
+        filtered_dataset.append(
+            SampleRequest(
+                prompt=prompt,
+                prompt_len=prompt_len,
+                expected_output_len=output_len,
+                multi_modal_data=multi_modal_data,
+                lora_request=lora_request,
+            )
+        )
+
+    return filtered_dataset
+
+
+def run_vllm(
+    requests: list[SampleRequest],
+    n: int,
+    engine_args: EngineArgs,
+) -> float:
+    from vllm import LLM, SamplingParams
+
+    @contextmanager
+    def rpd_profiler_context():
+        from rpdTracerControl import rpdTracerControl as rpd
+
+        llm.start_profile()
+        yield
+        llm.stop_profile()
+        rpd.top_totals()
+
+    @contextmanager
+    def torch_profiler_context(profile_dir: Optional[str] = None):
+        p = torch.profiler.profile(
+            activities=[
+                torch.profiler.ProfilerActivity.CPU,
+                torch.profiler.ProfilerActivity.CUDA,
+            ],
+            on_trace_ready=torch.profiler.tensorboard_trace_handler(str(profile_dir)),
+        )
+        p.start()
+        try:
+            with torch.no_grad():
+                yield p
+        finally:
+            p.stop()
+            print(p.key_averages().table(sort_by="self_cuda_time_total", row_limit=-1))
+
+    def get_profiling_context(profile_dir: Optional[str] = None):
+        if args.profile_torch:
+            return torch_profiler_context(profile_dir)
+        elif args.profile_rpd:
+            return rpd_profiler_context()
+        else:
+            return nullcontext()
+
+    if args.profile_torch or args.profile_rpd:
+        profile_dir = Path(args.profile_dir or "./vllm_benchmark_throughput_result")
+        profile_dir.mkdir(parents=True, exist_ok=True)
+        name = os.path.basename(os.path.normpath(args.model))
+        model_trace_name = (
+            f"{name}_in_{args.input_len}_out_{args.output_len}_"
+            f"tp_{args.tensor_parallel_size}"
+        )
+        print(f"Profiling (results will be saved to '{profile_dir}')...")
+        if args.profile_rpd:
+            profile_dir /= f"{model_trace_name}.rpd"
+            os.environ["VLLM_RPD_PROFILER_DIR"] = str(profile_dir)
+
+    llm = LLM(**dataclasses.asdict(engine_args))
+
+    # Add the requests to the engine.
+    prompts: list[TextPrompt] = []
+    sampling_params: list[SamplingParams] = []
+    for request in requests:
+        prompts.append(
+            TextPrompt(prompt=request.prompt, multi_modal_data=request.multi_modal_data)
+        )
+        sampling_params.append(
+            SamplingParams(
+                n=n,
+                temperature=1.0,
+                top_p=1.0,
+                ignore_eos=True,
+                max_tokens=request.expected_output_len,
+            )
+        )
+    lora_requests: Optional[list[LoRARequest]] = None
+    if engine_args.enable_lora:
+        lora_requests = [request.lora_request for request in requests]
+
+    use_beam_search = False
+
+    if not use_beam_search:
+        execute = lambda: llm.generate(
+            prompts, sampling_params, lora_request=lora_requests, use_tqdm=True
+        )
+    else:
+        assert lora_requests is None, "BeamSearch API does not support LoRA"
+        prompts = [request.prompt for request in requests]
+        # output_len should be the same for all requests.
+        output_len = requests[0][2]
+        for request in requests:
+            assert request.expected_output_len == output_len
+        execute = lambda: llm.beam_search(
+            prompts,
+            BeamSearchParams(
+                beam_width=n,
+                max_tokens=output_len,
+                ignore_eos=True,
+            ),
+        )
+
+    if args.profile_torch or args.profile_rpd:
+        with get_profiling_context(profile_dir):
+            execute()
+        return
+    else:
+        start = time.perf_counter()
+        execute()
+        end = time.perf_counter()
+        return end - start
+
+
+async def run_vllm_async(
+    requests: list[SampleRequest],
+    n: int,
+    engine_args: AsyncEngineArgs,
+    disable_frontend_multiprocessing: bool = False,
+) -> float:
+    from vllm import SamplingParams
+
+    async with build_async_engine_client_from_engine_args(
+        engine_args, disable_frontend_multiprocessing
+    ) as llm:
+        # Add the requests to the engine.
+        prompts: list[TextPrompt] = []
+        sampling_params: list[SamplingParams] = []
+        lora_requests: list[Optional[LoRARequest]] = []
+        for request in requests:
+            prompts.append(
+                TextPrompt(
+                    prompt=request.prompt, multi_modal_data=request.multi_modal_data
+                )
+            )
+            sampling_params.append(
+                SamplingParams(
+                    n=n,
+                    temperature=1.0,
+                    top_p=1.0,
+                    ignore_eos=True,
+                    max_tokens=request.lora_requests,
+                )
+            )
+            lora_requests.append(request.lora_request)
+
+        generators = []
+        start = time.perf_counter()
+        for i, (prompt, sp, lr) in enumerate(
+            zip(prompts, sampling_params, lora_requests)
+        ):
+            generator = llm.generate(prompt, sp, lora_request=lr, request_id=f"test{i}")
+            generators.append(generator)
+        all_gens = merge_async_iterators(*generators)
+        async for i, res in all_gens:
+            pass
+        end = time.perf_counter()
+        return end - start
+
+
+def run_hf(
+    requests: list[SampleRequest],
+    model: str,
+    tokenizer: PreTrainedTokenizerBase,
+    n: int,
+    max_batch_size: int,
+    trust_remote_code: bool,
+) -> float:
+    llm = AutoModelForCausalLM.from_pretrained(
+        model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code
+    )
+    if llm.config.model_type == "llama":
+        # To enable padding in the HF backend.
+        tokenizer.pad_token = tokenizer.eos_token
+    llm = llm.cuda()
+
+    pbar = tqdm(total=len(requests))
+    start = time.perf_counter()
+    batch: list[str] = []
+    max_prompt_len = 0
+    max_output_len = 0
+    for i in range(len(requests)):
+        prompt, prompt_len, output_len = requests[i]
+        # Add the prompt to the batch.
+        batch.append(prompt)
+        max_prompt_len = max(max_prompt_len, prompt_len)
+        max_output_len = max(max_output_len, output_len)
+        if len(batch) < max_batch_size and i != len(requests) - 1:
+            # Check if we can add more requests to the batch.
+            _, next_prompt_len, next_output_len = requests[i + 1]
+            if (
+                max(max_prompt_len, next_prompt_len)
+                + max(max_output_len, next_output_len)
+            ) <= 2048:
+                # We can add more requests to the batch.
+                continue
+
+        # Generate the sequences.
+        input_ids = tokenizer(batch, return_tensors="pt", padding=True).input_ids
+        llm_outputs = llm.generate(
+            input_ids=input_ids.cuda(),
+            do_sample=True,
+            num_return_sequences=n,
+            temperature=1.0,
+            top_p=1.0,
+            use_cache=True,
+            max_new_tokens=max_output_len,
+        )
+        # Include the decoding time.
+        tokenizer.batch_decode(llm_outputs, skip_special_tokens=True)
+        pbar.update(len(batch))
+
+        # Clear the batch.
+        batch = []
+        max_prompt_len = 0
+        max_output_len = 0
+    end = time.perf_counter()
+    return end - start
+
+
+def run_mii(
+    requests: list[SampleRequest],
+    model: str,
+    tensor_parallel_size: int,
+    output_len: int,
+) -> float:
+    from mii import client, serve
+
+    llm = serve(model, tensor_parallel=tensor_parallel_size)
+    prompts = [request.prompt for request in requests]
+
+    start = time.perf_counter()
+    llm.generate(prompts, max_new_tokens=output_len)
+    end = time.perf_counter()
+    client = client(model)
+    client.terminate_server()
+    return end - start
+
+
+def main(args: argparse.Namespace):
+    print(args)
+    random.seed(args.seed)
+
+    # Sample the requests.
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.tokenizer, trust_remote_code=args.trust_remote_code
+    )
+    if args.dataset is None:
+        vocab_size = tokenizer.vocab_size
+        requests = []
+        for _ in range(args.num_prompts):
+            request_tokenizer = tokenizer
+            lora_request: Optional[LoRARequest] = None
+            if args.enable_lora:
+                lora_request, lora_tokenizer = get_random_lora_request(args)
+                if lora_tokenizer:
+                    request_tokenizer = lora_tokenizer
+
+            # Synthesize a prompt with the given input length.
+            candidate_ids = [
+                random.randint(0, vocab_size - 1) for _ in range(args.input_len)
+            ]
+            # As tokenizer may add additional tokens like BOS, we need to try
+            # different lengths to get the desired input length.
+            for _ in range(5):  # Max attempts to correct
+                candidate_prompt = request_tokenizer.decode(candidate_ids)
+                tokenized_len = len(request_tokenizer.encode(candidate_prompt))
+
+                if tokenized_len == args.input_len:
+                    break
+
+                # Adjust length based on difference
+                diff = args.input_len - tokenized_len
+                if diff > 0:
+                    candidate_ids.extend(
+                        [random.randint(100, vocab_size - 100) for _ in range(diff)]
+                    )
+                else:
+                    candidate_ids = candidate_ids[:diff]
+            requests.append(
+                SampleRequest(
+                    prompt=candidate_prompt,
+                    prompt_len=args.input_len,
+                    expected_output_len=args.output_len,
+                    lora_request=lora_request,
+                )
+            )
+    else:
+        requests = sample_requests(tokenizer, args)
+
+    is_multi_modal = any(request.multi_modal_data is not None for request in requests)
+
+    if args.backend == "vllm":
+        if args.async_engine:
+            elapsed_time = uvloop.run(
+                run_vllm_async(
+                    requests,
+                    args.n,
+                    AsyncEngineArgs.from_cli_args(args),
+                    args.disable_frontend_multiprocessing,
+                )
+            )
+        else:
+            elapsed_time = run_vllm(requests, args.n, EngineArgs.from_cli_args(args))
+    elif args.backend == "hf":
+        assert args.tensor_parallel_size == 1
+        elapsed_time = run_hf(
+            requests,
+            args.model,
+            tokenizer,
+            args.n,
+            args.hf_max_batch_size,
+            args.trust_remote_code,
+        )
+    elif args.backend == "mii":
+        elapsed_time = run_mii(
+            requests, args.model, args.tensor_parallel_size, args.output_len
+        )
+    else:
+        raise ValueError(f"Unknown backend: {args.backend}")
+    total_num_tokens = sum(
+        request.prompt_len + request.expected_output_len for request in requests
+    )
+    total_output_tokens = sum(request.expected_output_len for request in requests)
+
+    if args.profile_torch or args.profile_rpd:
+        # Profiling complete
+        pass
+    else:
+        if is_multi_modal:
+            print(
+                "\033[91mWARNING\033[0m: Multi-modal request detected. The "
+                "following metrics are not accurate because image tokens are"
+                " not counted. See vllm-project/vllm/issues/9778 for details."
+            )
+        # TODO(vllm-project/vllm/issues/9778): Count molti-modal token length.
+        print(
+            f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
+            f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
+            f"{total_output_tokens / elapsed_time:.2f} output tokens/s"
+        )
+
+        # Output JSON results if specified
+        if args.output_json:
+            results = {
+                "elapsed_time": elapsed_time,
+                "num_requests": len(requests),
+                "total_num_tokens": total_num_tokens,
+                "requests_per_second": len(requests) / elapsed_time,
+                "tokens_per_second": total_num_tokens / elapsed_time,
+            }
+            with open(args.output_json, "w") as f:
+                json.dump(results, f, indent=4)
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(description="Benchmark the throughput.")
+    parser.add_argument(
+        "--backend", type=str, choices=["vllm", "hf", "mii"], default="vllm"
+    )
+    parser.add_argument(
+        "--dataset", type=str, default=None, help="Path to the dataset."
+    )
+    parser.add_argument(
+        "--input-len",
+        type=int,
+        default=None,
+        help="Input prompt length for each request",
+    )
+    parser.add_argument(
+        "--output-len",
+        type=int,
+        default=None,
+        help="Output length for each request. Overrides the "
+        "output length from the dataset.",
+    )
+    parser.add_argument(
+        "--n", type=int, default=1, help="Number of generated sequences per prompt."
+    )
+    parser.add_argument(
+        "--num-prompts", type=int, default=1000, help="Number of prompts to process."
+    )
+    parser.add_argument(
+        "--hf-max-batch-size",
+        type=int,
+        default=None,
+        help="Maximum batch size for HF backend.",
+    )
+    parser.add_argument(
+        "--output-json",
+        type=str,
+        default=None,
+        help="Path to save the throughput results in JSON format.",
+    )
+    parser.add_argument(
+        "--async-engine",
+        action="store_true",
+        default=False,
+        help="Use vLLM async engine rather than LLM class.",
+    )
+    parser.add_argument(
+        "--disable-frontend-multiprocessing",
+        action="store_true",
+        default=False,
+        help="Disable decoupled async engine frontend.",
+    )
+    # LoRA
+    parser.add_argument(
+        "--lora-path",
+        type=str,
+        default=None,
+        help="Path to the lora adapters to use. This can be an absolute path, "
+        "a relative path, or a Hugging Face model identifier.",
+    )
+    parser.add_argument(
+        "--profile-torch",
+        action="store_true",
+        help="profile the generation process of a single batch",
+    )
+    parser.add_argument(
+        "--profile-rpd",
+        action="store_true",
+        help="profile the generation process of a single batch",
+    )
+    parser.add_argument(
+        "--profile-dir",
+        type=str,
+        default=None,
+        help=(
+            "path to save the profiler output. Can be visualized "
+            "with ui.perfetto.dev or Tensorboard."
+        ),
+    )
+
+    parser = AsyncEngineArgs.add_cli_args(parser)
+    args = parser.parse_args()
+    if args.tokenizer is None:
+        args.tokenizer = args.model
+    if args.dataset is None:
+        assert args.input_len is not None
+        assert args.output_len is not None
+    else:
+        assert args.input_len is None
+    if args.enable_lora:
+        assert args.lora_path is not None
+
+    if args.backend == "vllm":
+        if args.hf_max_batch_size is not None:
+            raise ValueError("HF max batch size is only for HF backend.")
+    elif args.backend == "hf":
+        if args.hf_max_batch_size is None:
+            raise ValueError("HF max batch size is required for HF backend.")
+        if args.quantization is not None:
+            raise ValueError("Quantization is only for vLLM backend.")
+        if args.enable_lora is not None:
+            raise ValueError("LoRA benchmarking is only supported for vLLM backend")
+    elif args.backend == "mii":
+        if args.dtype != "auto":
+            raise ValueError("dtype must be auto for MII backend.")
+        if args.n != 1:
+            raise ValueError("n must be 1 for MII backend.")
+        if args.quantization is not None:
+            raise ValueError("Quantization is only for vLLM backend.")
+        if args.hf_max_batch_size is not None:
+            raise ValueError("HF max batch size is only for HF backend.")
+        if args.tokenizer != args.model:
+            raise ValueError("Tokenizer must be the same as the model for MII backend.")
+        if args.enable_lora is not None:
+            raise ValueError("LoRA benchmarking is only supported for vLLM backend")
+    main(args)
diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index 9c0ed1d09572..a85cd13061eb 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -122,7 +122,6 @@ function (get_torch_gpu_compiler_flags OUT_GPU_FLAGS GPU_LANG)
       "-DENABLE_FP8"
       "-U__HIP_NO_HALF_CONVERSIONS__"
       "-U__HIP_NO_HALF_OPERATORS__"
-      "-Werror=unused-variable"
       "-fno-gpu-rdc")
 
   endif()
diff --git a/csrc/rocm/fused_kernels.cu b/csrc/rocm/fused_kernels.cu
new file mode 100644
index 000000000000..a247f5c15c7b
--- /dev/null
+++ b/csrc/rocm/fused_kernels.cu
@@ -0,0 +1,192 @@
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#include <stdexcept>
+#include <algorithm>
+
+constexpr int WARP_SIZE = 64;
+
+template <typename T>
+__device__ __forceinline__ T silu(const T& x) {
+  // x * sigmoid(x)
+  return (T)(((float)x) / (1.0f + expf((float)-x)));
+}
+
+template <typename T>
+__device__ __forceinline__ T loadnt(T* addr) {
+  return __builtin_nontemporal_load(addr);
+}
+
+__device__ __forceinline__ float4 load_ntmprl(const float4* addr) {
+  auto addr_alias = reinterpret_cast<const float*>(addr);
+  auto dat0 = loadnt(addr_alias);
+  auto dat1 = loadnt(addr_alias + 1);
+  auto dat2 = loadnt(addr_alias + 2);
+  auto dat3 = loadnt(addr_alias + 3);
+  // auto dat0 = *(addr_alias);
+  // auto dat1 = *(addr_alias+1);
+  // auto dat2 = *(addr_alias+2);
+  // auto dat3 = *(addr_alias+3);
+  return make_float4(dat0, dat1, dat2, dat3);
+}
+
+// TBlock fetches entire rows of A, and entire col of B (K dimension); assume
+// N=1 for time being grid is M/A_NUM_ROWS blocks
+template <int NUM_A_ROWS_PER_BLOCK>
+__global__ void LLGemm_Silu_kernel(float4* af4, __half2* bf4, _Float16* c,
+                                   const int d) {
+  __shared__ float red_smem[NUM_A_ROWS_PER_BLOCK][WARP_SIZE];
+  const int row_addr = blockIdx.x * NUM_A_ROWS_PER_BLOCK / 2 * blockDim.x;
+  const int row_addr_d = row_addr + d * blockDim.x;
+  // int row_addr_1 = row_addr + CUDA_NUM_THREADS;
+  // int row_addr_2 = row_addr_1 + CUDA_NUM_THREADS;
+  // int row_addr_3 = row_addr_2 + CUDA_NUM_THREADS;
+  const int threadid = threadIdx.x;
+  const int warp = threadIdx.x / WARP_SIZE;
+  const int lane = threadIdx.x % WARP_SIZE;
+  const int num_warps = blockDim.x / WARP_SIZE;
+  const int qwarpid = threadid / 16;
+  const int qthreadid = threadid % 16;
+  float4 rowA_elem4[NUM_A_ROWS_PER_BLOCK];
+  // float4 colB_elem4;
+  __half2 colB_elem4x, colB_elem4y, colB_elem4z, colB_elem4w;
+  float acc[NUM_A_ROWS_PER_BLOCK];  //= 0.0;
+  __half2 acch2;
+
+  // rowA_elem4 = af4[row_addr + threadid];
+  //__syncthreads();
+  // rowA_elem4_1 = af4[row_addr_1 + threadid];
+  // rowA_elem4_2 = af4[row_addr_2 + threadid];
+  // rowA_elem4_3 = af4[row_addr_3 + threadid];
+#pragma unroll
+  for (int i = 0; i < NUM_A_ROWS_PER_BLOCK / 2; i++) {
+    rowA_elem4[2 * i] = load_ntmprl(&af4[row_addr + i * blockDim.x + threadid]);
+    rowA_elem4[2 * i + 1] =
+        load_ntmprl(&af4[row_addr_d + i * blockDim.x + threadid]);
+    // rowA_elem4[i] = af4[row_addr + i*blockDim.x + threadid];
+    //__syncthreads();
+  }
+  colB_elem4x = bf4[threadid * 4 + 0];
+  colB_elem4y = bf4[threadid * 4 + 1];
+  colB_elem4z = bf4[threadid * 4 + 2];
+  colB_elem4w = bf4[threadid * 4 + 3];
+
+  // __syncthreads();
+  __half2 Af2;
+  float2 S;
+  // auto Bh2ptr = reinterpret_cast<__half2 *>(&colB_elem4);
+  // auto Bf2x = *Bh2ptr;
+  // auto Bf2y = *(Bh2ptr+1);
+  // auto Bf2z = *(Bh2ptr+2);
+  // auto Bf2w = *(Bh2ptr+3);
+  auto Ah2ptr = reinterpret_cast<__half2*>(&rowA_elem4);
+  __half2* ah2lptr;
+#pragma unroll
+  for (int i = 0; i < NUM_A_ROWS_PER_BLOCK; i++) {
+    ah2lptr = Ah2ptr + i * 4;
+    Af2 = *(ah2lptr);
+    acch2 = __hmul2(Af2, colB_elem4x);
+    Af2 = *(ah2lptr + 1);
+    acch2 = __hfma2(Af2, colB_elem4y, acch2);
+    Af2 = *(ah2lptr + 2);
+    acch2 = __hfma2(Af2, colB_elem4z, acch2);
+    Af2 = *(ah2lptr + 3);
+    acch2 = __hfma2(Af2, colB_elem4w, acch2);
+    S = __half22float2(acch2);
+    acc[i] = S.x + S.y;
+  }
+
+#pragma unroll
+  for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) {
+#pragma unroll
+    for (int i = 0; i < NUM_A_ROWS_PER_BLOCK; i++) {
+      acc[i] += __shfl_xor(acc[i], mask);
+    }
+  }
+
+  // Warp leaders store the data to shared memory.
+  // if (lane == 0) {
+  //   #pragma unroll
+  //   for (int i=0; i<NUM_A_ROWS_PER_BLOCK; i++) {
+  //     red_smem[i][warp] = acc[i];
+  //   }
+  // }
+
+  if (lane < NUM_A_ROWS_PER_BLOCK) {
+    red_smem[lane][warp] = acc[lane];
+  }
+
+  // Make sure the data is in shared memory.
+  __syncthreads();
+  if (qwarpid < NUM_A_ROWS_PER_BLOCK) {
+    // if (threadid<64) {
+    // #pragma unroll
+    // for (int i=0; i<NUM_A_ROWS_PER_BLOCK/2; i++) {
+    //     acc[i+2*qwarpid] = 0.0;
+    // }
+    ////acc[qwarpid] = 0.0;
+
+    ////if (qthreadid<num_warps) {
+    // #pragma unroll
+    //   for (int i=0; i<NUM_A_ROWS_PER_BLOCK/2; i++) {
+    //     acc[i+2*qwarpid] = red_smem[i+2*qwarpid][qthreadid];
+    //   }
+    ////acc[qwarpid] = red_smem[qwarpid][qthreadid];
+
+    ////}
+    acc[qwarpid] = qthreadid < num_warps ? red_smem[qwarpid][qthreadid] : 0.f;
+    // if (threadid<32) {
+#pragma unroll
+    for (int mask = 16 / 2; mask >= 1; mask /= 2) {
+      // #pragma unroll
+      // for (int i=0; i<NUM_A_ROWS_PER_BLOCK/2; i++) {
+      //   acc[i+2*qwarpid] += __shfl_xor(acc[i+2*qwarpid], mask);
+      // }
+      acc[qwarpid] += __shfl_xor(acc[qwarpid], mask);
+    }
+    float oval2 = __shfl_xor(acc[qwarpid], 16);
+    // acc[1] = __shfl_xor(acc[1],16);
+    // acc[3] = __shfl_xor(acc[3],16);
+    //}
+    //  __syncthreads();
+    // if (threadid < NUM_A_ROWS_PER_BLOCK/2) {
+    if (lane == 0 or lane == 32) {
+      // oval = __float22half2_rn(make_float2(acc[qwarpid],oval2));
+      // c[blockIdx.x*NUM_A_ROWS_PER_BLOCK/2+qwarpid/2] = oval;
+
+      c[blockIdx.x * NUM_A_ROWS_PER_BLOCK / 2 + qwarpid / 2] =
+          silu(acc[qwarpid]) * oval2;
+    }
+  }  // threadid<WARP_SIZE
+}
+// define the kernel calling code:
+// template <typename T>
+void LLGemm_Silu(void* in_a, void* in_b, void* out_c, const int M, const int K,
+                 cudaStream_t stream, const int rows_per_block = 4) {
+  float4* af4 = reinterpret_cast<float4*>(in_a);
+  auto* bf4 = reinterpret_cast<__half2*>(in_b);
+  auto* c = reinterpret_cast<_Float16*>(out_c);
+  const int d = M / 2;
+  const int NUM_THREADS = K * 2 / 16;
+  int NUM_BLOCKS = M / rows_per_block;
+  if (rows_per_block == 2) {
+    LLGemm_Silu_kernel<2>
+        <<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(af4, bf4, c, d);
+  } else if (rows_per_block == 4) {
+    LLGemm_Silu_kernel<4>
+        <<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(af4, bf4, c, d);
+  } else if (rows_per_block == 8) {
+    LLGemm_Silu_kernel<8>
+        <<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(af4, bf4, c, d);
+  } else if (rows_per_block == 16) {
+    LLGemm_Silu_kernel<16>
+        <<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(af4, bf4, c, d);
+  } else {
+    NUM_BLOCKS = M / 4;
+    LLGemm_Silu_kernel<4>
+        <<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(af4, bf4, c, d);
+  }
+
+  cudaError_t err = cudaGetLastError();
+  if (cudaSuccess != err)
+    throw std::runtime_error("CUDA kernel failed : " + std::to_string(err));
+}
diff --git a/csrc/type_convert.cuh b/csrc/type_convert.cuh
index 21b9d0ae515d..d3f72b8b6b0e 100644
--- a/csrc/type_convert.cuh
+++ b/csrc/type_convert.cuh
@@ -162,4 +162,4 @@ struct alignas(16) _f16Vec {
     return result;
   }
 };
-}  // namespace vllm
\ No newline at end of file
+}  // namespace vllm
diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index 0b98c4b1aeb2..a36604462d45 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -1,12 +1,15 @@
 # default base image
 ARG REMOTE_VLLM="0"
 ARG COMMON_WORKDIR=/app
-ARG BASE_IMAGE=rocm/vllm-dev:base
+ARG BASE_IMAGE=rocm/vllm-private:355_wip_base_image_0823
+ARG AITER_BRANCH="355_wip"
+ARG AITER_REPO="https://github.com/ROCm/aiter.git"
 
 FROM ${BASE_IMAGE} AS base
 
 ARG ARG_PYTORCH_ROCM_ARCH
 ENV PYTORCH_ROCM_ARCH=${ARG_PYTORCH_ROCM_ARCH:-${PYTORCH_ROCM_ARCH}}
+ENV AITER_ROCM_ARCH=gfx942;gfx950
 
 # Install some basic utilities
 RUN apt-get update -q -y && apt-get install -q -y \
@@ -14,10 +17,26 @@ RUN apt-get update -q -y && apt-get install -q -y \
     apt-transport-https ca-certificates wget curl
 # Remove sccache    
 RUN python3 -m pip install --upgrade pip
+RUN python3 -m pip install --upgrade pip
 RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
 ARG COMMON_WORKDIR
 WORKDIR ${COMMON_WORKDIR}
 
+# --------------
+# DLL: TEMP since aiter is volatile.  When base is locked still build aiter
+FROM base AS build_aiter
+ARG AITER_BRANCH
+ARG AITER_REPO
+# RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
+#     pip install /install/*.whl
+RUN git clone --recursive ${AITER_REPO}
+RUN cd aiter \
+    && git checkout ${AITER_BRANCH} \
+    && git submodule update --init --recursive \
+    && pip install -r requirements.txt
+RUN pip install pyyaml && cd aiter && PREBUILD_KERNELS=1 GPU_ARCHS=${AITER_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist && ls /app/aiter/dist/*.whl
+RUN mkdir -p /app/install && cp /app/aiter/dist/*.whl /app/install
+
 
 # -----------------------
 # vLLM fetch stages
@@ -42,8 +61,10 @@ RUN cd vllm \
     && python3 -m pip install -r requirements/rocm.txt \
     && python3 setup.py clean --all  \
     && python3 setup.py bdist_wheel --dist-dir=dist
+
 FROM scratch AS export_vllm
 ARG COMMON_WORKDIR
+COPY --from=build_aiter /app/install/*.whl /
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/dist/*.whl /
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/requirements /requirements
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/benchmarks /benchmarks
diff --git a/docker/Dockerfile.rocm_base b/docker/Dockerfile.rocm_base
index 2ba5461dfe55..0317975a54b1 100644
--- a/docker/Dockerfile.rocm_base
+++ b/docker/Dockerfile.rocm_base
@@ -1,25 +1,26 @@
-ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:6.4.1-complete
-ARG HIPBLASLT_BRANCH="aa0bda7b"
-ARG HIPBLAS_COMMON_BRANCH="9b80ba8e"
-ARG LEGACY_HIPBLASLT_OPTION=
-ARG TRITON_BRANCH="e5be006"
-ARG TRITON_REPO="https://github.com/triton-lang/triton.git"
-ARG PYTORCH_BRANCH="f717b2af"
-ARG PYTORCH_VISION_BRANCH="v0.21.0"
+ARG BASE_IMAGE=registry-sc-harbor.amd.com/framework/compute-rocm-dkms-component-baas-rel:25_ubuntu22.04_py3.10_pytorch_rocm7.1_internal_testing_0ea0592f
+# ARG HIPBLASLT_BRANCH="aa0bda7b"
+# ARG HIPBLAS_COMMON_BRANCH="9b80ba8e"
+# ARG LEGACY_HIPBLASLT_OPTION=
+ARG TRITON_BRANCH="pytorch/rocm7.1_internal_testing"
+ARG TRITON_REPO="https://github.com/ROCm/triton.git"
+ARG PYTORCH_BRANCH="triton_kernel_wrap_fix"
+ARG PYTORCH_VISION_BRANCH="5dc9e7de"
 ARG PYTORCH_REPO="https://github.com/ROCm/pytorch.git"
 ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
-ARG FA_BRANCH="1a7f4dfa"
+ARG FA_BRANCH="3222ea3"
 ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
-ARG AITER_BRANCH="4822e675"
-ARG AITER_REPO="https://github.com/ROCm/aiter.git"
+# ARG AITER_BRANCH="6b92d30de680ec10e1e1463b609ed0f228f999f3"
+# ARG AITER_REPO="https://github.com/ROCm/aiter.git"
 
 FROM ${BASE_IMAGE} AS base
 
 ENV PATH=/opt/rocm/llvm/bin:$PATH
 ENV ROCM_PATH=/opt/rocm
 ENV LD_LIBRARY_PATH=/opt/rocm/lib:/usr/local/lib:
-ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942;gfx1100;gfx1101;gfx1200;gfx1201
+ARG PYTORCH_ROCM_ARCH=gfx942;gfx950
 ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
+# ENV AITER_ROCM_ARCH=gfx942;gfx950
 
 ARG PYTHON_VERSION=3.12
 
@@ -45,28 +46,28 @@ RUN apt-get update -y \
 
 RUN pip install -U packaging 'cmake<4' ninja wheel 'setuptools<80' pybind11 Cython
 
-FROM base AS build_hipblaslt
-ARG HIPBLASLT_BRANCH
-ARG HIPBLAS_COMMON_BRANCH
-# Set to "--legacy_hipblas_direct" for ROCm<=6.2
-ARG LEGACY_HIPBLASLT_OPTION
-RUN git clone https://github.com/ROCm/hipBLAS-common.git
-RUN apt-get remove -y hipblaslt && apt-get autoremove -y && apt-get autoclean -y
-RUN cd hipBLAS-common \
-    && git checkout ${HIPBLAS_COMMON_BRANCH} \
-    && mkdir build \
-    && cd build \
-    && cmake .. \
-    && make package \
-    && dpkg -i ./*.deb
-RUN git clone https://github.com/ROCm/hipBLASLt
-RUN cd hipBLASLt \
-    && git checkout ${HIPBLASLT_BRANCH} \
-    && apt-get install -y llvm-dev \
-    && ./install.sh -dc --architecture ${PYTORCH_ROCM_ARCH} ${LEGACY_HIPBLASLT_OPTION} \
-    && cd build/release \
-    && make package
-RUN mkdir -p /app/install && cp /app/hipBLASLt/build/release/*.deb /app/hipBLAS-common/build/*.deb /app/install
+# FROM base AS build_hipblaslt
+# ARG HIPBLASLT_BRANCH
+# ARG HIPBLAS_COMMON_BRANCH
+# # Set to "--legacy_hipblas_direct" for ROCm<=6.2
+# ARG LEGACY_HIPBLASLT_OPTION
+# RUN git clone https://github.com/ROCm/hipBLAS-common.git
+# RUN apt-get remove -y hipblaslt && apt-get autoremove -y && apt-get autoclean -y
+# RUN cd hipBLAS-common \
+#     && git checkout ${HIPBLAS_COMMON_BRANCH} \
+#     && mkdir build \
+#     && cd build \
+#     && cmake .. \
+#     && make package \
+#     && dpkg -i ./*.deb
+# RUN git clone https://github.com/ROCm/hipBLASLt
+# RUN cd hipBLASLt \
+#     && git checkout ${HIPBLASLT_BRANCH} \
+#     && apt-get install -y llvm-dev \
+#     && ./install.sh -dc --architecture ${PYTORCH_ROCM_ARCH} ${LEGACY_HIPBLASLT_OPTION} \
+#     && cd build/release \
+#     && make package
+# RUN mkdir -p /app/install && cp /app/hipBLASLt/build/release/*.deb /app/hipBLAS-common/build/*.deb /app/install
 
 FROM base AS build_triton
 ARG TRITON_BRANCH
@@ -75,10 +76,13 @@ RUN git clone ${TRITON_REPO}
 RUN cd triton \
     && git checkout ${TRITON_BRANCH} \
     && if [ ! -f setup.py ]; then cd python; fi \
-    && python3 setup.py bdist_wheel --dist-dir=dist \
-    && mkdir -p /app/install && cp dist/*.whl /app/install
-RUN if [ -d triton/python/triton_kernels ]; then pip install build && cd triton/python/triton_kernels \
-    && python3 -m build --wheel && cp dist/*.whl /app/install; fi
+    && python3 setup.py bdist_wheel --dist-dir=triton/dist \
+    && mkdir -p /app/install && cp triton/dist/triton*.whl /app/install
+RUN if [ -d triton/python/triton_kernels ]; then \
+    pip install build \
+    && cd triton/python/triton_kernels \
+    && python3 -m build --wheel --dist-dir=triton/dist \
+    && cp triton/dist/triton*.whl /app/install; fi
 
 FROM base AS build_amdsmi
 RUN cd /opt/rocm/share/amd_smi \
@@ -86,71 +90,76 @@ RUN cd /opt/rocm/share/amd_smi \
 RUN mkdir -p /app/install && cp /opt/rocm/share/amd_smi/dist/*.whl /app/install
 
 FROM base AS build_pytorch
-ARG PYTORCH_BRANCH
-ARG PYTORCH_VISION_BRANCH
-ARG PYTORCH_REPO
-ARG PYTORCH_VISION_REPO
+#ARG PYTORCH_BRANCH
+#ARG PYTORCH_VISION_BRANCH
+#ARG PYTORCH_REPO
+#ARG PYTORCH_VISION_REPO
 ARG FA_BRANCH
 ARG FA_REPO
-RUN git clone ${PYTORCH_REPO} pytorch
-RUN cd pytorch && git checkout ${PYTORCH_BRANCH} && \
-    pip install -r requirements.txt && git submodule update --init --recursive \
-    && python3 tools/amd_build/build_amd.py \
-    && CMAKE_PREFIX_PATH=$(python3 -c 'import sys; print(sys.prefix)') python3 setup.py bdist_wheel --dist-dir=dist \
-    && pip install dist/*.whl
-RUN git clone ${PYTORCH_VISION_REPO} vision
-RUN cd vision && git checkout ${PYTORCH_VISION_BRANCH} \
-    && python3 setup.py bdist_wheel --dist-dir=dist \
-    && pip install dist/*.whl
+#RUN git clone ${PYTORCH_REPO} pytorch
+#RUN cd pytorch && git checkout ${PYTORCH_BRANCH} && \
+#    pip install -r requirements.txt && git submodule update --init --recursive \
+#    && python3 tools/amd_build/build_amd.py \
+#    && CMAKE_PREFIX_PATH=$(python3 -c 'import sys; print(sys.prefix)') python3 setup.py bdist_wheel --dist-dir=dist \
+#    && pip install dist/*.whl
+#RUN git clone ${PYTORCH_VISION_REPO} vision
+#RUN cd vision && git checkout ${PYTORCH_VISION_BRANCH} \
+#    && python3 setup.py bdist_wheel --dist-dir=dist \
+#    && pip install dist/*.whl
 RUN git clone ${FA_REPO}
 RUN cd flash-attention \
     && git checkout ${FA_BRANCH} \
     && git submodule update --init \
     && GPU_ARCHS=$(echo ${PYTORCH_ROCM_ARCH} | sed -e 's/;gfx1[0-9]\{3\}//g') python3 setup.py bdist_wheel --dist-dir=dist
-RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \
-    && cp /app/vision/dist/*.whl /app/install \
+RUN mkdir -p /app/install \
+   # && cp /app/pytorch/dist/*.whl /app/install \
+   # && cp /app/vision/dist/*.whl /app/install \
     && cp /app/flash-attention/dist/*.whl /app/install
+RUN mkdir -p /app/install && cp /app/flash-attention/dist/*.whl /app/install
 
-FROM base AS build_aiter
-ARG AITER_BRANCH
-ARG AITER_REPO
-RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
-    pip install /install/*.whl
-RUN git clone --recursive ${AITER_REPO}
-RUN cd aiter \
-    && git checkout ${AITER_BRANCH} \
-    && git submodule update --init --recursive \
-    && pip install -r requirements.txt
-RUN pip install pyyaml && cd aiter && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py bdist_wheel --dist-dir=dist && ls /app/aiter/dist/*.whl
-RUN mkdir -p /app/install && cp /app/aiter/dist/*.whl /app/install
+# FROM base AS build_aiter
+# ARG AITER_BRANCH
+# ARG AITER_REPO
+# # RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
+# #     pip install /install/*.whl
+# RUN git clone --recursive ${AITER_REPO}
+# RUN cd aiter \
+#     && git checkout ${AITER_BRANCH} \
+#     && git submodule update --init --recursive \
+#     && pip install -r requirements.txt
+# RUN pip install pyyaml && cd aiter && PREBUILD_KERNELS=1 GPU_ARCHS=${AITER_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist && ls /app/aiter/dist/*.whl
+# RUN mkdir -p /app/install && cp /app/aiter/dist/*.whl /app/install
 
 FROM base AS debs
 RUN mkdir /app/debs
-RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \
-    cp /install/*.deb /app/debs
+# RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \
+#     cp /install/*.deb /app/debs
 RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \
-    cp /install/*.whl /app/debs
+   cp /install/*.whl /app/debs
 RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
     cp /install/*.whl /app/debs
 RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
     cp /install/*.whl /app/debs
-RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \
-    cp /install/*.whl /app/debs
+# RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \
+#     cp /install/*.whl /app/debs
 
 FROM base AS final
-RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \
-    dpkg -i /install/*deb \
-    && perl -p -i -e 's/, hipblas-common-dev \([^)]*?\), /, /g' /var/lib/dpkg/status \
-    && perl -p -i -e 's/, hipblaslt-dev \([^)]*?\), /, /g' /var/lib/dpkg/status \
-    && perl -p -i -e 's/, hipblaslt \([^)]*?\), /, /g' /var/lib/dpkg/status
+# RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \
+#     dpkg -i /install/*deb \
+#     && sed -i 's/, hipblaslt-dev \(.*\), hipcub-dev/, hipcub-dev/g' /var/lib/dpkg/status \
+#     && sed -i 's/, hipblaslt \(.*\), hipfft/, hipfft/g' /var/lib/dpkg/status
+# RUN --mount=type=bind,from=build_rccl,src=/app/install/,target=/install \
+#     dpkg -i /install/*deb \
+#     && sed -i 's/, rccl-dev \(.*\), rocalution/, rocalution/g' /var/lib/dpkg/status \
+#     && sed -i 's/, rccl \(.*\), rocalution/, rocalution/g' /var/lib/dpkg/status
 RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \
     pip install /install/*.whl
 RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
     pip install /install/*.whl
 RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
     pip install /install/*.whl
-RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \
-    pip install /install/*.whl
+# RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \
+#     pip install /install/*.whl
 
 ARG BASE_IMAGE
 ARG HIPBLAS_COMMON_BRANCH
@@ -167,15 +176,15 @@ ARG FA_REPO
 ARG AITER_BRANCH
 ARG AITER_REPO
 RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
-    && echo "HIPBLAS_COMMON_BRANCH: ${HIPBLAS_COMMON_BRANCH}" >> /app/versions.txt \
-    && echo "HIPBLASLT_BRANCH: ${HIPBLASLT_BRANCH}" >> /app/versions.txt \
-    && echo "LEGACY_HIPBLASLT_OPTION: ${LEGACY_HIPBLASLT_OPTION}" >> /app/versions.txt \
+    # && echo "HIPBLAS_COMMON_BRANCH: ${HIPBLAS_COMMON_BRANCH}" >> /app/versions.txt \
+    # && echo "HIPBLASLT_BRANCH: ${HIPBLASLT_BRANCH}" >> /app/versions.txt \
+    # && echo "LEGACY_HIPBLASLT_OPTION: ${LEGACY_HIPBLASLT_OPTION}" >> /app/versions.txt \
     && echo "TRITON_BRANCH: ${TRITON_BRANCH}" >> /app/versions.txt \
     && echo "TRITON_REPO: ${TRITON_REPO}" >> /app/versions.txt \
     && echo "PYTORCH_BRANCH: ${PYTORCH_BRANCH}" >> /app/versions.txt \
     && echo "PYTORCH_VISION_BRANCH: ${PYTORCH_VISION_BRANCH}" >> /app/versions.txt \
     && echo "PYTORCH_REPO: ${PYTORCH_REPO}" >> /app/versions.txt \
     && echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \
-    && echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \
-    && echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \
-    && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt
\ No newline at end of file
+    && echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt
+    # && echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \
+    # && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt
diff --git a/docs/dev-docker/README.md b/docs/dev-docker/README.md
new file mode 100644
index 000000000000..c24722e9350a
--- /dev/null
+++ b/docs/dev-docker/README.md
@@ -0,0 +1,568 @@
+# vllm FP8 Latency and Throughput benchmarks with vLLM on the AMD Instinct™ MI300X accelerator
+
+Documentation for Inferencing with vLLM on AMD Instinct™ MI300X platforms.
+
+## Overview
+
+vLLM is a toolkit and library for large language model (LLM) inference and serving. It deploys the PagedAttention algorithm, which reduces memory consumption and increases throughput by leveraging dynamic key and value allocation in GPU memory. vLLM also incorporates many recent LLM acceleration and quantization algorithms, such as fp8 GeMM, fp8 KV cache, continuous batching, flash attention, hip graph, tensor parallel, GPTQ, AWQ, and token speculation. In addition, AMD implements high-performance custom kernels and modules in vLLM to enhance performance further.
+
+This documentation includes information for running the popular Llama 3.1 series models from Meta using a pre-built AMD vLLM docker image optimized for an AMD Instinct™ MI300X or MI325X accelerator. The container is publicly available at [AMD Infinity Hub](https://www.amd.com/en/developer/resources/infinity-hub.html)
+
+The pre-built image includes:
+
+- ROCm™ 6.4.1
+- HipblasLT 0.15
+- vLLM 0.9.1
+- PyTorch 2.7
+
+## Pull latest Docker Image
+
+Pull the most recent validated docker image with `docker pull rocm/vllm-dev:main`
+
+## What is New
+
+- No need to specify the --compilation-config parameter, these options were turned on by default
+- Fixed llama3.1 405b CAR issue (no longer need --disable-custom-all-reduce)
+- Fixed +rms_norm custom kernel issue
+- Added quick reduce (set VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=FP to enable. Supported modes are FP, INT8, INT6, INT4)
+- Mitigated the commandr model causing GPU crash through a workaround until the driver issue is fixed
+
+## Known Issues and Workarounds
+
+- AITER does not support fp8 kv cache
+
+## Performance Results
+
+The data in the following tables is a reference point to help users validate observed performance. It should not be considered as the peak performance that can be delivered by AMD Instinct™ MI300X accelerator with vLLM. See the MLPerf section in this document for information about MLPerf 4.1 inference results. The performance numbers above were collected using the steps below.
+*Note Benchmarks were run with benchmark scripts from [v0.8.5](https://github.com/vllm-project/vllm/tree/v0.8.5/benchmarks)*
+
+### Throughput Measurements
+
+The table below shows performance data where a local inference client is fed requests at an infinite rate and shows the throughput client-server scenario under maximum load.
+
+| Model | Precision | TP Size | Input | Output | Num Prompts | Max Num Seqs | Throughput (tokens/s) |
+|-------|-----------|---------|-------|--------|-------------|--------------|-----------------------|
+| Llama 3.1 70B (amd/Llama-3.1-70B-Instruct-FP8-KV) | FP8 | 8 | 128 | 2048 | 3200 | 3200 | 12638.9  |
+|       |           |         | 128   | 4096   | 1500        | 1500         | 10756.8               |
+|       |           |         | 500   | 2000   | 2000        | 2000         | 10691.7               |
+|       |           |         | 2048  | 2048   | 1500        | 1500         | 7354.9                |
+| Llama 3.1 405B (amd/Llama-3.1-405B-Instruct-FP8-KV) | FP8 | 8 | 128 | 2048 | 1500 | 1500 | 3912.8 |
+|       |           |         | 128   | 4096   | 1500        | 1500         | 3084.7                |
+|       |           |         | 500   | 2000   | 2000        | 2000         | 2935.9                |
+|       |           |         | 2048  | 2048   | 500         | 500          | 2191.5                |
+
+*TP stands for Tensor Parallelism.*
+
+Supermicro AS-8125GS-TNMR2 with 2x AMD EPYC 9554 Processors, 2.25 TiB RAM, 8x AMD Instinct MI300X (192GiB, 750W) GPUs, Ubuntu 22.04, and amdgpu driver 6.8.5
+
+### Latency Measurements
+
+The table below shows latency measurement, which typically involves assessing the time from when the system receives an input to when the model produces a result.
+
+| Model | Precision | TP Size | Batch Size | Input | Output | MI300X Latency (sec) |
+|-------|-----------|----------|------------|--------|---------|-------------------|
+| Llama 3.1 70B (amd/Llama-3.1-70B-Instruct-FP8-KV) | FP8 | 8 | 1 | 128 | 2048 | 17.236 |
+| | | | 2 | 128 | 2048 | 18.057 |
+| | | | 4 | 128 | 2048 | 18.450 |
+| | | | 8 | 128 | 2048 | 19.677 |
+| | | | 16 | 128 | 2048 | 22.072 |
+| | | | 32 | 128 | 2048 | 24.932 |
+| | | | 64 | 128 | 2048 | 33.287 |
+| | | | 128 | 128 | 2048 | 46.484 |
+| | | | 1 | 2048 | 2048 | 17.500 |
+| | | | 2 | 2048 | 2048 | 18.055 |
+| | | | 4 | 2048 | 2048 | 18.858 |
+| | | | 8 | 2048 | 2048 | 20.161 |
+| | | | 16 | 2048 | 2048 | 22.347 |
+| | | | 32 | 2048 | 2048 | 25.966 |
+| | | | 64 | 2048 | 2048 | 35.324 |
+| | | | 128 | 2048 | 2048 | 52.394 |
+| Llama 3.1 405B (amd/Llama-3.1-405B-Instruct-FP8-KV) | FP8 | 8 | 1 | 128 | 2048 | 48.453 |
+| | | | 2 | 128 | 2048 | 49.268 |
+| | | | 4 | 128 | 2048 | 51.136 |
+| | | | 8 | 128 | 2048 | 54.226 |
+| | | | 16 | 128 | 2048 | 57.274 |
+| | | | 32 | 128 | 2048 | 68.901 |
+| | | | 64 | 128 | 2048 | 88.631 |
+| | | | 128 | 128 | 2048 | 117.027 |
+| | | | 1 | 2048 | 2048 | 48.362 |
+| | | | 2 | 2048 | 2048 | 49.121 |
+| | | | 4 | 2048 | 2048 | 52.347 |
+| | | | 8 | 2048 | 2048 | 54.471 |
+| | | | 16 | 2048 | 2048 | 57.841 |
+| | | | 32 | 2048 | 2048 | 70.538 |
+| | | | 64 | 2048 | 2048 | 91.452 |
+| | | | 128 | 2048 | 2048 | 125.471 |
+
+*TP stands for Tensor Parallelism.*
+
+Supermicro AS-8125GS-TNMR2 with 2x AMD EPYC 9554 Processors, 2.25 TiB RAM, 8x AMD Instinct MI300X (192GiB, 750W) GPUs, Ubuntu 22.04, and amdgpu driver 6.8.5
+
+## Reproducing Benchmarked Results
+
+### Preparation - Obtaining access to models
+
+The vllm-dev docker image should work with any model supported by vLLM.  When running with FP8, AMD has quantized models available for a variety of popular models, or you can quantize models yourself using Quark.  If needed, the vLLM benchmark scripts will automatically download models and then store them in a Hugging Face cache directory for reuse in future tests. Alternatively, you can choose to download the model to the cache (or to another directory on the system) in advance.
+
+Many HuggingFace models, including Llama-3.1, have gated access.  You will need to set up an account at (https://huggingface.co), search for the model of interest, and request access if necessary. You will also need to create a token for accessing these models from vLLM: open your user profile (https://huggingface.co/settings/profile), select "Access Tokens", press "+ Create New Token", and create a new Read token.
+
+### System optimization
+
+Before running performance tests you should ensure the system is optimized according to the [ROCm Documentation](https://rocm.docs.amd.com/en/latest/how-to/system-optimization/mi300x.html).  In particular, it is important to ensure that NUMA auto-balancing is disabled.
+
+*Note: Check that NUMA balancing is properly set by inspecting the output of the command below, which should have a value of 0, with, `cat /proc/sys/kernel/numa_balancing`*
+
+### Launch AMD vLLM Docker
+
+Download and launch the docker.  The HF_TOKEN is required to be set (either here or after launching the container) if you want to allow vLLM to download gated models automatically; use your HuggingFace token in place of `<token>` in the command below:
+
+```bash
+docker run -it --rm --ipc=host --network=host --group-add render \
+    --privileged --security-opt seccomp=unconfined \
+    --cap-add=CAP_SYS_ADMIN --cap-add=SYS_PTRACE \
+    --device=/dev/kfd --device=/dev/dri --device=/dev/mem \
+    -e HF_HOME=/data \
+    -e HF_TOKEN=<token> \
+    -v /data:/data \
+    rocm/vllm-dev:main
+```
+
+Note: The instructions in this document use `/data` to store the models.  If you choose a different directory, you will also need to make that change to the host volume mount when launching the docker container.  For example, `-v /home/username/models:/data` in place of `-v /data:/data` would store the models in /home/username/models on the host.  Some models can be quite large; please ensure that you have sufficient disk space prior to downloading the model.  Since the model download may take a long time, you can use `tmux` or `screen` to avoid getting disconnected.
+
+### Downloading models with huggingface-cli
+
+If you would like want to download models directly (instead of allowing vLLM to download them automatically), you can use the huggingface-cli inside the running docker container. (remove an extra white space) Login using the token that you created earlier. (Note, it is not necessary to save it as a git credential.)
+
+```bash
+huggingface-cli login
+```
+
+You can download a model to the huggingface-cache directory using a command similar to the following (substituting the name of the model you wish to download):
+
+```bash
+sudo mkdir -p /data/huggingface-cache
+sudo chmod -R a+w /data/huggingface-cache
+HF_HOME=/data/huggingface-cache huggingface-cli download meta-llama/Llama-3.1-405B-Instruct --exclude "original/*"
+```
+
+Alternatively, you may wish to download the model to a specific directory, e.g. so you can quantize the model with Quark:
+
+```bash
+sudo mkdir -p /data/llama-3.1
+sudo chmod -R a+w /data/llama-3.1
+huggingface-cli download meta-llama/Llama-3.1-405B-Instruct --exclude "original/*" --local-dir /data/llama-3.1/Llama-3.1-405B-Instruct
+```
+
+In the benchmark commands provided later in this document, replace the model name (e.g. `amd/Llama-3.1-405B-Instruct-FP8-KV`) with the path to the model (e.g. `/data/llama-3.1/Llama-3.1-405B-Instruct`)
+
+### Use pre-quantized models
+
+AMD has provided [FP8-quantized versions](https://huggingface.co/collections/amd/quark-quantized-ocp-fp8-models-66db7936d18fcbaf95d4405c) of several models in order to make them easier to run on MI300X / MI325X, including:
+
+- <https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV>
+- <https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV>
+- <https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV>
+
+Some models may be private to those who are members of <https://huggingface.co/amd>.
+
+These FP8 quantized checkpoints were generated with AMD’s Quark Quantizer. For more information about Quark, please refer to <https://quark.docs.amd.com/latest/quark_example_torch_llm_gen.html>
+
+### Quantize your own models
+
+This is an optional step if you would like to quantize your own model instead of using AMD's pre-quantized models.  These instructions use Llama-3.1-405B as an example, but the commands are similar for other models.
+
+First download the model from <https://huggingface.co/meta-llama/Llama-3.1-405B> to the /data/llama-3.1 directory as described above.
+
+[Download and install Quark](https://quark.docs.amd.com/latest/install.html)
+
+Run the quantization script in the example folder using the following command line:
+
+```bash
+# path to quark quantization script
+export QUARK_DIR=/data/quark-0.6.0+dba9ca364/examples/torch/language_modeling/llm_ptq/quantize_quark.py
+# path to Model 
+export MODEL_DIR=/data/llama-3.1/Llama-3.1-405B-Instruct
+python3 $QUARK_DIR \
+--model_dir $MODEL_DIR \
+--output_dir Llama-3.1-405B-Instruct-FP8-KV \
+--kv_cache_dtype fp8 \
+--quant_scheme w_fp8_a_fp8 \
+--num_calib_data 128 \
+--model_export quark_safetensors \
+--no_weight_matrix_merge \
+--multi_gpu
+```
+
+Note: the `--multi_gpu` parameter can be omitted for small models that fit on a single GPU.
+
+## Performance testing with AMD vLLM Docker
+
+### Performance environment variables
+
+Some environment variables enhance the performance of the vLLM kernels on the MI300X / MI325X accelerator. See the AMD Instinct MI300X workload optimization guide for more information.
+
+```bash
+export VLLM_USE_TRITON_FLASH_ATTN=0
+export VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1
+
+```
+
+### vLLM engine performance settings
+
+vLLM provides a number of engine options which can be changed to improve performance.  Refer to the [vLLM Engine Args](https://docs.vllm.ai/en/stable/usage/engine_args.html) documentation for the complete list of vLLM engine options.
+
+Below is a list of a few of the key vLLM engine arguments for performance; these can be passed to the vLLM benchmark scripts:
+- **--max-model-len** : Maximum context length supported by the model instance. Can be set to a lower value than model configuration value to improve performance and gpu memory utilization.
+- **--max-num-batched-tokens** : The maximum prefill size, i.e., how many prompt tokens can be packed together in a single prefill. Set to a higher value to improve prefill performance at the cost of higher gpu memory utilization. 65536 works well for LLama models.
+- **--max-num-seqs** : The maximum decode batch size (default 256). Using larger values will allow more prompts to be processed concurrently, resulting in increased throughput (possibly at the expense of higher latency).  If the value is too large, there may not be enough GPU memory for the KV cache, resulting in requests getting preempted.  The optimal value will depend on the GPU memory, model size, and maximum context length.
+- **--max-seq-len-to-capture** : Maximum sequence length for which Hip-graphs are captured and utilized. It's recommended to use Hip-graphs for the best decode performance. The default value of this parameter is 8K, which is lower than the large context lengths supported by recent models such as LLama. Set this parameter to max-model-len or maximum context length supported by the model for best performance.
+- **--gpu-memory-utilization** : The ratio of GPU memory reserved by a vLLM instance. Default value is 0.9.  Increasing the value (potentially as high as 0.99) will increase the amount of memory available for KV cache.  When running in graph mode (i.e. not using `--enforce-eager`), it may be necessary to use a slightly smaller value of 0.92 - 0.95 to ensure adequate memory is available for the HIP graph.
+
+### Latency Benchmark
+
+vLLM's benchmark_latency.py script measures end-to-end latency for a specified model, input/output length, and batch size.
+
+You can run latency tests for FP8 models with:
+
+```bash
+export VLLM_USE_TRITON_FLASH_ATTN=0
+export VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1
+MODEL=amd/Llama-3.1-405B-Instruct-FP8-KV
+BS=1
+IN=128
+OUT=2048
+TP=8
+
+python3 /app/vllm/benchmarks/benchmark_latency.py \
+    --distributed-executor-backend mp \
+    --dtype float16 \
+    --gpu-memory-utilization 0.9 \
+    --trust-remote-code \
+    --model $MODEL \
+    --batch-size $BS \
+    --input-len $IN \
+    --output-len $OUT \
+    --tensor-parallel-size $TP \
+    --num-iters-warmup 3 \
+    --num-iters 5
+
+```
+
+When measuring models with long context lengths, performance may improve by setting `--max-model-len` to a smaller value.  It is important, however, to ensure that the `--max-model-len` is at least as large as the IN + OUT token counts.
+
+To estimate Time To First Token (TTFT) with the benchmark_latency.py script, set the OUT to 1 token.  It is also recommended to use `--enforce-eager` to get a more accurate measurement of the time that it actually takes to generate the first token.  (For a more comprehensive measurement of TTFT, use the Online Serving Benchmark.)
+
+For additional information about the available parameters run:
+
+```bash
+/app/vllm/benchmarks/benchmark_latency.py -h
+```
+
+### Throughput Benchmark
+
+vLLM's benchmark_throughput.py script measures offline throughput.  It can either use an input dataset or random prompts with fixed input/output lengths.
+
+You can run throughput tests for FP8 models with:
+
+```bash
+export VLLM_USE_TRITON_FLASH_ATTN=0
+export VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1
+MODEL=amd/Llama-3.1-405B-Instruct-FP8-KV
+IN=128
+OUT=2048
+TP=8
+PROMPTS=1500
+MAX_NUM_SEQS=1500
+
+python3 /app/vllm/benchmarks/benchmark_throughput.py \
+    --distributed-executor-backend mp \
+    --kv-cache-dtype fp8 \
+    --dtype float16 \
+    --disable-detokenize \
+    --gpu-memory-utilization 0.9 \
+    --trust-remote-code \
+    --model $MODEL \
+    --max-model-len 8192 \
+    --max-num-batched-tokens 131072 \
+    --max-seq-len-to-capture 131072 \
+    --input-len $IN \
+    --output-len $OUT \
+    --tensor-parallel-size $TP \
+    --num-prompts $PROMPTS \
+    --max-num-seqs $MAX_NUM_SEQS
+```
+
+For FP16 models, remove `--kv-cache-dtype fp8`.
+
+When measuring models with long context lengths, performance may improve by setting `--max-model-len` to a smaller value (8192 in this example).  It is important, however, to ensure that the `--max-model-len` is at least as large as the IN + OUT token counts.
+
+It is important to tune vLLM’s --max-num-seqs value to an appropriate value depending on the model and input/output lengths.  Larger values will allow vLLM to leverage more of the GPU memory for KV Cache and process more prompts concurrently.  But if the value is too large, the KV cache will reach its capacity and vLLM will have to cancel and re-process some prompts.  Suggested values for various models and configurations are listed below.
+
+For models that fit on a single GPU, it is usually best to run with `--tensor-parallel-size 1`.  Requests can be distributed across multiple copies of vLLM running on different GPUs.  This will be more efficient than running a single copy of the model with `--tensor-parallel-size 8`.  (Note: the benchmark_throughput.py script does not include direct support for using multiple copies of vLLM)
+
+For optimal performance, the PROMPTS value should be a multiple of the MAX_NUM_SEQS value -- for example, if MAX_NUM_SEQS=1500 then the PROMPTS value could be 1500, 3000, etc.  If PROMPTS is smaller than MAX_NUM_SEQS then there won’t be enough prompts for vLLM to maximize concurrency.
+
+For additional information about the available parameters run:
+
+```bash
+python3 /app/vllm/benchmarks/benchmark_throughput.py -h
+```
+
+### Online Serving Benchmark
+
+Benchmark Llama-3.1-70B with input 4096 tokens, output 512 tokens and tensor parallelism 8 as an example,
+
+```bash
+export VLLM_USE_TRITON_FLASH_ATTN=0
+vllm serve amd/Llama-3.1-70B-Instruct-FP8-KV \
+    --swap-space 16 \
+    --disable-log-requests \
+    --quantization fp8 \
+    --kv-cache-dtype fp8 \
+    --dtype float16 \
+    --max-model-len 8192 \
+    --tensor-parallel-size 8 \
+    --max-num-batched-tokens 65536 \
+    --gpu-memory-utilization 0.99 \
+    --num_scheduler-steps 10
+```
+
+For FP16 models, remove `--kv-cache-dtype fp8`. Change port (for example --port 8005) if port=8000 is currently being used by other processes.
+
+Run client in a separate terminal. Use port_id from previous step else port-id=8000.
+
+```bash
+python /app/vllm/benchmarks/benchmark_serving.py \
+    --port 8000 \
+    --model amd/Llama-3.1-70B-Instruct-FP8-KV \
+    --dataset-name random \
+    --random-input-len 4096 \
+    --random-output-len 512 \
+    --request-rate 1 \
+    --ignore-eos \
+    --num-prompts 500 \
+    --percentile-metrics ttft,tpot,itl,e2el
+```
+
+Once all prompts are processed, terminate the server gracefully (ctrl+c).
+
+### Running DeepSeek-V3 and DeepSeek-R1
+
+We have experimental support for running both DeepSeek-V3 and DeepSeek-R1 models.
+*Note there are currently limitations and `--max-model-len` cannot be greater than 32768*
+
+```bash
+docker run -it --rm --ipc=host --network=host --group-add render \
+    --privileged --security-opt seccomp=unconfined \
+    --cap-add=CAP_SYS_ADMIN --cap-add=SYS_PTRACE \
+    --device=/dev/kfd --device=/dev/dri --device=/dev/mem \
+    -e VLLM_USE_TRITON_FLASH_ATTN=1 \
+    -e VLLM_ROCM_USE_AITER=1 \
+    -e  VLLM_MLA_DISABLE=0 \
+    rocm/vllm-dev:main
+
+# Online serving
+vllm serve deepseek-ai/DeepSeek-V3 \
+    --disable-log-requests \
+    --tensor-parallel-size 8 \
+    --trust-remote-code \
+    --max-model-len 131072 \
+    --block-size=1 
+
+python3 /app/vllm/benchmarks/benchmark_serving.py \
+    --backend vllm \
+    --model deepseek-ai/DeepSeek-V3 \
+    --max-concurrency 256\
+    --dataset-name random \
+    --random-input-len 128 \
+    --random-output-len 128 \
+    --num-prompts 1000
+
+# Offline throughput 
+python3 /app/vllm/benchmarks/benchmark_throughput.py --model deepseek-ai/DeepSeek-V3 \
+    --input-len <> --output-len <> --tensor-parallel-size 8 \
+    --quantization fp8 --kv-cache-dtype fp8 --dtype float16 \
+    --max-model-len 32768 --block-size=1 --trust-remote-code
+
+# Offline Latency
+python /app/vllm/benchmarks/benchmark_latency.py --model deepseek-ai/DeepSeek-V3 \
+--tensor-parallel-size 8 --trust-remote-code --max-model-len 32768 --block-size=1 \
+--batch-size <> --input-len <> --output-len <>
+```
+
+### CPX mode
+
+Currently only CPX-NPS1 mode is supported. So ONLY tp=1 is supported in CPX mode.
+But multiple instances can be started simultaneously (if needed) in CPX-NPS1 mode.
+
+Set GPUs in CPX mode with:
+
+```bash
+rocm-smi --setcomputepartition cpx
+```
+
+Example of running Llama3.1-8B on 1 CPX-NPS1 GPU with input 4096 and output 512. As mentioned above, tp=1.
+
+```bash
+HIP_VISIBLE_DEVICES=0 \
+python3 /app/vllm/benchmarks/benchmark_throughput.py \
+    --max-model-len 4608 \
+    --num-scheduler-steps 10 \
+    --num-prompts 100 \
+    --model amd/Llama-3.1-8B-Instruct-FP8-KV \
+    --input-len 4096 \
+    --output-len 512 \
+    --dtype float16 \
+    --tensor-parallel-size 1 \
+    --output-json <path/to/output.json> \
+    --quantization fp8 \
+    --gpu-memory-utilization 0.99
+```
+
+Set GPU to SPX mode.
+
+```bash
+rocm-smi --setcomputepartition spx
+```
+
+### Speculative Decoding
+
+Speculative decoding is one of the key features in vLLM. It has been supported on MI300. Here below is an example of the performance benchmark w/wo speculative decoding for Llama 3.1 405B with Llama 3.1 8B as the draft model.
+
+Without Speculative Decoding -
+
+```bash
+export VLLM_USE_TRITON_FLASH_ATTN=0
+python /app/vllm/benchmarks/benchmark_latency.py --model amd/Llama-3.1-405B-Instruct-FP8-KV --max-model-len 26720 -tp 8 --batch-size 1 --input-len 1024 --output-len 128
+```
+
+With Speculative Decoding -
+
+```bash
+export VLLM_USE_TRITON_FLASH_ATTN=0
+python /app/vllm/benchmarks/benchmark_latency.py --model amd/Llama-3.1-405B-Instruct-FP8-KV --max-model-len 26720 -tp 8 --batch-size 1 --input-len 1024 --output-len 128 --speculative-model amd/Llama-3.1-8B-Instruct-FP8-KV --num-speculative-tokens 5
+```
+
+You should see some performance improvement about the e2e latency.
+
+### AITER use cases
+
+`rocm/vllm-dev:main` image has experimental [AITER](https://github.com/ROCm/aiter) support, and can yield siginficant performance increase for some model/input/output/batch size configurations. To enable the feature make sure the following environment is set: `VLLM_ROCM_USE_AITER=1`, the default value is `0`. When building your own image follow the [Docker build steps](#Docker-manifest) using the [aiter_integration_final](https://github.com/ROCm/vllm/tree/aiter_integration_final) branch.
+
+Some use cases include:
+- amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
+- amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
+
+```bash
+export VLLM_ROCM_USE_AITER=1
+export VLLM_ROCM_USE_AITER_MHA=0
+export VLLM_ROCM_USE_AITER_RMSNORM=0
+python3 /app/vllm/benchmarks/benchmark_latency.py --model amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV -tp 8 --batch-size 256 --input-len 128 --output-len 2048
+```
+
+## MMLU_PRO_Biology Accuracy Evaluation
+
+### FP16
+
+vllm (pretrained=models--meta-llama--Llama-3.1-405B-Instruct/snapshots/069992c75aed59df00ec06c17177e76c63296a26,dtype=float16,tensor_parallel_size=8), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: 64
+
+| Tasks |Version|    Filter    |n-shot|  Metric   |   |Value |   |Stderr|
+|-------|------:|--------------|-----:|-----------|---|-----:|---|-----:|
+|biology|      0|custom-extract|     5|exact_match|↑  |0.8466|±  |0.0135|
+
+### FP8
+
+vllm (pretrained=models--meta-llama--Llama-3.1-405B-Instruct/snapshots/069992c75aed59df00ec06c17177e76c63296a26,dtype=float16,quantization=fp8,quantized_weights_path=/llama.safetensors,tensor_parallel_size=8), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: 32
+
+| Tasks |Version|    Filter    |n-shot|  Metric   |   |Value|   |Stderr|
+|-------|------:|--------------|-----:|-----------|---|----:|---|-----:|
+|biology|      0|custom-extract|     5|exact_match|↑  |0.848|±  |0.0134|
+
+## Performance
+
+### MLPerf Performance Results
+
+#### LLama-2-70B
+
+Please refer to the [Benchmarking Machine Learning using ROCm and AMD GPUs: Reproducing Our MLPerf Inference Submission — ROCm Blogs](https://rocm.blogs.amd.com/artificial-intelligence/mlperf-inf-4-1/README.html) for information on reproducing MLPerf 4.1 Inference results.  Note that due to changes in vLLM, it is not possible to use these instructions with the current rocm/vllm-dev docker image. Due to recent changes in vLLM, the instructions for MLPerf 4.1 submission do not apply to the current rocm/vllm-dev docker image.
+
+## Docker Manifest
+
+To reproduce the release docker:
+
+```bash
+    git clone https://github.com/ROCm/vllm.git
+    cd vllm
+    git checkout b432b7a285aa0dcb9677380936ffa74931bb6d6f
+    docker build -f docker/Dockerfile.rocm -t <your_tag> --build-arg USE_CYTHON=1 .
+```
+
+### Building AITER Image
+
+Use AITER release candidate branch instead:
+
+```bash
+    git clone https://github.com/ROCm/vllm.git
+    cd vllm
+    git checkout aiter_integration_final
+    docker build -f docker/Dockerfile.rocm -t <your_tag> --build-arg USE_CYTHON=1 .
+```
+
+## Changelog
+
+20250715_aiter:
+- No need to specify the --compilation-config parameter, these options were turned on by default
+- Fixed llama3.1 405b CAR issue (no longer need --disable-custom-all-reduce)
+- Fixed +rms_norm custom kernel issue
+- Added quick reduce (set VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=FP to enable. Supported modes are FP, INT8, INT6, INT4)
+- Mitigated the commandr model causing GPU crash through a workaround until the driver issue is fixed
+
+20250620_aiter:
+- V1 on by default (use VLLM_USE_V1=0 to override)
+- Fixed detokenizers issue
+- Fixed AITER MoE issues
+- vLLM v0.9.1
+  
+20250605_aiter:
+- Updated to ROCm 6.4.1 and vLLM v0.9.0.1
+- AITER MHA
+- IBM 3d kernel for unified attention
+- Full graph capture for split attention
+
+20250521_aiter:
+- AITER V1 engine performance improvement
+
+20250513_aiter:
+- Out of memory bug fix
+- PyTorch fixes
+- Tunable ops fixes
+
+20250410_aiter:
+- 2-stage MoE
+- MLA from AITER
+
+20250325_aiter:
+- Improved DeepSeek-V3/R1 performance
+- Initial Gemma-3 enablement
+- Detokenizer disablement
+- Torch.compile support
+
+20250305_aiter:
+- AITER improvements
+- Support for FP8 skinny GEMM
+
+20250207_aiter:
+- More performant AITER
+- Bug fixes
+
+20250205_aiter:
+- [AITER](https://github.com/ROCm/aiter) support
+- Performance improvement for custom paged attention
+- Reduced memory overhead bug fix
+
+20250124:
+- Fix accuracy issue with 405B FP8 Triton FA
+- Fixed accuracy issue with TP8
+
+20250117:
+- [Experimental DeepSeek-V3 and DeepSeek-R1 support](#running-deepseek-v3-and-deepseek-r1)
diff --git a/requirements/common.txt b/requirements/common.txt
index e21abfb9a30b..cbcaa4474547 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -2,7 +2,7 @@ regex # Replace re for higher-performance regex matching
 cachetools
 psutil
 sentencepiece  # Required for LLaMA tokenizer.
-numpy
+numpy==1.26.4
 requests >= 2.26.0
 tqdm
 blake3
diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index 10879f0be83c..d1f4778e25fd 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -322,4 +322,4 @@ async def test_serving_chat_did_set_correct_cache_salt(model_type):
     req.cache_salt = "test_salt"
     with suppress(Exception):
         await serving_chat.create_chat_completion(req)
-    assert mock_engine.generate.call_args.args[0]["cache_salt"] == "test_salt"
+    assert mock_engine.generate.call_args.args[0]["cache_salt"] == "test_salt"
\ No newline at end of file
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index a90c4f7969a0..d0e9734c77a4 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -3,7 +3,7 @@
 """Attention layer ROCm GPUs."""
 import itertools
 from dataclasses import dataclass
-from functools import cache
+from functools import cache, lru_cache
 from typing import List, Optional, Tuple, Type
 
 import torch
@@ -30,7 +30,7 @@
 @cache
 def is_rocm_aiter_paged_attn_enabled() -> bool:
     return envs.VLLM_ROCM_USE_AITER_PAGED_ATTN \
-        and envs.VLLM_ROCM_USE_AITER \
+        and envs.VLLM_ROCM_USE_AITER
 
 
 @cache
@@ -392,6 +392,26 @@ def _get_seq_len_block_table_args(
         raise AttributeError(f"Invalid attention type {str(attn_type)}")
 
 
+@lru_cache(maxsize=1)
+def get_static_kvscale(
+    k_scale_float: float,
+    v_scale_float: float,
+    num_kv_heads: int,
+    num_blocks: int,
+    block_size: int,
+    device,
+):
+    k_scale = torch.empty((num_kv_heads, num_blocks * block_size),
+                          dtype=torch.float32,
+                          device=device)
+    v_scale = torch.empty((num_kv_heads, num_blocks * block_size),
+                          dtype=torch.float32,
+                          device=device)
+    k_scale.fill_(k_scale_float)
+    v_scale.fill_(v_scale_float)
+    return k_scale, v_scale
+
+
 class ROCmFlashAttentionImpl(AttentionImpl):
     """
     If the input tensors contain prompt tokens, the layout is as follows:
@@ -483,9 +503,16 @@ def __init__(
                     "FA backend instead by setting the env var "
                     "`VLLM_USE_TRITON_FLASH_ATTN=0`")
 
-            from vllm.attention.ops.triton_flash_attention import (  # noqa: F401
-                triton_attention)
-            self.triton_attn_func = triton_attention
+            if not envs.VLLM_ROCM_USE_AITER:
+                from vllm.attention.ops.triton_flash_attention import (  # noqa: F401
+                    triton_attention)
+                self.triton_attn_func = triton_attention
+            else:
+                from aiter.ops.triton.mha import flash_attn_varlen_func
+                from aiter.ops.triton.mha import (
+                    mha_set_use_int64_strides as set_triton_fa_strides)
+                set_triton_fa_strides(True)
+                self.triton_attn_func = flash_attn_varlen_func
             logger.debug("Using Triton FA in ROCmBackend")
             if self.sliding_window != (-1, -1):
                 logger.warning("ROCm Triton FA does not currently support "
@@ -629,20 +656,26 @@ def forward(
         if (is_rocm_aiter_paged_attn_enabled() and kv_cache.dtype.itemsize == 1
                 and not self.aiter_kv_scales_initialized
                 and kv_cache.shape != torch.Size([0])):
+            from vllm.attention.ops.rocm_aiter_paged_attn import (
+                AITERPagedAttention)
+
             num_blocks = kv_cache.shape[1]
             block_size = kv_cache.shape[2] // (self.num_kv_heads *
                                                self.head_size)
-            k_scale = torch.empty((self.num_kv_heads, num_blocks * block_size),
-                                  dtype=torch.float32,
-                                  device=kv_cache.device)
-            v_scale = torch.empty((self.num_kv_heads, num_blocks * block_size),
-                                  dtype=torch.float32,
-                                  device=kv_cache.device)
+            AITERPagedAttention.is_asm_supported = (
+                self.head_size == 128
+                and self.num_heads // self.num_kv_heads <= 16
+                and self.kv_cache_dtype in ["int8", "fp8", "fp8_e4m3"])
+
             self.aiter_kv_scales_initialized = True
-            k_scale.fill_(layer._k_scale.item())
-            v_scale.fill_(layer._v_scale.item())
-            layer._k_scale = k_scale
-            layer._v_scale = v_scale
+            if AITERPagedAttention.is_asm_supported:
+                k_scale, v_scale = get_static_kvscale(layer._k_scale_float,
+                                                      layer._v_scale_float,
+                                                      self.num_kv_heads,
+                                                      num_blocks, block_size,
+                                                      kv_cache.device)
+                layer._k_scale = k_scale
+                layer._v_scale = v_scale
 
         # Only update KV cache for decoder self-attention
         # and encoder-decoder cross-attention
@@ -726,32 +759,52 @@ def forward(
                             query.dtype,
                             seq_lens,
                             make_attn_mask=causal_mask)  # type: ignore
-
-                    use_fp8_scales = (layer._q_scale and layer._k_scale
-                                      and layer._v_scale and layer._prob_scale
-                                      and (self.kv_cache_dtype == "fp8"
-                                           or self.force_fp8_attention))
-
-                    full_scales = (
-                        layer._q_scale.item(), layer._k_scale.item(),
-                        layer._v_scale.item(),
-                        layer._prob_scale.item()) if use_fp8_scales else None
-                    self.triton_attn_func(
-                        query,
-                        key,
-                        value,
-                        output[:num_prefill_tokens],
-                        query_seq_start_loc,
-                        key_seq_start_loc,
-                        query_max_seq_len,
-                        key_max_seq_len,
-                        causal_mask,
-                        self.scale,
-                        attn_masks[0][None]
-                        if attn_masks is not None else None,
-                        full_scales,
-                        output_scale,
-                    )
+                    if not envs.VLLM_ROCM_USE_AITER:
+                        use_fp8_scales = (layer._q_scale is not None
+                                          and layer._k_scale is not None
+                                          and layer._v_scale is not None
+                                          and layer._prob_scale is not None and
+                                          envs.VLLM_USE_ROCM_FP8_FLASH_ATTN)
+                        full_scales = (layer._q_scale.item(),
+                                       layer._k_scale.item(),
+                                       layer._v_scale.item(),
+                                       layer._prob_scale.item()
+                                       ) if use_fp8_scales else None
+                        self.triton_attn_func(
+                            query,
+                            key,
+                            value,
+                            output[:num_prefill_tokens],
+                            query_seq_start_loc,
+                            key_seq_start_loc,
+                            query_max_seq_len,
+                            key_max_seq_len,
+                            causal_mask,
+                            self.scale,
+                            attn_masks[0][None]
+                            if attn_masks is not None else None,
+                            full_scales,
+                            output_scale,
+                        )
+                    else:
+                        output[:num_prefill_tokens] = self.triton_attn_func(
+                            q=query,
+                            k=key,
+                            v=value,
+                            cu_seqlens_q=query_seq_start_loc,
+                            cu_seqlens_k=key_seq_start_loc,
+                            max_seqlen_q=query_max_seq_len,
+                            max_seqlen_k=key_max_seq_len,
+                            dropout_p=0.0,
+                            softmax_scale=self.scale,
+                            causal=causal_mask,
+                            window_size=self.sliding_window,
+                            alibi_slopes=self.alibi_slopes,
+                            deterministic=False,
+                            return_lse=False,
+                            return_attn_probs=False,
+                            block_table=None,
+                        )
                 elif self.use_naive_attn:
                     if self.num_kv_heads != self.num_heads:
                         # Interleave for MQA workaround.
@@ -777,6 +830,7 @@ def forward(
                         self.num_heads,
                         self.head_size,
                         self.scale,
+                        causal_mask,
                         attn_masks,
                     )
                 else:
@@ -878,6 +932,28 @@ def forward(
                     layer._v_scale,
                     output_scale,
                 )
+            elif is_rocm_aiter_paged_attn_enabled():
+                paged_attn.forward_decode(
+                    decode_query,
+                    key_cache,
+                    value_cache,
+                    (decode_meta.block_tables
+                     if self.attn_type != AttentionType.ENCODER_DECODER else
+                     decode_meta.cross_block_tables),
+                    (decode_meta.seq_lens_tensor
+                     if self.attn_type != AttentionType.ENCODER_DECODER else
+                     decode_meta.encoder_seq_lens_tensor),
+                    (decode_meta.max_decode_seq_len
+                     if self.attn_type != AttentionType.ENCODER_DECODER else
+                     decode_meta.max_encoder_seq_len),
+                    self.kv_cache_dtype,
+                    self.num_kv_heads,
+                    self.scale,
+                    self.alibi_slopes,
+                    layer._k_scale,
+                    layer._v_scale,
+                    output=output[num_prefill_tokens:],
+                )
             else:
                 # PagedAttention does not support fused quant, manually quantize
                 if output_scale is None:
@@ -929,6 +1005,7 @@ def _sdpa_attention(
     num_heads: int,
     head_size: int,
     scale: float,
+    is_causal: bool,
     attn_masks: Optional[List[torch.Tensor]] = None,
 ) -> torch.Tensor:
     start = 0
@@ -945,7 +1022,7 @@ def _sdpa_attention(
                 key[:, start:end, :],
                 value[:, start:end, :],
                 dropout_p=0.0,
-                is_causal=attn_masks is None,
+                is_causal=is_causal,
                 attn_mask=attn_masks[i] if attn_masks else None,
                 scale=scale).movedim(query.dim() - 2, 0)
             output[start:end, :, :] = sub_out
diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py
index 7b6c426b0f85..8752d22a5041 100644
--- a/vllm/attention/backends/utils.py
+++ b/vllm/attention/backends/utils.py
@@ -237,9 +237,18 @@ def build(self, seq_lens: List[int], query_lens: List[int],
             # The shape of graph_block_tables is
             # [max batch size, max context len // block size].
             input_block_tables = self.runner.graph_block_tables[:batch_size]
+            max_blocks = input_block_tables.shape[1]
             for i, block_table in enumerate(self.block_tables):
                 if block_table:
-                    input_block_tables[i, :len(block_table)] = block_table
+                    num_blocks = len(block_table)
+                    if num_blocks <= max_blocks:
+                        input_block_tables[i, :num_blocks] = block_table
+                    else:
+                        # It may be possible to have more blocks allocated due
+                        # to lookahead slots of multi-step, however, they are
+                        # not used anyway, so can be safely ignored.
+                        input_block_tables[
+                            i, :max_blocks] = block_table[:max_blocks]
             block_tables = torch.from_numpy(input_block_tables).to(
                 device, non_blocking=True)
         else:
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 237802afccde..c8a2d7e7bcb0 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -29,6 +29,8 @@
 logger = init_logger(__name__)
 USE_XFORMERS_OPS = None
 
+if current_platform.is_rocm():
+    VLLM_ROCM_USE_AITER_TRITON_FUSED_ROPE_ZEROS_KV_CACHE = envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_TRITON_FUSED_ROPE_ZEROS_KV_CACHE
 
 def check_xformers_availability():
     global USE_XFORMERS_OPS
@@ -128,6 +130,11 @@ def __init__(
         # but requires q to be quantized as well.
         self._q_scale = torch.tensor(1.0, dtype=torch.float32)
         self._prob_scale = torch.tensor(1.0, dtype=torch.float32)
+        self._out_scale = None
+
+        # Keeping float32 version of _q_scale tensor for assertions
+        # during graph capture. Otherwise asserts are triggeting HIP error
+        self._q_scale_float = 1.0
 
         # We also keep q/k/v_scale on host (cpu) memory for attention
         # backends that require the scales to be on host instead of on device.
@@ -229,6 +236,9 @@ def forward(
         # shape does not match the query shape, so we optionally let the model
         # definition specify the output tensor shape.
         output_shape: Optional[torch.Size] = None,
+        positions: torch.Tensor = None,
+        cos_sin_cache: torch.Tensor = None,
+        is_neox: bool = False,
     ) -> torch.Tensor:
         """
         The KV cache is stored inside this class and is accessed via
@@ -246,9 +256,15 @@ def forward(
         if self.use_output:
             output_shape = (output_shape
                             if output_shape is not None else query.shape)
-            output = torch.zeros(output_shape,
-                                 dtype=query.dtype,
-                                 device=query.device)
+            if VLLM_ROCM_USE_AITER_TRITON_FUSED_ROPE_ZEROS_KV_CACHE:
+                output = torch.empty(output_shape,
+                                    dtype=query.dtype,
+                                    device=query.device)
+            else:
+                output = torch.zeros(output_shape,
+                                    dtype=query.dtype,
+                                    device=query.device)
+
             hidden_size = output_shape[-1]
             # We skip reshaping query, key and value tensors for the MLA
             # backend since these tensors have different semantics and are
@@ -270,15 +286,19 @@ def forward(
                     attn_metadata = attn_metadata[self.layer_name]
                 self_kv_cache = self.kv_cache[forward_context.virtual_engine]
                 self.impl.forward(self,
-                                  query,
-                                  key,
-                                  value,
-                                  self_kv_cache,
-                                  attn_metadata,
-                                  output=output)
+                                query,
+                                key,
+                                value,
+                                self_kv_cache,
+                                attn_metadata,
+                                output=output)
             else:
-                torch.ops.vllm.unified_attention_with_output(
-                    query, key, value, output, self.layer_name)
+                if VLLM_ROCM_USE_AITER_TRITON_FUSED_ROPE_ZEROS_KV_CACHE:
+                    torch.ops.vllm.unified_attention_with_output(
+                        query, key, value, output, self.layer_name, None, positions, cos_sin_cache, True)
+                else:
+                    torch.ops.vllm.unified_attention_with_output(
+                        query, key, value, output, self.layer_name)
             return output.view(-1, hidden_size)
         else:
             if self.use_direct_call:
@@ -495,6 +515,9 @@ def unified_attention_with_output(
     output: torch.Tensor,
     layer_name: str,
     output_scale: Optional[torch.Tensor] = None,
+    positions:  Optional[torch.Tensor] = None,
+    cos_sin_cache: Optional[torch.Tensor] = None,
+    is_neox: bool = False,
     output_block_scale: Optional[torch.Tensor] = None,
 ) -> None:
     wait_for_kv_layer_from_connector(layer_name)
@@ -504,15 +527,30 @@ def unified_attention_with_output(
         attn_metadata = attn_metadata[layer_name]
     self = forward_context.no_compile_layers[layer_name]
     kv_cache = self.kv_cache[forward_context.virtual_engine]
-    self.impl.forward(self,
-                      query,
-                      key,
-                      value,
-                      kv_cache,
-                      attn_metadata,
-                      output=output,
-                      output_scale=output_scale,
-                      output_block_scale=output_block_scale)
+
+    if VLLM_ROCM_USE_AITER_TRITON_FUSED_ROPE_ZEROS_KV_CACHE:
+        from vllm.v1.attention.backends.triton_attn import TritonAttentionImpl
+        assert isinstance(self.impl, TritonAttentionImpl), f"Expect attention implementation = TritonAttentionImpl for VLLM_ROCM_USE_AITER_TRITON_FUSED_ROPE_ZEROS_KV_CACHE=1 but got {self.impl=}"
+        assert self.impl.kv_sharing_target_layer_name is None, "kv_sharing_target_layer_name error"
+        self.impl.forward(self,
+                        query,
+                        key,
+                        value,
+                        kv_cache,
+                        attn_metadata,
+                        output=output,
+                        output_scale=output_scale,
+                        positions=positions, cos_sin_cache=cos_sin_cache, is_neox=is_neox)
+    else:
+        self.impl.forward(self,
+                        query,
+                        key,
+                        value,
+                        kv_cache,
+                        attn_metadata,
+                        output=output,
+                        output_scale=output_scale,
+                        output_block_scale=output_block_scale)
 
     maybe_save_kv_layer_to_connector(layer_name, kv_cache)
 
@@ -524,6 +562,9 @@ def unified_attention_with_output_fake(
     output: torch.Tensor,
     layer_name: str,
     output_scale: Optional[torch.Tensor] = None,
+    positions:  Optional[torch.Tensor] = None,
+    cos_sin_cache: Optional[torch.Tensor] = None,
+    is_neox: bool = False,
     output_block_scale: Optional[torch.Tensor] = None,
 ) -> None:
     return
diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py
index b0cb7ffce6c6..bb3c075c60ee 100644
--- a/vllm/attention/ops/prefix_prefill.py
+++ b/vllm/attention/ops/prefix_prefill.py
@@ -7,6 +7,7 @@
 import torch
 
 from vllm.platforms import current_platform
+from vllm.platforms.rocm import not_mi350
 from vllm.triton_utils import tl, triton
 
 # Static kernels parameters
@@ -864,7 +865,10 @@ def context_attention_fwd(q,
     max_seq_len = 0 if max_seq_len is None else max_seq_len
     extra_kargs = {}
     if current_platform.is_rocm():
-        extra_kargs = {"kpack": 1, "waves_per_eu": 2}
+        if not_mi350():
+            extra_kargs = {"kpack": 1, "waves_per_eu": 2}
+        else:
+            extra_kargs = {"waves_per_eu": 2}
 
     grid = lambda META: (batch, head,
                          triton.cdiv(max_input_len, META["BLOCK_M"]))
diff --git a/vllm/attention/ops/rocm_aiter_paged_attn.py b/vllm/attention/ops/rocm_aiter_paged_attn.py
index ad97152e208b..1507621c29fa 100644
--- a/vllm/attention/ops/rocm_aiter_paged_attn.py
+++ b/vllm/attention/ops/rocm_aiter_paged_attn.py
@@ -13,6 +13,7 @@
 
 
 class AITERPagedAttention(PagedAttention):
+    is_asm_supported: bool = False
 
     @staticmethod
     def write_to_paged_cache(
@@ -25,20 +26,30 @@ def write_to_paged_cache(
         k_scale: torch.Tensor,
         v_scale: torch.Tensor,
     ) -> None:
-        if kv_cache_dtype not in ["int8", "fp8", "fp8_e4m3"]:
-            PagedAttention.write_to_paged_cache(key, value, key_cache,
-                                                value_cache, slot_mapping,
-                                                kv_cache_dtype, k_scale,
-                                                v_scale)
+        if not AITERPagedAttention.is_asm_supported:
+            PagedAttention.write_to_paged_cache(
+                key,
+                value,
+                key_cache,
+                value_cache,
+                slot_mapping,
+                kv_cache_dtype,
+                k_scale,
+                v_scale,
+            )
         else:
-            kv_cache_torch_dtype = (FP8_DTYPE
-                                    if "fp8" in kv_cache_dtype else torch.int8)
+            kv_cache_torch_dtype = FP8_DTYPE \
+                        if "fp8" in kv_cache_dtype else torch.int8
             key_cache = key_cache.view(kv_cache_torch_dtype)
             value_cache = value_cache.view(kv_cache_torch_dtype)
 
-            rocm_aiter.reshape_and_cache_with_pertoken_quant(
-                key, value, key_cache, value_cache, k_scale, v_scale,
-                slot_mapping.flatten(), True)
+            # rocm_aiter.reshape_and_cache_with_pertoken_quant(
+            #     key, value, key_cache, value_cache, k_scale, v_scale,
+            #     slot_mapping.flatten(), True)
+            rocm_aiter.reshape_and_cache(key, value, key_cache, value_cache,
+                                         slot_mapping.flatten(),
+                                         kv_cache_dtype, k_scale, v_scale,
+                                         True)
 
     @staticmethod
     def forward_decode(
@@ -59,44 +70,76 @@ def forward_decode(
         blocksparse_vert_stride: int = 0,
         blocksparse_block_size: int = 64,
         blocksparse_head_sliding_step: int = 0,
+        output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        if kv_cache_dtype not in ["int8", "fp8", "fp8_e4m3"]:
-            return PagedAttention.forward_decode(
-                query=query,
-                key_cache=key_cache,
-                value_cache=value_cache,
-                block_tables=block_tables,
-                seq_lens=seq_lens,
-                max_seq_len=max_seq_len,
-                kv_cache_dtype=kv_cache_dtype,
-                num_kv_heads=num_kv_heads,
-                scale=scale,
-                alibi_slopes=alibi_slopes,
-                k_scale=k_scale,
-                v_scale=v_scale,
-                tp_rank=tp_rank,
-                blocksparse_local_blocks=blocksparse_local_blocks,
-                blocksparse_vert_stride=blocksparse_vert_stride,
-                blocksparse_block_size=blocksparse_block_size,
-                blocksparse_head_sliding_step=blocksparse_head_sliding_step)
+        if output is None:
+            output = torch.empty_like(query)
+        block_size = value_cache.shape[3]
+        if not AITERPagedAttention.is_asm_supported:
+            import aiter
+
+            max_num_partitions = (max_seq_len + 256 - 1) // 256
+            assert 256 % block_size == 0
+            num_seqs, num_heads, head_size = query.shape
+            tmp_output = torch.empty(
+                size=(num_seqs, num_heads, max_num_partitions, head_size),
+                dtype=output.dtype,
+                device=output.device,
+            )
+            exp_sums = torch.empty(
+                size=(num_seqs, num_heads, max_num_partitions),
+                dtype=torch.float32,
+                device=output.device,
+            )
+            max_logits = torch.empty_like(exp_sums)
+            return aiter.paged_attention_rocm(
+                output,
+                exp_sums,
+                max_logits,
+                tmp_output,
+                query,
+                key_cache,
+                value_cache,
+                num_kv_heads,
+                scale,
+                block_tables,
+                seq_lens,
+                block_size,
+                max_seq_len,
+                alibi_slopes,
+                kv_cache_dtype,
+                k_scale,
+                v_scale,
+                None,
+                256,
+            )
 
         if "fp8" in kv_cache_dtype:
-            key_cache = key_cache.view(torch.float8_e4m3fnuz)
-            value_cache = value_cache.view(torch.float8_e4m3fnuz)
+            kv_cache_torch_dtype = FP8_DTYPE
+            # kv_cache_torch_dtype = torch.int8
+            key_cache = key_cache.view(kv_cache_torch_dtype)
+            value_cache = value_cache.view(kv_cache_torch_dtype)
 
         if blocksparse_vert_stride is not None and blocksparse_vert_stride > 1:
             # use blocksparse paged attention
             block_size = value_cache.size(-1)
-            assert (blocksparse_block_size > 0 and
-                    blocksparse_block_size % block_size == 0), \
-                (f"{blocksparse_block_size=} needs to be a multiple of"
-                 f"{block_size=} used in block_tables.")
+            assert (blocksparse_block_size > 0
+                    and blocksparse_block_size % block_size == 0), (
+                        f"{blocksparse_block_size=} needs to be a multiple of"
+                        f"{block_size=} used in block_tables.")
 
-        output = torch.empty_like(query)
-        block_size = value_cache.shape[3]
         max_num_blocks_per_seq = cdiv(max_seq_len, block_size)
 
-        rocm_aiter.pa_fwd_asm(query, key_cache, value_cache, block_tables,
-                              seq_lens, max_num_blocks_per_seq, k_scale,
-                              v_scale, output)
+        rocm_aiter.pa_fwd_asm(
+            query,
+            key_cache,
+            value_cache,
+            # asm_V_shuffle(value_cache),
+            block_tables,
+            seq_lens,
+            max_num_blocks_per_seq,
+            K_QScale=k_scale,
+            V_QScale=v_scale,
+            out_=output,
+        )
         return output
diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py
index 71bb23657921..f7342bc95aca 100644
--- a/vllm/distributed/device_communicators/custom_all_reduce.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce.py
@@ -54,7 +54,7 @@ class CustomAllreduce:
     def __init__(self,
                  group: ProcessGroup,
                  device: Union[int, str, torch.device],
-                 max_size=2 * 8192 * 1024) -> None:
+                 max_size=8 * 8192 * 1024) -> None:
         """
         Args:
             group: the process group to work on. If None, it will use the
diff --git a/vllm/envs.py b/vllm/envs.py
index ca70bc369de4..ea45daa8f756 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -17,7 +17,8 @@
     VLLM_NCCL_SO_PATH: Optional[str] = None
     LD_LIBRARY_PATH: Optional[str] = None
     VLLM_USE_TRITON_FLASH_ATTN: bool = True
-    VLLM_V1_USE_PREFILL_DECODE_ATTENTION: bool = True
+    VLLM_USE_ROCM_FP8_FLASH_ATTN: bool = True
+    VLLM_V1_USE_PREFILL_DECODE_ATTENTION: bool = False
     VLLM_USE_AITER_UNIFIED_ATTENTION: bool = False
     VLLM_FLASH_ATTN_VERSION: Optional[int] = None
     LOCAL_RANK: int = 0
@@ -95,10 +96,14 @@
     VLLM_ROCM_USE_AITER: bool = False
     VLLM_ROCM_USE_AITER_PAGED_ATTN: bool = False
     VLLM_ROCM_USE_AITER_LINEAR: bool = True
+    VLLM_ROCM_USE_AITER_CK_TILE_LINEAR: bool = True
     VLLM_ROCM_USE_AITER_MOE: bool = True
     VLLM_ROCM_USE_AITER_RMSNORM: bool = False
     VLLM_ROCM_USE_AITER_MLA: bool = True
     VLLM_ROCM_USE_AITER_MHA: bool = True
+    VLLM_USE_AITER_TRITON_SILU_MUL: bool = False
+    VLLM_TRITON_FP4_GEMM_USE_ASM: bool = False
+    VLLM_USE_AITER_TRITON_ROPE: bool = False
     VLLM_ROCM_USE_SKINNY_GEMM: bool = True
     VLLM_ROCM_FP8_PADDING: bool = True
     VLLM_ROCM_MOE_PADDING: bool = True
@@ -167,6 +172,7 @@
     VLLM_ALLREDUCE_USE_SYMM_MEM: bool = False
     VLLM_TUNED_CONFIG_FOLDER: Optional[str] = None
     VLLM_DISABLE_PAD_FOR_CUDAGRAPH: bool = False
+    VLLM_ROCM_USE_AITER_TRITON_FUSED_ROPE_ZEROS_KV_CACHE: bool = False
 
 
 def get_default_cache_root():
@@ -370,6 +376,11 @@ def get_vllm_port() -> Optional[int]:
     lambda: bool(
         os.environ.get("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1") != "0"),
 
+    # use quantized q,k,v,softmax(qk^T), attn output during prefill
+    "VLLM_USE_ROCM_FP8_FLASH_ATTN":
+    lambda: (os.getenv("VLLM_USE_ROCM_FP8_FLASH_ATTN", "False").lower() in
+             ("true", "1")),
+
     # Feature flag to enable/disable Inductor standalone compile.
     # In torch <= 2.7 we ignore this flag; in torch >= 2.8 this is
     # enabled by default.
@@ -751,6 +762,10 @@ def get_vllm_port() -> Optional[int]:
     lambda: (os.getenv("VLLM_ROCM_USE_AITER_LINEAR", "True").lower() in
              ("true", "1")),
 
+    "VLLM_ROCM_USE_AITER_CK_TILE_LINEAR":
+    lambda: (os.getenv("VLLM_ROCM_USE_AITER_CK_TILE_LINEAR", "True").lower() in
+             ("true", "1")),
+
     # Whether to use aiter moe ops.
     # By default is enabled.
     "VLLM_ROCM_USE_AITER_MOE":
@@ -774,6 +789,24 @@ def get_vllm_port() -> Optional[int]:
     lambda: (os.getenv("VLLM_ROCM_USE_AITER_MHA", "True").lower() in
              ("true", "1")),
 
+    # Whether to use aiter silu mul.
+    # By default is disabled.
+    "VLLM_USE_AITER_TRITON_SILU_MUL":
+    lambda: (os.getenv("VLLM_USE_AITER_TRITON_SILU_MUL", "False").lower() in
+             ("true", "1")),
+
+    # Whether to use aiter fp4 gemm asm.
+    # By default is disabled.
+    "VLLM_TRITON_FP4_GEMM_USE_ASM":
+    lambda: (os.getenv("VLLM_TRITON_FP4_GEMM_USE_ASM", "False").lower() in
+             ("true", "1")),
+
+    # Whether to use aiter rope.
+    # By default is disabled.
+    "VLLM_USE_AITER_TRITON_ROPE":
+    lambda: (os.getenv("VLLM_USE_AITER_TRITON_ROPE", "False").lower() in
+             ("true", "1")),
+
     # use rocm skinny gemms
     "VLLM_ROCM_USE_SKINNY_GEMM":
     lambda: (os.getenv("VLLM_ROCM_USE_SKINNY_GEMM", "True").lower() in
@@ -1192,6 +1225,10 @@ def get_vllm_port() -> Optional[int]:
     "VLLM_TUNED_CONFIG_FOLDER":
     lambda: os.getenv("VLLM_TUNED_CONFIG_FOLDER", None),
 
+    # Use AITER Triton fused rope + zeros + reshape_and_cache
+    "VLLM_ROCM_USE_AITER_TRITON_FUSED_ROPE_ZEROS_KV_CACHE":
+    lambda: bool(int(os.getenv("VLLM_ROCM_USE_AITER_TRITON_FUSED_ROPE_ZEROS_KV_CACHE", "0"))),
+
 }
 
 # --8<-- [end:env-vars-definition]
diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index f3248589abc4..eb7697a5d383 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -8,6 +8,7 @@
 import torch.nn as nn
 import torch.nn.functional as F
 
+from vllm import envs
 from vllm.distributed import (divide, get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size)
 from vllm.model_executor.custom_op import CustomOp
@@ -65,7 +66,12 @@ class SiluAndMul(CustomOp):
 
     def __init__(self):
         super().__init__()
-        if current_platform.is_cuda_alike():
+
+        if current_platform.is_rocm() and envs.VLLM_USE_AITER_TRITON_SILU_MUL:
+            import aiter.ops.triton.activation as ops
+            self.op = lambda x, shuffle: \
+                ops.act_mul_and_mxfp4_quant(x, "silu", shuffle=shuffle)
+        elif current_platform.is_cuda_alike():
             self.op = torch.ops._C.silu_and_mul
         elif current_platform.is_xpu():
             from vllm._ipex_ops import ipex_ops
@@ -78,12 +84,19 @@ def forward_native(self, x: torch.Tensor) -> torch.Tensor:
         d = x.shape[-1] // 2
         return F.silu(x[..., :d]) * x[..., d:]
 
-    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
-        d = x.shape[-1] // 2
-        output_shape = (x.shape[:-1] + (d, ))
-        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
-        self.op(out, x)
-        return out
+    def forward_cuda(self,
+                     x: torch.Tensor,
+                     scale: Optional[torch.Tensor] = None) -> torch.Tensor:
+        if envs.VLLM_USE_AITER_TRITON_SILU_MUL:
+            shuffle = envs.VLLM_TRITON_FP4_GEMM_USE_ASM and x.shape[0] >= 32
+            out, out_scales = self.op(x, shuffle)
+            return out, out_scales
+        else:
+            d = x.shape[-1] // 2
+            output_shape = (x.shape[:-1] + (d, ))
+            out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+            self.op(out, x)
+            return out
 
     def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
         d = x.shape[-1] // 2
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 17a5c735a57f..3c94f7ef7983 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -44,6 +44,11 @@
 
 from .rocm_aiter_fused_moe import is_rocm_aiter_moe_enabled
 
+try:
+    from aiter.ops.triton.moe_op_mxfp4 import _fused_moe_kernel_mxfp4
+except ImportError:
+    _fused_moe_kernel_mxfp4 = None
+
 logger = init_logger(__name__)
 
 
@@ -507,6 +512,7 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
                             use_int8_w8a8: bool,
                             use_int8_w8a16: bool,
                             use_int4_w4a16: bool,
+                            use_mxfp4_w4a4: bool,
                             per_channel_quant: bool,
                             block_shape: Optional[list[int]] = None,
                             B_bias: Optional[torch.Tensor] = None) -> None:
@@ -524,6 +530,9 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
     elif use_int8_w8a16 or use_int4_w4a16:
         assert B_scale is not None
         assert block_shape is None or block_shape[0] == 0
+    elif use_mxfp4_w4a4:
+        assert A_scale is not None
+        assert B_scale is not None
     else:
         assert A_scale is None
         assert B_scale is None
@@ -611,6 +620,55 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
             use_int8_w8a16=use_int8_w8a16,
             **config,
         )
+    elif use_mxfp4_w4a4:
+        ONE = torch.ones(B.size(0), dtype=torch.float32, device=A.device)
+        # overwrite config with a static one for now
+        config = {
+            "BLOCK_SIZE_M": 128,
+            "BLOCK_SIZE_N": 128,
+            "BLOCK_SIZE_K": 128,
+            "GROUP_SIZE_M": 4,
+            "num_warps": 8,
+            "num_stages": 2,
+            "waves_per_eu": 0,
+            "matrix_instr_nonkdim": 16,
+            "kpack": 1,
+        }
+        _fused_moe_kernel_mxfp4[grid](
+            A,
+            B,
+            C,
+            ONE[0],
+            ONE,
+            A_scale,
+            B_scale,
+            topk_weights,
+            sorted_token_ids,
+            expert_ids,
+            num_tokens_post_padded,
+            B.size(1),
+            A.size(1),
+            EM,
+            num_tokens,
+            A.stride(0),
+            A.stride(1),
+            B.stride(0),
+            B.stride(2),
+            B.stride(1),
+            C.stride(1),
+            C.stride(2),
+            A_scale.stride(0),
+            A_scale.stride(1),
+            B_scale.stride(0),
+            B_scale.stride(2),
+            B_scale.stride(1),
+            MUL_ROUTED_WEIGHT=mul_routed_weight,
+            top_k=top_k,
+            compute_type=compute_type,
+            SWIZZLE_MX_A=False,
+            SWIZZLE_MX_B=False,
+            **config,
+        )
     else:
         config = config.copy()
         BLOCK_SIZE_K = config.pop("BLOCK_SIZE_K")
@@ -1601,7 +1659,7 @@ def fused_experts_impl(
     else:
         out_hidden_states = torch.empty_like(hidden_states)
 
-    if use_mxfp4_w4a4:
+    if use_mxfp4_w4a4 and not current_platform.supports_mx():
         # Weight has to be dequantized for mxfp4 emulation.
         w1 = dequant_mxfp4(w1, w1_scale, hidden_states.dtype)
         w1_scale = None
@@ -1660,6 +1718,8 @@ def fused_experts_impl(
                                 use_int8_w8a8=use_int8_w8a8,
                                 use_int8_w8a16=use_int8_w8a16,
                                 use_int4_w4a16=use_int4_w4a16,
+                                use_mxfp4_w4a4=use_mxfp4_w4a4
+                                and current_platform.supports_mx(),
                                 per_channel_quant=per_channel_quant,
                                 block_shape=block_shape,
                                 B_bias=w1_bias)
@@ -1710,6 +1770,8 @@ def fused_experts_impl(
                                 use_int8_w8a8=use_int8_w8a8,
                                 use_int8_w8a16=use_int8_w8a16,
                                 use_int4_w4a16=use_int4_w4a16,
+                                use_mxfp4_w4a4=use_mxfp4_w4a4
+                                and current_platform.supports_mx(),
                                 per_channel_quant=per_channel_quant,
                                 block_shape=block_shape,
                                 B_bias=w2_bias)
@@ -2016,6 +2078,8 @@ def apply(
             use_int8_w8a8=self.use_int8_w8a8,
             use_int8_w8a16=self.use_int8_w8a16,
             use_int4_w4a16=self.use_int4_w4a16,
+            use_mxfp4_w4a4=self.use_mxfp4_w4a4
+            and current_platform.supports_mx(),
             per_channel_quant=self.per_act_token_quant,
             block_shape=self.block_shape,
             B_bias=None  # TODO support B_bias
@@ -2049,6 +2113,8 @@ def apply(
             use_int8_w8a8=self.use_int8_w8a8,
             use_int8_w8a16=self.use_int8_w8a16,
             use_int4_w4a16=self.use_int4_w4a16,
+            use_mxfp4_w4a4=self.use_mxfp4_w4a4
+            and current_platform.supports_mx(),
             per_channel_quant=self.per_act_token_quant,
             block_shape=self.block_shape,
             B_bias=None  # TODO support B_bias
diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
index 93e20c3477bb..b838fd798bbc 100644
--- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
@@ -279,7 +279,7 @@ def rocm_aiter_grouped_topk(
     if e_score_correction_bias is not None:
         torch.ops.vllm.rocm_aiter_biased_grouped_topk(
             gating_output,
-            e_score_correction_bias,
+            e_score_correction_bias.to(gating_output.dtype),
             topk_weights,
             topk_ids,
             num_expert_group,
@@ -409,15 +409,15 @@ def shuffle_weights(
     *tensors: torch.Tensor, layout: tuple[int, int] = (16, 16)
 ) -> tuple[torch.Tensor, ...]:
     """
-    Applies shuffle_weight function from AITER to each 
+    Applies shuffle_weight function from AITER to each
     input tensor and returns them.
-    
+
     Rearranges (shuffles) the input tensor/s
     into a specified block layout for optimized computation.
 
     Args:
         *tensors: Variable number of torch.Tensor objects.
-        layout: A pair of integers specifying the 
+        layout: A pair of integers specifying the
         block sizes used to divide the tensors during shuffling.
         Default is (16, 16).
 
diff --git a/vllm/model_executor/layers/fused_moe/utils.py b/vllm/model_executor/layers/fused_moe/utils.py
index 1aeb3f92bc3e..56b24d2691bd 100644
--- a/vllm/model_executor/layers/fused_moe/utils.py
+++ b/vllm/model_executor/layers/fused_moe/utils.py
@@ -19,6 +19,9 @@
 from vllm.utils import cdiv
 from vllm.utils.flashinfer import fp4_quantize
 
+if current_platform.supports_mx():
+    from aiter.ops.triton.quant import dynamic_mxfp4_quant
+
 
 @triton.jit
 def _count_expert_num_tokens(topk_ids_ptr, expert_num_tokens_ptr, num_experts,
@@ -169,14 +172,14 @@ def _mxfp4_quantize(
     A_scale: Optional[torch.Tensor],
     per_act_token_quant: bool,
     block_shape: Optional[list[int]] = None,
-) -> tuple[torch.Tensor, None]:
+) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
     assert block_shape is None
     if not current_platform.supports_mx():
         A = quant_dequant_mxfp4(A)
-    else:
-        raise NotImplementedError()
-
-    return A, None
+        return A, A_scale
+    if A_scale is not None:
+        return A, A_scale
+    return dynamic_mxfp4_quant(A)
 
 
 def _mxfp8_quantize(
diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
index a9577abedb49..b2d753018b4e 100644
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -9,6 +9,7 @@
 import vllm.envs as envs
 from vllm.model_executor.custom_op import CustomOp
 from vllm.platforms import current_platform
+from vllm.utils import direct_register_custom_op
 
 
 def is_rocm_aiter_rmsnorm_enabled() -> bool:
@@ -47,45 +48,69 @@ def fused_add_rms_norm(
     return out, residual_out
 
 
-def rocm_aiter_rms_norm(x: torch.Tensor, weight: torch.Tensor,
-                        variance_epsilon: float) -> torch.Tensor:
-    import aiter as rocm_aiter
-    if x.dim() > 2:
-        x_original_shape = x.shape
-        x = x.reshape(-1, x_original_shape[-1])
-        x = rocm_aiter.rms_norm(x, weight, variance_epsilon)
-        return x.reshape(x_original_shape)
+if is_rocm_aiter_rmsnorm_enabled():
 
-    return rocm_aiter.rms_norm(x, weight, variance_epsilon)
+    def rocm_aiter_rms_norm_impl(x: torch.Tensor, weight: torch.Tensor,
+                                 variance_epsilon: float) -> torch.Tensor:
+        from aiter.ops.triton.rmsnorm import rms_norm
+        if x.dim() > 2:
+            x_original_shape = x.shape
+            x = x.reshape(-1, x_original_shape[-1])
+            x = rms_norm(x, weight, variance_epsilon)
+            return x.reshape(x_original_shape)
 
+        return rms_norm(x, weight, variance_epsilon)
 
-def rocm_aiter_fused_add_rms_norm(
-        x: torch.Tensor, residual: torch.Tensor, weight: torch.Tensor,
-        variance_epsilon: float) -> tuple[torch.Tensor, torch.Tensor]:
+    def rocm_aiter_rms_norm_fake(input: torch.Tensor, weight: torch.Tensor,
+                                 variance_epsilon: float) -> torch.Tensor:
+        return input.clone()
 
-    import aiter as rocm_aiter
+    direct_register_custom_op(
+        op_name="rocm_aiter_rms_norm",
+        op_func=rocm_aiter_rms_norm_impl,
+        mutates_args=[],
+        fake_impl=rocm_aiter_rms_norm_fake,
+        dispatch_key=current_platform.dispatch_key,
+    )
 
-    residual_out = torch.empty_like(residual)
-    output = torch.empty_like(x)
-    rocm_aiter.rmsnorm2d_fwd_with_add(
-        output,  # output
-        x,  # input
-        residual,  # residual input
-        residual_out,  # residual output
-        weight,
-        variance_epsilon,
+    def rocm_aiter_fused_add_rms_norm_impl(
+            x: torch.Tensor, residual: torch.Tensor, weight: torch.Tensor,
+            variance_epsilon: float) -> tuple[torch.Tensor, torch.Tensor]:
+        from aiter.ops.triton.rmsnorm import rmsnorm2d_fwd_with_add
+        residual_out = torch.empty_like(residual)
+        output = torch.empty_like(x)
+        rmsnorm2d_fwd_with_add(
+            output,  # output
+            x,  # input
+            residual,  # residual input
+            residual_out,  # residual output
+            weight,
+            variance_epsilon,
+        )
+        return output, residual_out
+
+    def rocm_aiter_fused_add_rms_norm_fake(
+            x: torch.Tensor, residual: torch.Tensor, weight: torch.Tensor,
+            variance_epsilon: float) -> tuple[torch.Tensor, torch.Tensor]:
+        return x.clone(), residual.clone()
+
+    direct_register_custom_op(
+        op_name="rocm_aiter_fused_add_rms_norm",
+        op_func=rocm_aiter_fused_add_rms_norm_impl,
+        mutates_args=[],
+        fake_impl=rocm_aiter_fused_add_rms_norm_fake,
+        dispatch_key=current_platform.dispatch_key,
     )
-    return output, residual_out
 
 
 def dispatch_cuda_rmsnorm_func(add_residual: bool):
     if add_residual:
         if is_rocm_aiter_rmsnorm_enabled():
-            return rocm_aiter_fused_add_rms_norm
+            return torch.ops.vllm.rocm_aiter_fused_add_rms_norm
         return fused_add_rms_norm
 
     if is_rocm_aiter_rmsnorm_enabled():
-        return rocm_aiter_rms_norm
+        return torch.ops.vllm.rocm_aiter_rms_norm
     return rms_norm
 
 
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 19ff63145024..313d8e5460e2 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -1351,7 +1351,9 @@ def weight_loader_v2(self, param: BasevLLMParameter,
         param.load_row_parallel_weight(loaded_weight=loaded_weight)
 
     def forward(
-        self, input_
+        self,
+        input_,
+        input_scales=None
     ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[Parameter]]]:
         if self.input_is_parallel:
             input_parallel = input_
@@ -1366,9 +1368,16 @@ def forward(
         # Only fuse bias add into GEMM for rank 0 (this ensures that
         # bias will not get added more than once in TP>1 case)
         bias_ = None if (self.tp_rank > 0 or self.skip_bias_add) else self.bias
-        output_parallel = self.quant_method.apply(self,
-                                                  input_parallel,
-                                                  bias=bias_)
+        if input_scales is not None:
+            output_parallel = self.quant_method.apply(self,
+                                                      input_parallel,
+                                                      bias=bias_,
+                                                      x_scales=input_scales)
+        else:
+            output_parallel = self.quant_method.apply(self,
+                                                      input_parallel,
+                                                      bias=bias_)
+
         if self.reduce_results and self.tp_size > 1:
             output = tensor_model_parallel_all_reduce(output_parallel)
         else:
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 0200b0e9ed00..ff273e185f00 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -209,8 +209,11 @@ def __init__(self, quant_config: Fp8Config):
         # and at the moment are MI300 series
         self.use_aiter_and_is_supported = (current_platform.is_rocm()
                                            and envs.VLLM_ROCM_USE_AITER
-                                           and envs.VLLM_ROCM_USE_AITER_LINEAR
-                                           and current_platform.is_fp8_fnuz())
+                                           and envs.VLLM_ROCM_USE_AITER_LINEAR)
+        self.use_ck_tile_and_is_supported = (
+            current_platform.is_rocm() and envs.VLLM_ROCM_USE_AITER
+            and envs.VLLM_ROCM_USE_AITER_CK_TILE_LINEAR
+            and current_platform.is_fp8_fnuz())
 
         self.block_quant = self.quant_config.weight_block_size is not None
         self.act_q_static = self.quant_config.activation_scheme == "static"
@@ -365,7 +368,10 @@ def process_weights_after_loading(self, layer: Module) -> None:
             layer.weight = Parameter(weight, requires_grad=False)
             layer.weight_scale_inv = Parameter(weight_scale_inv,
                                                requires_grad=False)
-
+            if self.use_ck_tile_and_is_supported:
+                weight_tmp = layer.weight.clone()
+                del layer.weight
+                layer.weight = weight_tmp
         # If checkpoint not serialized fp8, quantize the weights.
         elif not self.quant_config.is_checkpoint_fp8_serialized:
             qweight, weight_scale = ops.scaled_fp8_quant(layer.weight,
@@ -381,9 +387,13 @@ def process_weights_after_loading(self, layer: Module) -> None:
         # If checkpoint is fp8, handle that there are N scales for N
         # shards in a fused module
         else:
+            layer.weight_scale.data[layer.weight_scale.data == torch.finfo(
+                torch.float32).min] = 1
             layer.weight_scale = torch.nn.Parameter(layer.weight_scale.data,
                                                     requires_grad=False)
             if self.quant_config.activation_scheme == "static":
+                layer.input_scale.data[layer.input_scale.data == torch.finfo(
+                    torch.float32).min] = 1
                 layer.input_scale = torch.nn.Parameter(layer.input_scale.data,
                                                        requires_grad=False)
 
@@ -463,6 +473,7 @@ def apply(self,
                 bias=bias,
                 cutlass_block_fp8_supported=self.cutlass_block_fp8_supported,
                 use_aiter_and_is_supported=self.use_aiter_and_is_supported,
+                use_ck_tile_and_is_supported=self.use_ck_tile_and_is_supported,
             )
 
         return self.fp8_linear.apply(input=x,
@@ -553,6 +564,113 @@ def maybe_make_prepare_finalize(
         logger.debug_once("%s", prepare_finalize.__class__.__name__)
         return prepare_finalize
 
+    def _maybe_pad_rocm_aiter_block_scaled_fused_moe_weights(
+            self,
+            w2_weight,
+            w2_weight_scale_inv,
+            w13_weight,
+            w13_weight_scale_inv,
+            block_k=128,
+            block_n=128):
+        """
+        Pads the MoE weights and scales to align with block quantization
+        requirements.
+
+        aiter.fmoe_fp8_blockscale_g1u1 only support out dtype = bf16,
+        inter_dim % 256 = 0 and fc_scale_blkn and fc_scale_blkk is 128
+        """
+
+        if (not self.rocm_aiter_moe_enabled):
+            return (w2_weight, w2_weight_scale_inv, w13_weight,
+                    w13_weight_scale_inv)
+
+        if (self.rocm_aiter_moe_enabled
+                and (w2_weight.shape[-1] % 256 == 0
+                     and w13_weight.shape[-2] % 256 == 0)):
+            return (w2_weight, w2_weight_scale_inv, w13_weight,
+                    w13_weight_scale_inv)
+
+        logger.info_once(
+            "ROCm AITER Padding MoE weights and scales for block quantization."
+        )
+        # for now this is enabled for DeepSeekV3 and Qwen3
+        assert block_k == 128, "block_k must be 128"
+        assert block_n == 128, "block_n must be 128"
+        assert block_k == block_n, (
+            "block_k and block_n must be the same value: 128")
+
+        num_experts, hidden_size, inter_dim = w2_weight.shape
+        padded_inter_dim = ((inter_dim + 255) // 256) * 256
+        # inter_dim_block_scale = layer.w2_weight_scale_inv.shape[2]
+        #  = ((intermediate_size_per_partition + block_n - 1) // block_n)
+        inter_dim_block_scale = (inter_dim + block_n - 1) // block_n
+        padded_inter_dim_block_scale = ((padded_inter_dim + block_n - 1) //
+                                        block_n)
+
+        # k_block_scale is also known as hidden_size_block
+        # Pad w2_weight to
+        #   [num_experts, hidden_size, inter_dim]
+        # Padding Logic:
+        #   [expert(local_expert:EP), hidden_size, inter_dim]
+        # after padding inter_dim with 0.0 to multiple of 256
+        #   [expert(local_expert:EP), hidden_size, padded_inter_dim]
+        if padded_inter_dim > inter_dim:
+            pad_size = padded_inter_dim - inter_dim
+            w2_weight = F.pad(w2_weight, (0, pad_size), value=0.0)
+
+        # Pad w2_weight_scale_inv to
+        #   [num_experts, k_block_scale, inter_dim_block_scale]
+        # Padding Logic:
+        #   [expert(local_expert:EP), k_block_scale, inter_dim_block_scale]
+        # after padding inter_dim with 1.0
+        #   [expert(local_expert:EP), k_block_scale, padded_inter_dim_block_scale] # noqa: E501
+        if padded_inter_dim_block_scale > inter_dim_block_scale:
+            pad_size = padded_inter_dim_block_scale - inter_dim_block_scale
+            w2_weight_scale_inv = F.pad(w2_weight_scale_inv, (0, pad_size),
+                                        value=1.0)
+
+        # Pad w13_weight to
+        #   [num_experts, 2 * inter_dim, hidden_size]
+        # Padding Logic:
+        #   [expert(local_expert:EP), inter_dim*2, dim]
+        # after reshape
+        #   [expert(local_expert:EP), 2, inter_dim, dim]
+        # after right padding
+        #   [expert(local_expert:EP), 2, padded_inter_dim, dim]
+        # after reshape
+        #   [expert(local_expert:EP), 2 * padded_inter_dim, dim]
+        w13_weight = w13_weight.view(num_experts, 2, inter_dim, hidden_size)
+        if padded_inter_dim > inter_dim:
+            pad_size = padded_inter_dim - inter_dim
+            w13_weight = F.pad(w13_weight, (0, 0, 0, pad_size), value=0.0)
+        w13_weight = w13_weight.view(num_experts, 2 * padded_inter_dim,
+                                     hidden_size)
+
+        # Pad w13_weight_scale_inv to
+        #   [num_experts, 2 * inter_dim_block_scale, k_block_scale]
+        # Padding Logic:
+        #   k_block_scale = ((hidden_size + block_k - 1) // block_k)
+        #   [expert(local_expert:EP), inter_dim_block_scale*2, k_block_scale] # noqa: E501
+        # after reshape
+        #   [expert(local_expert:EP), 2, inter_dim_block_scale, k_block_scale] # noqa: E501
+        # after right padding with 1.0
+        #   [expert(local_expert:EP), 2, padded_inter_dim_block_scale, k_block_scale] # noqa: E501
+        # after reshape
+        #   [expert(local_expert:EP), 2 * padded_inter_dim_block_scale, k_block_scale] # noqa: E501
+        k_block_scale = w13_weight_scale_inv.shape[
+            2]  # k_block_scale = (hidden_size + block_k - 1) // block_k
+        w13_weight_scale_inv = w13_weight_scale_inv.view(
+            num_experts, 2, inter_dim_block_scale, k_block_scale)
+        if padded_inter_dim_block_scale > inter_dim_block_scale:
+            pad_size = padded_inter_dim_block_scale - inter_dim_block_scale
+            w13_weight_scale_inv = F.pad(w13_weight_scale_inv,
+                                         (0, 0, 0, pad_size),
+                                         value=1.0)
+        w13_weight_scale_inv = w13_weight_scale_inv.view(
+            num_experts, 2 * padded_inter_dim_block_scale, k_block_scale)
+
+        return w2_weight, w2_weight_scale_inv, w13_weight, w13_weight_scale_inv
+
     def create_weights(self, layer: Module, num_experts: int, hidden_size: int,
                        intermediate_size_per_partition: int,
                        params_dtype: torch.dtype, **extra_weight_attrs):
@@ -714,6 +832,17 @@ def process_weights_after_loading(self, layer: Module) -> None:
                 w2_weight = layer.w2_weight
                 w2_weight_scale_inv = layer.w2_weight_scale_inv
 
+            if self.quant_config.weight_block_size is not None:
+                (w2_weight, w2_weight_scale_inv, w13_weight,
+                 w13_weight_scale_inv
+                 ) = self._maybe_pad_rocm_aiter_block_scaled_fused_moe_weights(
+                     w2_weight,
+                     w2_weight_scale_inv,
+                     w13_weight,
+                     w13_weight_scale_inv,
+                     block_n=self.quant_config.weight_block_size[0],
+                     block_k=self.quant_config.weight_block_size[1])
+
             # torch.compile() cannot use Parameter subclasses.
             layer.w13_weight = Parameter(w13_weight, requires_grad=False)
             layer.w13_weight_scale_inv = Parameter(w13_weight_scale_inv,
diff --git a/vllm/model_executor/layers/quantization/kv_cache.py b/vllm/model_executor/layers/quantization/kv_cache.py
index e5604670fb4c..044fd77347e3 100644
--- a/vllm/model_executor/layers/quantization/kv_cache.py
+++ b/vllm/model_executor/layers/quantization/kv_cache.py
@@ -52,7 +52,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         # regardless whether the kv-scale is available in the checkpoint.
         # No need to process kv scales after loading if we are going to
         # calculate them on the fly.
-        if layer.kv_cache_dtype != "auto" and not layer.calculate_kv_scales:
+        if not layer.calculate_kv_scales:
             if layer.k_scale > 0.0 and layer.v_scale > 0.0:
                 # We prefer to use separate k_scale and v_scale if present
                 k_scale = layer.k_scale.to("cpu").tolist()
@@ -124,6 +124,8 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
 
         # These are used in the final Attention.forward()
         layer._q_scale.copy_(q_scale)
+        layer._q_scale_float = q_scale
+
         layer._prob_scale.copy_(prob_scale)
         if layer.kv_cache_dtype == "fp8" and (q_scale == 1.0
                                               or prob_scale == 1.0):
diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index 6724796904f0..1cb373b6f832 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -183,7 +183,7 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int,
             hidden_size = round_up(hidden_size, 256)
         elif current_platform.is_rocm():
             intermediate_size_per_partition_after_pad = round_up(
-                intermediate_size_per_partition, 128)
+                intermediate_size_per_partition, 256)
         else:
             intermediate_size_per_partition_after_pad = round_up(
                 intermediate_size_per_partition, 64)
diff --git a/vllm/model_executor/layers/quantization/quark/quark.py b/vllm/model_executor/layers/quantization/quark/quark.py
index b67ee5cf453d..56224cbbcdf0 100644
--- a/vllm/model_executor/layers/quantization/quark/quark.py
+++ b/vllm/model_executor/layers/quantization/quark/quark.py
@@ -163,6 +163,19 @@ def _check_scheme_supported(self,
         else:
             return False
 
+    def is_fp8_w8a8(self) -> bool:
+        # Returns True if all quantized layers in model are fp8 w8a8
+        global_quant_config = cast(
+            dict[str, Any], self.quant_config.get("global_quant_config"))
+        layer_quant_configs = cast(dict[str, Any],
+                                   self.quant_config.get("layer_quant_config"))
+        for config in (global_quant_config, *layer_quant_configs.values()):
+            weight_config = cast(dict[str, Any], config.get("weight"))
+            input_config = cast(dict[str, Any], config.get("input_tensors"))
+            if not self._is_fp8_w8a8(weight_config, input_config):
+                return False
+        return True
+
     def _is_fp8_w8a8(self, weight_quant: Optional[dict[str, Any]],
                      input_quant: Optional[dict[str, Any]]) -> bool:
         # Confirm weights and input quantized.
@@ -385,7 +398,8 @@ def create_weights(self, layer: torch.nn.Module,
     def apply(self,
               layer: torch.nn.Module,
               x: torch.Tensor,
-              bias: Optional[torch.Tensor] = None):
+              bias: Optional[torch.Tensor] = None,
+              x_scales: torch.Tensor = None):
         """
         Use the output of create_weights and the CompressedTensorsScheme
         associated with the layer to apply the forward pass with the
@@ -395,7 +409,11 @@ def apply(self,
         scheme = layer.scheme
         if scheme is None:
             raise ValueError("A scheme must be defined for each layer")
-        return scheme.apply_weights(layer, x, bias=bias)
+
+        if x_scales is None:
+            return scheme.apply_weights(layer, x, bias=bias)
+        else:
+            return scheme.apply_weights(layer, x, bias=bias, x_scales=x_scales)
 
 
 class QuarkKVCacheMethod(BaseKVCacheMethod):
diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py
index 58f56c6381b3..e4771056cc4e 100644
--- a/vllm/model_executor/layers/quantization/quark/quark_moe.py
+++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py
@@ -301,13 +301,7 @@ def __init__(
                 "QDQ (quantize and dequantize) will be used, with the linear "
                 "layers computed in high precision.")
         else:
-            self.emulate = True
-            logger.warning_once(
-                "The current platform supports native MXFP4 "
-                "computation, but kernels are not yet integrated in vLLM. "
-                "Simulated weight dequantization and activation "
-                "QDQ (quantize and dequantize) will be used, with the linear "
-                "layers computed in high precision.")
+            self.emulate = False
 
     def create_weights(self, layer: torch.nn.Module, num_experts: int,
                        hidden_size: int, intermediate_size_per_partition: int,
diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py
index 880438a22a69..18a2495297b6 100644
--- a/vllm/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py
+++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py
@@ -6,7 +6,7 @@
 import torch
 import torch.nn.functional as F
 
-from vllm.logger import init_logger
+from vllm import envs
 from vllm.model_executor.layers.quantization.quark.schemes import QuarkScheme
 from vllm.model_executor.layers.quantization.utils.mxfp4_utils import (
     OCP_MX_BLOCK_SIZE, dequant_mxfp4, quant_dequant_mxfp4)
@@ -14,7 +14,80 @@
                                            PackedvLLMParameter)
 from vllm.platforms import current_platform
 
-logger = init_logger(__name__)
+try:
+    from aiter.ops.shuffle import shuffle_weight
+    from aiter.ops.triton.gemm_afp4wfp4 import gemm_afp4wfp4
+    from aiter.ops.triton.quant import dynamic_mxfp4_quant
+
+    from vllm.utils import direct_register_custom_op
+    if envs.VLLM_TRITON_FP4_GEMM_USE_ASM:
+        from aiter import gemm_a4w4, per_1x32_f4_quant_hip
+
+    def gemm_with_dynamic_quant(
+        x: torch.Tensor,
+        weight: torch.Tensor,
+        weight_scale: torch.Tensor,
+        x_scales: torch.Tensor = None,
+        out_dtype: Optional[torch.dtype] = torch.bfloat16,
+    ) -> torch.Tensor:
+        M = x.shape[0]
+        if envs.VLLM_TRITON_FP4_GEMM_USE_ASM:
+            if x_scales is None:
+                # use hip quant kernel for performance
+                x_q, x_s = per_1x32_f4_quant_hip(x, shuffle=True)
+            else:
+                x_q = x
+                x_s = x_scales
+
+            # 32 alignment is enough for dim0 padding of output for
+            # gemm_a4w4 kernel
+            y = torch.empty((M + 31) // 32 * 32,
+                            weight.shape[0],
+                            device=x_q.device,
+                            dtype=out_dtype)
+
+            gemm_a4w4(x_q,
+                      weight,
+                      x_s,
+                      weight_scale.view(x_s.dtype),
+                      y,
+                      bpreshuffle=True)
+            return y[:M]
+        else:
+            if x_scales is None:
+                x_q, x_s = dynamic_mxfp4_quant(x)
+            else:
+                x_q = x
+                x_s = x_scales
+            y = torch.empty(x_q.shape[0],
+                            weight.shape[0],
+                            device=x_q.device,
+                            dtype=out_dtype)
+
+            gemm_afp4wfp4(x_q, weight, x_s, weight_scale.T, out_dtype, y)
+            return y
+
+    def gemm_with_dynamic_quant_fake(
+        x: torch.Tensor,
+        weight: torch.Tensor,
+        weight_scale: torch.Tensor,
+        x_scales: torch.Tensor = None,
+        out_dtype: Optional[torch.dtype] = torch.bfloat16,
+    ) -> torch.Tensor:
+        return torch.empty((*x.shape[:-1], weight.shape[0]),
+                           dtype=out_dtype,
+                           device=x.device)
+
+    direct_register_custom_op(
+        op_name="gemm_with_dynamic_quant",
+        op_func=gemm_with_dynamic_quant,
+        mutates_args=[],
+        fake_impl=gemm_with_dynamic_quant_fake,
+        dispatch_key=current_platform.dispatch_key,
+    )
+
+except ImportError:
+    dynamic_mxfp4_quant = gemm_afp4wfp4 = None
 
 __all__ = ["QuarkW4A4MXFP4"]
 
@@ -27,29 +100,14 @@ def __init__(self, weight_quant_spec: dict[str, Any],
         self.qscheme = "per_group"
         self.weight_quant_spec = weight_quant_spec
         self.input_quant_spec = input_quant_spec
-
-        self.static_input_scales = not input_quant_spec.get("is_dynamic")
-
-        if self.static_input_scales:
+        self.emulate = not current_platform.supports_mx()
+        if not self.emulate and (dynamic_mxfp4_quant is None
+                                 or gemm_afp4wfp4 is None):
+            # Currently need these kernels if not emulating
             raise NotImplementedError(
-                "QuarkW4A4MXFP4 with static input scales is currently not "
-                "implemented. Please open an issue.")
-
-        if not current_platform.supports_mx():
-            self.emulate = True
-            logger.warning_once(
-                "The current platform does not support native MXFP4 "
-                "computation. Simulated weight dequantization and activation "
-                "QDQ (quantize and dequantize) will be used, with the linear "
-                "layers computed in high precision.")
-        else:
-            self.emulate = True
-            logger.warning_once(
-                "The current platform supports native MXFP4 "
-                "computation, but kernels are not yet integrated in vLLM. "
-                "Simulated weight dequantization and activation "
-                "QDQ (quantize and dequantize) will be used, with the linear "
-                "layers computed in high precision.")
+                f"{self.__class__.__name__} requires AITER to be installed "
+                "for non-emulation mode! Please refer to "
+                "https://github.com/ROCm/aiter for installation details.")
 
     @classmethod
     def get_min_capability(cls) -> int:
@@ -58,8 +116,65 @@ def get_min_capability(cls) -> int:
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         layer.weight = torch.nn.Parameter(layer.weight.data,
                                           requires_grad=False)
-        layer.weight_scale = torch.nn.Parameter(layer.weight_scale.data,
-                                                requires_grad=False)
+
+        if self.emulate:
+            layer.weight_scale = torch.nn.Parameter(layer.weight_scale.data,
+                                                    requires_grad=False)
+            try:
+                from quark.torch.export.nn.modules import realquantizer
+                from quark.torch.quantization.config.config import (
+                    QuantizationSpec)
+            except ImportError as err:
+                raise ImportError(
+                    "The package `amd-quark` is required to use AMD Quark "
+                    "MX-FP4 models. Please install it with `pip install "
+                    "amd-quark`.") from err
+
+            weight_quant_spec = QuantizationSpec.from_dict(
+                self.weight_quant_spec)
+
+            weight_quantizer = realquantizer.get_real_quantizer(
+                qspec=weight_quant_spec,
+                quantizer=None,
+                real_quantized=True,
+                reorder=False,
+                float_dtype=self.out_dtype,
+                scale_shape=layer.weight_scale.shape,
+                zero_point_shape=None,
+            )
+            weight_quantizer.scale.data = layer.weight_scale.data
+
+            layer.weight = torch.nn.Parameter(
+                weight_quantizer(layer.weight.data).to(self.out_dtype),
+                requires_grad=False,
+            )
+            layer.weight_scale = None
+
+            # This call is necessary to release the scales memory.
+            torch.cuda.empty_cache()
+        else:
+            if envs.VLLM_TRITON_FP4_GEMM_USE_ASM:
+                # shuffle weight scale
+                weight_scale_shuffle = layer.weight_scale.data
+                sm, sn = weight_scale_shuffle.shape
+                weight_scale_shuffle = weight_scale_shuffle.view(
+                    sm // 32, 2, 16, sn // 8, 2, 4, 1)
+                weight_scale_shuffle = weight_scale_shuffle.permute(
+                    0, 3, 5, 2, 4, 1, 6).contiguous()
+                weight_scale_shuffle = weight_scale_shuffle.view(sm, sn)
+                layer.weight_scale = torch.nn.Parameter(weight_scale_shuffle,
+                                                        requires_grad=False)
+
+                # shuffle weight
+                weight_shuffle = layer.weight.data
+                weight_shuffle = shuffle_weight(weight_shuffle,
+                                                layout=(16, 16))
+                layer.weight = torch.nn.Parameter(weight_shuffle,
+                                                  requires_grad=False)
+            else:
+                layer.weight_scale = torch.nn.Parameter(
+                    layer.weight_scale.data.T.contiguous(),
+                    requires_grad=False)
 
     def create_weights(self, layer: torch.nn.Module,
                        output_partition_sizes: list[int],
@@ -100,7 +215,8 @@ def create_weights(self, layer: torch.nn.Module,
     def apply_weights(self,
                       layer: torch.nn.Module,
                       x: torch.Tensor,
-                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+                      bias: Optional[torch.Tensor] = None,
+                      x_scales: torch.Tensor = None) -> torch.Tensor:
 
         if self.emulate:
             dq_w = dequant_mxfp4(layer.weight, layer.weight_scale, x.dtype)
@@ -109,4 +225,5 @@ def apply_weights(self,
 
             return F.linear(x, dq_w, bias)
         else:
-            raise NotImplementedError()
+            return torch.ops.vllm.gemm_with_dynamic_quant(
+                x, layer.weight, layer.weight_scale, x_scales, self.out_dtype)
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index 7b324dce3c36..242eee4ad7ba 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -55,9 +55,12 @@ def rocm_aiter_gemm_w8a8_blockscale_impl(
     block_size: list[int],
     output_dtype: torch.dtype = torch.float16,
 ) -> torch.Tensor:
-    import aiter as rocm_aiter
+    # import aiter as rocm_aiter
+
+    # return rocm_aiter.gemm_a8w8_blockscale(A, B, As, Bs, dtype=output_dtype)
+    from aiter.ops.triton.gemm_a8w8_blockscale import gemm_a8w8_blockscale
 
-    return rocm_aiter.gemm_a8w8_blockscale(A, B, As, Bs, dtype=output_dtype)
+    return gemm_a8w8_blockscale(A, B, As, Bs, dtype=output_dtype)
 
 
 def rocm_aiter_gemm_w8a8_blockscale_fake(
@@ -92,8 +95,58 @@ def rocm_aiter_gemm_w8a8_blockscale_fake(
         aiter_per1x128_quant = get_hip_quant(rocm_aiter.QuantType.per_1x128)
 
 
+def rocm_aiter_ck_tile_gemm_w8a8_blockscale_impl(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    As: torch.Tensor,
+    Bs: torch.Tensor,
+    block_size: list[int],
+    output_dtype: torch.dtype = torch.float16,
+) -> torch.Tensor:
+    import aiter as rocm_aiter
+
+    return rocm_aiter.gemm_a8w8_blockscale_ck_tile(A,
+                                                   B,
+                                                   As,
+                                                   Bs,
+                                                   dtype=output_dtype)
+
+
+def rocm_aiter_ck_tile_gemm_w8a8_blockscale_fake(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    As: torch.Tensor,
+    Bs: torch.Tensor,
+    block_size: list[int],
+    output_dtype: torch.dtype = torch.float16,
+) -> torch.Tensor:
+
+    m = A.shape[0]
+    n = B.shape[0]
+    Y = torch.empty(m, n, dtype=output_dtype, device=A.device)
+    return Y
+
+
+if current_platform.is_rocm():
+    direct_register_custom_op(
+        op_name="rocm_aiter_ck_tile_gemm_w8a8_blockscale",
+        op_func=rocm_aiter_ck_tile_gemm_w8a8_blockscale_impl,
+        mutates_args=[],
+        fake_impl=rocm_aiter_ck_tile_gemm_w8a8_blockscale_fake,
+        dispatch_key=current_platform.dispatch_key,
+    )
+    if (envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_CK_TILE_LINEAR
+            and current_platform.is_fp8_fnuz()):
+
+        import aiter as rocm_aiter
+        from aiter import get_hip_quant
+
+        aiter_per1x128_quant = get_hip_quant(rocm_aiter.QuantType.per_1x128)
+
+
 def dispatch_w8a8_blockscale_func(
-    use_cutlass: bool, use_aiter_and_is_supported: bool
+    use_cutlass: bool, use_aiter_and_is_supported: bool,
+    use_ck_tile_and_is_supported: bool
 ) -> Callable[[
         torch.Tensor,
         torch.Tensor,
@@ -106,6 +159,8 @@ def dispatch_w8a8_blockscale_func(
         return cutlass_scaled_mm
     if (use_aiter_and_is_supported):
         return torch.ops.vllm.rocm_aiter_gemm_w8a8_blockscale
+    if (use_ck_tile_and_is_supported):
+        return torch.ops.vllm.rocm_aiter_ck_tile_gemm_w8a8_blockscale
     return w8a8_block_fp8_matmul
 
 
@@ -120,6 +175,7 @@ def apply_w8a8_block_fp8_linear(
     bias: Optional[torch.Tensor] = None,
     cutlass_block_fp8_supported: bool = CUTLASS_BLOCK_FP8_SUPPORTED,
     use_aiter_and_is_supported: bool = False,
+    use_ck_tile_and_is_supported: bool = False,
 ) -> torch.Tensor:
     assert input_scale is None
     # View input as 2D matrix for fp8 methods
@@ -167,7 +223,7 @@ def apply_w8a8_block_fp8_linear(
         use_cutlass = False
 
     w8a8_blockscale_func = dispatch_w8a8_blockscale_func(
-        use_cutlass, use_aiter_and_is_supported)
+        use_cutlass, use_aiter_and_is_supported, use_ck_tile_and_is_supported)
     if use_cutlass:
         q_input, x_scale = per_token_group_quant_fp8(
             input_2d, block_size[1], column_major_scales=use_cutlass)
@@ -175,7 +231,7 @@ def apply_w8a8_block_fp8_linear(
                                       block_size, input.dtype)
 
     else:
-        if use_aiter_and_is_supported:
+        if use_aiter_and_is_supported and current_platform.is_fp8_fnuz():
             q_input, x_scale = aiter_per1x128_quant(
                 input_2d.contiguous(), quant_dtype=rocm_aiter.dtypes.fp8)
         else:
@@ -199,6 +255,7 @@ def apply_w8a8_block_fp8_linear_fake(
     bias: Optional[torch.Tensor] = None,
     cutlass_block_fp8_supported: bool = CUTLASS_BLOCK_FP8_SUPPORTED,
     use_aiter_and_is_supported: bool = False,
+    use_ck_tile_and_is_supported: bool = False,
 ) -> torch.Tensor:
     output_shape = [*input.shape[:-1], weight.shape[0]]
     return torch.empty(output_shape, dtype=input.dtype, device=input.device)
diff --git a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
index 3de928fea720..fec7a6eff73c 100644
--- a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
@@ -30,12 +30,37 @@ def _swizzle_mxfp4(quant_tensor, scale, num_warps):
             "cause performance degradation. Please upgrade to torch nightly")
         value_layout, value_layout_opts = StridedLayout, dict()
         scale_layout, scale_layout_opts = StridedLayout, dict()
+
+    elif current_platform.is_rocm():
+        from triton_kernels.target_info import is_hip
+        from triton_kernels.tensor_details.layout import (
+            BlackwellMXScaleLayout, GFX950MXScaleLayout, HopperMXScaleLayout,
+            HopperMXValueLayout)
+        value_layout = StridedLayout
+        scale_layout = StridedLayout
+        if not is_hip():
+            if torch.cuda.get_device_capability()[0] == 9:
+                value_layout = HopperMXValueLayout
+                scale_layout = HopperMXScaleLayout
+            if torch.cuda.get_device_capability()[0] == 10:
+                scale_layout = BlackwellMXScaleLayout
+        else:
+            import os
+            use_scale_preshuffling = os.environ.get(
+                "TRITON_HIP_PRESHUFFLE_SCALES", "0") == "1"
+            if use_scale_preshuffling:
+                scale_layout = GFX950MXScaleLayout
     else:
+        """ weight swizzle for mxfp4 moe, used for OAI mxfp4 kernel
+        """
         value_layout, value_layout_opts = \
-            layout.make_default_matmul_mxfp4_w_layout(mx_axis=1)
+            layout.make_default_matmul_mxfp4_w_layout(
+            mx_axis=1)
         scale_layout, scale_layout_opts = (
             layout.make_default_matmul_mxfp4_w_scale_layout(
                 mx_axis=1, num_warps=num_warps))
+
+
     if current_platform.is_cuda() and \
         current_platform.is_device_capability(100):
         constraints = {
@@ -46,10 +71,16 @@ def _swizzle_mxfp4(quant_tensor, scale, num_warps):
     # transpose the tensor so that the quantization axis is on dim1
     quant_tensor = quant_tensor.transpose(-2, -1)
     scale = scale.transpose(-2, -1)
-    quant_tensor = convert_layout(wrap_torch_tensor(quant_tensor, dtype=FP4),
-                                  value_layout, **value_layout_opts)
-    scale = convert_layout(wrap_torch_tensor(scale), scale_layout,
-                           **scale_layout_opts)
+    if current_platform.is_rocm():
+        quant_tensor = convert_layout(
+            wrap_torch_tensor(quant_tensor, dtype=FP4), value_layout)
+        scale = convert_layout(wrap_torch_tensor(scale), scale_layout)
+    else:
+        quant_tensor = convert_layout(
+            wrap_torch_tensor(quant_tensor, dtype=FP4), value_layout,
+            **value_layout_opts)
+        scale = convert_layout(wrap_torch_tensor(scale), scale_layout,
+                               **scale_layout_opts)
     return quant_tensor, InFlexData(), scale
 
 
diff --git a/vllm/model_executor/layers/rotary_embedding/base.py b/vllm/model_executor/layers/rotary_embedding/base.py
index 10fce857a8ae..63fd796c17c1 100644
--- a/vllm/model_executor/layers/rotary_embedding/base.py
+++ b/vllm/model_executor/layers/rotary_embedding/base.py
@@ -5,9 +5,12 @@
 
 import torch
 
+import vllm.envs as envs
 from vllm.model_executor.custom_op import CustomOp
 
 from .common import apply_rotary_emb_dispatch, apply_rotary_emb_torch
+from .rocm_aiter_rope_ops import (is_rocm_rotary_embedding_enabled,
+                                  is_rocm_triton_rotary_embedding_enabled)
 
 
 @CustomOp.register("rotary_embedding")
@@ -35,6 +38,11 @@ def __init__(
         cache = cache.to(dtype)
         self.cos_sin_cache: torch.Tensor
         self.register_buffer("cos_sin_cache", cache, persistent=False)
+        self.is_rocm_aiter_enabled = \
+            is_rocm_rotary_embedding_enabled()
+        self.is_rocm_aiter_triton_enabled = \
+            is_rocm_triton_rotary_embedding_enabled(
+        )
 
     def _compute_inv_freq(self, base: float) -> torch.Tensor:
         """Compute the inverse frequency."""
@@ -98,7 +106,6 @@ def forward_cuda(
         key: Optional[torch.Tensor] = None,
         offsets: Optional[torch.Tensor] = None,
     ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
-        from vllm import _custom_ops as ops
 
         # __setattr__ in nn.Module (called by `self.cos_sin_cache = ...`)
         # is expensive, so avoid calling it if possible
@@ -107,18 +114,121 @@ def forward_cuda(
             self.cos_sin_cache = self.cos_sin_cache.to(query.device,
                                                        dtype=query.dtype)
 
-        # ops.rotary_embedding()/batched_rotary_embedding()
-        # are in-place operations that update the query and key tensors.
-        if offsets is not None:
-            ops.batched_rotary_embedding(positions, query, key, self.head_size,
-                                         self.cos_sin_cache,
-                                         self.is_neox_style, self.rotary_dim,
-                                         offsets)
+        num_tokens = positions.numel()
+
+        if envs.VLLM_USE_AITER_TRITON_ROPE:
+            assert key is not None
+            cos, sin = self.cos_sin_cache.chunk(2, dim=-1)
+            query_shape = query.shape
+            key_shape = key.shape
+            query = query.view(num_tokens, -1, self.head_size)
+            key = key.view(num_tokens, -1, self.head_size)
+            query_ = query[..., :self.rotary_dim]
+            key_ = key[..., :self.rotary_dim]
+            rotate_style = 0 if self.is_neox_style else 1
+            positions = positions.view(*query.shape[:1])
+            if offsets is not None:
+                offsets = offsets.view(*query.shape[:1])
+            torch.ops.vllm.rocm_aiter_rotary_emb_with_key_forward_triton(
+                positions,
+                sin,
+                cos,
+                query_,
+                key_,
+                offsets,
+                rotate_style,
+                False,
+            )
+            query = query.view(query_shape)
+            key = key.view(key_shape)
         else:
-            ops.rotary_embedding(positions, query, key, self.head_size,
-                                 self.cos_sin_cache, self.is_neox_style)
+            from vllm import _custom_ops as ops
+
+            # ops.rotary_embedding()/batched_rotary_embedding()
+            # are in-place operations that update the query and key tensors.
+            if offsets is not None:
+                ops.batched_rotary_embedding(positions, query, key,
+                                             self.head_size,
+                                             self.cos_sin_cache,
+                                             self.is_neox_style,
+                                             self.rotary_dim, offsets)
+            else:
+                ops.rotary_embedding(positions, query, key, self.head_size,
+                                     self.cos_sin_cache, self.is_neox_style)
+
         return query, key
 
+    def forward_hip(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: Optional[torch.Tensor] = None,
+        offsets: Optional[torch.Tensor] = None,
+        is_nope_first=False,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        # currently only rotary embedding ops from AITER package are
+        # supported for HiP forward.
+        if self.is_rocm_aiter_triton_enabled:
+            return self.forward_cuda(positions, query, key, offsets)
+        elif self.is_rocm_aiter_enabled:
+            return self.forward_hip_rocm_aiter(positions, query, key, offsets,
+                                               is_nope_first)
+        return self.forward_native(positions, query, key, offsets)
+
+    def forward_hip_rocm_aiter(
+        self,
+        positions: torch.Tensor,
+        # if     is_nope_first
+        # [[batch_size, seq_len, num_heads, nope_size+rope_size]
+        # if NOT is_nope_first
+        # [[batch_size, seq_len, num_heads, rope_size+nope_size],
+        query: torch.Tensor,
+        key: Optional[torch.Tensor] = None,
+        offsets: Optional[torch.Tensor] = None,
+        is_nope_first: bool = False,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        if self.cos_sin_cache.device != query.device or \
+            self.cos_sin_cache.dtype != query.dtype:
+            self.cos_sin_cache = self.cos_sin_cache.to(query.device,
+                                                       dtype=query.dtype)
+        cos, sin = self.cos_sin_cache.chunk(2, dim=-1)
+
+        cos = cos.unsqueeze(-2).unsqueeze(-2)
+        sin = sin.unsqueeze(-2).unsqueeze(-2)
+
+        rotate_style = 0 if self.is_neox_style else 1
+
+        num_tokens = positions.numel()
+
+        query_shape = query.shape
+        query = query.view(1, num_tokens, -1, self.head_size)
+        if key is not None:
+            key_shape = key.shape
+            key = key.view(1, num_tokens, -1, self.head_size)
+
+        positions = positions.view(*query.shape[:2])
+        if offsets is not None:
+            offsets = offsets.view(*query.shape[:2])
+
+        if not is_nope_first:
+            query_ = query[..., :self.rotary_dim]
+            key_ = key[..., :self.rotary_dim] if key is not None else None
+        else:
+            query_ = query[..., -self.rotary_dim:]
+            key_ = key[..., -self.rotary_dim:] if key is not None else None
+
+        if key_ is None:
+            torch.ops.vllm.rocm_aiter_rotary_emb_without_key_forward_hip(
+                positions, sin, cos, query_, offsets, rotate_style,
+                is_nope_first)
+            return query.view(query_shape), None
+
+        torch.ops.vllm.rocm_aiter_rotary_emb_with_key_forward_hip(
+            positions, sin, cos, query_, key_, offsets, rotate_style,
+            is_nope_first)
+
+        return query.view(query_shape), key.view(key_shape)
+
     def forward_xpu(
         self,
         positions: torch.Tensor,
diff --git a/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py b/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py
index cd888b733426..d72daa18db02 100644
--- a/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py
+++ b/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py
@@ -96,6 +96,11 @@ def forward(
         offsets: Optional[torch.Tensor] = None,
     ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
         """PyTorch-native implementation equivalent to forward()."""
+        if self.is_rocm_aiter_triton_enabled:
+            return self.forward_cuda(positions, query, key, offsets)
+        elif self.is_rocm_aiter_enabled:
+            return self.forward_hip_rocm_aiter(positions, query, key, offsets)
+
         assert key is not None
         query_rot = query[..., :self.rotary_dim]
         key_rot = key[..., :self.rotary_dim]
diff --git a/vllm/model_executor/layers/rotary_embedding/rocm_aiter_rope_ops.py b/vllm/model_executor/layers/rotary_embedding/rocm_aiter_rope_ops.py
new file mode 100644
index 000000000000..8f0b2e1a371c
--- /dev/null
+++ b/vllm/model_executor/layers/rotary_embedding/rocm_aiter_rope_ops.py
@@ -0,0 +1,192 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional
+
+import torch
+
+import vllm.envs as envs
+from vllm.platforms import current_platform
+from vllm.utils import direct_register_custom_op
+
+
+def is_rocm_rotary_embedding_enabled() -> bool:
+    return (current_platform.is_rocm() and envs.VLLM_ROCM_USE_AITER)
+
+
+def is_rocm_triton_rotary_embedding_enabled() -> bool:
+    return (current_platform.is_rocm() and envs.VLLM_ROCM_USE_AITER
+            and envs.VLLM_USE_AITER_TRITON_ROPE)
+
+
+def rocm_aiter_rotary_emb_without_key_forward_hip_impl(
+    positions: torch.Tensor,
+    sin: torch.Tensor,
+    cos: torch.Tensor,
+    query: torch.Tensor,
+    offsets: Optional[torch.Tensor] = None,
+    rotate_style: int = 0,
+    is_nope_first: bool = False,
+) -> None:
+    import aiter as ops
+    if offsets is None:
+        ops.rope_cached_positions_fwd_inplace(
+            query,
+            cos,
+            sin,
+            positions,
+            rotate_style,
+            reuse_freqs_front_part=True,
+            nope_first=is_nope_first,
+        )
+    else:
+        ops.rope_cached_positions_offsets_fwd_inplace(
+            query,
+            cos,
+            sin,
+            positions,
+            offsets,
+            rotate_style,
+            reuse_freqs_front_part=True,
+            nope_first=is_nope_first,
+        )
+
+
+def rocm_aiter_rotary_emb_with_key_forward_hip_impl(
+    positions: torch.Tensor,
+    sin: torch.Tensor,
+    cos: torch.Tensor,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    offsets: Optional[torch.Tensor] = None,
+    rotate_style: int = 0,
+    is_nope_first: bool = False,
+) -> None:
+    import aiter as ops
+    if offsets is None:
+        ops.rope_cached_positions_2c_fwd_inplace(
+            query,
+            key,
+            cos,
+            sin,
+            positions,
+            rotate_style,
+            reuse_freqs_front_part=True,
+            nope_first=is_nope_first,
+        )
+    else:
+        ops.rope_cached_positions_offsets_2c_fwd_inplace(
+            query,
+            key,
+            cos,
+            sin,
+            positions,
+            offsets,
+            rotate_style,
+            reuse_freqs_front_part=True,
+            nope_first=is_nope_first,
+        )
+
+
+def rocm_aiter_rotary_emb_with_key_forward_hip_fake(
+    positions: torch.Tensor,
+    sin: torch.Tensor,
+    cos: torch.Tensor,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    offsets: Optional[torch.Tensor] = None,
+    rotate_style: int = 0,
+    is_nope_first: bool = False,
+) -> None:
+    pass
+
+
+def rocm_aiter_rotary_emb_without_key_forward_hip_fake(
+    positions: torch.Tensor,
+    sin: torch.Tensor,
+    cos: torch.Tensor,
+    query: torch.Tensor,
+    offsets: Optional[torch.Tensor] = None,
+    rotate_style: int = 0,
+    is_nope_first: bool = False,
+) -> None:
+    pass
+
+
+if is_rocm_rotary_embedding_enabled():
+
+    direct_register_custom_op(
+        op_name="rocm_aiter_rotary_emb_with_key_forward_hip",
+        op_func=rocm_aiter_rotary_emb_with_key_forward_hip_impl,
+        mutates_args=["key", "query"],
+        fake_impl=rocm_aiter_rotary_emb_with_key_forward_hip_fake,
+        dispatch_key=current_platform.dispatch_key,
+    )
+
+    direct_register_custom_op(
+        op_name="rocm_aiter_rotary_emb_without_key_forward_hip",
+        op_func=rocm_aiter_rotary_emb_without_key_forward_hip_impl,
+        mutates_args=["query"],
+        fake_impl=rocm_aiter_rotary_emb_without_key_forward_hip_fake,
+        dispatch_key=current_platform.dispatch_key,
+    )
+
+
+def rocm_aiter_rotary_emb_with_key_forward_triton_impl(
+    positions: torch.Tensor,
+    sin: torch.Tensor,
+    cos: torch.Tensor,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    offsets: Optional[torch.Tensor] = None,
+    rotate_style: int = 0,
+    is_nope_first: bool = False,
+) -> None:
+    import aiter.ops.triton.rope as ops
+    if offsets is None:
+        ops.rope_cached_thd_positions_2c_fwd_inplace(
+            query,
+            key,
+            cos,
+            sin,
+            positions,
+            rotate_style,
+            reuse_freqs_front_part=True,
+            nope_first=is_nope_first,
+        )
+    else:
+        ops.rope_cached_thd_positions_offsets_2c_fwd_inplace(
+            query,
+            key,
+            cos,
+            sin,
+            positions,
+            offsets,
+            rotate_style,
+            reuse_freqs_front_part=True,
+            nope_first=is_nope_first,
+        )
+
+
+def rocm_aiter_rotary_emb_with_key_forward_triton_fake(
+    positions: torch.Tensor,
+    sin: torch.Tensor,
+    cos: torch.Tensor,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    offsets: Optional[torch.Tensor] = None,
+    rotate_style: int = 0,
+    is_nope_first: bool = False,
+) -> None:
+    pass
+
+
+if is_rocm_triton_rotary_embedding_enabled():
+
+    direct_register_custom_op(
+        op_name="rocm_aiter_rotary_emb_with_key_forward_triton",
+        op_func=rocm_aiter_rotary_emb_with_key_forward_triton_impl,
+        mutates_args=["key", "query"],
+        fake_impl=rocm_aiter_rotary_emb_with_key_forward_triton_fake,
+        dispatch_key=current_platform.dispatch_key,
+    )
diff --git a/vllm/model_executor/layers/utils.py b/vllm/model_executor/layers/utils.py
index 2897f75b3129..ec51966a6c8b 100644
--- a/vllm/model_executor/layers/utils.py
+++ b/vllm/model_executor/layers/utils.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Utility methods for model layers."""
+import os
 from typing import Callable, Optional
 
 import torch
@@ -10,6 +11,12 @@
 from vllm.platforms import current_platform
 from vllm.utils import direct_register_custom_op
 
+if current_platform.is_rocm():
+    from aiter.ops.triton.gemm_a16w16 import gemm_a16w16
+
+VLLM_USE_AITER_TRITON_GEMM = (os.getenv("VLLM_USE_AITER_TRITON_GEMM",
+                                        "False").lower() in ("true", "1"))
+
 
 def shuffle_weight(w: torch.Tensor) -> torch.Tensor:
     # Shuffle weight along the last dimension so that
@@ -92,22 +99,37 @@ def default_unquantized_gemm(layer: torch.nn.Module,
     return torch.nn.functional.linear(x, weight, bias)
 
 
+def aiter_GEMM_check(m, n, k):
+    return ((n == 5120 and k == 2880) or (n == 2880 and k == 4096)
+            or (n == 128 and k == 2880) or (n == 640 and k == 2880)
+            or (n == 2880 and k == 512))
+
+
 def rocm_unquantized_gemm_impl(
         x: torch.Tensor,
         weight: torch.Tensor,
         bias: Optional[torch.Tensor] = None) -> torch.Tensor:
     from vllm.platforms.rocm import on_gfx9
     k = weight.shape[1]
+    m = weight.shape[0]
+    if x.is_contiguous():
+        x_view = x.view(-1, x.size(-1))
+    else:
+        x_view = x.reshape(-1, x.size(-1))
+    n = x_view.shape[0]
     use_skinny = (envs.VLLM_ROCM_USE_SKINNY_GEMM and on_gfx9() and \
                     x.dtype in [torch.float16, torch.bfloat16] \
                     and k % 8 == 0 and bias is None)
 
+    if VLLM_USE_AITER_TRITON_GEMM and aiter_GEMM_check(n, m, k):
+        return gemm_a16w16(x, weight, bias)
+
     if use_skinny is not True:
         return torch.nn.functional.linear(x, weight, bias)
 
-    x_view = x.view(-1, x.size(-1))
-    n = x_view.shape[0]
-    m = weight.shape[0]
+    # x_view = x.view(-1, x.size(-1))
+    # n = x_view.shape[0]
+    # m = weight.shape[0]
     cu_count = current_platform.get_cu_count()
 
     if m > 8 and 0 < n <= 4:
diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py
index 9c1c05320cf3..99962400bb13 100644
--- a/vllm/model_executor/models/gpt_oss.py
+++ b/vllm/model_executor/models/gpt_oss.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
 from collections.abc import Iterable
 from typing import Optional
 
@@ -24,11 +25,28 @@
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
 from vllm.utils import cdiv
-
+from vllm.platforms import current_platform
 from .utils import (AutoWeightsLoader, WeightsMapper, extract_layer_index,
                     maybe_prefix)
+import os
+
+if current_platform.is_rocm():
+    from aiter.ops.triton.gemm_a16w16 import gemm_a16w16
+
+VLLM_USE_AITER_TRITON_FUSED_SPLIT_QKV_ROPE = (os.getenv(
+    "VLLM_USE_AITER_TRITON_FUSED_SPLIT_QKV_ROPE", "False").lower()
+                                              in ("true", "1"))
+VLLM_USE_AITER_TRITON_FUSED_ADD_RMSNORM_PAD = (os.getenv(
+    "VLLM_USE_AITER_TRITON_FUSED_ADD_RMSNORM_PAD", "False").lower()
+                                               in ("true", "1"))
+if VLLM_USE_AITER_TRITON_FUSED_SPLIT_QKV_ROPE:
+    from aiter.ops.triton.fused_qkv_split_qk_rope import (
+        fused_qkv_split_qk_rope)
+if VLLM_USE_AITER_TRITON_FUSED_ADD_RMSNORM_PAD:
+    from aiter.ops.triton.fused_add_rmsnorm_pad import fused_add_rmsnorm_pad
 
 
 class OAIAttention(nn.Module):
@@ -119,15 +137,45 @@ def __init__(
 
     def forward(self, hidden_states: torch.Tensor,
                 positions: torch.Tensor) -> torch.Tensor:
-        t = self.norm(hidden_states)
-
+        # t = self.norm(hidden_states)
+        if isinstance(hidden_states, tuple) and current_platform.is_rocm(
+        ) and VLLM_USE_AITER_TRITON_FUSED_ADD_RMSNORM_PAD:
+            hidden_states, res = hidden_states
+            t, hidden_states = fused_add_rmsnorm_pad(
+                hidden_states, self.norm.weight, self.norm.variance_epsilon,
+                res)
+        else:
+            t = self.norm(hidden_states)
         qkv, _ = self.qkv(t)
-        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
-        q, k = self.rotary_emb(positions, q, k)
+        # q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        # q, k = self.rotary_emb(positions, q, k)
+        if VLLM_USE_AITER_TRITON_FUSED_SPLIT_QKV_ROPE:
+            cos, sin = self.rotary_emb.cos_sin_cache.chunk(2, dim=-1)
+            q, k, v = fused_qkv_split_qk_rope(
+                qkv,
+                cos,
+                sin,
+                positions,
+                self.num_local_attention_heads,
+                self.num_local_key_value_heads,
+                self.head_dim,
+                is_neox=self.rotary_emb.is_neox_style,
+                offsets=None,
+                reuse_freqs_front_part=(self.head_dim // 2 == cos.shape[-1]),
+                nope_first=False,
+            )
+            q = q.view(-1, self.q_size)
+            k = k.view(-1, self.kv_size)
+        else:
+            q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size],
+                                dim=-1)
+            q, k = self.rotary_emb(positions, q, k)
         v = v.contiguous()
         attn_output = self.attn(q, k, v)
         output, _ = self.o_proj(attn_output)
-
+        if current_platform.is_rocm(
+        ) and VLLM_USE_AITER_TRITON_FUSED_ADD_RMSNORM_PAD:
+            return output, hidden_states
         return output + hidden_states
 
 
@@ -145,6 +193,7 @@ def __init__(
         self.num_experts = config.num_local_experts
         self.experts_per_token = config.num_experts_per_tok
         self.world_size = dist.get_world_size() if dist.is_initialized() else 1
+        self.hidden_size = config.hidden_size
         self.norm = RMSNorm(config.hidden_size, eps=1e-5)
         self.router = torch.nn.Linear(config.hidden_size,
                                       config.num_local_experts,
@@ -162,10 +211,29 @@ def __init__(
                                 has_bias=True,
                                 activation="swigluoai")
 
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        t = self.norm(x)
-        g = self.router(t)
-        t = self.experts(hidden_states=t, router_logits=g)
+    def forward(self, x: torch.Tensor | tuple) -> torch.Tensor:
+        if isinstance(x, tuple) and current_platform.is_rocm(
+        ) and VLLM_USE_AITER_TRITON_FUSED_ADD_RMSNORM_PAD:
+            x, res = x
+            t, x = fused_add_rmsnorm_pad(x,
+                                         self.norm.weight,
+                                         self.norm.variance_epsilon,
+                                         res,
+                                         x_pad_to_multiple=256)
+        else:
+            t = self.norm(x)
+
+        if current_platform.is_rocm():
+            g = gemm_a16w16(t[:, :self.hidden_size], self.router.weight,
+                            self.router.bias)
+        else:
+            g = self.router(t[:, :self.hidden_size])
+        t = self.experts(hidden_states=t,
+                         router_logits=g)[:, :self.hidden_size]
+
+        if current_platform.is_rocm(
+        ) and VLLM_USE_AITER_TRITON_FUSED_ADD_RMSNORM_PAD:
+            return x, t
         return x + t
 
 
@@ -229,7 +297,13 @@ def forward(self, input_ids: torch.Tensor,
         x = self.embedding(input_ids)
         for layer in self.layers:
             x = layer(x, positions)
-        x = self.norm(x)
+        if isinstance(x, tuple) and current_platform.is_rocm(
+        ) and VLLM_USE_AITER_TRITON_FUSED_ADD_RMSNORM_PAD:
+            x, res = x
+            x, _ = fused_add_rmsnorm_pad(x, self.norm.weight,
+                                         self.norm.variance_epsilon, res)
+        else:
+            x = self.norm(x)
         return x
 
     def _load_weights_mxfp4(
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index e39a6df843cd..2a9cb5af044c 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -30,6 +30,7 @@
 from torch import nn
 from transformers import LlamaConfig
 
+import vllm.envs as envs
 from vllm.attention import Attention, AttentionType
 from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention
 from vllm.compilation.decorators import support_torch_compile
@@ -56,6 +57,9 @@
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
+from vllm.platforms import current_platform
+if current_platform.is_rocm():
+    VLLM_ROCM_USE_AITER_TRITON_FUSED_ROPE_ZEROS_KV_CACHE = envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_TRITON_FUSED_ROPE_ZEROS_KV_CACHE
 
 class LlamaMLP(nn.Module):
 
@@ -92,8 +96,12 @@ def __init__(
 
     def forward(self, x):
         x, _ = self.gate_up_proj(x)
-        x = self.act_fn(x)
-        x, _ = self.down_proj(x)
+        if envs.VLLM_USE_AITER_TRITON_SILU_MUL:
+            x, x_scales = self.act_fn(x)
+            x, _ = self.down_proj(x, x_scales)
+        else:
+            x = self.act_fn(x)
+            x, _ = self.down_proj(x)
         return x
 
 
@@ -196,8 +204,14 @@ def forward(
     ) -> torch.Tensor:
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
-        q, k = self.rotary_emb(positions, q, k)
-        attn_output = self.attn(q, k, v)
+
+        if VLLM_ROCM_USE_AITER_TRITON_FUSED_ROPE_ZEROS_KV_CACHE:
+            attn_output = self.attn(q, k, v,
+                positions=positions, cos_sin_cache=self.rotary_emb.cos_sin_cache, is_neox=self.rotary_emb.is_neox_style)
+        else:
+            q, k = self.rotary_emb(positions, q, k)
+            attn_output = self.attn(q, k, v)
+
         output, _ = self.o_proj(attn_output)
         return output
 
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 6129477439d5..1e232de10e75 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -112,6 +112,12 @@ def on_mi3xx() -> bool:
     return any(arch in GPU_ARCH for arch in ["gfx942", "gfx950"])
 
 
+@cache
+def not_mi350() -> bool:
+    GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName
+    return "gfx950" not in GPU_ARCH
+
+
 @cache
 def on_gfx9() -> bool:
     GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName
@@ -283,7 +289,9 @@ def is_fully_connected(cls, physical_device_ids: list[int]) -> bool:
                         link_type = amdsmi_topo_get_link_type(
                             handle, peer_handle)
                         # type is 2 for XGMI
-                        if link_type["hops"] != 1 or link_type["type"] != 2:
+                        if link_type["hops"] != 1 or link_type["type"] not in [
+                                1, 2
+                        ]:
                             return False
                     except AmdSmiException as error:
                         logger.error("AMD 1 hop XGMI detection failed.",
diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py
index b4b057fa281f..4b0b0e20e3a0 100644
--- a/vllm/v1/attention/backends/rocm_aiter_fa.py
+++ b/vllm/v1/attention/backends/rocm_aiter_fa.py
@@ -325,6 +325,22 @@ def build(self,
             dtype=torch.uint8,
             device=self.device,
         )
+        if max_query_len > 1:
+            # We pre-compute cumulative seq len needed for prefill attention
+            # here to avoid recomputing it for every layer
+            cu_seq_lens = torch.zeros(seq_lens.shape[0] + 1,
+                                      dtype=torch.int32,
+                                      device=seq_lens.device)
+            torch.cumsum(seq_lens,
+                         dim=0,
+                         dtype=cu_seq_lens.dtype,
+                         out=cu_seq_lens[1:])
+            num_actual_kv_tokens = int(cu_seq_lens[-1].item())
+        else:
+            cu_seq_lens = None
+            num_actual_kv_tokens = 0
+
+        use_cascade = common_prefix_len > 0
 
         attn_metadata = AiterFlashAttentionMetadata(
             num_actual_tokens=num_actual_tokens,
@@ -446,6 +462,7 @@ def __init__(
         self.sinks = sinks
         if self.sinks is not None:
             raise NotImplementedError("Sinks are not supported for ROCM AITER")
+        self.sinks = sinks
 
     def forward(
         self,
diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py
index 25021b9e23d9..db985591cd0e 100644
--- a/vllm/v1/attention/backends/triton_attn.py
+++ b/vllm/v1/attention/backends/triton_attn.py
@@ -27,6 +27,12 @@
 
 logger = init_logger(__name__)
 
+if current_platform.is_rocm():
+    VLLM_ROCM_USE_AITER_TRITON_FUSED_ROPE_ZEROS_KV_CACHE = envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_TRITON_FUSED_ROPE_ZEROS_KV_CACHE
+    if VLLM_ROCM_USE_AITER_TRITON_FUSED_ROPE_ZEROS_KV_CACHE:
+        from aiter.ops.triton.fused_kv_cache import (
+            fused_qk_rope_reshape_and_cache)
+
 
 @dataclass
 class TritonAttentionMetadata:
@@ -246,24 +252,24 @@ def __init__(
                                       "TritonAttentionImpl")
 
         self.fp8_dtype = current_platform.fp8_dtype()
-        self.force_prefill_decode_attn = \
-            envs.VLLM_V1_USE_PREFILL_DECODE_ATTENTION
-
-        if not self.force_prefill_decode_attn:
-            # If not using prefill decode attention, we use the Triton
-            # unified attention implementation.
-            if use_aiter_unified_attention():
-                logger.info_once(
-                    "Using aiter unified attention for TritonAttentionImpl")
-                from aiter.ops.triton.unified_attention import (
-                    unified_attention)
-                self.unified_attention = unified_attention
-            else:
-                logger.info_once(
-                    "Using vllm unified attention for TritonAttentionImpl")
-                from vllm.attention.ops.triton_unified_attention import (
-                    unified_attention)
-                self.unified_attention = unified_attention
+
+        # If not using prefill decode attention, we use the Triton
+        # unified attention implementation.
+        if use_aiter_unified_attention():
+            logger.info_once(
+                "Using aiter unified attention for TritonAttentionImpl")
+            from aiter.ops.triton.unified_attention import unified_attention
+            self.unified_attention = unified_attention
+        elif not envs.VLLM_V1_USE_PREFILL_DECODE_ATTENTION:
+            logger.info_once(
+                "Using vllm unified attention for TritonAttentionImpl")
+            from vllm.attention.ops.triton_unified_attention import (
+                unified_attention)
+            self.unified_attention = unified_attention
+        else:
+            logger.info_once(
+                "Using vllm split prefill decode attention for TritonAttentionImpl"
+            )
 
         self.sinks = sinks
         if sinks is not None:
@@ -282,6 +288,9 @@ def forward(
         attn_metadata: FlashAttentionMetadata,
         output: Optional[torch.Tensor] = None,
         output_scale: Optional[torch.Tensor] = None,
+        positions: torch.Tensor = None,
+        cos_sin_cache: torch.Tensor = None,
+        is_neox: bool = False,
         output_block_scale: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with FlashAttention.
@@ -318,7 +327,8 @@ def forward(
         # Whenever making a change in this method, please benchmark the
         # performance to make sure it does not introduce any overhead.
 
-        use_prefill_decode_attn = self.force_prefill_decode_attn
+        use_prefill_decode_attn = envs.VLLM_V1_USE_PREFILL_DECODE_ATTENTION \
+            and not use_aiter_unified_attention()
         num_actual_tokens = attn_metadata.num_actual_tokens
 
         if use_prefill_decode_attn:
@@ -327,37 +337,70 @@ def forward(
         else:
             key_cache, value_cache = kv_cache.unbind(0)
 
-        if self.kv_sharing_target_layer_name is None:
-            # Reshape the input keys and values and store them in the cache.
-            # Skip this if sharing KV cache with an earlier attention layer.
-            if use_prefill_decode_attn:
-                PagedAttention.write_to_paged_cache(
-                    key,
-                    value,
-                    key_cache,
-                    value_cache,
-                    attn_metadata.slot_mapping,
-                    self.kv_cache_dtype,
-                    layer._k_scale,
-                    layer._v_scale,
-                )
-            else:
-                torch.ops._C_cache_ops.reshape_and_cache_flash(
-                    key,
-                    value,
-                    key_cache,
-                    value_cache,
-                    attn_metadata.slot_mapping,
-                    self.kv_cache_dtype,
-                    layer._k_scale,
-                    layer._v_scale,
-                )
+        if VLLM_ROCM_USE_AITER_TRITON_FUSED_ROPE_ZEROS_KV_CACHE:
+            assert self.kv_sharing_target_layer_name is None, "self.kv_sharing_target_layer_name error"
+            cos, sin = cos_sin_cache.chunk(2, dim=-1)
+            is_fp8_kv_cache = self.kv_cache_dtype.startswith("fp8")
+            if is_fp8_kv_cache:
+                key_cache_og_dtype = key_cache.dtype
+                value_cache_og_dtype = value_cache.dtype
+                key_cache = key_cache.view(self.fp8_dtype)
+                value_cache = value_cache.view(self.fp8_dtype)
+            query, key, key_cache, value_cache, output = fused_qk_rope_reshape_and_cache(
+                query,
+                key,
+                value,
+                key_cache,
+                value_cache,
+                attn_metadata.slot_mapping,
+                positions,
+                cos,
+                sin,
+                layer._k_scale,
+                layer._v_scale,
+                is_neox,
+                flash_layout=(not use_prefill_decode_attn),
+                apply_scale=is_fp8_kv_cache,
+                offs=None,
+                q_out=query,
+                k_out=key,
+                output_zeros=True,
+                zeros_out=output)
+            if is_fp8_kv_cache:
+                key_cache = key_cache.view(key_cache_og_dtype)
+                value_cache = value_cache.view(value_cache_og_dtype)
+        else:
+            if self.kv_sharing_target_layer_name is None:
+                # Reshape the input keys and values and store them in the cache.
+                # Skip this if sharing KV cache with an earlier attention layer.
+                if use_prefill_decode_attn:
+                    PagedAttention.write_to_paged_cache(
+                        key,
+                        value,
+                        key_cache,
+                        value_cache,
+                        attn_metadata.slot_mapping,
+                        self.kv_cache_dtype,
+                        layer._k_scale,
+                        layer._v_scale,
+                    )
+                else:
+                    torch.ops._C_cache_ops.reshape_and_cache_flash(
+                        key,
+                        value,
+                        key_cache,
+                        value_cache,
+                        attn_metadata.slot_mapping,
+                        self.kv_cache_dtype,
+                        layer._k_scale,
+                        layer._v_scale,
+                    )
 
         if self.kv_cache_dtype.startswith("fp8"):
             key_cache = key_cache.view(self.fp8_dtype)
             value_cache = value_cache.view(self.fp8_dtype)
             num_tokens, num_heads, head_size = query.shape
-            assert layer._q_scale == 1.0, \
+            assert layer._q_scale_float == 1.0, \
                 "A non 1.0 q_scale is not currently supported."
             if not current_platform.is_rocm():
                 # Skip Q quantization on ROCm, since dequantizing back to